Last active November 6, 2017 11:50
Class Sentence: string manipulation
* This file is part of the Inserm Radico package.
* @author Stéphane IBANEZ <>
* This class is used for manipulate strings and new methods will be added further.
* To use this class outof Symfony environment, delete the line namespace InsermRadicoBundle\Entity;
namespace InsermRadicoBundle\Services;
use \Normalizer;
class Sentence {
const SPACE = ' ';
private $_substitutions = 0;
private $_substitutionList = array();
private $_nbWords = 0;
public function getNbWords() {
return $this->_nbWords;
public function getSubstitutions() {
return $this->_substitutions;
public function getSubstitutionList() {
return $this->_substitutionList;
protected function utf8_str_split($str) {
$arr = array();
$strLen = mb_strlen($str, 'UTF-8');
for ($i = 0; $i < $strLen; $i++) {
$arr[] = mb_substr($str, $i, 1, 'UTF-8');
return $arr;
public function getWords($sentence) {
return explode(self::SPACE, $sentence);
public function getSentence(array $words) {
return implode(self::SPACE, $words);
public function normalizeUtf8String($s = '', $withSqlWildCards=false) {
$s = trim($s);
$original_string = $s;
$strUniformized = "";
$substitutions = 0;
$subList = array();
$subject = $this->utf8_str_split($s);
/////////// ONE Char replacement
$patterns_x1 = "-ŋžðÞáàâäãåçéèêëíìîïóòôöõøúùûüýÿŊŽÐþÁÀÂÄÃÅÇÉÈÊËÍÌÎÏÓÒÔÖÕØÚÙÛÜÝŸ";
$replacements_x1 = " nzdtaaaaaaceeeeiiiioooooouuuuyynzDTAAAAAACEEEEIIIIOOOOOOUUUUYY";
$patterns_x1 = $this->utf8_str_split($patterns_x1);
$replacements_x1 = $this->utf8_str_split($replacements_x1);
for ($i = 0; $i < count($patterns_x1); $i++) {
$patterns_x1[$i] = "/" . $patterns_x1[$i] . "/u";
$replacements_x1[$i] = "$0," . $replacements_x1[$i] . ",1";
////////// TWO Chars replacement
$patterns_x2 = array('ß', 'Ñ', 'ñ', 'Ö', 'ö', 'Ü', 'ü', 'Æ', 'æ', 'IJ', 'ij', 'ÿ', 'Œ',); // 'ĸ' , 'ŀ' , 'ſ', 'ŧ');
$replacements_x2 = array('ss', 'NY', 'ny', 'OE', 'oe', 'UE', 'ue', 'AE', 'ae', 'IJ', 'ij', 'yu', 'OE'); //
for ($i = 0; $i < count($replacements_x2); $i++) {
$replacements_x2[$i] = "$0," . $replacements_x2[$i] . ",2";
$patterns_x2[$i] = "/" . $patterns_x2[$i] . "/u";
$result = preg_replace($patterns_x1, $replacements_x1, $subject);
for ($i = 0; $i < count($result); $i++) {
if ($subject[$i] !== $result[$i]) {
list($org, $subs, $count) = explode(',', $result[$i]);
$subject[$i] = $subs;
$substitutions += $count;
$subList[] = "$org => $subs";
$result2 = preg_replace($patterns_x2, $replacements_x2, $subject);
for ($i = 0; $i < count($result2); $i++) {
if ($subject[$i] !== $result2[$i]) {
list($org, $subs, $count) = explode(',', $result2[$i]);
$subject[$i] = $subs;
$substitutions += $count;
$subList[] = "$org => $subs";
for ($i = 0; $i < count($subject); $i++)
$strUniformized .= $subject[$i];
if($withSqlWildCards) {
$regex = '/[^a-zA-Z\s]%_/';
} else {
$regex = '/[^a-zA-Z\s]/';
$strUniformized = preg_replace($regex, '', $strUniformized);
$this->_substitutions = $substitutions;
$this->_substitutionList = $subList;
$strUniformized = trim(mb_strtoupper($strUniformized));
if ($strUniformized !== "") {
$words = explode(' ', $strUniformized);
$this->_nbWords = count($words);
return $strUniformized;
public function position(array $words) {
$nbWords = count($words);
$wordPos = array();
// $wordPos["{begin}"] = "-1";
for ($c = 0; $c < $nbWords; $c++) {
$wordPos[$words[$c]] = $c;
// $wordPos["{end}"] = $nbWords;
return $wordPos;
function sortByScore($a, $b) {
return $a['score'] < $b['score'];
public function words(array $positions) {
$wordPos = array();
foreach ($positions as $word => $pos) {
$wordPos[$pos] = $word;
return $wordPos;
public function getWordAt($words, $position) {
foreach($words as $word => $pos) {
if($pos === $position) return $word;
return '';
public function score($sentence, $combinations) {
$orgWords = explode(' ', $sentence);
$orgWordPos[$sentence] = $this->position($orgWords);
$end = count($orgWords);
$result = false;
foreach ($combinations as $combinaison => $string) {
// echo "<hr>";
$score = 0;
$match = 0;
$next = 0;
$prev = 0;
$destWords = explode(' ', $string);
$diff = count($orgWords) - count($destWords);
// fill array to max length
if ($diff > 0) {
$beg = count($destWords);
for ($i = $beg; $i < $end; $i++) {
$destWords[$i] = "###$i";
$destWordPos[$string] = $this->position($destWords);
} else {
$destWordPos[$string] = $this->position($destWords);
$orgWords = $this->words($orgWords);
// echo "\norg words<pre>";print_r($orgWords);
// echo "\norg org words position<pre>";print_r($orgWordPos[$sentence]);
$destWords = $this->words($destWords);
// echo "\ndest words<pre>";print_r($destWords);
// echo "\ndest words position<pre>";print_r($destWordPos[$string]);
// echo "</pre>";
foreach($destWordPos[$string] as $destWord => $position) {
// echo "<br>POS: $destWord -> $position";
if( array_key_exists($destWord, $orgWordPos[$sentence]) ) {
// echo "$string ------- $destWord ---------------- YES --- ";
$prevDestWord = $this->getWordAt($destWordPos[$string], $position - 1);
$nextDestWord = $this->getWordAt($destWordPos[$string], $position + 1);
$inPosOrgWord = $this->getWordAt($orgWordPos[$sentence], $position);
$prevOrgWord = $this->getWordAt($orgWordPos[$sentence], $position - 1);
$nextOrgWord = $this->getWordAt($orgWordPos[$sentence], $position + 1);
// echo "IN POS: $prevOrgWord PREV $prevDestWord $prevOrgWord NEXT $nextDestWord $nextOrgWord";
if($inPosOrgWord === $destWord) {
// echo "<br>$sentence|$string INPLACE=> $inPosOrgWord === $destWord $match";
if($prevOrgWord === $prevDestWord && $prevOrgWord.$prevDestWord != '') {
// echo "<br>$sentence|$string PREV => $prevOrgWord === $prevDestWord $prev";
if($nextOrgWord === $nextDestWord && $nextOrgWord.$nextDestWord != '') {
// echo "<br>$sentence|$string NEXT => $nextOrgWord === $nextDestWord $next";
$score = $match + $next + $prev;
$levenshtein = (int) levenshtein($sentence, $string, 5, 5, 5);
// var_dump($result);
} // each destWord
if($score > 0) {
// echo "<br>SCORE > 0 $score";
$result[] = array('sentence'=> $sentence, 'candidate' => $string, 'score' => $score, 'match' => $match, 'prev' => $prev, 'next' => $next, 'levenstein' => $levenshtein);
} // each combinations
if($result) {
usort($result, "self::sortByScore");
// var_dump($result);
return $result;
public function scoreOld($sentence, $combinations) {
$orgWords = explode(' ', $sentence);
$orgWordPos[$sentence] = $this->position($orgWords);
$end = count($orgWords);
foreach ($combinations as $combinaison => $string) {
$destWords = explode(' ', $string);
$diff = count($orgWords) - count($destWords);
// fill array to max length
if ($diff > 0) {
$beg = count($destWords);
for ($i = $beg; $i < $end; $i++) {
$destWords[$i] = "###$i";
$destWordPos[$string] = $this->position($destWords);
} else {
$destWordPos[$string] = $this->position($destWords);
foreach ($orgWordPos as $orgSentence => $orgData) {
$orgWords = $this->words($orgData);
echo "\norg words";print_r($orgWords);
foreach ($destWordPos as $destSentence => $destData) {
$score = 0;
foreach ($destData as $destWord => $destPos) {
//echo "\n\n<br>checking $destPos $destWord ";
if (array_key_exists($destWord, $orgData)) {
if ($destWord === "{begin}" || $destWord === "{end}") continue;
$next = false;
$prev = false;
$exact = false;
$orgPos = $orgData[$destWord];
$destwords = $this->words($destData);
echo "\n\nposition: $destPos\n";
echo "\ndest words";print_r($destwords);
$prevDestWord = $destwords[$destPos - 1];
$prevOrgDestWord = $orgWords[$orgPos - 1];
$nextDestWord = $destwords[$destPos + 1];
$nextOrgDestWord = $orgWords[$orgPos + 1];
$orgWord = $orgWords[$orgPos];
// echo "\n\n<br>$destPos $destWord $orgWord ----- $prevDestWord $prevOrgDestWord, $nextDestWord $nextOrgDestWord";
if ($destWord === $orgWord) {
// echo "\n<br>exta match $orgPos $destWord $orgWord";
$exact = true;
$score ++;
// si mot avant dest = mot avant origine score += 1
if ($prevDestWord === $prevOrgDestWord) {
//echo "<br> <font color='red'>PREV DEST === PREV ORG => SCORE + 1</font>";
// $score ++;
$prev = true;
// si mot apres dest = mot apres origine = 1 score += 1
if ($nextDestWord === $nextOrgDestWord) {
//echo "<br> <font color='green'>NEXT DEST === NEXT ORG=> SCORE + 1</font>";
// $score ++;
$next = true;
if ($next && $prev) {
// $score++;
if ($next || $prev) {
// $score++;
} else {
$result[] = array('sentence' => $string, 'score' => $score);
usort($result, "self::sortByScore");
return $result;
public function deduplicate(array $list) {
$nbLines = count($list);
for ($l = 0; $l < $nbLines; $l++) {
$string = $this->getSentence($list[$l]);
$tmp[$string] = $list[$l];
foreach ($tmp as $key => $value) {
$result[] = $value;
return $result;
public function makeSentenceList(array $input) {
$nbInput = count($input);
for ($l = 0; $l < $nbInput; $l++) {
$string = $this->getSentence($input[$l]);
$result[] = $string;
return $result;
public function permuteArray(array $input) {
//COMMENT on this site:
//This algorithm is nice and instructive how you would do it on paper, but otherwise very
//inefficient as it calculates same permutations multiple times. Not to say that it is
//very impractical for calculating permutations of larger arrays as the space and number
//of calculations grow exponentially.
$input = array_values($input);
// permutation of 1 value is the same value
if (count($input) === 1) {
return array($input);
// to permute multiple values, pick a value to put in the front and
// permute the rest; repeat this with all values of the original array
$result = [];
$nbInput = count($input);
for ($i = 0; $i < $nbInput; $i++) {
$copy = $input;
$value = array_splice($copy, $i, 1);
foreach ($this->permuteArray($copy) as $permutation) {
array_unshift($permutation, $value[0]);
$result[] = $permutation;
return $result;
public function isoPermute($sentence) {
$words = $this->getWords($sentence);
return $this->permuteArray($words);
public function dropPermute($sentence) {
$result = array();
$list = $this->isoPermute($sentence);
$nbList = count($list);
for ($l = 0; $l < $nbList; $l++) {
$words = $list[$l];
$words = array_splice($words, 0, -1);
$sentence = $this->getSentence($words);
$list = array_merge($list, $this->isoPermute($sentence));
$result = $this->deduplicate($list);
return $result;
public function fullPermute($sentence) {
$result = array();
$list = $this->dropPermute($sentence);
$nbList = count($list);
for ($l = 0; $l < $nbList; $l++) {
$words = $list[$l];
$nbWords = count($words) - 1;
for ($w = $nbWords; $w > 1; $w--) {
$words = array_splice($words, 0, -1);
$sentence = $this->getSentence($words);
$list = array_merge($list, $this->dropPermute($sentence));
$result = $this->deduplicate($list);
return $result;
public function getDoubleChar($sentence) {
$strLen = strlen($sentence);
for($c=0; $c < $strLen; $c++) {
$altered[] = substr($sentence,0, $c) . $sentence[$c] . substr($sentence, $c, $strLen);
return $altered;
public function parseDoubleChar($sentence, $combinations) {
$sentenceLen = mb_strlen($sentence);
echo "<pre>";
echo "</pre>";
foreach ($combinations as $combinaison => $string) {
echo "<hr>";
$match = 0;
$candidateLen = mb_strlen($string);
for ($i = 0; $i < $sentenceLen; $i++) {
if ($i < mb_strlen($candidateLen)) {
if ($string[$i] == $sentence[$i]) {
echo "<br>$i match ".$string[$i] ."==". $sentence[$i];
$results[$string] = $match;
echo "<pre>";
echo "</pre>";
public function getDroppedChar($sentence) {
$strLen = strlen($sentence);
for($c=0; $c < $strLen; $c++) {
$altered[] = substr($sentence,0, $c) . substr($sentence, $c+1, $strLen);
return $altered;
public function parseDropChar($sentence, $combinations) {
$sentenceLen = mb_strlen($sentence);
echo "<pre>";
echo "</pre>";
foreach ($combinations as $combinaison => $string) {
echo "<hr>";
$match = 0;
$candidateLen = mb_strlen($string);
for ($i = 0; $i < $sentenceLen; $i++) {
if ($i < mb_strlen($candidateLen)) {
if ($string[$i] == $sentence[$i]) {
echo "<br>$i match ".$string[$i] ."==". $sentence[$i];
$results[$string] = $match;
echo "<pre>";
echo "</pre>";
public function lettersMatch($sentence, $combinations, $ratio=.8) {
// @todo
public function subSentenceMatch($sentence, $combinations, $ratio=1) {
// @todo
public function getMatch5on5($string, $candidate) {
$results = array();
public function extSearch($string, $persons, $percent = 80) {
$begin = microtime(true);
//echo "<br>$string ";
$results = array();
// print_r($persons);
foreach ($persons as $id => $person) {
$result = array(
'fMatch4_5' => "&nbsp;",
'bMatch4_5' => "&nbsp;",
'condition4_5' => false,
'fMatch5_5' => 0,
'bMatch5_5' => 0,
'condition5_5' => false,
'person' => $person
$matchFwd = 0;
$conditionFwd = 0;
$fullName = $person['fullNameUniformized'];
/// CONDITION 5 / 5 of included words
$fMatch5_5 = 0;
$searchWords = explode(' ', $string);
foreach ($searchWords as $id => $word) {
if ($word != '') {
$regex = "/$word/";
if (preg_match_all($regex, $fullName, $matches)) {
//echo "<br>fwd found $word in ".$person['fullName'];
if($fMatch5_5 > 0 && ($fMatch5_5 / count($searchWords) == 1)) {
$result['fMatch5_5'] = "<font color='green'>" . $fMatch5_5 . "/" . count($searchWords) . "</font>";
$result['condition5_5'] = true;
$result['percent'] = 100;
} else {
$result['fMatch5_5'] = "<font color='red'>" . $fMatch5_5 . "/" . count($searchWords) . "</font>";
// back search
$bMatch5_5 = 0;
$searchWordsBack = explode(' ', $fullName);
foreach ($searchWordsBack as $id => $word) {
if ($word != '') {
$regex = "/$word/";
if (preg_match_all($regex, $string, $matches)) {
//echo "<br>bck found $word in $string";
if($bMatch5_5 > 0 && ($bMatch5_5 / count($searchWordsBack) == 1)) {
$result['bMatch5_5'] = "<font color='green'>" . $bMatch5_5 . "/" . count($searchWordsBack) . "</font>";
$result['condition5_5'] = true;
$result['percent'] = 100;
} else {
$result['bMatch5_5'] = "<font color='red'>" . $bMatch5_5 . "/" . count($searchWordsBack) . "</font>";
// print_r($result);
// continue;
$conditionFwd = false;
$conditionBck = false;
// if(!$result['condition5_5']) {
// Forward search 4 / 5
if($searchFamilyNameUniformized == $fullName) {
$matchBack = mb_strlen($fullName);
} else {
for ($i = 0; $i < mb_strlen($searchFamilyNameUniformized); $i++) {
if ($i < mb_strlen($fullName)) {
if ($searchFamilyNameUniformized[$i] == $fullName[$i]) {
} else
$countFwd = mb_strlen($fullNameUniformized);
if ($matchFwd > 0) $conditionFwd = ( $matchFwd / $countFwd ) * 100;
if ($conditionFwd >= $percent) {
$fMatch4_5 = "<font color='green'>" . $matchFwd . "/" . $countFwd . "</font>";
} else {
$fMatch4_5 = "<font color='red'>" . $matchFwd . "/" . $countFwd . "</font>";
$matchBack = 0;
$conditionBack = 0;
if($searchFamilyNameUniformized == $fullName) {
$matchBack = mb_strlen($fullName);
} else {
for ($i = 0; $i < mb_strlen($fullName); $i++) {
if ($i < mb_strlen($searchFamilyNameUniformized)) {
if ($searchFamilyNameUniformized[$i] == $fullName[$i]) {
} else
$countBack = mb_strlen($searchFullNameUniformized);
if ($matchBack > 0) $conditionBack = ($matchBack / $countBack) * 100;
if ($conditionBack >= $percent) {
$fMatch4_5 = "<font color='green'>" . $matchBack . "/" . $countBack . "</font>";
} else {
$bMatch4_5 = "<font color='red'>" . $matchBack . "/" . $countBack . "</font>";
$result['percent'] = ($matchBack / $countBack) * 100;
// if($conditionFwd >= $percent) echo "<br><font color='green'>$id: $string in ".$person['fullName']." match forward: $fMatch4_5 condition forward: $conditionFwd%</font>";
// if($conditionBack >= $percent) echo "<br><font color='red'>$id: ".$person['fullName']." in $string match backward: $bMatch4_5 condition backward: $conditionBack%</font>";
$result['fMatch4_5'] = $fMatch4_5;
$result['bMatch4_5'] = $bMatch4_5;
// }
// conditions check
if ($conditionFwd >= $percent && $conditionBack >= $percent) $result['condition4_5'] = true;
if ($result['condition4_5'] || $result['condition5_5']) {
$results[] = $result;
// usort($results, "InsermRadicoBundle\Entity\ExtendedSearch::sortByPercent");
$end = microtime(true);
usort($results, "InsermRadicoBundle\Entity\ExtendedSearch::sortByPercent");
$this->_processTime = $end - $begin;
return $results;
/********************* USAGE ************************
$string = 'j\'appelais la classe SENTENCE de Stéphane Ibáñez.12345 #?';
$sentence = new Sentence();
$sentenceUniformized = $sentence->normalizeUtf8String($string); // get uniformized sentence
$sentenceSubstitutions = $sentence->getSubstitutions(); // get number of substitutions
$sentenceSubstitutionList = $sentence->getSubstitutionList(); // get substitutions list
$sentenceNbWords = $sentence->getNbWords(); // get number of words
