Last active
November 6, 2017 11:50
-
-
Save stephaneIBANEZ/16fcb796ffdbf6e970b7215da9138e3a to your computer and use it in GitHub Desktop.
Class Sentence: string manipulation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
* This file is part of the Inserm Radico package. | |
* | |
* @author Stéphane IBANEZ <stephane.ibanez@aezan.com> | |
* | |
* This class is used for manipulate strings and new methods will be added further. | |
* | |
* To use this class outof Symfony environment, delete the line namespace InsermRadicoBundle\Entity; | |
*/ | |
namespace InsermRadicoBundle\Services; | |
use \Normalizer; | |
class Sentence { | |
const SPACE = ' '; | |
private $_substitutions = 0; | |
private $_substitutionList = array(); | |
private $_nbWords = 0; | |
public function getNbWords() { | |
return $this->_nbWords; | |
} | |
public function getSubstitutions() { | |
return $this->_substitutions; | |
} | |
public function getSubstitutionList() { | |
return $this->_substitutionList; | |
} | |
protected function utf8_str_split($str) { | |
$arr = array(); | |
$strLen = mb_strlen($str, 'UTF-8'); | |
for ($i = 0; $i < $strLen; $i++) { | |
$arr[] = mb_substr($str, $i, 1, 'UTF-8'); | |
} | |
return $arr; | |
} | |
public function getWords($sentence) { | |
return explode(self::SPACE, $sentence); | |
} | |
public function getSentence(array $words) { | |
return implode(self::SPACE, $words); | |
} | |
public function normalizeUtf8String($s = '', $withSqlWildCards=false) { | |
$s = trim($s); | |
$original_string = $s; | |
$strUniformized = ""; | |
$substitutions = 0; | |
$subList = array(); | |
$subject = $this->utf8_str_split($s); | |
/////////// ONE Char replacement | |
$patterns_x1 = "-ŋžðÞáàâäãåçéèêëíìîïóòôöõøúùûüýÿŊŽÐþÁÀÂÄÃÅÇÉÈÊËÍÌÎÏÓÒÔÖÕØÚÙÛÜÝŸ"; | |
$replacements_x1 = " nzdtaaaaaaceeeeiiiioooooouuuuyynzDTAAAAAACEEEEIIIIOOOOOOUUUUYY"; | |
$patterns_x1 = $this->utf8_str_split($patterns_x1); | |
$replacements_x1 = $this->utf8_str_split($replacements_x1); | |
for ($i = 0; $i < count($patterns_x1); $i++) { | |
$patterns_x1[$i] = "/" . $patterns_x1[$i] . "/u"; | |
$replacements_x1[$i] = "$0," . $replacements_x1[$i] . ",1"; | |
} | |
////////// TWO Chars replacement | |
$patterns_x2 = array('ß', 'Ñ', 'ñ', 'Ö', 'ö', 'Ü', 'ü', 'Æ', 'æ', 'IJ', 'ij', 'ÿ', 'Œ',); // 'ĸ' , 'ŀ' , 'ſ', 'ŧ'); | |
$replacements_x2 = array('ss', 'NY', 'ny', 'OE', 'oe', 'UE', 'ue', 'AE', 'ae', 'IJ', 'ij', 'yu', 'OE'); // | |
for ($i = 0; $i < count($replacements_x2); $i++) { | |
$replacements_x2[$i] = "$0," . $replacements_x2[$i] . ",2"; | |
$patterns_x2[$i] = "/" . $patterns_x2[$i] . "/u"; | |
} | |
$result = preg_replace($patterns_x1, $replacements_x1, $subject); | |
for ($i = 0; $i < count($result); $i++) { | |
if ($subject[$i] !== $result[$i]) { | |
list($org, $subs, $count) = explode(',', $result[$i]); | |
$subject[$i] = $subs; | |
$substitutions += $count; | |
$subList[] = "$org => $subs"; | |
} | |
} | |
$result2 = preg_replace($patterns_x2, $replacements_x2, $subject); | |
for ($i = 0; $i < count($result2); $i++) { | |
if ($subject[$i] !== $result2[$i]) { | |
list($org, $subs, $count) = explode(',', $result2[$i]); | |
$subject[$i] = $subs; | |
$substitutions += $count; | |
$subList[] = "$org => $subs"; | |
} | |
} | |
for ($i = 0; $i < count($subject); $i++) | |
$strUniformized .= $subject[$i]; | |
if($withSqlWildCards) { | |
$regex = '/[^a-zA-Z\s]%_/'; | |
} else { | |
$regex = '/[^a-zA-Z\s]/'; | |
} | |
$strUniformized = preg_replace($regex, '', $strUniformized); | |
$this->_substitutions = $substitutions; | |
$this->_substitutionList = $subList; | |
// to UPPERCASE | |
$strUniformized = trim(mb_strtoupper($strUniformized)); | |
if ($strUniformized !== "") { | |
$words = explode(' ', $strUniformized); | |
$this->_nbWords = count($words); | |
} | |
return $strUniformized; | |
} | |
public function position(array $words) { | |
$nbWords = count($words); | |
$wordPos = array(); | |
// $wordPos["{begin}"] = "-1"; | |
for ($c = 0; $c < $nbWords; $c++) { | |
$wordPos[$words[$c]] = $c; | |
} | |
// $wordPos["{end}"] = $nbWords; | |
return $wordPos; | |
} | |
function sortByScore($a, $b) { | |
return $a['score'] < $b['score']; | |
} | |
public function words(array $positions) { | |
$wordPos = array(); | |
foreach ($positions as $word => $pos) { | |
$wordPos[$pos] = $word; | |
} | |
return $wordPos; | |
} | |
public function getWordAt($words, $position) { | |
foreach($words as $word => $pos) { | |
if($pos === $position) return $word; | |
} | |
return ''; | |
} | |
public function score($sentence, $combinations) { | |
$orgWords = explode(' ', $sentence); | |
$orgWordPos[$sentence] = $this->position($orgWords); | |
$end = count($orgWords); | |
$result = false; | |
foreach ($combinations as $combinaison => $string) { | |
// echo "<hr>"; | |
$score = 0; | |
$match = 0; | |
$next = 0; | |
$prev = 0; | |
$destWords = explode(' ', $string); | |
$diff = count($orgWords) - count($destWords); | |
// fill array to max length | |
if ($diff > 0) { | |
$beg = count($destWords); | |
for ($i = $beg; $i < $end; $i++) { | |
$destWords[$i] = "###$i"; | |
} | |
$destWordPos[$string] = $this->position($destWords); | |
} else { | |
$destWordPos[$string] = $this->position($destWords); | |
} | |
$orgWords = $this->words($orgWords); | |
// echo "\norg words<pre>";print_r($orgWords); | |
// echo "\norg org words position<pre>";print_r($orgWordPos[$sentence]); | |
$destWords = $this->words($destWords); | |
// echo "\ndest words<pre>";print_r($destWords); | |
// echo "\ndest words position<pre>";print_r($destWordPos[$string]); | |
// echo "</pre>"; | |
foreach($destWordPos[$string] as $destWord => $position) { | |
// echo "<br>POS: $destWord -> $position"; | |
if( array_key_exists($destWord, $orgWordPos[$sentence]) ) { | |
// echo "$string ------- $destWord ---------------- YES --- "; | |
$prevDestWord = $this->getWordAt($destWordPos[$string], $position - 1); | |
$nextDestWord = $this->getWordAt($destWordPos[$string], $position + 1); | |
$inPosOrgWord = $this->getWordAt($orgWordPos[$sentence], $position); | |
$prevOrgWord = $this->getWordAt($orgWordPos[$sentence], $position - 1); | |
$nextOrgWord = $this->getWordAt($orgWordPos[$sentence], $position + 1); | |
// echo "IN POS: $prevOrgWord PREV $prevDestWord $prevOrgWord NEXT $nextDestWord $nextOrgWord"; | |
if($inPosOrgWord === $destWord) { | |
$match++; | |
// echo "<br>$sentence|$string INPLACE=> $inPosOrgWord === $destWord $match"; | |
} | |
if($prevOrgWord === $prevDestWord && $prevOrgWord.$prevDestWord != '') { | |
$prev++; | |
// echo "<br>$sentence|$string PREV => $prevOrgWord === $prevDestWord $prev"; | |
} | |
if($nextOrgWord === $nextDestWord && $nextOrgWord.$nextDestWord != '') { | |
$next++; | |
// echo "<br>$sentence|$string NEXT => $nextOrgWord === $nextDestWord $next"; | |
} | |
} | |
$score = $match + $next + $prev; | |
$levenshtein = (int) levenshtein($sentence, $string, 5, 5, 5); | |
// var_dump($result); | |
} // each destWord | |
if($score > 0) { | |
// echo "<br>SCORE > 0 $score"; | |
$result[] = array('sentence'=> $sentence, 'candidate' => $string, 'score' => $score, 'match' => $match, 'prev' => $prev, 'next' => $next, 'levenstein' => $levenshtein); | |
} | |
} // each combinations | |
if($result) { | |
usort($result, "self::sortByScore"); | |
} | |
// var_dump($result); | |
return $result; | |
} | |
public function scoreOld($sentence, $combinations) { | |
$orgWords = explode(' ', $sentence); | |
$orgWordPos[$sentence] = $this->position($orgWords); | |
$end = count($orgWords); | |
foreach ($combinations as $combinaison => $string) { | |
$destWords = explode(' ', $string); | |
$diff = count($orgWords) - count($destWords); | |
// fill array to max length | |
if ($diff > 0) { | |
$beg = count($destWords); | |
for ($i = $beg; $i < $end; $i++) { | |
$destWords[$i] = "###$i"; | |
} | |
$destWordPos[$string] = $this->position($destWords); | |
} else { | |
$destWordPos[$string] = $this->position($destWords); | |
} | |
foreach ($orgWordPos as $orgSentence => $orgData) { | |
$orgWords = $this->words($orgData); | |
echo "\norg words";print_r($orgWords); | |
foreach ($destWordPos as $destSentence => $destData) { | |
$score = 0; | |
foreach ($destData as $destWord => $destPos) { | |
//echo "\n\n<br>checking $destPos $destWord "; | |
if (array_key_exists($destWord, $orgData)) { | |
if ($destWord === "{begin}" || $destWord === "{end}") continue; | |
$next = false; | |
$prev = false; | |
$exact = false; | |
$orgPos = $orgData[$destWord]; | |
$destwords = $this->words($destData); | |
echo "\n\nposition: $destPos\n"; | |
echo "\ndest words";print_r($destwords); | |
$prevDestWord = $destwords[$destPos - 1]; | |
$prevOrgDestWord = $orgWords[$orgPos - 1]; | |
$nextDestWord = $destwords[$destPos + 1]; | |
$nextOrgDestWord = $orgWords[$orgPos + 1]; | |
$orgWord = $orgWords[$orgPos]; | |
// echo "\n\n<br>$destPos $destWord $orgWord ----- $prevDestWord $prevOrgDestWord, $nextDestWord $nextOrgDestWord"; | |
if ($destWord === $orgWord) { | |
// echo "\n<br>exta match $orgPos $destWord $orgWord"; | |
$exact = true; | |
$score ++; | |
} | |
// si mot avant dest = mot avant origine score += 1 | |
if ($prevDestWord === $prevOrgDestWord) { | |
//echo "<br> <font color='red'>PREV DEST === PREV ORG => SCORE + 1</font>"; | |
// $score ++; | |
$prev = true; | |
} | |
// si mot apres dest = mot apres origine = 1 score += 1 | |
if ($nextDestWord === $nextOrgDestWord) { | |
//echo "<br> <font color='green'>NEXT DEST === NEXT ORG=> SCORE + 1</font>"; | |
// $score ++; | |
$next = true; | |
} | |
if ($next && $prev) { | |
// $score++; | |
} | |
if ($next || $prev) { | |
// $score++; | |
} | |
} else { | |
$score--; | |
} | |
} | |
} | |
} | |
$result[] = array('sentence' => $string, 'score' => $score); | |
} | |
usort($result, "self::sortByScore"); | |
return $result; | |
} | |
public function deduplicate(array $list) { | |
$nbLines = count($list); | |
for ($l = 0; $l < $nbLines; $l++) { | |
$string = $this->getSentence($list[$l]); | |
$tmp[$string] = $list[$l]; | |
} | |
foreach ($tmp as $key => $value) { | |
$result[] = $value; | |
} | |
return $result; | |
} | |
public function makeSentenceList(array $input) { | |
$nbInput = count($input); | |
for ($l = 0; $l < $nbInput; $l++) { | |
$string = $this->getSentence($input[$l]); | |
$result[] = $string; | |
} | |
return $result; | |
} | |
public function permuteArray(array $input) { | |
// FROM http://stackoverflow.com/questions/10222835/get-all-permutations-of-a-php-array | |
//COMMENT on this site: | |
//This algorithm is nice and instructive how you would do it on paper, but otherwise very | |
//inefficient as it calculates same permutations multiple times. Not to say that it is | |
//very impractical for calculating permutations of larger arrays as the space and number | |
//of calculations grow exponentially. | |
$input = array_values($input); | |
// permutation of 1 value is the same value | |
if (count($input) === 1) { | |
return array($input); | |
} | |
// to permute multiple values, pick a value to put in the front and | |
// permute the rest; repeat this with all values of the original array | |
$result = []; | |
$nbInput = count($input); | |
for ($i = 0; $i < $nbInput; $i++) { | |
$copy = $input; | |
$value = array_splice($copy, $i, 1); | |
foreach ($this->permuteArray($copy) as $permutation) { | |
array_unshift($permutation, $value[0]); | |
$result[] = $permutation; | |
} | |
} | |
return $result; | |
} | |
public function isoPermute($sentence) { | |
$words = $this->getWords($sentence); | |
return $this->permuteArray($words); | |
} | |
public function dropPermute($sentence) { | |
$result = array(); | |
$list = $this->isoPermute($sentence); | |
$nbList = count($list); | |
for ($l = 0; $l < $nbList; $l++) { | |
$words = $list[$l]; | |
$words = array_splice($words, 0, -1); | |
$sentence = $this->getSentence($words); | |
$list = array_merge($list, $this->isoPermute($sentence)); | |
} | |
$result = $this->deduplicate($list); | |
return $result; | |
} | |
public function fullPermute($sentence) { | |
$result = array(); | |
$list = $this->dropPermute($sentence); | |
$nbList = count($list); | |
for ($l = 0; $l < $nbList; $l++) { | |
$words = $list[$l]; | |
$nbWords = count($words) - 1; | |
for ($w = $nbWords; $w > 1; $w--) { | |
$words = array_splice($words, 0, -1); | |
$sentence = $this->getSentence($words); | |
$list = array_merge($list, $this->dropPermute($sentence)); | |
} | |
} | |
$result = $this->deduplicate($list); | |
return $result; | |
} | |
public function getDoubleChar($sentence) { | |
$strLen = strlen($sentence); | |
for($c=0; $c < $strLen; $c++) { | |
$altered[] = substr($sentence,0, $c) . $sentence[$c] . substr($sentence, $c, $strLen); | |
} | |
return $altered; | |
} | |
public function parseDoubleChar($sentence, $combinations) { | |
$sentenceLen = mb_strlen($sentence); | |
echo "<pre>"; | |
print_r($combinations); | |
echo "</pre>"; | |
foreach ($combinations as $combinaison => $string) { | |
echo "<hr>"; | |
$match = 0; | |
$candidateLen = mb_strlen($string); | |
for ($i = 0; $i < $sentenceLen; $i++) { | |
if ($i < mb_strlen($candidateLen)) { | |
if ($string[$i] == $sentence[$i]) { | |
echo "<br>$i match ".$string[$i] ."==". $sentence[$i]; | |
$match++; | |
} | |
} | |
} | |
$results[$string] = $match; | |
} | |
echo "<pre>"; | |
print_r($results); | |
echo "</pre>"; | |
} | |
public function getDroppedChar($sentence) { | |
$strLen = strlen($sentence); | |
for($c=0; $c < $strLen; $c++) { | |
$altered[] = substr($sentence,0, $c) . substr($sentence, $c+1, $strLen); | |
} | |
return $altered; | |
} | |
public function parseDropChar($sentence, $combinations) { | |
$sentenceLen = mb_strlen($sentence); | |
echo "<pre>"; | |
print_r($combinations); | |
echo "</pre>"; | |
foreach ($combinations as $combinaison => $string) { | |
echo "<hr>"; | |
$match = 0; | |
$candidateLen = mb_strlen($string); | |
for ($i = 0; $i < $sentenceLen; $i++) { | |
if ($i < mb_strlen($candidateLen)) { | |
if ($string[$i] == $sentence[$i]) { | |
echo "<br>$i match ".$string[$i] ."==". $sentence[$i]; | |
$match++; | |
} | |
} | |
} | |
$results[$string] = $match; | |
} | |
echo "<pre>"; | |
print_r($results); | |
echo "</pre>"; | |
} | |
public function lettersMatch($sentence, $combinations, $ratio=.8) { | |
// @todo | |
} | |
public function subSentenceMatch($sentence, $combinations, $ratio=1) { | |
// @todo | |
} | |
public function getMatch5on5($string, $candidate) { | |
$results = array(); | |
} | |
public function extSearch($string, $persons, $percent = 80) { | |
$begin = microtime(true); | |
//echo "<br>$string "; | |
$results = array(); | |
// print_r($persons); | |
foreach ($persons as $id => $person) { | |
$result = array( | |
'fMatch4_5' => " ", | |
'bMatch4_5' => " ", | |
'condition4_5' => false, | |
'fMatch5_5' => 0, | |
'bMatch5_5' => 0, | |
'condition5_5' => false, | |
'person' => $person | |
); | |
$matchFwd = 0; | |
$conditionFwd = 0; | |
$fullName = $person['fullNameUniformized']; | |
/// CONDITION 5 / 5 of included words | |
$fMatch5_5 = 0; | |
$searchWords = explode(' ', $string); | |
foreach ($searchWords as $id => $word) { | |
if ($word != '') { | |
$regex = "/$word/"; | |
if (preg_match_all($regex, $fullName, $matches)) { | |
//echo "<br>fwd found $word in ".$person['fullName']; | |
$fMatch5_5++; | |
} | |
} | |
} | |
if($fMatch5_5 > 0 && ($fMatch5_5 / count($searchWords) == 1)) { | |
$result['fMatch5_5'] = "<font color='green'>" . $fMatch5_5 . "/" . count($searchWords) . "</font>"; | |
$result['condition5_5'] = true; | |
$result['percent'] = 100; | |
} else { | |
$result['fMatch5_5'] = "<font color='red'>" . $fMatch5_5 . "/" . count($searchWords) . "</font>"; | |
} | |
// back search | |
$bMatch5_5 = 0; | |
$searchWordsBack = explode(' ', $fullName); | |
foreach ($searchWordsBack as $id => $word) { | |
if ($word != '') { | |
$regex = "/$word/"; | |
if (preg_match_all($regex, $string, $matches)) { | |
//echo "<br>bck found $word in $string"; | |
$bMatch5_5++; | |
} | |
} | |
} | |
if($bMatch5_5 > 0 && ($bMatch5_5 / count($searchWordsBack) == 1)) { | |
$result['bMatch5_5'] = "<font color='green'>" . $bMatch5_5 . "/" . count($searchWordsBack) . "</font>"; | |
$result['condition5_5'] = true; | |
$result['percent'] = 100; | |
} else { | |
$result['bMatch5_5'] = "<font color='red'>" . $bMatch5_5 . "/" . count($searchWordsBack) . "</font>"; | |
} | |
// print_r($result); | |
// continue; | |
$conditionFwd = false; | |
$conditionBck = false; | |
// if(!$result['condition5_5']) { | |
// Forward search 4 / 5 | |
if($searchFamilyNameUniformized == $fullName) { | |
$matchBack = mb_strlen($fullName); | |
} else { | |
for ($i = 0; $i < mb_strlen($searchFamilyNameUniformized); $i++) { | |
if ($i < mb_strlen($fullName)) { | |
if ($searchFamilyNameUniformized[$i] == $fullName[$i]) { | |
$matchFwd++; | |
} | |
} else | |
break; | |
} | |
} | |
$countFwd = mb_strlen($fullNameUniformized); | |
if ($matchFwd > 0) $conditionFwd = ( $matchFwd / $countFwd ) * 100; | |
if ($conditionFwd >= $percent) { | |
$fMatch4_5 = "<font color='green'>" . $matchFwd . "/" . $countFwd . "</font>"; | |
} else { | |
$fMatch4_5 = "<font color='red'>" . $matchFwd . "/" . $countFwd . "</font>"; | |
} | |
///// BACKWARD SEARCH 4 / 5 | |
$matchBack = 0; | |
$conditionBack = 0; | |
if($searchFamilyNameUniformized == $fullName) { | |
$matchBack = mb_strlen($fullName); | |
} else { | |
for ($i = 0; $i < mb_strlen($fullName); $i++) { | |
if ($i < mb_strlen($searchFamilyNameUniformized)) { | |
if ($searchFamilyNameUniformized[$i] == $fullName[$i]) { | |
$matchBack++; | |
} | |
} else | |
break; | |
} | |
} | |
$countBack = mb_strlen($searchFullNameUniformized); | |
if ($matchBack > 0) $conditionBack = ($matchBack / $countBack) * 100; | |
if ($conditionBack >= $percent) { | |
$fMatch4_5 = "<font color='green'>" . $matchBack . "/" . $countBack . "</font>"; | |
} else { | |
$bMatch4_5 = "<font color='red'>" . $matchBack . "/" . $countBack . "</font>"; | |
} | |
$result['percent'] = ($matchBack / $countBack) * 100; | |
// if($conditionFwd >= $percent) echo "<br><font color='green'>$id: $string in ".$person['fullName']." match forward: $fMatch4_5 condition forward: $conditionFwd%</font>"; | |
// if($conditionBack >= $percent) echo "<br><font color='red'>$id: ".$person['fullName']." in $string match backward: $bMatch4_5 condition backward: $conditionBack%</font>"; | |
$result['fMatch4_5'] = $fMatch4_5; | |
$result['bMatch4_5'] = $bMatch4_5; | |
// } | |
// conditions check | |
if ($conditionFwd >= $percent && $conditionBack >= $percent) $result['condition4_5'] = true; | |
if ($result['condition4_5'] || $result['condition5_5']) { | |
$results[] = $result; | |
} | |
} | |
// usort($results, "InsermRadicoBundle\Entity\ExtendedSearch::sortByPercent"); | |
$end = microtime(true); | |
usort($results, "InsermRadicoBundle\Entity\ExtendedSearch::sortByPercent"); | |
$this->_processTime = $end - $begin; | |
return $results; | |
} | |
} | |
/********************* USAGE ************************ | |
$string = 'j\'appelais la classe SENTENCE de Stéphane Ibáñez.12345 #?'; | |
$sentence = new Sentence(); | |
$sentenceUniformized = $sentence->normalizeUtf8String($string); // get uniformized sentence | |
$sentenceSubstitutions = $sentence->getSubstitutions(); // get number of substitutions | |
$sentenceSubstitutionList = $sentence->getSubstitutionList(); // get substitutions list | |
$sentenceNbWords = $sentence->getNbWords(); // get number of words | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Last release of Sentence class with all basic functions