Improve search plugin and add support for UTF8 to Fuzz algorithm

This commit is contained in:
Diego Najar 2019-12-07 14:24:13 +01:00
parent b510c59bab
commit 7c8bad72c5
2 changed files with 25 additions and 26 deletions

View File

@ -116,7 +116,7 @@ EOF;
global $url;
// Change the whereAmI to avoid load pages in the rule 69.pages
// This is only for performance propose
// This is only for performance purpose
$url->setWhereAmI('search');
// Get the string to search from the URL
@ -125,7 +125,6 @@ EOF;
// Search the string in the cache and get all pages with matches
$list = $this->search($stringToSearch);
$this->numberOfItems = count($list);
// Split the content in pages
@ -220,7 +219,7 @@ EOF;
// Inlcude Fuzz algorithm
require_once($this->phpPath().'vendors/fuzz.php');
$fuzz = new Fuzz($cache, 10, 1, true);
$results = $fuzz->search($text, 3);
$results = $fuzz->search($text, 5);
return(array_keys($results));
}

View File

@ -125,8 +125,8 @@ class Fuzz
{
$suffix = [];
$result = 0;
$n = strlen($source);
$m = strlen($target);
$n = mb_strlen($source, CHARSET);
$m = mb_strlen($target, CHARSET);
for ($i = 0; $i <= $n; $i++) {
for ($j = 0; $j <= $m; $j++) {
@ -155,8 +155,8 @@ class Fuzz
public function getLevenshtein($source, $target)
{
$matrix = [];
$n = strlen($source);
$m = strlen($target);
$n = mb_strlen($source, CHARSET);
$m = mb_strlen($target, CHARSET);
if ($n === 0) {
return $m;
@ -208,35 +208,35 @@ class Fuzz
$shorter;
$longer;
if (strlen($first) > strlen($second)) {
$longer = strtolower($first);
$shorter = strtolower($second);
if (mb_strlen($first, CHARSET) > mb_strlen($second, CHARSET)) {
$longer = mb_strtolower($first, CHARSET);
$shorter = mb_strtolower($second, CHARSET);
} else {
$longer = strtolower($second);
$shorter = strtolower($first);
$longer = mb_strtolower($second, CHARSET);
$shorter = mb_strtolower($first, CHARSET);
}
// Get half the length distance of shorter string
$halfLen = intval((strlen($shorter) / 2) + 1);
$halfLen = intval((mb_strlen($shorter,CHARSET) / 2) + 1);
$match1 = $this->_getCharMatch($shorter, $longer, $halfLen);
$match2 = $this->_getCharMatch($longer, $shorter, $halfLen);
if ((strlen($match1) == 0 || strlen($match2) == 0)
|| (strlen($match1) != strlen($match2))
if ((mb_strlen($match1, CHARSET) == 0 || mb_strlen($match2, CHARSET) == 0)
|| (mb_strlen($match1, CHARSET) != mb_strlen($match2, CHARSET))
) {
return 0.0;
}
$trans = $this->_getTranspositions($match1, $match2);
$distance = (strlen($match1) / strlen($shorter)
+ strlen($match2) / strlen($longer)
+ (strlen($match1) - $trans)
/ strlen($match1)) / 3.0;
$distance = (mb_strlen($match1, CHARSET) / mb_strlen($shorter, CHARSET)
+ mb_strlen($match2, CHARSET) / mb_strlen($longer, CHARSET)
+ (mb_strlen($match1, CHARSET) - $trans)
/ mb_strlen($match1, CHARSET)) / 3.0;
// Apply Winkler Adjustment
$prefixLen = min(strlen($this->_getPrefix($first, $second)), 4);
$prefixLen = min(mb_strlen($this->_getPrefix($first, $second),CHARSET), 4);
$jaroWinkler = round(($distance + (0.1 * $prefixLen * (1.0 - $distance))) * 100.0) / 100.0;
return $jaroWinkler;
@ -255,8 +255,8 @@ class Fuzz
{
$common = '';
$copy = $second;
$firstLen = strlen($first);
$secondLen = strlen($second);
$firstLen = mb_strlen($first, CHARSET);
$secondLen = mb_strlen($second, CHARSET);
for ($i = 0; $i < $firstLen; $i++) {
$char = $first[$i];
@ -285,7 +285,7 @@ class Fuzz
private function _getTranspositions($first, $second)
{
$trans = 0;
$firstLen = strlen($first);
$firstLen = mb_strlen($first, CHARSET);
for ($i = 0; $i < $firstLen; $i++) {
if ($first[$i] != $second[$i]) {
@ -307,7 +307,7 @@ class Fuzz
*/
private function _getPrefix($first, $second)
{
if (strlen($first) == 0 || strlen($second) == 0) {
if (mb_strlen($first, CHARSET) == 0 || mb_strlen($second, CHARSET) == 0) {
return '';
}
@ -317,7 +317,7 @@ class Fuzz
} elseif ($index == 0) {
return '';
} else {
return substr($first, 0, $index);
return mb_substr($first, 0, $index, CHARSET);
}
}
@ -335,7 +335,7 @@ class Fuzz
return -1;
}
$maxLen = min(strlen($first), strlen($second));
$maxLen = min(mb_strlen($first, CHARSET), mb_strlen($second, CHARSET));
for ($i = 0; $i < $maxLen; $i++) {
if ($first[$i] != $second[$i]) {
return $i;