From 7c8bad72c574b1d2a3cd1b499f7672e231973ea3 Mon Sep 17 00:00:00 2001 From: Diego Najar Date: Sat, 7 Dec 2019 14:24:13 +0100 Subject: [PATCH] Improve search plugin and add support for UTF8 to Fuzz algorithm --- bl-plugins/search/plugin.php | 5 ++-- bl-plugins/search/vendors/fuzz.php | 46 +++++++++++++++--------------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/bl-plugins/search/plugin.php b/bl-plugins/search/plugin.php index a183c1ee..9985fcae 100644 --- a/bl-plugins/search/plugin.php +++ b/bl-plugins/search/plugin.php @@ -116,7 +116,7 @@ EOF; global $url; // Change the whereAmI to avoid load pages in the rule 69.pages - // This is only for performance propose + // This is only for performance purpose $url->setWhereAmI('search'); // Get the string to search from the URL @@ -125,7 +125,6 @@ EOF; // Search the string in the cache and get all pages with matches $list = $this->search($stringToSearch); - $this->numberOfItems = count($list); // Split the content in pages @@ -220,7 +219,7 @@ EOF; // Inlcude Fuzz algorithm require_once($this->phpPath().'vendors/fuzz.php'); $fuzz = new Fuzz($cache, 10, 1, true); - $results = $fuzz->search($text, 3); + $results = $fuzz->search($text, 5); return(array_keys($results)); } diff --git a/bl-plugins/search/vendors/fuzz.php b/bl-plugins/search/vendors/fuzz.php index c1ab7761..72c5eaf7 100644 --- a/bl-plugins/search/vendors/fuzz.php +++ b/bl-plugins/search/vendors/fuzz.php @@ -125,8 +125,8 @@ class Fuzz { $suffix = []; $result = 0; - $n = strlen($source); - $m = strlen($target); + $n = mb_strlen($source, CHARSET); + $m = mb_strlen($target, CHARSET); for ($i = 0; $i <= $n; $i++) { for ($j = 0; $j <= $m; $j++) { @@ -155,8 +155,8 @@ class Fuzz public function getLevenshtein($source, $target) { $matrix = []; - $n = strlen($source); - $m = strlen($target); + $n = mb_strlen($source, CHARSET); + $m = mb_strlen($target, CHARSET); if ($n === 0) { return $m; @@ -208,35 +208,35 @@ class Fuzz $shorter; $longer; - if (strlen($first) > strlen($second)) { - $longer = strtolower($first); - $shorter = strtolower($second); + if (mb_strlen($first, CHARSET) > mb_strlen($second, CHARSET)) { + $longer = mb_strtolower($first, CHARSET); + $shorter = mb_strtolower($second, CHARSET); } else { - $longer = strtolower($second); - $shorter = strtolower($first); + $longer = mb_strtolower($second, CHARSET); + $shorter = mb_strtolower($first, CHARSET); } // Get half the length distance of shorter string - $halfLen = intval((strlen($shorter) / 2) + 1); + $halfLen = intval((mb_strlen($shorter,CHARSET) / 2) + 1); $match1 = $this->_getCharMatch($shorter, $longer, $halfLen); $match2 = $this->_getCharMatch($longer, $shorter, $halfLen); - if ((strlen($match1) == 0 || strlen($match2) == 0) - || (strlen($match1) != strlen($match2)) + if ((mb_strlen($match1, CHARSET) == 0 || mb_strlen($match2, CHARSET) == 0) + || (mb_strlen($match1, CHARSET) != mb_strlen($match2, CHARSET)) ) { return 0.0; } $trans = $this->_getTranspositions($match1, $match2); - $distance = (strlen($match1) / strlen($shorter) - + strlen($match2) / strlen($longer) - + (strlen($match1) - $trans) - / strlen($match1)) / 3.0; + $distance = (mb_strlen($match1, CHARSET) / mb_strlen($shorter, CHARSET) + + mb_strlen($match2, CHARSET) / mb_strlen($longer, CHARSET) + + (mb_strlen($match1, CHARSET) - $trans) + / mb_strlen($match1, CHARSET)) / 3.0; // Apply Winkler Adjustment - $prefixLen = min(strlen($this->_getPrefix($first, $second)), 4); + $prefixLen = min(mb_strlen($this->_getPrefix($first, $second),CHARSET), 4); $jaroWinkler = round(($distance + (0.1 * $prefixLen * (1.0 - $distance))) * 100.0) / 100.0; return $jaroWinkler; @@ -255,8 +255,8 @@ class Fuzz { $common = ''; $copy = $second; - $firstLen = strlen($first); - $secondLen = strlen($second); + $firstLen = mb_strlen($first, CHARSET); + $secondLen = mb_strlen($second, CHARSET); for ($i = 0; $i < $firstLen; $i++) { $char = $first[$i]; @@ -285,7 +285,7 @@ class Fuzz private function _getTranspositions($first, $second) { $trans = 0; - $firstLen = strlen($first); + $firstLen = mb_strlen($first, CHARSET); for ($i = 0; $i < $firstLen; $i++) { if ($first[$i] != $second[$i]) { @@ -307,7 +307,7 @@ class Fuzz */ private function _getPrefix($first, $second) { - if (strlen($first) == 0 || strlen($second) == 0) { + if (mb_strlen($first, CHARSET) == 0 || mb_strlen($second, CHARSET) == 0) { return ''; } @@ -317,7 +317,7 @@ class Fuzz } elseif ($index == 0) { return ''; } else { - return substr($first, 0, $index); + return mb_substr($first, 0, $index, CHARSET); } } @@ -335,7 +335,7 @@ class Fuzz return -1; } - $maxLen = min(strlen($first), strlen($second)); + $maxLen = min(mb_strlen($first, CHARSET), mb_strlen($second, CHARSET)); for ($i = 0; $i < $maxLen; $i++) { if ($first[$i] != $second[$i]) { return $i;