Improve search plugin and add support for UTF8 to Fuzz algorithm

This commit is contained in:
Diego Najar 2019-12-07 14:24:13 +01:00
parent b510c59bab
commit 7c8bad72c5
2 changed files with 25 additions and 26 deletions

View File

@ -116,7 +116,7 @@ EOF;
global $url; global $url;
// Change the whereAmI to avoid load pages in the rule 69.pages // Change the whereAmI to avoid load pages in the rule 69.pages
// This is only for performance propose // This is only for performance purpose
$url->setWhereAmI('search'); $url->setWhereAmI('search');
// Get the string to search from the URL // Get the string to search from the URL
@ -125,7 +125,6 @@ EOF;
// Search the string in the cache and get all pages with matches // Search the string in the cache and get all pages with matches
$list = $this->search($stringToSearch); $list = $this->search($stringToSearch);
$this->numberOfItems = count($list); $this->numberOfItems = count($list);
// Split the content in pages // Split the content in pages
@ -220,7 +219,7 @@ EOF;
// Inlcude Fuzz algorithm // Inlcude Fuzz algorithm
require_once($this->phpPath().'vendors/fuzz.php'); require_once($this->phpPath().'vendors/fuzz.php');
$fuzz = new Fuzz($cache, 10, 1, true); $fuzz = new Fuzz($cache, 10, 1, true);
$results = $fuzz->search($text, 3); $results = $fuzz->search($text, 5);
return(array_keys($results)); return(array_keys($results));
} }

View File

@ -125,8 +125,8 @@ class Fuzz
{ {
$suffix = []; $suffix = [];
$result = 0; $result = 0;
$n = strlen($source); $n = mb_strlen($source, CHARSET);
$m = strlen($target); $m = mb_strlen($target, CHARSET);
for ($i = 0; $i <= $n; $i++) { for ($i = 0; $i <= $n; $i++) {
for ($j = 0; $j <= $m; $j++) { for ($j = 0; $j <= $m; $j++) {
@ -155,8 +155,8 @@ class Fuzz
public function getLevenshtein($source, $target) public function getLevenshtein($source, $target)
{ {
$matrix = []; $matrix = [];
$n = strlen($source); $n = mb_strlen($source, CHARSET);
$m = strlen($target); $m = mb_strlen($target, CHARSET);
if ($n === 0) { if ($n === 0) {
return $m; return $m;
@ -208,35 +208,35 @@ class Fuzz
$shorter; $shorter;
$longer; $longer;
if (strlen($first) > strlen($second)) { if (mb_strlen($first, CHARSET) > mb_strlen($second, CHARSET)) {
$longer = strtolower($first); $longer = mb_strtolower($first, CHARSET);
$shorter = strtolower($second); $shorter = mb_strtolower($second, CHARSET);
} else { } else {
$longer = strtolower($second); $longer = mb_strtolower($second, CHARSET);
$shorter = strtolower($first); $shorter = mb_strtolower($first, CHARSET);
} }
// Get half the length distance of shorter string // Get half the length distance of shorter string
$halfLen = intval((strlen($shorter) / 2) + 1); $halfLen = intval((mb_strlen($shorter,CHARSET) / 2) + 1);
$match1 = $this->_getCharMatch($shorter, $longer, $halfLen); $match1 = $this->_getCharMatch($shorter, $longer, $halfLen);
$match2 = $this->_getCharMatch($longer, $shorter, $halfLen); $match2 = $this->_getCharMatch($longer, $shorter, $halfLen);
if ((strlen($match1) == 0 || strlen($match2) == 0) if ((mb_strlen($match1, CHARSET) == 0 || mb_strlen($match2, CHARSET) == 0)
|| (strlen($match1) != strlen($match2)) || (mb_strlen($match1, CHARSET) != mb_strlen($match2, CHARSET))
) { ) {
return 0.0; return 0.0;
} }
$trans = $this->_getTranspositions($match1, $match2); $trans = $this->_getTranspositions($match1, $match2);
$distance = (strlen($match1) / strlen($shorter) $distance = (mb_strlen($match1, CHARSET) / mb_strlen($shorter, CHARSET)
+ strlen($match2) / strlen($longer) + mb_strlen($match2, CHARSET) / mb_strlen($longer, CHARSET)
+ (strlen($match1) - $trans) + (mb_strlen($match1, CHARSET) - $trans)
/ strlen($match1)) / 3.0; / mb_strlen($match1, CHARSET)) / 3.0;
// Apply Winkler Adjustment // Apply Winkler Adjustment
$prefixLen = min(strlen($this->_getPrefix($first, $second)), 4); $prefixLen = min(mb_strlen($this->_getPrefix($first, $second),CHARSET), 4);
$jaroWinkler = round(($distance + (0.1 * $prefixLen * (1.0 - $distance))) * 100.0) / 100.0; $jaroWinkler = round(($distance + (0.1 * $prefixLen * (1.0 - $distance))) * 100.0) / 100.0;
return $jaroWinkler; return $jaroWinkler;
@ -255,8 +255,8 @@ class Fuzz
{ {
$common = ''; $common = '';
$copy = $second; $copy = $second;
$firstLen = strlen($first); $firstLen = mb_strlen($first, CHARSET);
$secondLen = strlen($second); $secondLen = mb_strlen($second, CHARSET);
for ($i = 0; $i < $firstLen; $i++) { for ($i = 0; $i < $firstLen; $i++) {
$char = $first[$i]; $char = $first[$i];
@ -285,7 +285,7 @@ class Fuzz
private function _getTranspositions($first, $second) private function _getTranspositions($first, $second)
{ {
$trans = 0; $trans = 0;
$firstLen = strlen($first); $firstLen = mb_strlen($first, CHARSET);
for ($i = 0; $i < $firstLen; $i++) { for ($i = 0; $i < $firstLen; $i++) {
if ($first[$i] != $second[$i]) { if ($first[$i] != $second[$i]) {
@ -307,7 +307,7 @@ class Fuzz
*/ */
private function _getPrefix($first, $second) private function _getPrefix($first, $second)
{ {
if (strlen($first) == 0 || strlen($second) == 0) { if (mb_strlen($first, CHARSET) == 0 || mb_strlen($second, CHARSET) == 0) {
return ''; return '';
} }
@ -317,7 +317,7 @@ class Fuzz
} elseif ($index == 0) { } elseif ($index == 0) {
return ''; return '';
} else { } else {
return substr($first, 0, $index); return mb_substr($first, 0, $index, CHARSET);
} }
} }
@ -335,7 +335,7 @@ class Fuzz
return -1; return -1;
} }
$maxLen = min(strlen($first), strlen($second)); $maxLen = min(mb_strlen($first, CHARSET), mb_strlen($second, CHARSET));
for ($i = 0; $i < $maxLen; $i++) { for ($i = 0; $i < $maxLen; $i++) {
if ($first[$i] != $second[$i]) { if ($first[$i] != $second[$i]) {
return $i; return $i;