445287f04c
Fix for #889
379 lines
10 KiB
PHP
379 lines
10 KiB
PHP
<?php
|
|
|
|
/**
|
|
* This script was modified to fit in Bludit
|
|
*/
|
|
|
|
/**
|
|
* Fuzz Class
|
|
* kevinfiol\fuzzget
|
|
*
|
|
* @category Class
|
|
* @package None
|
|
* @author Kevin Fiol <fiolkevin@gmail.com>
|
|
* @license https://opensource.org/licenses/MIT MIT License
|
|
* @link http://github.com/kevinfiol
|
|
*/
|
|
|
|
class Fuzz
|
|
{
|
|
private $_source;
|
|
private $_sourceLen;
|
|
private $_maxResults;
|
|
private $_searchMode;
|
|
private $_useLCS;
|
|
|
|
/**
|
|
* Fuzz Object Constructor
|
|
* Initialize private variables
|
|
*
|
|
* @param array $source An array of associative arrays
|
|
* @param int $maxResults The maximum number of results to retrieve upon a search
|
|
* @param int $searchMode 0 = Levenshtein, 1 = Jaro-Winkler
|
|
* @param boolean $useLCS Factor in Longest Common Substring in search results
|
|
*/
|
|
public function __construct($source, $maxResults, $searchMode, $useLCS)
|
|
{
|
|
$this->_source = $source;
|
|
$this->_sourceLen = count($source);
|
|
$this->_maxResults = max($maxResults, 1);
|
|
$this->_useLCS = $useLCS;
|
|
|
|
if ($searchMode < 0 || $searchMode > 1) {
|
|
throw new \Exception('Invalid search mode');
|
|
} else {
|
|
$this->_searchMode = $searchMode;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Search Method
|
|
* Initiate Search
|
|
*
|
|
* @param string $search Term to search for
|
|
* @param int $minLCS (if using LCS) Specify the minimum longest common substring
|
|
* @param int $maxDistance (if using Levenshtein) Specify the maximum distance allowed
|
|
*
|
|
* @return array $results Array of associative arrays containing search matches
|
|
*/
|
|
public function search($search, $minLCS = null, $maxDistance = null)
|
|
{
|
|
$results = [];
|
|
$scores = [];
|
|
|
|
// Nullify these parameters if they are irrelevant to searchMode
|
|
if (!$this->_useLCS) $minLCS = null;
|
|
if ($this->_searchMode != 0) $maxDistance = null;
|
|
|
|
// Cycle through result pool
|
|
//for ($i = 0; $i < $this->_sourceLen; $i++) {
|
|
foreach ($this->_source as $pageKey => $data) {
|
|
$allLev = [];
|
|
$allJaros = [];
|
|
$allLCSs = [];
|
|
|
|
// Cycle through each object's properties
|
|
foreach ($data as $key => $val) {
|
|
if ($this->_searchMode == 0) {
|
|
$allLev[] = $this->getLevenshtein(strval($val), $search);
|
|
} elseif ($this->_searchMode == 1) {
|
|
$allJaros[] = $this->getJaroWinkler(strval($val), $search);
|
|
}
|
|
|
|
if ($this->_useLCS) {
|
|
$allLCSs[] = $this->getLCS(strval($val), $search);
|
|
}
|
|
}
|
|
|
|
$lowestLev = $allLev ? min($allLev) : null;
|
|
$highestJaro = $allJaros ? max($allJaros) : null;
|
|
$highestLCS = $allLCSs ? max($allLCSs) : null;
|
|
|
|
// Get Score
|
|
if ($this->_searchMode == 0) {
|
|
$score = $lowestLev;
|
|
} else {
|
|
$score = -1 * abs($highestJaro);
|
|
}
|
|
|
|
if ($this->_useLCS) {
|
|
$score -= $highestLCS;
|
|
}
|
|
|
|
// Append Index of object + Best Score
|
|
if (($maxDistance == null || $lowestLev <= $maxDistance)
|
|
&& ($minLCS == null || $highestLCS >= $minLCS)
|
|
) {
|
|
$scores[$pageKey] = $score;
|
|
}
|
|
}
|
|
|
|
// Sort by score
|
|
asort($scores);
|
|
return $scores;
|
|
}
|
|
|
|
/**
|
|
* Get Longest Common Substring
|
|
*
|
|
* @param string $source Term to search for
|
|
* @param string $target Target term to search against
|
|
*
|
|
* @return int $result LCS Score
|
|
*/
|
|
public function getLCS($source, $target)
|
|
{
|
|
$suffix = [];
|
|
$result = 0;
|
|
$n = strlen($source);
|
|
$m = strlen($target);
|
|
|
|
for ($i = 0; $i <= $n; $i++) {
|
|
for ($j = 0; $j <= $m; $j++) {
|
|
if ($i === 0 || $j === 0) {
|
|
$suffix[$i][$j] = 0;
|
|
} elseif ($source[$i - 1] == $target[$j - 1]) {
|
|
$suffix[$i][$j] = $suffix[$i - 1][$j - 1] + 1;
|
|
$result = max($result, $suffix[$i][$j]);
|
|
} else {
|
|
$suffix[$i][$j] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
/**
|
|
* Get Levenshtein Distance
|
|
*
|
|
* @param string $source Term to search for
|
|
* @param string $target Target term to search against
|
|
*
|
|
* @return int Levenshtein Distance
|
|
*/
|
|
public function getLevenshtein($source, $target)
|
|
{
|
|
$matrix = [];
|
|
$n = strlen($source);
|
|
$m = strlen($target);
|
|
|
|
if ($n === 0) {
|
|
return $m;
|
|
} elseif ($m === 0) {
|
|
return $n;
|
|
}
|
|
|
|
// Initialize First Row
|
|
for ($i = 0; $i <= $n; $i++) {
|
|
$matrix[0][$i] = $i;
|
|
}
|
|
// Initialize First Column
|
|
for ($i = 0; $i <= $m; $i++) {
|
|
$matrix[$i][0] = $i;
|
|
}
|
|
|
|
for ($i = 1; $i <= $n; $i++) {
|
|
for ($j = 1; $j <= $m; $j++) {
|
|
if ($source[$i - 1] === $target[$j - 1]) {
|
|
$cost = 0;
|
|
} else {
|
|
$cost = 1;
|
|
}
|
|
|
|
// Cell immediately above + 1
|
|
$up = $matrix[$j - 1][$i] + 1;
|
|
// Cell immediately to the left + 1
|
|
$left = $matrix[$j][$i - 1] + 1;
|
|
// Cell diagnolly above and to the left + cost
|
|
$upleft = $matrix[$j - 1][$i - 1] + $cost;
|
|
|
|
$matrix[$j][$i] = min($up, $left, $upleft);
|
|
}
|
|
}
|
|
|
|
return $matrix[$m][$n];
|
|
}
|
|
|
|
/**
|
|
* Get Jaro-Winkler Score
|
|
*
|
|
* @param string $first String to match
|
|
* @param string $second String to match
|
|
*
|
|
* @return double $jaroWinkler Jaro-Winkler score between 0.0 and 1.0
|
|
*/
|
|
public function getJaroWinkler($first, $second)
|
|
{
|
|
$shorter;
|
|
$longer;
|
|
|
|
if (strlen($first) > strlen($second)) {
|
|
$longer = strtolower($first);
|
|
$shorter = strtolower($second);
|
|
} else {
|
|
$longer = strtolower($second);
|
|
$shorter = strtolower($first);
|
|
}
|
|
|
|
// Get half the length distance of shorter string
|
|
$halfLen = intval((strlen($shorter) / 2) + 1);
|
|
|
|
$match1 = $this->_getCharMatch($shorter, $longer, $halfLen);
|
|
$match2 = $this->_getCharMatch($longer, $shorter, $halfLen);
|
|
|
|
if ((strlen($match1) == 0 || strlen($match2) == 0)
|
|
|| (strlen($match1) != strlen($match2))
|
|
) {
|
|
return 0.0;
|
|
}
|
|
|
|
$trans = $this->_getTranspositions($match1, $match2);
|
|
|
|
$distance = (strlen($match1) / strlen($shorter)
|
|
+ strlen($match2) / strlen($longer)
|
|
+ (strlen($match1) - $trans)
|
|
/ strlen($match1)) / 3.0;
|
|
|
|
// Apply Winkler Adjustment
|
|
$prefixLen = min(strlen($this->_getPrefix($first, $second)), 4);
|
|
$jaroWinkler = round(($distance + (0.1 * $prefixLen * (1.0 - $distance))) * 100.0) / 100.0;
|
|
|
|
return $jaroWinkler;
|
|
}
|
|
|
|
/**
|
|
* Get Character Matches
|
|
*
|
|
* @param string $first String to match
|
|
* @param string $second String to match
|
|
* @param int $limit Limit of characters to match
|
|
*
|
|
* @return string $common Common substring
|
|
*/
|
|
private function _getCharMatch($first, $second, $limit)
|
|
{
|
|
$common = '';
|
|
$copy = $second;
|
|
$firstLen = strlen($first);
|
|
$secondLen = strlen($second);
|
|
|
|
for ($i = 0; $i < $firstLen; $i++) {
|
|
$char = $first[$i];
|
|
$found = false;
|
|
|
|
for ($j = max(0, $i - $limit); !$found && $j < min($i + $limit, $secondLen); $j++) {
|
|
if ($copy[$j] == $char) {
|
|
$found = true;
|
|
$common .= $char;
|
|
$copy[$j] = '*';
|
|
}
|
|
}
|
|
}
|
|
|
|
return $common;
|
|
}
|
|
|
|
/**
|
|
* Get Transpositions
|
|
*
|
|
* @param string $first String to match
|
|
* @param string $second String to match
|
|
*
|
|
* @return int $trans Number of transpositions between strings
|
|
*/
|
|
private function _getTranspositions($first, $second)
|
|
{
|
|
$trans = 0;
|
|
$firstLen = strlen($first);
|
|
|
|
for ($i = 0; $i < $firstLen; $i++) {
|
|
if ($first[$i] != $second[$i]) {
|
|
$trans += 1;
|
|
}
|
|
}
|
|
|
|
$trans /= 2;
|
|
return $trans;
|
|
}
|
|
|
|
/**
|
|
* Get Prefix
|
|
*
|
|
* @param string $first String to match
|
|
* @param string $second String to match
|
|
*
|
|
* @return string Returns substring representing the longest prefix
|
|
*/
|
|
private function _getPrefix($first, $second)
|
|
{
|
|
if (strlen($first) == 0 || strlen($second) == 0) {
|
|
return '';
|
|
}
|
|
|
|
$index = $this->_getDiffIndex($first, $second);
|
|
if ($index == -1) {
|
|
return $first;
|
|
} elseif ($index == 0) {
|
|
return '';
|
|
} else {
|
|
return substr($first, 0, $index);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get Difference Index
|
|
*
|
|
* @param string $first String to match
|
|
* @param string $second String to match
|
|
*
|
|
* @return Return index of first difference
|
|
*/
|
|
private function _getDiffIndex($first, $second)
|
|
{
|
|
if ($first == $second) {
|
|
return -1;
|
|
}
|
|
|
|
$maxLen = min(strlen($first), strlen($second));
|
|
for ($i = 0; $i < $maxLen; $i++) {
|
|
if ($first[$i] != $second[$i]) {
|
|
return $i;
|
|
}
|
|
}
|
|
|
|
return $maxLen;
|
|
}
|
|
|
|
/**
|
|
* Print Matrix
|
|
* Utility / Testing function for testing purposes
|
|
*
|
|
* @param array $arr 2-dimensional array representing a matrix
|
|
*
|
|
* @return void
|
|
*/
|
|
private function _printMatrix($arr)
|
|
{
|
|
$str = '';
|
|
$width = count($arr[0]);
|
|
$height = count($arr);
|
|
|
|
for ($i = 0; $i < $height; $i++) {
|
|
for ($j = 0; $j < $width; $j++) {
|
|
if (!isset($arr[$i][$j])) {
|
|
$arr[$i][$j] = ' ';
|
|
}
|
|
|
|
$str = $str . "[{$arr[$i][$j]}]";
|
|
|
|
if ($j === $width - 1) {
|
|
$str = $str . PHP_EOL;
|
|
}
|
|
}
|
|
}
|
|
|
|
print($str);
|
|
}
|
|
}
|