We had the same problem and made 2 relatively fast stored procedures.
Levenshtein distance is good but not great for autocompleting searching
"catema" will match quite well with "catamaran" even though it has a bad Levenshtein distance. Making our function well suited for type as you go searches.
We have 2 versions, 1 is optimized to work with a big indexed word table and uses the first letter to narrow down the search for significant performance gain.
SELECT fuzzy_match_first_word('catema', `word`, 80) FROM `dictionary` WHERE (`word` LIKE 'c%') AND (fuzzy_match_first_word('catema', `word`, 80)>=80)
The other version will word on bigger strings, not just comparing single words.
SELECT fuzzy_match('catema', `subject`, 80) FROM `dictionary` WHERE (fuzzy_match('catema', `subject`, 80)>=80)
Stored Procedures:
DELIMITER //
CREATE OR REPLACE FUNCTION `fuzzy_match_first_word`(`str_needle` VARCHAR(64), `str_haystack` VARCHAR(4096), `minimum_quality` INT(11)) RETURNS INT(11)
DETERMINISTIC
BEGIN
DECLARE needleLen, haystackLen, iIdx, cLen, mLen, penalty, checkSpan, shiftAmount INT DEFAULT 0;
DECLARE sChar, subCharNeedle CHAR(1) DEFAULT ' ';
DECLARE res INT DEFAULT 100;
DECLARE n INT DEFAULT 2; -- assume first letter to be ok, needs to be checked by outer like on indexed field
DECLARE shifted INT DEFAULT 4; -- how often we allow letters being moved
SET needleLen = CHAR_LENGTH(str_needle);
SET haystackLen = CHAR_LENGTH(str_haystack);
SET checkSpan = 2; -- Check_span decides how wide to check - Min: 1, Max: Not sensible beyond 5.
IF (needleLen < 1) OR (haystackLen < 1) THEN SET res = 0; ELSE
SET sChar= LEFT(str_needle,1);
IF (haystackLen <= needleLen) THEN
SET cLen = haystackLen;
SET res = res-(20*(needleLen-haystackLen)); -- 30 penalty for each missing letter
if(res < minimum_quality) THEN RETURN 0; END IF;
SET mLen = cLen;
ELSE
SET cLen = needleLen;
SET mLen = haystackLen;
END IF;
WHILE n <= cLen DO
SET subCharNeedle = SUBSTRING(str_needle, n, 1);
IF(SUBSTRING(str_haystack, n + shiftAmount, 1) <> subCharNeedle) THEN
`fail_check`:
BEGIN -- check if not correct
SET penalty = 20; -- 20% reduction for each missed letter, 5% for closeness a close hit
FOR i IN 1..checkSpan DO
-- positive (assume missing letter more likely than a added letter)
SET iIdx = (n + i);
IF (iIdx > 0) AND (iIdx <= mLen) THEN
IF (SUBSTRING(str_haystack, iIdx + shiftAmount, 1) = subCharNeedle) THEN
SET penalty = 5*i;
IF shifted > 0 THEN
SET shifted = shifted-1;
SET shiftAmount = i + shiftAmount;
END IF;
LEAVE `fail_check`;
END IF;
END IF;
-- negative
SET iIdx = (n - i);
IF (iIdx > 0) AND (iIdx <= mLen) THEN
IF (SUBSTRING(str_haystack, iIdx + shiftAmount, 1) = subCharNeedle) THEN
SET penalty = 5*i;
IF shifted > 0 THEN
SET shifted = shifted-1;
SET shiftAmount = -i + shiftAmount;
END IF;
LEAVE `fail_check`;
END IF;
END IF;
END FOR;
END; -- end of fail_check
SET res = res - penalty;
if(res < minimum_quality) THEN RETURN 0; END IF;
END IF;
SET n = n + 1;
END WHILE;
END IF;
RETURN res;
END //
DELIMITER ;
DELIMITER //
CREATE OR REPLACE FUNCTION fuzzy_match(str_needle VARCHAR(64), str_haystack VARCHAR(4096), minimum_quality INT)
RETURNS INT DETERMINISTIC CONTAINS SQL
BEGIN
DECLARE needle_len, haystack_len, cIdx, iIdx, cLen, loop_abort, n INT DEFAULT 0;
DECLARE sub_len, check_span INT;
DECLARE sSub VARCHAR(4096);
DECLARE sChar, subChar_needle, subChar_tmp CHAR(1) DEFAULT ' ';
DECLARE res, rmatch_score, minq FLOAT DEFAULT 0;
SET str_needle = UPPER(REPLACE(TRIM(str_needle),' ',''));
SET str_haystack = UPPER(REPLACE(TRIM(str_haystack),' ',''));
SET needle_len = CHAR_LENGTH(str_needle);
SET haystack_len = CHAR_LENGTH(str_haystack);
SET minq = (minimum_quality / 100.0);
SET check_span = 2; -- Check_span decides how wide to check - Min: 1, Max: Not sensible beyond 5.
SET sChar= LEFT(str_needle,1);
IF (needle_len > 0) AND (haystack_len > 0) THEN
REPEAT
SET cIdx = IFNULL(LOCATE(sChar, str_haystack, cIdx+1), 0);
IF (cIdx > 0) THEN
SET sSub = SUBSTRING(str_haystack, cIdx, needle_len+1);
SET cLen = CHAR_LENGTH(sSub);
SET sub_len = CHAR_LENGTH(sSub);
SET cLen = (sub_len * (sub_len < needle_len)) + (needle_len * (sub_len >= needle_len));
SET rmatch_score = 0;
WHILE (loop_abort = 0) AND (n < cLen) DO
SET n = n + 1;
SET subChar_needle = SUBSTRING(str_needle, n, 1);
IF (subChar_tmp <> subChar_needle) THEN
SET subChar_tmp = subChar_needle;
FOR i IN -check_span..check_span DO
SET iIdx = (n + i - 1);
IF (iIdx >= 0) AND (iIdx < cLen) THEN
IF (subChar_needle = SUBSTRING(sSub, iIdx + 1, 1)) THEN
SET rmatch_score = rmatch_score + (check_span + 1 - ABS(i));
END IF;
END IF;
END FOR;
SET loop_abort = ((rmatch_score / (check_span * n)) < minq);
ELSE
SET rmatch_score = rmatch_score + check_span;
END IF;
END WHILE;
SET res = (rmatch_score / ((check_span + 1) * needle_len));
END IF;
UNTIL (cIdx <= 0) OR (res >= 1) END REPEAT;
END IF;
RETURN (res >= minq) * ROUND(res * 100);
END //
DELIMITER ;