算法实现/字符串/Levenshtein 距离
本页面上的 Levenshtein 算法实现仅供说明。在大多数情况下,应用程序将使用在堆分配上谨慎使用的实现,特别是在比较大量单词时。以下备注说明了这种方法和相关主题的一些变化。
- 大多数实现使用一维或二维数组来存储所比较单词的前缀的距离。在大多数应用程序中,这些结构的大小是预先知道的。例如,当距离仅在低于某个最大允许距离时才相关时,就会出现这种情况(当从字典中选择单词以近似匹配给定单词时,就会发生这种情况)。在这种情况下,数组可以预先分配并在对连续单词进行算法的多次运行中重复使用。
- 使用最大允许距离对搜索时间设置上限。一旦字符串前缀之间的最小 Levenshtein 距离超过最大允许距离,搜索就可以停止。
- 字符的删除、插入和替换可以分配不同的权重。通常的选择是将所有三个权重设置为 1。这些权重的不同值允许在单词列表中进行更灵活的搜索策略。
function levenshtein(s1:String, s2:String):uint
{
const len1:uint = s1.length, len2:uint = s2.length;
var d:Vector.<Vector.<uint> >=new Vector.<Vector.<uint> >(len1+1);
for(i=0; i<=len1; ++i)
d[i] = new Vector.<uint>(len2+1);
d[0][0]=0;
var i:int;
var j:int;
for(i=1; i<=len1; ++i) d[i][0]=i; //int faster than uint
for(j=1; j<=len2; ++j) d[0][j]=j;
for(i = 1; i <= len1; ++i)
for(j = 1; j <= len2; ++j)
d[i][j] = Math.min( Math.min(d[i - 1][j] + 1,d[i][j - 1] + 1),
d[i - 1][j - 1] + (s1.charAt(i - 1) == s2.charAt(j - 1) ? 0 : 1) );
return(d[len1][len2]);
}
function Levenshtein(Left, Right : String) return Natural is
D : array(0 .. Left'Last, 0 .. Right'Last) of Natural;
begin
for I in D'range(1) loop D(I, 0) := I;end loop;
for J in D'range(2) loop D(0, J) := J;end loop;
for I in Left'range loop
for J in Right'range loop
D(I, J) := Natural'Min(D(I - 1, J), D(I, J - 1)) + 1;
D(I, J) := Natural'Min(D(I, J), D(I - 1, J - 1) + Boolean'Pos(Left(I) /= Right(J)));
end loop;
end loop;
return D(D'Last(1), D'Last(2));
end Levenshtein;
function levenshtein(str1, str2, cost_ins, cost_rep, cost_del, str1_len, str2_len, matrix, is_match, i, j, x, y, z) {
if(cost_ins == "") cost_ins = 1
if(cost_rep == "") cost_rep = 1
if(cost_del == "") cost_del = 1
str1_len = length(str1)
str2_len = length(str2)
if(str1_len == 0) return str2_len * cost_ins
if(str2_len == 0) return str1_len * cost_del
matrix[0, 0] = 0
for(i = 1; i <= str1_len; i++) {
matrix[i, 0] = i * cost_del
for(j = 1; j <= str2_len; j++) {
matrix[0, j] = j * cost_ins
x = matrix[i - 1, j] + cost_del
y = matrix[i, j - 1] + cost_ins
z = matrix[i - 1, j - 1] + (substr(str1, i, 1) == substr(str2, j, 1) ? 0 : cost_rep)
x = x < y ? x : y
matrix[i, j] = x < z ? x : z
}
}
return matrix[str1_len, str2_len]
}
#!/bin/bash
function levenshtein {
if (( $# != 2 )); then
echo "Usage: $0 word1 word2" >&2
elif (( ${#1} < ${#2} )); then
levenshtein "$2" "$1"
else
local str1len=${#1}
local str2len=${#2}
local d
for (( i = 0; i <= (str1len+1)*(str2len+1); i++ )); do
d[i]=0
done
for (( i = 0; i <= str1len; i++ )); do
d[i+0*str1len]=$i
done
for (( j = 0; j <= str2len; j++ )); do
d[0+j*(str1len+1)]=$j
done
for (( j = 1; j <= str2len; j++ )); do
for (( i = 1; i <= str1len; i++ )); do
[ "${1:i-1:1}" = "${2:j-1:1}" ] && local cost=0 || local cost=1
del=$(( d[(i-1)+str1len*j]+1 ))
ins=$(( d[i+str1len*(j-1)]+1 ))
alt=$(( d[(i-1)+str1len*(j-1)]+cost ))
d[i+str1len*j]=$( echo -e "$del\n$ins\n$alt" | sort -n | head -1 )
done
done
echo ${d[str1len+str1len*(str2len)]}
fi
}
(defun levenshtein-distance (str1 str2)
"Calculates the Levenshtein distance between str1 and str2, returns an editing distance (int)."
(let ((n (length str1))
(m (length str2)))
;; Check trivial cases
(cond ((= 0 n) (return-from levenshtein-distance m))
((= 0 m) (return-from levenshtein-distance n)))
(let ((col (make-array (1+ m) :element-type 'integer))
(prev-col (make-array (1+ m) :element-type 'integer)))
;; We need to store only two columns---the current one that
;; is being built and the previous one
(dotimes (i (1+ m))
(setf (svref prev-col i) i))
;; Loop across all chars of each string
(dotimes (i n)
(setf (svref col 0) (1+ i))
(dotimes (j m)
(setf (svref col (1+ j))
(min (1+ (svref col j))
(1+ (svref prev-col (1+ j)))
(+ (svref prev-col j)
(if (char-equal (schar str1 i) (schar str2 j)) 0 1)))))
(rotatef col prev-col))
(svref prev-col m))))
int min(int a, int b, int c)
{
if(a <= b && a <= c)
{
return a;
}
else if(b <= a && b <= c)
{
return b;
}
else if(c <= a && c <= b)
{
return c;
}
}
int levenshtein(char *s1, char *s2) {
unsigned int x, y, s1len, s2len;
s1len = strlen(s1);
s2len = strlen(s2);
unsigned int matrix[s2len+1][s1len+1];
matrix[0][0] = 0;
for (x = 1; x <= s2len; x++)
matrix[x][0] = matrix[x-1][0] + 1;
for (y = 1; y <= s1len; y++)
matrix[0][y] = matrix[0][y-1] + 1;
for (x = 1; x <= s2len; x++)
for (y = 1; y <= s1len; y++)
matrix[x][y] = min(matrix[x-1][y] + 1, matrix[x][y-1] + 1, matrix[x-1][y-1] + (s1[y-1] == s2[x-1] ? 0 : 1));
return(matrix[s2len][s1len]);
}
以上代码可以优化为使用 O(min(m,n)) 空间而不是 O(mn)。关键观察是,我们只需要在逐列填充矩阵时访问前一列的内容。因此,我们可以一遍又一遍地重复使用一列,在继续时覆盖其内容。
#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
int levenshtein(char *s1, char *s2) {
unsigned int s1len, s2len, x, y, lastdiag, olddiag;
s1len = strlen(s1);
s2len = strlen(s2);
unsigned int column[s1len + 1];
for (y = 1; y <= s1len; y++)
column[y] = y;
for (x = 1; x <= s2len; x++) {
column[0] = x;
for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
olddiag = column[y];
column[y] = MIN3(column[y] + 1, column[y - 1] + 1, lastdiag + (s1[y-1] == s2[x - 1] ? 0 : 1));
lastdiag = olddiag;
}
}
return column[s1len];
}
unsigned int edit_distance(const std::string& s1, const std::string& s2)
{
const std::size_t len1 = s1.size(), len2 = s2.size();
std::vector<std::vector<unsigned int>> d(len1 + 1, std::vector<unsigned int>(len2 + 1));
d[0][0] = 0;
for(unsigned int i = 1; i <= len1; ++i) d[i][0] = i;
for(unsigned int i = 1; i <= len2; ++i) d[0][i] = i;
for(unsigned int i = 1; i <= len1; ++i)
for(unsigned int j = 1; j <= len2; ++j)
// note that std::min({arg1, arg2, arg3}) works only in C++11,
// for C++98 use std::min(std::min(arg1, arg2), arg3)
d[i][j] = std::min({ d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1) });
return d[len1][len2];
}
请注意,使用
vector<vector<>>
在这里并不高效,因为您不需要二维数组。
以下是另一个只使用 内存的实现(它还利用了 这个事实,因此不需要从 3 个可能性中取最小值)。
#include <algorithm>
#include <vector>
template<typename T>
typename T::size_type LevenshteinDistance(const T &source, const T &target) {
if (source.size() > target.size()) {
return LevenshteinDistance(target, source);
}
using TSizeType = typename T::size_type;
const TSizeType min_size = source.size(), max_size = target.size();
std::vector<TSizeType> lev_dist(min_size + 1);
for (TSizeType i = 0; i <= min_size; ++i) {
lev_dist[i] = i;
}
for (TSizeType j = 1; j <= max_size; ++j) {
TSizeType previous_diagonal = lev_dist[0], previous_diagonal_save;
++lev_dist[0];
for (TSizeType i = 1; i <= min_size; ++i) {
previous_diagonal_save = lev_dist[i];
if (source[i - 1] == target[j - 1]) {
lev_dist[i] = previous_diagonal;
} else {
lev_dist[i] = std::min(std::min(lev_dist[i - 1], lev_dist[i]), previous_diagonal) + 1;
}
previous_diagonal = previous_diagonal_save;
}
}
return lev_dist[min_size];
}
以下是具有不同插入、删除和替换成本的广义 Levenshtein 距离的实现。
#include <algorithm>
#include <vector>
template<typename T>
typename T::size_type GeneralizedLevenshteinDistance(const T &source,
const T &target,
typename T::size_type insert_cost = 1,
typename T::size_type delete_cost = 1,
typename T::size_type replace_cost = 1) {
if (source.size() > target.size()) {
return GeneralizedLevenshteinDistance(target, source, delete_cost, insert_cost, replace_cost);
}
using TSizeType = typename T::size_type;
const TSizeType min_size = source.size(), max_size = target.size();
std::vector<TSizeType> lev_dist(min_size + 1);
lev_dist[0] = 0;
for (TSizeType i = 1; i <= min_size; ++i) {
lev_dist[i] = lev_dist[i - 1] + delete_cost;
}
for (TSizeType j = 1; j <= max_size; ++j) {
TSizeType previous_diagonal = lev_dist[0], previous_diagonal_save;
lev_dist[0] += insert_cost;
for (TSizeType i = 1; i <= min_size; ++i) {
previous_diagonal_save = lev_dist[i];
if (source[i - 1] == target[j - 1]) {
lev_dist[i] = previous_diagonal;
} else {
lev_dist[i] = std::min(std::min(lev_dist[i - 1] + delete_cost, lev_dist[i] + insert_cost), previous_diagonal + replace_cost);
}
previous_diagonal = previous_diagonal_save;
}
}
return lev_dist[min_size];
}
private int levenshtein(string a, string b)
{
if (string.IsNullOrEmpty(a))
{
if (!string.IsNullOrEmpty(b))
{
return b.Length;
}
return 0;
}
if (string.IsNullOrEmpty(b))
{
if (!string.IsNullOrEmpty(a))
{
return a.Length;
}
return 0;
}
int cost;
int[,] d = new int[a.Length + 1, b.Length + 1];
int min1;
int min2;
int min3;
for (int i = 0; i <= d.GetUpperBound(0); i += 1)
{
d[i, 0] = i;
}
for (int i = 0; i <= d.GetUpperBound(1); i += 1)
{
d[0, i] = i;
}
for (int i = 1; i <= d.GetUpperBound(0); i += 1)
{
for (int j = 1; j <= d.GetUpperBound(1); j += 1)
{
cost = (a[i-1] != b[j-1])? 1 : 0;
min1 = d[i - 1, j] + 1;
min2 = d[i, j - 1] + 1;
min3 = d[i - 1, j - 1] + cost;
d[i, j] = Math.Min(Math.Min(min1, min2), min3);
}
}
return d[d.GetUpperBound(0), d.GetUpperBound(1)];
}
具有减少内存使用量的实现
public int LevenshteinDistance(string source, string target){
if(string.IsNullOrEmpty(source)){
if(string.IsNullOrEmpty(target)) return 0;
return target.Length;
}
if(string.IsNullOrEmpty(target)) return source.Length;
if(source.Length > target.Length){
var temp = target;
target = source;
source = temp;
}
var m = target.Length;
var n = source.Length;
var distance = new int[2, m + 1];
// Initialize the distance matrix
for(var j = 1; j <= m; j++) distance[0, j] = j;
var currentRow = 0;
for(var i = 1; i <= n; ++i){
currentRow = i & 1;
distance[currentRow, 0] = i;
var previousRow = currentRow ^ 1;
for(var j = 1; j <= m; j++) {
var cost = (target[j - 1] == source[i - 1] ? 0 : 1);
distance[currentRow, j] = Math.Min(Math.Min(
distance[previousRow, j] + 1,
distance[currentRow, j - 1] + 1),
distance[previousRow, j - 1] + cost);
}
}
return distance[currentRow, m];
}
Damerau-Levenshtein 距离在渐进时间 O ((max + 1) * min (first.length (), second.length ())) 内计算。
/// <summary>
/// Damerau-Levenshtein distance
/// </summary>
public class DamerauLevensteinMetric
{
private const int DEFAULT_LENGTH = 255;
private int[] _currentRow;
private int[] _previousRow;
private int[] _transpositionRow;
public DamerauLevensteinMetric()
: this(DEFAULT_LENGTH)
{
}
/// <summary>
///
/// </summary>
/// <param name="maxLength"></param>
public DamerauLevensteinMetric(int maxLength)
{
_currentRow = new int[maxLength + 1];
_previousRow = new int[maxLength + 1];
_transpositionRow = new int[maxLength + 1];
}
/// <summary>
/// Damerau-Levenshtein distance is computed in asymptotic time O((max + 1) * min(first.length(), second.length()))
/// </summary>
/// <param name="first"></param>
/// <param name="second"></param>
/// <param name="max"></param>
/// <returns></returns>
public int GetDistance(string first, string second, int max)
{
int firstLength = first.Length;
int secondLength = second.Length;
if (firstLength == 0)
return secondLength;
if (secondLength == 0) return firstLength;
if (firstLength > secondLength)
{
string tmp = first;
first = second;
second = tmp;
firstLength = secondLength;
secondLength = second.Length;
}
if (max < 0) max = secondLength;
if (secondLength - firstLength > max) return max + 1;
if (firstLength > _currentRow.Length)
{
_currentRow = new int[firstLength + 1];
_previousRow = new int[firstLength + 1];
_transpositionRow = new int[firstLength + 1];
}
for (int i = 0; i <= firstLength; i++)
_previousRow[i] = i;
char lastSecondCh = (char) 0;
for (int i = 1; i <= secondLength; i++)
{
char secondCh = second[i - 1];
_currentRow[0] = i;
// Compute only diagonal stripe of width 2 * (max + 1)
int from = Math.Max(i - max - 1, 1);
int to = Math.Min(i + max + 1, firstLength);
char lastFirstCh = (char) 0;
for (int j = from; j <= to; j++)
{
char firstCh = first[j - 1];
// Compute minimal cost of state change to current state from previous states of deletion, insertion and swapping
int cost = firstCh == secondCh ? 0 : 1;
int value = Math.Min(Math.Min(_currentRow[j - 1] + 1, _previousRow[j] + 1), _previousRow[j - 1] + cost);
// If there was transposition, take in account its cost
if (firstCh == lastSecondCh && secondCh == lastFirstCh)
value = Math.Min(value, _transpositionRow[j - 2] + cost);
_currentRow[j] = value;
lastFirstCh = firstCh;
}
lastSecondCh = secondCh;
int[] tempRow = _transpositionRow;
_transpositionRow = _previousRow;
_previousRow = _currentRow;
_currentRow = tempRow;
}
return _previousRow[firstLength];
}
}
(defn nextelt
"Given two characters, the previous row, and a row we are
building, determine out the next element for this row."
[char1 char2 prevrow thisrow position]
(if (= char1 char2)
(prevrow (- position 1))
(+ 1 (min
(prevrow (- position 1))
(prevrow position)
(last thisrow))))
)
(defn nextrow
"Based on the next character from string1 and the whole of string2
calculate the next row. Initially thisrow contains one number."
[char1 str2 prevrow thisrow]
(let [char2 (first str2)
position (count thisrow)]
(if (= (count thisrow) (count prevrow))
thisrow
(recur
char1
(rest str2)
prevrow
(conj thisrow (nextelt char1 char2 prevrow thisrow position))))))
(defn levenshtein
"Calculate the Levenshtein distance between two strings."
([str1 str2]
(let [row0 (vec (map first (map vector (iterate inc 1) str2)))]
(levenshtein 1 (vec (cons 0 row0)) str1 str2)))
([row-nr prevrow str1 str2]
(let [next-row (nextrow (first str1) str2 prevrow (vector row-nr))
str1-remainder (.substring str1 1)]
(if (= "" str1-remainder)
(last next-row)
(recur (inc row-nr) next-row str1-remainder str2))))
)
另一个使用瞬态数据结构的实现,灵感来自 Common Lisp 版本
(defn levenshtein [str1 str2]
"a Clojure levenshtein implementation using transient data structure"
(let [n (count str1) m (count str2)]
(cond
(= 0 n) m
(= 0 m) n
:else
(let [prev-col (transient (vec (range (inc m)))) col (transient [])] ; initialization for the first column.
(dotimes [i n]
(assoc! col 0 (inc i)) ; update col[0]
(dotimes [j m]
(assoc! col (inc j) ; update col[1..m]
(min (inc (nth col j))
(inc (nth prev-col (inc j)))
(+ (get prev-col j) (if (= (nth str1 i) (nth str2 j)) 0 1)))))
(dotimes [i (count prev-col)]
(assoc! prev-col i (get col i)))) ;
(last (persistent! col)))))) ; last element of last column
import std.string;
import std.algorithm; // for min, already defines levenshteinDistance in library
pure nothrow size_t damerauDistance(const string source, const string target) {
immutable auto sourceLength = source.length;
immutable auto targetLength = target.length;
auto distance = new size_t[][](sourceLength + 1, targetLength+1);
for (auto iSource = 0; iSource <= sourceLength; ++iSource) {
distance[iSource][0] = iSource;
}
for (auto iTarget = 1; iTarget <= targetLength; ++iTarget) {
distance[0][iTarget] = iTarget;
}
for (auto iSource = 1; iSource <= sourceLength; ++iSource) {
for (auto iTarget = 1; iTarget <= targetLength; ++iTarget) {
auto delete1 = distance[iSource - 1][iTarget] + 1;
auto insert1 = distance[iSource][iTarget - 1] + 1;
size_t marginalCost = source[iSource - 1] == target[iTarget - 1] ? 0 : 1;
auto substitute = distance[iSource - 1][iTarget - 1] + marginalCost;
distance[iSource][iTarget] = min(delete1, insert1, substitute);
if (iSource > 1 && iTarget > 1 &&
source[iSource - 1] == target[iTarget - 2] &&
source[iSource - 2] == target[iTarget-1]) {
distance[iSource][iTarget] =
min(distance[iSource][iTarget], distance[iSource - 2][iTarget - 2] + marginalCost);
}
}
}
return distance[sourceLength][targetLength];
}
一个简单的实现,当然可以改进。
function Levenshtein(Word1, Word2: String): integer;
var lev : array of array of integer;
i,j : integer;
begin
// Word1 := LowerCase(Word1);
// Word2 := LowerCase(Word2);
// If the words are identical, do nothing
if Word1 = Word2 then
begin
result := 0;
exit;
end;
SetLength(lev, length(Word1) + 1);
for i := low(lev) to high(lev) do
setLength(lev[i], length(Word2) + 1);
for i := low(lev) to high(lev) do lev[i][0] := i;
for j := low(lev[low(lev)]) to high(lev[low(lev)]) do lev[0][j] := j;
for i := low(lev)+1 to high(lev) do
for j := low(lev[i])+1 to high(lev[i]) do
lev[i][j] := min(min(lev[i-1][j] + 1,lev[i][j-1] + 1)
,lev[i-1][j-1] + ifthen(Word1[i] = Word2[j], 0, 1));
result := lev[length(word1)][length(word2)];
end;
distance (a, b: STRING): INTEGER
-- Compute Levenshtein distance between strings `a' and `b'.
note
synopsis: "[
Example: Calculate distance between "sitting" and "kitten":
k i t t e n
0 1 2 3 4 5 6 <-- set top row to [0 .. m]
s 1 1 2 3 4 5 6
i 2 2 1 2 3 4 5
t 3 3 2 1 2 3 4
t 4 4 3 2 1 2 3
i 5 5 4 3 2 2 3
n 6 6 5 4 3 3 2
g 7 7 6 5 4 4 3! <-- Answer
^
|
set first column to [0 .. n]
Compute top row and left-most columns first. Then go cell-by-cell
(e.g. 1 .. n over 1 .. m), row-by-row, column-by-column, computing:
d[i-1, j] + 1 -- a deletion
d[i, j-1] + 1 -- an insertion
d[i-1, j-1] + 1 -- a substitution
Taking the minimum of each and putting it in its proper place in
the d[i, j] array, unless the char's are the same (e.g. a [i - 1] = b [j - 1])
]"
language_note: "[
Eiffel is a 1-based array system and not 0-based, so do not
read the array element accesses from the 0-based mindset.
]"
local
d: ARRAY2 [INTEGER]
i, j, m, n: INTEGER
do
m := a.count + 1; n := b.count + 1
create d.make_filled (0, m, n)
from i := 0 until i = m loop d [i + 1, 1] := i; i := i + 1 end
from j := 0 until j = n loop d [1, j + 1] := j; j := j + 1 end
from j := 2 until j > n loop
from i := 2 until i > m loop
if a [i - 1] = b [j - 1] then
d[i, j] := d[i-1, j-1]
else
d[i, j] := (d[i-1, j-1] + 1).min (d[i, j-1] + 1).min (d[i-1, j] + 1)
end
i := i + 1
end
j := j + 1
end
Result := d[m, n]
end
(defun levenshtein-distance (str1 str2)
"Return the edit distance between strings STR1 and STR2."
(if (not (stringp str1))
(error "Argument was not a string: %s" str1))
(if (not (stringp str2))
(error "Argument was not a string: %s" str2))
(let* ((make-table (function (lambda (columns rows init)
(make-vector rows (make-vector columns init)))))
(tref (function (lambda (table x y)
(aref (aref table y) x))))
(tset (function (lambda (table x y object)
(let ((row (copy-sequence (aref table y))))
(aset row x object)
(aset table y row) object))))
(length-str1 (length str1))
(length-str2 (length str2))
(d (funcall make-table (1+ length-str1) (1+ length-str2) 0)))
(let ((i 0) (j 0))
(while (<= i length-str1)
(funcall tset d i 0 i)
(setq i (1+ i)))
(while (<= j length-str2)
(funcall tset d 0 j j)
(setq j (1+ j))))
(let ((i 1))
(while (<= i length-str1)
(let ((j 1))
(while (<= j length-str2)
(let* ((cost (if (equal (aref str1 (1- i)) (aref str2 (1- j)))
0
1))
(deletion (1+ (funcall tref d (1- i) j)))
(insertion (1+ (funcall tref d i (1- j))))
(substitution (+ (funcall tref d (1- i) (1- j)) cost)))
(funcall tset d i j (min insertion deletion substitution)))
(setq j (1+ j))))
(setq i (1+ i))))
(funcall tref d length-str1 length-str2)))
levenshtein(A, []) -> length(A);
levenshtein([], B) -> length(B);
levenshtein([A | TA] = AA, [B | TB] = BA) ->
lists:min([levenshtein(TA, BA) + 1,
levenshtein(AA, TB) + 1,
levenshtein(TA, TB) + delta(A, B)]).
delta(_A, _A) -> 0;
delta(_A, _B) -> 1.
内联的 min 函数提供了很大的速度提升。
let inline min3 one two three =
if one < two && one < three then one
elif two < three then two
else three
let wagnerFischer (s: string) (t: string) =
let m = s.Length
let n = t.Length
let d = Array2D.create (m + 1) (n + 1) 0
for i = 0 to m do d.[i, 0] <- i
for j = 0 to n do d.[0, j] <- j
for j = 1 to n do
for i = 1 to m do
if s.[i-1] = t.[j-1] then
d.[i, j] <- d.[i-1, j-1]
else
d.[i, j] <-
min3
(d.[i-1, j ] + 1) // a deletion
(d.[i , j-1] + 1) // an insertion
(d.[i-1, j-1] + 1) // a substitution
d.[m,n]
以下是一个稍微快一点的延迟版本。
let wagnerFischerLazy (s: string) (t: string) =
let m = s.Length
let n = t.Length
let d = Array2D.create (m+1) (n+1) -1
let rec dist =
function
| i, 0 -> i
| 0, j -> j
| i, j when d.[i,j] <> -1 -> d.[i,j]
| i, j ->
let dval =
if s.[i-1] = t.[j-1] then dist (i-1, j-1)
else
min3
(dist (i-1, j) + 1) // a deletion
(dist (i, j-1) + 1) // an insertion
(dist (i-1, j-1) + 1) // a substitution
d.[i, j] <- dval; dval
dist (m, n)
此版本使用动态规划,时间复杂度为 ,其中 和 分别是 和 的长度,空间复杂度为 个整数加上一些常数空间(即 )。
import "unicode/utf8"
func Levenshtein(a, b string) int {
f := make([]int, utf8.RuneCountInString(b)+1)
for j := range f {
f[j] = j
}
for _, ca := range a {
j := 1
fj1 := f[0] // fj1 is the value of f[j - 1] in last iteration
f[0]++
for _, cb := range b {
mn := min(f[j]+1, f[j-1]+1) // delete & insert
if cb != ca {
mn = min(mn, fj1+1) // change
} else {
mn = min(mn, fj1) // matched
}
fj1, f[j] = f[j], mn // save f[j] to fj1(j is about to increase), update f[j] to mn
j++
}
}
return f[len(f)-1]
}
func min(a, b int) int {
if a <= b {
return a
} else {
return b
}
}
此版本基于下面的 Java 版本。
class Levenshtein {
def static int distance(String str1, String str2) {
def str1_len = str1.length()
def str2_len = str2.length()
int[][] distance = new int[str1_len + 1][str2_len + 1]
(str1_len + 1).times { distance[it][0] = it }
(str2_len + 1).times { distance[0][it] = it }
(1..str1_len).each { i ->
(1..str2_len).each { j ->
distance[i][j] = [distance[i-1][j]+1, distance[i][j-1]+1, str1[i-1]==str2[j-1]?distance[i-1][j-1]:distance[i-1][j-1]+1].min()
}
}
distance[str1_len][str2_len]
}
}
使用 GHCi 测试。
levenshtein::String->String->Int
levenshtein s t =
d!!(length s)!!(length t)
where d = [[distance m n|n<-[0..length t]]|m<-[0..length s]]
distance i 0 = i
distance 0 j = j
distance i j = minimum [d!!(i-1)!!j+1, d!!i!!(j-1)+1, d!!(i-1)!!(j-1) + (if s!!(i-1)==t!!(j-1) then 0 else 1)]
这里有一个稍微快一点的版本。
levenshtein::String->String->Int
levenshtein s t =
d!!(length s)!!(length t)
where d = [[distance m n|n<-[0..length t]]|m<-[0..length s]]
distance i 0 = i
distance 0 j = j
distance i j = if s!!(i-1)==t!!(j-1) then d!!(i-1)!!(j-1) else
1 + minimum [d!!(i-1)!!j, d!!i!!(j-1), d!!(i-1)!!(j-1)]
对于大型字符串,使用数组快得多
import Data.Array.Unboxed
import Data.Array.ST
import Control.Monad.ST
for_ xs f = mapM_ f xs
levenshtein :: [Char] -> [Char] -> Int
levenshtein s t = d ! (ls , lt)
where s' = array (0,ls) [ (i,x) | (i,x) <- zip [0..] s ]::UArray Int Char
t' = array (0,lt) [ (i,x) | (i,x) <- zip [0..] t ]::UArray Int Char
ls = length s
lt = length t
(l,h) = ((0,0),(length s,length t))
d = runSTUArray $ do
m <- newArray (l,h) 0 :: ST s (STUArray s (Int,Int) Int)
for_ [0..ls] $ \i -> writeArray m (i,0) i
for_ [0..lt] $ \j -> writeArray m (0,j) j
for_ [1..lt] $ \j -> do
for_ [1..ls] $ \i -> do
let c = if s'!(i-1)==t'! (j-1)
then 0 else 1
x <- readArray m (i-1,j)
y <- readArray m (i,j-1)
z <- readArray m (i-1,j-1)
writeArray m (i,j) $ minimum [x+1, y+1, z+c ]
return m
作为递归定义的数组
import Array
toArray l = listArray (0, length l - 1) l -- makes an Array from a List
mkArray f bnds = array bnds [ (i, f i) | i <- range bnds ] -- defines an Array over its indices
levenshtein sa sb = table
where
arrA = toArray sa
arrB = toArray sb
table = mkArray f ((0,0), (length sa , length sb))
f (ia, 0) = ia
f (0 ,ib) = ib
f (ia,ib)
| a == b = table ! (ia-1,ib-1)
| otherwise = 1 + minimum [ table ! x | x <- [(ia-1,ib-1),(ia-1,ib),(ia,ib-1)] ]
where
a = arrA ! (ia - 1)
b = arrB ! (ib - 1)
最后:快速但难以理解的实现
levenshtein2 sa sb = last $ foldl transform [0..length sa] sb
where
transform xs@(x:xs') c = scanl compute (x+1) (zip3 sa xs xs')
where
compute z (c', x, y) = minimum [y+1, z+1, x + fromEnum (c' /= c)]
Levenshtein := method(left, right,
if(right size < left size, return Levenshtein(right, left))
current := 0 to(left size) asList
right foreach(i, righti,
previous := current
current = List clone with(i + 1)
left foreach(j, leftj,
current append((current at(j) + 1) min(previous at(j + 1) + 1) min(previous at(j) + if(leftj == righti, 0, 1)))
)
)
current last
)
慢速递归版本
function levenshteinDistance (s, t) {
if (s.length === 0) return t.length;
if (t.length === 0) return s.length;
return Math.min(
levenshteinDistance(s.substr(1), t) + 1,
levenshteinDistance(t.substr(1), s) + 1,
levenshteinDistance(s.substr(1), t.substr(1)) + (s[0] !== t[0] ? 1 : 0)
);
}
使用动态规划的更快方法
// Compute the edit distance between the two given strings
function getEditDistance(a, b) {
if (a.length === 0) return b.length;
if (b.length === 0) return a.length;
var matrix = [];
// increment along the first column of each row
var i;
for (i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
// increment each column in the first row
var j;
for (j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
// Fill in the rest of the matrix
for (i = 1; i <= b.length; i++) {
for (j = 1; j <= a.length; j++) {
if (b.charAt(i-1) == a.charAt(j-1)) {
matrix[i][j] = matrix[i-1][j-1];
} else {
matrix[i][j] = Math.min(matrix[i-1][j-1] + 1, // substitution
Math.min(matrix[i][j-1] + 1, // insertion
matrix[i-1][j] + 1)); // deletion
}
}
}
return matrix[b.length][a.length];
};
public class LevenshteinDistance {
private static int minimum(int a, int b, int c) {
return Math.min(Math.min(a, b), c);
}
public static int computeLevenshteinDistance(CharSequence lhs, CharSequence rhs) {
int[][] distance = new int[lhs.length() + 1][rhs.length() + 1];
for (int i = 0; i <= lhs.length(); i++)
distance[i][0] = i;
for (int j = 1; j <= rhs.length(); j++)
distance[0][j] = j;
for (int i = 1; i <= lhs.length(); i++)
for (int j = 1; j <= rhs.length(); j++)
distance[i][j] = minimum(
distance[i - 1][j] + 1,
distance[i][j - 1] + 1,
distance[i - 1][j - 1] + ((lhs.charAt(i - 1) == rhs.charAt(j - 1)) ? 0 : 1));
return distance[lhs.length()][rhs.length()];
}
}
非递归,更快
public int levenshteinDistance (CharSequence lhs, CharSequence rhs) {
int len0 = lhs.length() + 1;
int len1 = rhs.length() + 1;
// the array of distances
int[] cost = new int[len0];
int[] newcost = new int[len0];
// initial cost of skipping prefix in String s0
for (int i = 0; i < len0; i++) cost[i] = i;
// dynamically computing the array of distances
// transformation cost for each letter in s1
for (int j = 1; j < len1; j++) {
// initial cost of skipping prefix in String s1
newcost[0] = j;
// transformation cost for each letter in s0
for(int i = 1; i < len0; i++) {
// matching current letters in both strings
int match = (lhs.charAt(i - 1) == rhs.charAt(j - 1)) ? 0 : 1;
// computing cost for each transformation
int cost_replace = cost[i - 1] + match;
int cost_insert = cost[i] + 1;
int cost_delete = newcost[i - 1] + 1;
// keep minimum cost
newcost[i] = Math.min(Math.min(cost_insert, cost_delete), cost_replace);
}
// swap cost/newcost arrays
int[] swap = cost; cost = newcost; newcost = swap;
}
// the distance is the cost for transforming all letters in both strings
return cost[len0 - 1];
}
直接递归实现。不要用于大型字符串!
function levenshtein(lhs::String, rhs::String, lhsi::Int = length(lhs), rhsi::Int = length(rhs))
if lhsi == 0
return rhsi
end
if rhsi == 0
return lhsi
end
cost = lhs[lhsi] == rhs[rhsi] ? 0 : 1
min(
levenshtein(lhs, rhs, lhsi - 1, rhsi) + 1,
levenshtein(lhs, rhs, lhsi, rhsi - 1) + 1,
levenshtein(lhs, rhs, lhsi - 1, rhsi - 1) + cost
)
end
使用以下测试
@show levenshtein("aaa", "ab")
@show levenshtein("kitten", "sitting")
fun levenshtein(lhs : CharSequence, rhs : CharSequence) : Int {
val lhsLength = lhs.length
val rhsLength = rhs.length
var cost = IntArray(lhsLength + 1) { it }
var newCost = IntArray(lhsLength + 1) { 0 }
for (i in 1..rhsLength) {
newCost[0] = i
for (j in 1..lhsLength) {
val editCost= if(lhs[j - 1] == rhs[i - 1]) 0 else 1
val costReplace = cost[j - 1] + editCost
val costInsert = cost[j] + 1
val costDelete = newCost[j - 1] + 1
newCost[j] = minOf(costInsert, costDelete, costReplace)
}
val swap = cost
cost = newCost
newCost = swap
}
return cost[lhsLength]
}
此实现似乎已损坏;但我不能确定如何修复它。
DROP FUNCTION IF EXISTS `levenshtein`;
CREATE FUNCTION `levenshtein`(`s1` VARCHAR(255) CHARACTER SET utf8, `s2` VARCHAR(255) CHARACTER SET utf8)
RETURNS TINYINT UNSIGNED
NO SQL
DETERMINISTIC
BEGIN
DECLARE s1_len, s2_len, i, j, c, c_temp TINYINT UNSIGNED;
-- max strlen=255 for this function
DECLARE cv0, cv1 VARBINARY(256);
-- if any param is NULL return NULL
-- (consistent with builtin functions)
IF (s1 + s2) IS NULL THEN
RETURN NULL;
END IF;
SET s1_len = CHAR_LENGTH(s1),
s2_len = CHAR_LENGTH(s2),
cv1 = 0x00,
j = 1,
i = 1,
c = 0;
-- if any string is empty,
-- distance is the length of the other one
IF (s1 = s2) THEN
RETURN 0;
ELSEIF (s1_len = 0) THEN
RETURN s2_len;
ELSEIF (s2_len = 0) THEN
RETURN s1_len;
END IF;
WHILE (j <= s2_len) DO
SET cv1 = CONCAT(cv1, CHAR(j)),
j = j + 1;
END WHILE;
WHILE (i <= s1_len) DO
SET c = i,
cv0 = CHAR(i),
j = 1;
-- What for syntax of WHILE is that?? MYSQL do not now. Please help.
WHILE (j <= c_temp) THEN
SET c = c_temp;
END IF;
SET c_temp = ORD(SUBSTRING(cv1, j+1, 1)) + 1;
IF (c > c_temp) THEN
SET c = c_temp;
END IF;
SET cv0 = CONCAT(cv0, CHAR(c)),
j = j + 1;
END WHILE;
SET cv1 = cv0,
i = i + 1;
END WHILE;
RETURN c;
END;
下面的实现,感谢 Code Janitor 的 Jason Rust,似乎更完整。
CREATE FUNCTION levenshtein( s1 VARCHAR(255), s2 VARCHAR(255) )
RETURNS INT
DETERMINISTIC
BEGIN
DECLARE s1_len, s2_len, i, j, c, c_temp, cost INT;
DECLARE s1_char CHAR;
-- max strlen=255
DECLARE cv0, cv1 VARBINARY(256);
SET s1_len = CHAR_LENGTH(s1), s2_len = CHAR_LENGTH(s2), cv1 = 0x00, j = 1, i = 1, c = 0;
IF s1 = s2 THEN
RETURN 0;
ELSEIF s1_len = 0 THEN
RETURN s2_len;
ELSEIF s2_len = 0 THEN
RETURN s1_len;
ELSE
WHILE j <= s2_len DO
SET cv1 = CONCAT(cv1, UNHEX(HEX(j))), j = j + 1;
END WHILE;
WHILE i <= s1_len DO
SET s1_char = SUBSTRING(s1, i, 1), c = i, cv0 = UNHEX(HEX(i)), j = 1;
WHILE j <= s2_len DO
SET c = c + 1;
IF s1_char = SUBSTRING(s2, j, 1) THEN
SET cost = 0; ELSE SET cost = 1;
END IF;
SET c_temp = CONV(HEX(SUBSTRING(cv1, j, 1)), 16, 10) + cost;
IF c > c_temp THEN SET c = c_temp; END IF;
SET c_temp = CONV(HEX(SUBSTRING(cv1, j+1, 1)), 16, 10) + 1;
IF c > c_temp THEN
SET c = c_temp;
END IF;
SET cv0 = CONCAT(cv0, UNHEX(HEX(c))), j = j + 1;
END WHILE;
SET cv1 = cv0, i = i + 1;
END WHILE;
END IF;
RETURN c;
END;
- (NSInteger)computeLevenshteinDistanceFrom:(NSString *)source to:(NSString *)target {
NSUInteger sourceLength = [source length] + 1;
NSUInteger targetLength = [target length] + 1;
NSMutableArray *cost = [NSMutableArray new];
NSMutableArray *newCost = [NSMutableArray new];
for (NSUInteger i = 0; i < sourceLength; i++) {
cost[i] = @(i);
}
for (NSUInteger j = 1; j < targetLength; j++) {
newCost[0] = @(j - 1);
for (NSUInteger i = 1; i < sourceLength; i++) {
NSInteger match = ([source characterAtIndex:i - 1] == [target characterAtIndex:j - 1]) ? 0 : 1;
NSInteger costReplace = [cost[i - 1] integerValue] + match;
NSInteger costInsert = [cost[i] integerValue] + 1;
NSInteger costDelete = [newCost[i - 1] integerValue] + 1;
newCost[i] = @(MIN(MIN(costInsert, costDelete), costReplace));
}
NSMutableArray *swap = cost;
cost = newCost;
newCost = swap;
}
return [cost[sourceLength - 1] integerValue];
}
或者使用扩展“和 C 风格”
#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
@implementation NSString (LevensteinDistance)
- (NSUInteger)distance:(NSString *)query {
const char *selfCharArray = [self UTF8String];
const char *queryCharArray = [query UTF8String];
NSUInteger x, y;
NSUInteger selfLength = strlen(selfCharArray);
NSUInteger queryLength = strlen(queryCharArray);
NSUInteger matrix[queryLength + 1][selfLength + 1];
matrix[0][0] = 0;
for (x = 1; x <= queryLength; x++) {
matrix[x][0] = matrix[x - 1][0] + 1;
}
for (y = 1; y <= selfLength; y++) {
matrix[0][y] = matrix[0][y - 1] + 1;
}
for (x = 1; x <= queryLength; x++) {
for (y = 1; y <= selfLength; y++) {
matrix[x][y] = MIN3(matrix[x - 1][y] + 1, matrix[x][y - 1] + 1, matrix[x - 1][y - 1] + (selfCharArray[y - 1] == queryCharArray[x - 1] ? 0 : 1));
}
}
return (matrix[queryLength][selfLength]);
}
@end
(* Minimum of three integers. This function is deliberately
* not polymorphic because (1) we only need to compare integers
* and (2) the OCaml compilers do not perform type specialization
* for user-defined functions. *)
let minimum (x:int) y z =
let m' (a:int) b = if a < b then a else b in
m' (m' x y) z
(* Matrix initialization. *)
let init_matrix n m =
let init_col = Array.init m in
Array.init n (function
| 0 -> init_col (function j -> j)
| i -> init_col (function 0 -> i | _ -> 0)
)
(* Computes the Levenshtein distance between two unistring.
* If you want to run it faster, add the -unsafe option when
* compiling or use Array.unsafe_* functions (but be carefull
* with these well-named unsafe features). *)
let distance_utf8 x y =
match Array.length x, Array.length y with
| 0, n -> n
| m, 0 -> m
| m, n ->
let matrix = init_matrix (m + 1) (n + 1) in
for i = 1 to m do
let s = matrix.(i) and t = matrix.(i - 1) in
for j = 1 to n do
let cost = abs (compare x.(i - 1) y.(j - 1)) in
s.(j) <- minimum (t.(j) + 1) (s.(j - 1) + 1) (t.(j - 1) + cost)
done
done;
matrix.(m).(n)
(* This function takes two strings, convert them to unistring (int array)
* and then call distance_utf8, so we can compare utf8 strings. Please
* note that you need Glib (see LablGTK). *)
let distance x y =
distance_utf8 (Glib.Utf8.to_unistring x) (Glib.Utf8.to_unistring y)
function [dist,L]=levenshtein_distance(str1,str2)
L1=length(str1)+1;
L2=length(str2)+1;
L=zeros(L1,L2);
g=+1;%just constant
m=+0;%match is cheaper, we seek to minimize
d=+1;%not-a-match is more costly.
%do BC's
L(:,1)=([0:L1-1]*g)';
L(1,:)=[0:L2-1]*g;
m4=0;%loop invariant
for idx=2:L1;
for idy=2:L2
if(str1(idx-1)==str2(idy-1))
score=m;
else
score=d;
end
m1=L(idx-1,idy-1) + score;
m2=L(idx-1,idy) + g;
m3=L(idx,idy-1) + g;
L(idx,idy)=min(m1,min(m2,m3));
end
end
dist=L(L1,L2);
return
end
%I think this function generates errors:
%>> levenshtein('aaa','ab')
% ans =
% 1
%correct answer here is 2, I believe: 1 deletion, 1 replacement?
function q = levenshtein(s1,s2)
d = toeplitz(find(s1),find(s2))-1;
for j = 1:numel(s2)-1
for i = 1:numel(s1)-1
d(i+1,j+1) = min(min(d(i,j+1),d(i+1,j)) + 1,d(i,j) + ne(s1(i), s2(j)));
end
end
q = d(end) + eq(i,j)*ne(s1(end),s2(end)); % check if last chars are different
use List::Util qw(min);
sub levenshtein {
my ($str1, $str2) = @_;
my @ar1 = split //, $str1;
my @ar2 = split //, $str2;
my @dist;
$dist[$_][0] = $_ foreach (0 .. @ar1);
$dist[0][$_] = $_ foreach (0 .. @ar2);
foreach my $i (1 .. @ar1){
foreach my $j (1 .. @ar2){
my $cost = $ar1[$i - 1] eq $ar2[$j - 1] ? 0 : 1;
$dist[$i][$j] = min(
$dist[$i - 1][$j] + 1,
$dist[$i][$j - 1] + 1,
$dist[$i - 1][$j - 1] + $cost );
}
}
return $dist[@ar1][@ar2];
}
function levenshtein(string a, b)
sequence costs = tagset(length(b)+1,0)
for i=1 to length(a) do
costs[1] = i
integer newcost = i-1, pj = i, cj
for j=1 to length(b) do
cj = costs[j+1]
pj = min({pj+1, cj+1, newcost+(a[i]!=b[j])})
{costs[j+1],newcost} = {pj,cj}
end for
end for
return costs[$-1]
end function
请注意,从 PHP 4.0.1 版开始,有一个标准库调用 levenshtein()[1]。然而,它仅限于比较长度不超过 255 个字符的字符串,因此限制了它的实用性。
function leven($s1, $s2){
$l1 = strlen($s1); // Length of string $s1
$l2 = strlen($s2); // Length of $s2
$dis = range(0, $l2); // Array with (0,1,2,...,n)
for ($x = 1; $x <= $l1; $x++) {
$dis_new[0] = $x;
for ($y = 1; $y <= $l2; $y++) {
$c = ($s1[$x - 1] == $s2[$y - 1]) ? 0 : 1;
$dis_new[$y] = min($dis[$y] + 1, $dis_new[$y - 1] + 1, $dis[$y - 1] + $c);
}
$dis = $dis_new;
}
return $dis[$l2];
}
使用一维数组复制 PHP 版本
CREATE OR REPLACE FUNCTION levenshtein(s text, t text)
RETURNS integer AS $$
DECLARE i integer;
DECLARE j integer;
DECLARE m integer;
DECLARE n integer;
DECLARE d integer[];
DECLARE c integer;
BEGIN
m := char_length(s);
n := char_length(t);
i := 0;
j := 0;
FOR i IN 0..m LOOP
d[i*(n+1)] = i;
END LOOP;
FOR j IN 0..n LOOP
d[j] = j;
END LOOP;
FOR i IN 1..m LOOP
FOR j IN 1..n LOOP
IF SUBSTRING(s,i,1) = SUBSTRING(t, j,1) THEN
c := 0;
ELSE
c := 1;
END IF;
d[i*(n+1)+j] := LEAST(d[(i-1)*(n+1)+j]+1, d[i*(n+1)+j-1]+1, d[(i-1)*(n+1)+j-1]+c);
END LOOP;
END LOOP;
return d[m*(n+1)+n];
END;
$$ LANGUAGE plpgsql IMMUTABLE
第一个版本是一个动态规划算法,添加了只使用动态规划矩阵最后两行进行计算的优化。
def levenshtein(s1, s2):
if len(s1) < len(s2):
return levenshtein(s2, s1)
# len(s1) >= len(s2)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
deletions = current_row[j] + 1 # than s2
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
第二个版本
def lev(a, b):
if not a: return len(b)
if not b: return len(a)
return min(lev(a[1:], b[1:])+(a[0] != b[0]), lev(a[1:], b)+1, lev(a, b[1:])+1)
(注意,虽然非常紧凑,但此实现的运行时间非常差,在实际应用中不可用。一种使其明显更快的方法是添加 @functools.cache
装饰器,但由于许多字符串复制,它并不能使其最优。)
第三个版本(有效)
def LD(s,t):
s = ' ' + s
t = ' ' + t
d = {}
S = len(s)
T = len(t)
for i in range(S):
d[i, 0] = i
for j in range (T):
d[0, j] = j
for j in range(1,T):
for i in range(1,S):
if s[i] == t[j]:
d[i, j] = d[i-1, j-1]
else:
d[i, j] = min(d[i-1, j], d[i, j-1], d[i-1, j-1]) + 1
return d[S-1, T-1]
(注意,虽然紧凑,但此实现的运行时间相对较差。)
第四个版本
def levenshtein(seq1, seq2):
oneago = None
thisrow = range(1, len(seq2) + 1) + [0]
for x in xrange(len(seq1)):
twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
for y in xrange(len(seq2)):
delcost = oneago[y] + 1
addcost = thisrow[y - 1] + 1
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
thisrow[y] = min(delcost, addcost, subcost)
return thisrow[len(seq2) - 1]
(注意,此实现的时间复杂度为 O(N*M),空间复杂度为 O(M),其中 N 和 M 分别是两个序列的长度。)
第五个版本,使用 NumPy 对第一个版本进行向量化。在我的测试用例中,速度提高了约 40%。但是,这个 NumPy 向量化的 Python 版本在 levenshtein('aabcc', 'bccdd')
的情况下错误地返回 5(应该为 4,因为有两个 'a' 插入 + 两个 'd' 删除)。原因是删除步骤 np.minimum(current_row[1:], current_row[0:-1] + 1)
没有处理 current_row[j]
的最小值可能是 current_row[j-2] + 2
的情况,因为在此示例中存在两个连续的删除。参见 讨论。
def levenshtein(source, target):
if len(source) < len(target):
return levenshtein(target, source)
# So now we have len(source) >= len(target).
if len(target) == 0:
return len(source)
# We call tuple() to force strings to be used as sequences
# ('c', 'a', 't', 's') - numpy uses them as values by default.
source = np.array(tuple(source))
target = np.array(tuple(target))
# We use a dynamic programming algorithm, but with the
# added optimization that we only need the last two rows
# of the matrix.
previous_row = np.arange(target.size + 1)
for s in source:
# Insertion (target grows longer than source):
current_row = previous_row + 1
# Substitution or matching:
# Target and source items are aligned, and either
# are different (cost of 1), or are the same (cost of 0).
current_row[1:] = np.minimum(
current_row[1:],
np.add(previous_row[:-1], target != s))
# Deletion (target grows shorter than source):
current_row[1:] = np.minimum(
current_row[1:],
current_row[0:-1] + 1)
previous_row = current_row
return previous_row[-1]
(注意,此实现仅在权重不依赖于编辑的字符时才有效。)
第六个版本,来自维基百科关于莱文斯坦距离的文章;使用两行矩阵的迭代方法。
# Christopher P. Matthews
# [email protected]
# Sacramento, CA, USA
def levenshtein(s, t):
''' From Wikipedia article; Iterative with two matrix rows. '''
if s == t: return 0
elif len(s) == 0: return len(t)
elif len(t) == 0: return len(s)
v0 = [None] * (len(t) + 1)
v1 = [None] * (len(t) + 1)
for i in range(len(v0)):
v0[i] = i
for i in range(len(s)):
v1[0] = i + 1
for j in range(len(t)):
cost = 0 if s[i] == t[j] else 1
v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
for j in range(len(v0)):
v0[j] = v1[j]
return v1[len(t)]
function (str1, str2)
{
if (typeof(str1) != "character" && class(str1) != "factor")
stop(sprintf("Illegal data type: %s", typeof(str1)))
if (class(str1) == "factor")
str = as.character(str1)
if (typeof(str2) != "character" && class(str2) != "factor")
stop(sprintf("Illegal data type: %s", typeof(str2)))
if (class(str2) == "factor")
str = as.character(str2)
if ((is.array(str1) || is.array(str2)) && dim(str1) != dim(str2))
stop("non-conformable arrays")
if (length(str1) == 0 || length(str2) == 0)
return(integer(0))
l1 <- length(str1)
l2 <- length(str2)
out <- .C("levenshtein", as.character(str1), as.character(str2),
l1, l2, ans = integer(max(l1, l2)), PACKAGE = "RecordLinkage")
if (any(is.na(str1), is.na(str2)))
out$ans[is.na(str1) | is.na(str2)] = NA
if (is.array(str1))
return(array(out$ans, dim(str1)))
if (is.array(str2))
return(array(out$ans, dim(str2)))
return(out$ans)
}
"(注意) 这只是莱文斯坦距离的许多实现之一,但我无法让其他实现工作。
/* rexx levenshtein calculates the edit distance Karlocai 2009-01-18
between 2 strings s and t
This implementation of the Levenshtein algorithm
uses one row only (0..n), containing
- values of the previous line in columns [i-1]..n and
- values of the current line in columns 1..[i-2]
The current left value is kept in the variable lc
i: column 1..n of s, 1st index
j: row 1..m of t, 2nd index
*/
Parse Arg s,t -- gets 2 strings as parameter
n = Length(s) -- checks the parameters
m = Length(t)
If n = 0 Then
Return m
If m = 0 Then
Return n
Do i = 0 To n -- initializes the 1st row
r.i = i
End
Do j = 1 To m -- for each row
lc = j -- left column start value
Do i = 1 To n -- for each column
nv = Min(r.i + 1, ,
lc + 1, ,
r.[i-1] + (Substr(s,i,1) <> Substr(t,j,1)))
r.[i-1] = lc -- sets previous left column
lc = nv -- current left column
End
r.n = lc -- sets last current value
End
Return r.n
此 Ruby 版本很简单,但速度非常慢,尽管它可以与任何实现了 '==' 的元素的数组一起使用。
def levenshtein(first:, second:)
case
when first.empty? then second.length
when second.empty? then first.length
else [(first[0] == second[0] ? 0 : 1) + levenshtein(first: first[1..-1], second: second[1..-1]),
1 + levenshtein(first: first[1..-1], second: second),
1 + levenshtein(first: first, second: second[1..-1])].min
end
end
此记忆化版本快得多。
def levenshtein(first:, second:)
matrix = [(0..first.length).to_a]
(1..second.length).each do |j|
matrix << [j] + [0] * (first.length)
end
(1..second.length).each do |i|
(1..first.length).each do |j|
if first[j-1] == second[i-1]
matrix[i][j] = matrix[i-1][j-1]
else
matrix[i][j] = [
matrix[i-1][j],
matrix[i][j-1],
matrix[i-1][j-1],
].min + 1
end
end
end
return matrix.last.last
end
另一个记忆化版本。
def levenshtein(first:, second:)
m, n = first.length, second.length
return m if n == 0
return n if m == 0
# Create our distance matrix
d = Array.new(m+1) {Array.new(n+1)}
0.upto(m) { |i| d[i][0] = i }
0.upto(n) { |j| d[0][j] = j }
1.upto(n) do |j|
1.upto(m) do |i|
d[i][j] = first[i-1] == second[j-1] ? d[i-1][j-1] : [d[i-1][j]+1,d[i][j-1]+1,d[i-1][j-1]+1,].min
end
end
d[m][n]
end
可以使用 C 绑定获得字符串的莱文斯坦距离的更快实现 此处
Rust(beta2)中的简单实现,从 C 版本转换而来。它比绝对最小值更通用,事实上它接受任何可以转换为 &[u8] 的类型。
fn levenshtein<T1: AsRef<[u8]>, T2: AsRef<[u8]>>(s1: T1, s2: T2) -> usize {
let v1 = s1.as_ref();
let v2 = s2.as_ref();
// Early exit if one of the strings is empty
let v1len = v1.len();
let v2len = v2.len();
if v1len == 0 { return v2len; }
if v2len == 0 { return v1len; }
#[inline]
fn min3<T: Ord>(v1: T, v2: T, v3: T) -> T{
std::cmp::min(v1, std::cmp::min(v2, v3))
}
#[inline]
fn delta(x: u8, y: u8) -> usize {
if x == y { 0 } else { 1 }
}
let mut column: Vec<usize> = (0..v1len+1).collect();
for x in 1..v2len+1 {
column[0] = x;
let mut lastdiag = x-1;
for y in 1..v1len+1 {
let olddiag = column[y];
column[y] = min3(
column[y] + 1,
column[y-1] + 1,
lastdiag + delta(v1[y-1], v2[x-1]),
);
lastdiag = olddiag;
}
}
column[v1len]
}
您可以在 Rust Playpen 上对其进行实时测试。
函数式版本可能会更加简洁。
def levenshtein(str1: String, str2: String): Int = {
val lenStr1 = str1.length
val lenStr2 = str2.length
val d: Array[Array[Int]] = Array.ofDim(lenStr1 + 1, lenStr2 + 1)
for (i <- 0 to lenStr1) d(i)(0) = i
for (j <- 0 to lenStr2) d(0)(j) = j
for (i <- 1 to lenStr1; j <- 1 to lenStr2) {
val cost = if (str1(i - 1) == str2(j - 1)) 0 else 1
d(i)(j) = min(
d(i-1)(j ) + 1, // deletion
d(i )(j-1) + 1, // insertion
d(i-1)(j-1) + cost // substitution
)
}
d(lenStr1)(lenStr2)
}
def min(nums: Int*): Int = nums.min
import scala.collection.mutable
import scala.collection.parallel.ParSeq
def levenshtein(s1: String, s2: String): Int = {
val memorizedCosts = mutable.Map[(Int, Int), Int]()
def lev: ((Int, Int)) => Int = {
case (k1, k2) =>
memorizedCosts.getOrElseUpdate((k1, k2), (k1, k2) match {
case (i, 0) => i
case (0, j) => j
case (i, j) =>
ParSeq(1 + lev((i - 1, j)),
1 + lev((i, j - 1)),
lev((i - 1, j - 1))
+ (if (s1(i - 1) != s2(j - 1)) 1 else 0)).min
})
}
lev((s1.length, s2.length))
}
;; using the `match' macro as defined in http://synthcode.com/scheme/match.scm
(define/memoized (edit-distance a b)
(match `(,a ,b)
(`(,a ())
(length a))
(`(() ,b)
(length b))
(`((,a0 . ,a*) (,b0 . ,b*))
(min (+ (edit-distance a* b) 1)
(+ (edit-distance a b*) 1)
(+ (edit-distance a* b*)
(if (equal? a0 b0) 0 2))))))
;; where the `define/memoized' special form can be defined in the following way:
(define (memoized proc)
(let ((cache (make-hash-table)))
(lambda args
(match (hash-get-handle cache args)
(`(,key . ,memoized-result)
(apply values memoized-result))
(_
(call-with-values (lambda () (apply proc args))
(lambda result
(hash-set! cache args result)
(apply values result))))))))
(define-syntax-rule (define/memoized (name . args) . body)
(define name (memoized (lambda args . body))))
levenshteinDistanceBetween: stringOne and: stringTwo
"Iterative calculation of the Levenshtein distance between two strings."
| arrayTwo arrayOne|
" degenerate cases"
stringTwo = stringOne
ifTrue:[^0].
(stringTwo size) = 0
ifTrue:[^(stringOne size)].
(stringOne size) = 0
ifTrue:[^(stringTwo size)].
"create two work vectors of integer distances"
arrayOne := Array new:((stringTwo size) + 1).
arrayTwo := Array new:((stringTwo size) + 1).
"initialize v0 (the previous row of distances)
this row is A[0][i]: edit distance for an empty s
the distance is just the number of characters to delete from t"
(1 to: (arrayOne size)) do:[ :i | arrayOne at: i put: i-1 ].
(1 to: (stringOne size)) do: [ : i |
" calculate v1 (current row distances) from the previous row v0
first element of v1 is A[i+1][0] edit distance is delete (i+1) chars from s to match empty t"
arrayTwo at: 1 put: i.
" use formula to fill in the rest of the row"
(1 to: (stringTwo size)) do: [ :j || cost minimum minimumAux |
((stringOne at: i) = (stringTwo at: j))
ifTrue:[cost:=0]
ifFalse:[cost:=1].
minimumAux := ((arrayTwo at: j) + 1) min: ((arrayOne at: (j+1)) + 1).
minimum := minimumAux min: ((arrayOne at: j) + cost).
arrayTwo at: (j+1) put: minimum.].
(1 to: (arrayOne size)) do: [ :j | arrayOne at: j put: (arrayTwo at: j) ].
].
^arrayTwo at: ((stringTwo size)+1).
/**
* Levenshtein edit distance calculator
*
* Inspired by https://gist.github.com/bgreenlee/52d93a1d8fa1b8c1f38b
* Improved with http://stackoverflow.com/questions/26990394/slow-swift-arrays-and-strings-performance
*/
class Levenshtein {
private class func min(numbers: Int...) -> Int {
return numbers.reduce(numbers[0], combine: {$0 < $1 ? $0 : $1})
}
class Array2D {
var cols:Int, rows:Int
var matrix: [Int]
init(cols:Int, rows:Int) {
self.cols = cols
self.rows = rows
matrix = Array(count:cols*rows, repeatedValue:0)
}
subscript(col:Int, row:Int) -> Int {
get {
return matrix[cols * row + col]
}
set {
matrix[cols*row+col] = newValue
}
}
func colCount() -> Int {
return self.cols
}
func rowCount() -> Int {
return self.rows
}
}
class func distanceBetween(aStr: String, and bStr: String) -> Int {
let a = Array(aStr.utf16)
let b = Array(bStr.utf16)
let dist = Array2D(cols: a.count + 1, rows: b.count + 1)
for i in 1...a.count {
dist[i, 0] = i
}
for j in 1...b.count {
dist[0, j] = j
}
for i in 1...a.count {
for j in 1...b.count {
if a[i-1] == b[j-1] {
dist[i, j] = dist[i-1, j-1] // noop
} else {
dist[i, j] = min(
dist[i-1, j] + 1, // deletion
dist[i, j-1] + 1, // insertion
dist[i-1, j-1] + 1 // substitution
)
}
}
}
return dist[a.count, b.count]
}
}
%find the levenshtein distance
function LD (s, t : string) : int
% a couple variables
var cell_above := 0
var cell_left := 0
var cell_diagonal := 0
var cost := 0
%lengths
var n := length (s)
var m := length (t)
%if a string is zero
if n = 0 then
result m
elsif m = 0 then
result n
else
end if
% make an array with both words
var matrix : array 0 .. n, 0 .. m of int
% initialize array to the length of each word
for i : 0 .. n
matrix (i, 0) := i
end for
for i : 0 .. m
matrix (0, i) := i
end for
% calculate total changes
for i : 1 .. n
for j : 1 .. m
if s (i) = t (j) then
cost := 0
else
cost := 1
end if
cell_above := matrix (i - 1, j) + 1
cell_left := matrix (i, j - 1) + 1
cell_diagonal := matrix (i - 1, j - 1) + cost
var mini := min (cell_above, cell_left)
matrix (i, j) := min (mini, cell_diagonal)
end for
end for
result matrix (n, m)
end LD
put LD ("s", "t")
此版本与本文中的 JavaScript 和 PHP 实现相同。
Function levenshtein( a, b )
Dim i,j,cost,d,min1,min2,min3
' Avoid calculations where there there are empty words
If Len( a ) = 0 Then levenshtein = Len( b ): Exit Function
If Len( b ) = 0 Then levenshtein = Len( a ): Exit Function
' Array initialization
ReDim d( Len( a ), Len( b ) )
For i = 0 To Len( a ): d( i, 0 ) = i: Next
For j = 0 To Len( b ): d( 0, j ) = j: Next
' Actual calculation
For i = 1 To Len( a )
For j = 1 To Len( b )
If Mid(a, i, 1) = Mid(b, j, 1) Then cost = 0 Else cost = 1 End If
' Since min() function is not a part of VBScript, we'll "emulate" it below
min1 = ( d( i - 1, j ) + 1 )
min2 = ( d( i, j - 1 ) + 1 )
min3 = ( d( i - 1, j - 1 ) + cost )
If min1 <= min2 And min1 <= min3 Then
d( i, j ) = min1
ElseIf min2 <= min1 And min2 <= min3 Then
d( i, j ) = min2
Else
d( i, j ) = min3
End If
Next
Next
levenshtein = d( Len( a ), Len( b ) )
End Function
此版本与本文中的 JavaScript 和 PHP 实现相同。当我尝试使用本文中的其他 VBA 实现时遇到问题,因此我不得不采用下面的版本。
Application.WorksheetFunction.Min() 方法是 Excel 特定的。如果您在其他启用 VBA 的应用程序中实现它,请取消注释条件块并注释 Application.WorksheetFunction.Min() 行。
Function levenshtein(a As String, b As String) As Integer
Dim i As Integer
Dim j As Integer
Dim cost As Integer
Dim d(,) As Integer
Dim min1 As Integer
Dim min2 As Integer
Dim min3 As Integer
If Len( a ) = 0 Then
levenshtein = Len( b )
Exit Function
End If
If Len( b ) = 0 Then
levenshtein = Len( a )
Exit Function
End If
ReDim d(Len(a), Len(b))
For i = 0 To Len(a)
d(i, 0) = i
Next
For j = 0 To Len(b)
d(0, j) = j
Next
For i = 1 To Len(a)
For j = 1 To Len(b)
If Mid(a, i, 1) = Mid(b, j, 1) Then
cost = 0
Else
cost = 1
End If
' Since Min() function is not a part of VBA, we'll "emulate" it below
min1 = (d(i - 1, j) + 1)
min2 = (d(i, j - 1) + 1)
min3 = (d(i - 1, j - 1) + cost)
' If min1 <= min2 And min1 <= min3 Then
' d(i, j) = min1
' ElseIf min2 <= min1 And min2 <= min3 Then
' d(i, j) = min2
' Else
' d(i, j) = min3
' End If
' In Excel we can use Min() function that is included
' as a method of WorksheetFunction object
d(i, j) = Application.WorksheetFunction.Min(min1, min2, min3)
Next
Next
levenshtein = d(Len(a), Len(b))
End Function
此版本与本文中的 VB 实现相同。
type twoDimArrayType
secondArray() as integer
end type
dim FirstArray() as twoDimArrayType
Dim i As Integer
Dim j As Integer
Dim cost As Integer
Dim d() As Integer
Dim min1 As Integer
Dim min2 As Integer
Dim min3 As Integer
declare Function calculateLevenshteinDistance(byval firstString As String, byval secondString As String) As Integer
Function calculateLevenshteinDistance(byval firstString As String, byval secondString As String) As Integer
print chr$(12)
'If one of the parameter is null, then the result will be the length of the other parameter...
if Len(firstString) = 0 Then
calculateLevenshteinDistance = Len(secondString)
exit Function
end if
if Len(secondString) = 0 Then
calculateLevenshteinDistance = Len(firstString)
exit Function
end if
'Initializing array...
redim FirstArray(len(firstString)+1)
for i=1 to ubound(FirstArray)
redim FirstArray(i).secondArray(len(secondString)+1)
next
'Deletion...
For i = 1 To ubound(FirstArray)
FirstArray(i).secondArray(1) = i-1
Next
'Insertion ...
For i = 1 To ubound(FirstArray(ubound(FirstArray)).secondArray)
FirstArray(1).secondArray(i) = i-1
Next
'Actual calculation...
for i=2 to ubound(FirstArray)
for j=2 to ubound(FirstArray(i).secondArray)
If StringCompare(Mid$(firstString, i-1, 1), Mid$(secondString, j-1, 1))=0 Then
cost = 0
Else
cost = 1
End If
min1 = FirstArray(i-1).secondArray(j) + 1
min2 = FirstArray(i).secondArray(j-1) + 1
min3 = FirstArray(i-1).secondArray(j-1) + cost
If min1 <= min2 And min1 <= min3 Then
FirstArray(i).secondArray(j) = min1
ElseIf min2 <= min1 And min2 <= min3 Then
FirstArray(i).secondArray(j) = min2
Else
FirstArray(i).secondArray(j) = min3
End If
Next
Next
'Calculating Return Value...
calculateLevenshteinDistance = FirstArray(ubound(FirstArray)).secondArray(ubound(FirstArray(ubound(FirstArray)).secondArray))
End Function
这是 Teslock 机器语言中的莱文斯坦距离计算。
.declare singlecall virtual LevenshteinDistance[args(2) string s1, string s2]: out unsigned int main ( define unsigned int constant "cost_ins" == 1; define unsigned int constant "cost_del" == 1; define unsigned int constant "cost_sub" == 1; define unsigned int variable "n1" == calculate::string_operations>length("s1"); define unsigned int variable "n2" == calculate::string_operations>length("s2"); define unsigned int_array(calculate::string_operations>array_instantiation("p")>fixed_length("n2", ++1)); define unsigned int_array(calculate::string_operations>array_instantiation("q")>fixed_length("n2", ++1)); define unsigned int_array(calculate::string_operations>array_instantiation("r")>variable_length); p>array_vector>0 == 0; loop_for>finalized(define unsigned int variable "j" == 1)>break_condition("j" <= "n2")>forward_condition(++j) ( p>array_vector>j == p>array_vector>j::access>+constants::cost_ins; ) q>array_vector>0 == 0; loop_for>finalized(define unsigned int variable "i" == 1)>break_condition("i" <= "n1")>forward_condition(++i) ( q>array_vector>0 == p>array_vector>0::access>+constants::cost_del; loop for>finalized(define unsigned int variable "j" == 1)>break_condition("j" <= "n2")>forward_condition(++j) ( define unsigned int variable "d_del" == p>array_vector>j::access>+constants::cost_del; define unsigned int variable "d_ins" == q>array_vector>j[delegate_handle::j::access>-1]>+constants::cost_ins; define unsigned int veriable "d_sub" == p>array_vector>j[delegate_handle::j::access>-1]>+logical_operations>xor_result["s1"::access>"s1"[delegate_handle::j::access>-1]?0>return_handle_as_result constants::"cost_sub"]; q>array_vector>j == dll_extern::math_interop_singlecall(min)::[args(3) (d_del, d_ins), d_sub; ) local>"r" == "p"(self_typecast::ignore_unsafe_condition); local>"p" == "q"(self_typecast); local>"q" == "r"(self_typecast); ) logical_result(param p::singlecall)::define>"return" == p>array_vector>"n2"; )
请注意,存在一个标准的 ABAP 函数 distance,它可以计算莱文斯坦距离,无需下面的代码。
REPORT zlevenshtein.
*----------------------------------------------------------------------*
* CLASS lcl_levenshtein DEFINITION
*----------------------------------------------------------------------*
*
*----------------------------------------------------------------------*
CLASS lcl_levenshtein DEFINITION.
PUBLIC SECTION.
CLASS-METHODS:
distance IMPORTING i_s TYPE csequence
i_t TYPE csequence
RETURNING value(r) TYPE i.
ENDCLASS. "lcl_c DEFINITION
*----------------------------------------------------------------------*
* CLASS lcl_levenshtein IMPLEMENTATION
*----------------------------------------------------------------------*
*
*----------------------------------------------------------------------*
CLASS lcl_levenshtein IMPLEMENTATION.
METHOD distance.
DEFINE m_get.
l_m_index = ( ( l_l_t * ( l_m_i + ( &2 ) ) ) + l_m_j + ( &1 ) ) + 1 .
read table l_d into r index l_m_index.
add &3 to r.
insert r into table l_v.
END-OF-DEFINITION.
DATA: l_d TYPE STANDARD TABLE OF i,
l_v TYPE SORTED TABLE OF i WITH UNIQUE KEY table_line,
l_cost TYPE i,
l_m_i TYPE i,
l_m_j TYPE i,
l_m_index TYPE i,
l_l_s TYPE i,
l_l_t TYPE i.
l_l_s = STRLEN( i_s ).
l_l_t = STRLEN( i_t ).
DO l_l_s TIMES.
l_m_i = sy-index - 1.
DO l_l_t TIMES. "#EC CI_NESTED
l_m_j = sy-index - 1.
IF i_s+l_m_i(1) = i_t+l_m_j(1).
l_cost = 0.
ELSE.
l_cost = 1.
ENDIF.
IF l_m_j = 0.
r = l_cost.
ELSEIF l_m_i = 0.
r = l_cost.
ELSE.
CLEAR l_v.
m_get: -1 0 1, 0 -1 1, -1 -1 l_cost.
READ TABLE l_v INTO r INDEX 1.
ENDIF.
APPEND r TO l_d.
ENDDO.
ENDDO.
ENDMETHOD. "distance
ENDCLASS. "lcl_levenshtein IMPLEMENTATION
START-OF-SELECTION.
DATA: d TYPE i.
d = lcl_levenshtein=>distance( i_s = 'sitting' i_t = 'kitten' ).
WRITE: / d.
IF STRING1 = STRING2 THEN
LD = 0
END ELSE
S.LEN = LEN(STRING1)
C.LEN = LEN(STRING2)
MAT LD.MTX = ''
DIM LD.MTX(100,100)
FOR I = 3 TO S.LEN + 2
LD.MTX(I,1) = STRING1[I-2,1]
NEXT I
FOR I = 3 TO S.LEN + 2
LD.MTX(I,2) = I - 2
NEXT I
FOR I = 3 TO C.LEN + 2
LD.MTX(1,I) = STRING2[I-2,1]
NEXT I
LD.MTX(2,2) = 0
FOR I = 3 TO C.LEN + 2
LD.MTX(2,I) = I - 2
NEXT I
FOR I = 3 TO (S.LEN+2)
S.LETTER = LD.MTX(I,1)
FOR J = 3 TO (C.LEN+2)
C.LETTER = LD.MTX(1,J)
IF C.LETTER = S.LETTER THEN COST = 0 ELSE COST = 1
P1 = LD.MTX(I-1,J) + 1
P2 = LD.MTX(I,J-1) + 1
P3 = LD.MTX(I-1,J-1) + COST
IF P1 < P2 THEN LD.NUM = P1 ELSE LD.NUM = P2
IF P3 < LD.NUM THEN LD.NUM = P3
LD.MTX(I,J) = LD.NUM
NEXT J
NEXT I
LD = LD.MTX(S.LEN+2,C.LEN+2)
END
C 版本的实现
// Damerau-Levenshtein includes transposition, remove for normal Levenshtein
pub fn levenshteinDistance(a: []const u8, b: []const u8) !u8 {
const length: usize = a.len + 1;
var column = try std.ArrayList(u8).initCapacity(std.heap.page_allocator, length);
try column.appendNTimes(0, length);
defer column.deinit();
var y: u8 = 1;
while (y <= a.len) : (y += 1) {
column.items[y] = y;
var x: u8 = 1;
while (x <= b.len) : (x += 1) {
column.items[0] = x;
var last_diag = x - 1;
var j: u8 = 1;
while (j <= a.len) : (j += 1) {
const old_diag = column.items[j];
const cost: u8 = if (a[j - 1] == b[x - 1]) 0 else 1;
const deletion: u8 = column.items[j] + 1;
const insertion: u8 = column.items[j - 1] + 1;
const substitution: u8 = last_diag + cost;
const transposition: u8 = if (j > 1 and x > 1 and a[j - 1] == b[x - 2] and a[j - 2] == b[x - 1]) column.items[j - 2] + 1 else std.math.maxInt(u8);
column.items[j] = std.mem.min(u8, &.{ deletion, insertion, substitution, transposition });
last_diag = old_diag;
}
}
}
return column.items[length - 1];
}