維基百科對字符串相似度(Damerau–Levenshtein distance)的定義是:
簡單翻譯下,兩個字符序列的DL距離,就是從一個變換到另一個的最小操作次數(shù)。這個變換包括插入一個字符、刪除一個字符、替換一個字符、或互換兩個相鄰字符。 而所謂“編輯距離(edit distance,或叫Levenshtein distance)”,并不包含互換兩個相鄰字符。 主要應(yīng)用是在字符拼寫檢查上,當(dāng)然也可以用在其他地方,比方不少輸入法就提供類似的校正功能(搜狗拼音輸入法即實現(xiàn)了此功能)。 看起來簡單,實現(xiàn)還是有一定困難的,好在有牛人已經(jīng)做好相應(yīng)的函數(shù),如 Kambiz 在 How to match two strings approximately 中提供了兩個函數(shù): 計算DL距離的函數(shù)DamerauLevenshteinDistance(Str1, Str2) function DamerauLevenshteinDistance(const Str1, Str2: string): Integer; var LenStr1, LenStr2: Integer; I, J, T, Cost, Minimum: Integer; pStr1, pStr2, S1, S2: PChar; D, RowPrv2, RowPrv1, RowCur, Temp: PIntegerArray; begin LenStr1 := Length(Str1); LenStr2 := Length(Str2); // to save some space, make sure the second index points to the shorter string if LenStr1 < LenStr2 then begin T := LenStr1; LenStr1 := LenStr2; LenStr2 := T; pStr1 := PChar(Str2); pStr2 := PChar(Str1); end else begin pStr1 := PChar(Str1); pStr2 := PChar(Str2); end; // to save some time and space, look for exact match while (LenStr2 <> 0) and (pStr1^ = pStr2^) do begin Inc(pStr1); Inc(pStr2); Dec(LenStr1); Dec(LenStr2); end; // when one string is empty, length of the other is the distance if LenStr2 = 0 then begin Result := LenStr1; Exit; end; // calculate the edit distance T := LenStr2 + 1; GetMem(D, 3 * T * SizeOf(Integer)); FillChar(D^, 2 * T * SizeOf(Integer), 0); RowCur := D; RowPrv1 := @D[T]; RowPrv2 := @D[2 * T]; S1 := pStr1; for I := 1 to LenStr1 do begin Temp := RowPrv2; RowPrv2 := RowPrv1; RowPrv1 := RowCur; RowCur := Temp; RowCur[0] := I; S2 := pStr2; for J := 1 to LenStr2 do begin Cost := Ord(S1^ <> S2^); Minimum := RowPrv1[J - 1] + Cost; // substitution T := RowCur[J - 1] + 1; // insertion if T < Minimum then Minimum := T; T := RowPrv1[J] + 1; // deletion if T < Minimum then Minimum := T; if (I <> 1) and (J <> 1) and (S1^ = (S2 - 1)^) and (S2^ = (S1 - 1)^) then begin T := RowPrv2[J - 2] + Cost; // transposition if T < Minimum then Minimum := T; end; RowCur[J] := Minimum; Inc(S2); end; Inc(S1); end; Result := RowCur[LenStr2]; FreeMem(D); end; 還有計算字符串相似度的函數(shù) StringSimilarityRatio(Str1, Str2, IgnoreCase): Double; 返回值在0到1之間,0表示不相似,1表示完全相似。 function StringSimilarityRatio(const Str1, Str2: string; IgnoreCase: Boolean): Double; var MaxLen: Integer; Distance: Integer; begin Result := 1.0; if Length(Str1) > Length(Str2) then MaxLen := Length(Str1) else MaxLen := Length(Str2); if MaxLen <> 0 then begin if IgnoreCase then Distance := DamerauLevenshteinDistance(LowerCase(Str1), LowerCase(Str2)) else Distance := DamerauLevenshteinDistance(Str1, Str2); Result := Result - (Distance / MaxLen); end; end; 后來data man 參考一個德國人的ApproxStrUtils單元(該單元計算的是L距離,不是DL距離),給出一個據(jù)說效率更高的DL距離函數(shù),遺憾的是調(diào)用它會有“Invalid Pointer Operation”的報錯,還沒有Debug出問題所在,所以暫時先用前一個版本吧。 function DamerauLevenshteinDistance2(const Str1, Str2: string): Integer; function Min(const A, B, C: Integer): Integer; inline; begin Result := A; if B < A then Result := B; if C < Result then Result := C; end; var LenStr1, LenStr2: Integer; I, J, T, Cost, PrevCost: Integer; pStr1, pStr2, S1, S2: PChar; D: PIntegerArray; begin LenStr1 := Length(Str1); LenStr2 := Length(Str2); // to save some space, make sure the second index points to the shorter string if LenStr1 < LenStr2 then begin T := LenStr1; LenStr1 := LenStr2; LenStr2 := T; pStr1 := PChar(Str2); pStr2 := PChar(Str1); end else begin pStr1 := PChar(Str1); pStr2 := PChar(Str2); end; // to save some time and space, look for exact match while (LenStr2 <> 0) and (pStr1^ = pStr2^) do begin Inc(pStr1); Inc(pStr2); Dec(LenStr1); Dec(LenStr2); end; while (LenStr2 <> 0) and ((pStr1 + LenStr1 - 1)^ = (pStr2 + LenStr2 - 1)^) do begin Dec(LenStr1); Dec(LenStr2); end; if LenStr2 = 0 then begin Result := LenStr1; Exit; end; // calculate the edit distance T := LenStr2 + 1; GetMem(D, T * SizeOf(Integer)); for I := 0 to T do D[I] := I; S1 := pStr1; for I := 1 to LenStr1 do begin PrevCost := I - 1; Cost := I; S2 := pStr2; for J := 1 to LenStr2 do begin if (S1^ = S2^) or ((I > 1) and (J > 1) and (S1^ = (S2 - 1)^) and (S2^ = (S1 - 1)^)) then Cost := PrevCost else Cost := 1 + min(Cost, PrevCost, D[J]); PrevCost := D[J]; D[J] := Cost; Inc(S2); end; Inc(S1); end; Result := D[LenStr2]; FreeMem(D); end; 參考文獻:
|
|