字符串编辑距离

01 /* This is the standard Levenstein string edit distance algorithm */
02 int LevensteinDistance(const vector<string>& vQuery, const vector<string>& vData)
03 {
04     int nQueryLen = vQuery.size();
05     int nDataLen = vData.size();
06
07     // first consider the length factor
08     int nDifference = abs(nQueryLen nTransLen);
09     if (nDifference >= MAXDISTANCE)
10         return max(nQueryLen, nTransLen);
11    
12     if (nQueryLen == 0)
13         return nDataLen ;
14     else if (nDataLen == 0)
15         return nQueryLen;
16
17     // make a edit distance matrix to store local edit distance and
18     // calculate the edit distance using DP algorithm
19     // entry D[i, j] is the edit distance ED(Ai,Bj)
20     int **ppEditDistMatrix;
21     ppEditDistMatrix = new int* [nQueryLen + 1];
22     ppEditDistMatrix[0] = new int[(nQueryLen + 1) * (nTransLen + 1)]; // allocate all space once
23
24     for (int i = 1; i <= nQueryLen; ++i)
25     {
26         ppEditDistMatrix[i] = ppEditDistMatrix[i 1] + nTransLen + 1;  
27     }
28
29     // Init the matrix of edit distance
30     for (int nRow = 1; nRow <= nQueryLen; ++nRow)
31     {
32         ppEditDistMatrix[nRow][0] = nRow; // INF
33     }
34     for (int nCol = 0; nCol <= nDataLen; ++nCol)
35     {
36         ppEditDistMatrix[0][nCol] = nCol;
37     }
38
39     int comp[3];
40
41     for (int nRow = 0; nRow < nQueryLen; ++nRow)
42     {
43         for (int nCol = 0; nCol < nDataLen; ++nCol)
44         {
45             comp[0] = ppEditDistMatrix[nRow + 1][nCol] + 1; // insert
46             comp[1] = ppEditDistMatrix[nRow][nCol + 1] + 1; // delete
47
48             if (vQuery[nRow] == nData[nCol])   // substitution
49                 comp[2] = ppEditDistMatrix[nRow][nCol];
50             else
51                 comp[2] = ppEditDistMatrix[nRow][nCol] + 1;
52
53             ppEditDistMatrix[nRow + 1][nCol + 1] = min(min(comp[0], comp[1]), comp[2]);
54         }
55     }
56
57     float nEditDist = (float)ppEditDistMatrix[nQueryLen][nDataLen];
58
59     delete[] ppEditDistMatrix[0];
60     delete[] ppEditDistMatrix;
61
62     return nEditDist;
63 }

下面这个算法同样实现了标准的编辑距离计算,但是不用开辟矩阵空间,只用以为数组可以表示,更节省空间

01 /* This is the standard Levenstein string edit distance algorithm, but save more space */
02 int GetDistance(const vector<string>& vQuery, const vector<string>& vData)
03 {
04     int nQueryLen = vQuery.size();
05     int nDataLen = vData.size();
06
07     // first consider the length factor
08     int nDifference = abs(nQueryLen nTransLen);
09     if (nDifference >= MAXDISTANCE)
10         return max(nQueryLen, nTransLen);
11    
12     if (nQueryLen == 0)
13         return nDataLen ;
14     else if (nDataLen == 0)
15         return nQueryLen;
16
17     int* pDistanceArray = new int[nDataLen + 1];
18
19     for (int i = 0; i <= nDataLen; ++i)
20     {
21         pDistanceArray[i] = i;
22     }
23
24     int cur = 0;
25     int next = 0;
26
27     for (int i = 0; i < nQueryLen; i++)
28     {
29         cur = i + 1;
30         for (int j = 0; j < nDataLen; j++)
31         {
32             next = min(min(pDistanceArray[j + 1] + 1, cur + 1), pDistanceArray[j] + (vQuery[i] == vData[j] ? 0 : 1));
33             pDistanceArray[j] = cur;
34             cur = next;
35         }
36         pDistanceArray[nTransLen] = next;
37     }
38
39     delete pDistanceArray;
40
41     return next;
42 }

下面这个算法可以作为标准编辑距离算法的一种替代算法,它可以极大提高速度,但会失去一定精度。

01 /* This is an alternative to the Levenstein string edit distance algorithm.
02 /* By looking for similarity locally to made it really fast!
03 /* Its complexity is O(n*constant) rather than O(n^2).
04 /* The basic idea behind it using a fast and dirty LCS (lowest common substring) length to
05 /* determine the similarity between two strings. But it must ensure the size of vQuery than the size of vData or equal */
06 float GetFastDistance(const vector<string>& vQuery, const vector<string>& vData)
07 {
08     int nQueryLen = vQuery.size();
09     int nDataLen = vData.size();
10
11     // first consider the length factor
12     int nDifference = abs(nQueryLen nTransLen);
13     if (nDifference >= MAXDISTANCE)
14         return max(nQueryLen, nTransLen);
15    
16     if (nQueryLen == 0)
17         return nDataLen ;
18     else if (nDataLen == 0)
19         return nQueryLen;
20
21     int c = 0;
22     int nOffset1 = 0;
23     int nOffset2 = 0;
24     int nLCS = 0;
25
26     while ((c + nOffset1 < nQueryLen) && (c + nOffset2 < nDataLen))
27     {
28         if (vQuery[c + nOffset1] == vTrans[c + nOffset2])
29             ++nLCS;
30         else
31         {
32             // define FAST to make it even faster, but less reliable
33 #ifdef FAST
34             c += (nOffset1 + nOffset2) / 2;
35             if (c >= nQueryLen)
36                 c = nQueryLen 1;
37             if (c >= nDataLen)
38                 c = nDataLen 1;
39 #endif
40             nOffset1 = 0;
41             nOffset2 = 0;
42
43             if (vQuery[c] == vTrans[c])
44             {
45                 ++c;
46                 continue;
47             }
48             for (int i = 1; i < MAXOFFSET; ++i)
49             {
50                 if ((c + i < nQueryLen) && (vQuery[c + i] == nDataLen[c]))
51                 {
52                     nOffset1 = i;
53                     break;
54                 }
55                 if ((c + i < nDataLenLen) && (vQuery[c] == nDataLen[c + i]))
56                 {
57                     nOffset2 = i;
58                     break;
59                 }
60             }
61         }
62         ++c;
63     }
64
65     return ((float)nQueryLen + nTransLen) / 2 nLCS;
66 }

发表评论

电子邮件地址不会被公开。 必填项已用*标注