source: trunk/tools/linguist/shared/simtexth.cpp@ 779

Last change on this file since 779 was 651, checked in by Dmitry A. Kuminov, 16 years ago

trunk: Merged in qt 4.6.2 sources.

File size: 9.7 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation (qt-info@nokia.com)
6**
7** This file is part of the Qt Linguist of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at qt-info@nokia.com.
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "simtexth.h"
43#include "translator.h"
44
45#include <QtCore/QByteArray>
46#include <QtCore/QString>
47#include <QtCore/QList>
48
49
50QT_BEGIN_NAMESPACE
51
52typedef QList<TranslatorMessage> TML;
53
54/*
55 How similar are two texts? The approach used here relies on co-occurrence
56 matrices and is very efficient.
57
58 Let's see with an example: how similar are "here" and "hither"? The
59 co-occurrence matrix M for "here" is M[h,e] = 1, M[e,r] = 1, M[r,e] = 1, and 0
60 elsewhere; the matrix N for "hither" is N[h,i] = 1, N[i,t] = 1, ...,
61 N[h,e] = 1, N[e,r] = 1, and 0 elsewhere. The union U of both matrices is the
62 matrix U[i,j] = max { M[i,j], N[i,j] }, and the intersection V is
63 V[i,j] = min { M[i,j], N[i,j] }. The score for a pair of texts is
64
65 score = (sum of V[i,j] over all i, j) / (sum of U[i,j] over all i, j),
66
67 a formula suggested by Arnt Gulbrandsen. Here we have
68
69 score = 2 / 6,
70
71 or one third.
72
73 The implementation differs from this in a few details. Most importantly,
74 repetitions are ignored; for input "xxx", M[x,x] equals 1, not 2.
75*/
76
77/*
78 Every character is assigned to one of 20 buckets so that the co-occurrence
79 matrix requires only 20 * 20 = 400 bits, not 256 * 256 = 65536 bits or even
80 more if we want the whole Unicode. Which character falls in which bucket is
81 arbitrary.
82
83 The second half of the table is a replica of the first half, because of
84 laziness.
85*/
86static const int indexOf[256] = {
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89// ! " # $ % & ' ( ) * + , - . /
90 0, 2, 6, 7, 10, 12, 15, 19, 2, 6, 7, 10, 12, 15, 19, 0,
91// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
92 1, 3, 4, 5, 8, 9, 11, 13, 14, 16, 2, 6, 7, 10, 12, 15,
93// @ A B C D E F G H I J K L M N O
94 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
95// P Q R S T U V W X Y Z [ \ ] ^ _
96 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
97// ` a b c d e f g h i j k l m n o
98 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
99// p q r s t u v w x y z { | } ~
100 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
101
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 2, 6, 7, 10, 12, 15, 19, 2, 6, 7, 10, 12, 15, 19, 0,
105 1, 3, 4, 5, 8, 9, 11, 13, 14, 16, 2, 6, 7, 10, 12, 15,
106 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
107 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
108 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
109 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0
110};
111
112/*
113 The entry bitCount[i] (for i between 0 and 255) is the number of bits used to
114 represent i in binary.
115*/
116static const int bitCount[256] = {
117 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
118 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
119 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
120 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
121 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
122 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
123 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
124 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
125 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
126 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
127 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
128 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
129 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
130 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
131 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
132 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
133};
134
135struct CoMatrix
136{
137 /*
138 The matrix has 20 * 20 = 400 entries. This requires 50 bytes, or 13
139 words. Some operations are performed on words for more efficiency.
140 */
141 union {
142 quint8 b[52];
143 quint32 w[13];
144 };
145
146 CoMatrix() { memset( b, 0, 52 ); }
147
148 CoMatrix(const QString &str)
149 {
150 QByteArray ba = str.toUtf8();
151 const char *text = ba.constData();
152 char c = '\0', d;
153 memset( b, 0, 52 );
154 /*
155 The Knuth books are not in the office only for show; they help make
156 loops 30% faster and 20% as readable.
157 */
158 while ( (d = *text) != '\0' ) {
159 setCoOccurence( c, d );
160 if ( (c = *++text) != '\0' ) {
161 setCoOccurence( d, c );
162 text++;
163 }
164 }
165 }
166
167 void setCoOccurence( char c, char d ) {
168 int k = indexOf[(uchar) c] + 20 * indexOf[(uchar) d];
169 b[k >> 3] |= (1 << (k & 0x7));
170 }
171
172 int worth() const {
173 int w = 0;
174 for ( int i = 0; i < 50; i++ )
175 w += bitCount[b[i]];
176 return w;
177 }
178};
179
180static inline CoMatrix reunion(const CoMatrix &m, const CoMatrix &n)
181{
182 CoMatrix p;
183 for (int i = 0; i < 13; ++i)
184 p.w[i] = m.w[i] | n.w[i];
185 return p;
186}
187
188static inline CoMatrix intersection(const CoMatrix &m, const CoMatrix &n)
189{
190 CoMatrix p;
191 for (int i = 0; i < 13; ++i)
192 p.w[i] = m.w[i] & n.w[i];
193 return p;
194}
195
196StringSimilarityMatcher::StringSimilarityMatcher(const QString &stringToMatch)
197{
198 m_cm = new CoMatrix(stringToMatch);
199 m_length = stringToMatch.length();
200}
201
202int StringSimilarityMatcher::getSimilarityScore(const QString &strCandidate)
203{
204 CoMatrix cmTarget(strCandidate);
205 int delta = qAbs(m_length - strCandidate.size());
206 int score = ( (intersection(*m_cm, cmTarget).worth() + 1) << 10 ) /
207 ( reunion(*m_cm, cmTarget).worth() + (delta << 1) + 1 );
208 return score;
209}
210
211StringSimilarityMatcher::~StringSimilarityMatcher()
212{
213 delete m_cm;
214}
215
216/**
217 * Checks how similar two strings are.
218 * The return value is the score, and a higher score is more similar
219 * than one with a low score.
220 * Linguist considers a score over 190 to be a good match.
221 * \sa StringSimilarityMatcher
222 */
223int getSimilarityScore(const QString &str1, const QString &str2)
224{
225 CoMatrix cmTarget(str2);
226 CoMatrix cm(str1);
227 int delta = qAbs(str1.size() - str2.size());
228
229 int score = ( (intersection(cm, cmTarget).worth() + 1) << 10 )
230 / ( reunion(cm, cmTarget).worth() + (delta << 1) + 1 );
231
232 return score;
233}
234
235CandidateList similarTextHeuristicCandidates(const Translator *tor,
236 const QString &text, int maxCandidates)
237{
238 QList<int> scores;
239 CandidateList candidates;
240
241 TML all = tor->translatedMessages();
242
243 foreach (const TranslatorMessage &mtm, all) {
244 if (mtm.type() == TranslatorMessage::Unfinished
245 || mtm.translation().isEmpty())
246 continue;
247
248 QString s = mtm.sourceText();
249 int score = getSimilarityScore(s, text);
250
251 if (candidates.size() == maxCandidates && score > scores[maxCandidates - 1] )
252 candidates.removeLast();
253
254 if (candidates.size() < maxCandidates && score >= textSimilarityThreshold) {
255 Candidate cand( s, mtm.translation() );
256
257 int i;
258 for (i = 0; i < candidates.size(); i++) {
259 if (score >= scores.at(i)) {
260 if (score == scores.at(i)) {
261 if (candidates.at(i) == cand)
262 goto continue_outer_loop;
263 } else {
264 break;
265 }
266 }
267 }
268 scores.insert(i, score);
269 candidates.insert(i, cand);
270 }
271 continue_outer_loop:
272 ;
273 }
274 return candidates;
275}
276
277QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.