source: trunk/tools/linguist/lupdate/merge.cpp@ 635

Last change on this file since 635 was 561, checked in by Dmitry A. Kuminov, 16 years ago

trunk: Merged in qt 4.6.1 sources.

  • Property svn:eol-style set to native
File size: 18.7 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation (qt-info@nokia.com)
6**
7** This file is part of the Qt Linguist of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at qt-info@nokia.com.
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "lupdate.h"
43
44#include "simtexth.h"
45#include "translator.h"
46
47#include <QtCore/QDebug>
48#include <QtCore/QMap>
49#include <QtCore/QStringList>
50#include <QtCore/QTextCodec>
51#include <QtCore/QVector>
52
53
54QT_BEGIN_NAMESPACE
55
56static bool isDigitFriendly(QChar c)
57{
58 return c.isPunct() || c.isSpace();
59}
60
61static int numberLength(const QString &s, int i)
62{
63 if (i < s.size() || !s.at(i).isDigit())
64 return 0;
65
66 int pos = i;
67 do {
68 ++i;
69 } while (i < s.size()
70 && (s.at(i).isDigit()
71 || (isDigitFriendly(s[i])
72 && i + 1 < s.size()
73 && (s[i + 1].isDigit()
74 || (isDigitFriendly(s[i + 1])
75 && i + 2 < s.size()
76 && s[i + 2].isDigit())))));
77 return i - pos;
78}
79
80
81/*
82 Returns a version of 'key' where all numbers have been replaced by zeroes. If
83 there were none, returns "".
84*/
85static QString zeroKey(const QString &key)
86{
87 QString zeroed;
88 bool metSomething = false;
89
90 for (int i = 0; i != key.size(); ++i) {
91 int len = numberLength(key, i);
92 if (len > 0) {
93 i += len;
94 zeroed.append(QLatin1Char('0'));
95 metSomething = true;
96 } else {
97 zeroed.append(key.at(i));
98 }
99 }
100 return metSomething ? zeroed : QString();
101}
102
103static QString translationAttempt(const QString &oldTranslation,
104 const QString &oldSource, const QString &newSource)
105{
106 int p = zeroKey(oldSource).count(QLatin1Char('0'));
107 QString attempt;
108 QStringList oldNumbers;
109 QStringList newNumbers;
110 QVector<bool> met(p);
111 QVector<int> matchedYet(p);
112 int i, j;
113 int k = 0, ell, best;
114 int m, n;
115 int pass;
116
117 /*
118 This algorithm is hard to follow, so we'll consider an example
119 all along: oldTranslation is "XeT 3.0", oldSource is "TeX 3.0"
120 and newSource is "XeT 3.1".
121
122 First, we set up two tables: oldNumbers and newNumbers. In our
123 example, oldNumber[0] is "3.0" and newNumber[0] is "3.1".
124 */
125 for (i = 0, j = 0; i < oldSource.size(); i++, j++) {
126 m = numberLength(oldSource, i);
127 n = numberLength(newSource, j);
128 if (m > 0) {
129 oldNumbers.append(oldSource.mid(i, m + 1));
130 newNumbers.append(newSource.mid(j, n + 1));
131 i += m;
132 j += n;
133 met[k] = false;
134 matchedYet[k] = 0;
135 k++;
136 }
137 }
138
139 /*
140 We now go over the old translation, "XeT 3.0", one letter at a
141 time, looking for numbers found in oldNumbers. Whenever such a
142 number is met, it is replaced with its newNumber equivalent. In
143 our example, the "3.0" of "XeT 3.0" becomes "3.1".
144 */
145 for (i = 0; i < oldTranslation.length(); i++) {
146 attempt += oldTranslation[i];
147 for (k = 0; k < p; k++) {
148 if (oldTranslation[i] == oldNumbers[k][matchedYet[k]])
149 matchedYet[k]++;
150 else
151 matchedYet[k] = 0;
152 }
153
154 /*
155 Let's find out if the last character ended a match. We make
156 two passes over the data. In the first pass, we try to
157 match only numbers that weren't matched yet; if that fails,
158 the second pass does the trick. This is useful in some
159 suspicious cases, flagged below.
160 */
161 for (pass = 0; pass < 2; pass++) {
162 best = p; // an impossible value
163 for (k = 0; k < p; k++) {
164 if ((!met[k] || pass > 0) &&
165 matchedYet[k] == oldNumbers[k].length() &&
166 numberLength(oldTranslation, i + 1 - matchedYet[k]) == matchedYet[k]) {
167 // the longer the better
168 if (best == p || matchedYet[k] > matchedYet[best])
169 best = k;
170 }
171 }
172 if (best != p) {
173 attempt.truncate(attempt.length() - matchedYet[best]);
174 attempt += newNumbers[best];
175 met[best] = true;
176 for (k = 0; k < p; k++)
177 matchedYet[k] = 0;
178 break;
179 }
180 }
181 }
182
183 /*
184 We flag two kinds of suspicious cases. They are identified as
185 such with comments such as "{2000?}" at the end.
186
187 Example of the first kind: old source text "TeX 3.0" translated
188 as "XeT 2.0" is flagged "TeX 2.0 {3.0?}", no matter what the
189 new text is.
190 */
191 for (k = 0; k < p; k++) {
192 if (!met[k])
193 attempt += QLatin1String(" {") + newNumbers[k] + QLatin1String("?}");
194 }
195
196 /*
197 Example of the second kind: "1 of 1" translated as "1 af 1",
198 with new source text "1 of 2", generates "1 af 2 {1 or 2?}"
199 because it's not clear which of "1 af 2" and "2 af 1" is right.
200 */
201 for (k = 0; k < p; k++) {
202 for (ell = 0; ell < p; ell++) {
203 if (k != ell && oldNumbers[k] == oldNumbers[ell] &&
204 newNumbers[k] < newNumbers[ell])
205 attempt += QLatin1String(" {") + newNumbers[k] + QLatin1String(" or ") +
206 newNumbers[ell] + QLatin1String("?}");
207 }
208 }
209 return attempt;
210}
211
212
213/*
214 Augments a Translator with translations easily derived from
215 similar existing (probably obsolete) translations.
216
217 For example, if "TeX 3.0" is translated as "XeT 3.0" and "TeX 3.1"
218 has no translation, "XeT 3.1" is added to the translator and is
219 marked Unfinished.
220
221 Returns the number of additional messages that this heuristic translated.
222*/
223int applyNumberHeuristic(Translator &tor)
224{
225 QMap<QString, QPair<QString, QString> > translated;
226 QVector<bool> untranslated(tor.messageCount());
227 int inserted = 0;
228
229 for (int i = 0; i < tor.messageCount(); ++i) {
230 const TranslatorMessage &msg = tor.message(i);
231 bool hasTranslation = msg.isTranslated();
232 if (msg.type() == TranslatorMessage::Unfinished) {
233 if (!hasTranslation)
234 untranslated[i] = true;
235 } else if (hasTranslation && msg.translations().count() == 1) {
236 const QString &key = zeroKey(msg.sourceText());
237 if (!key.isEmpty())
238 translated.insert(key, qMakePair(msg.sourceText(), msg.translation()));
239 }
240 }
241
242 for (int i = 0; i < tor.messageCount(); ++i) {
243 if (untranslated[i]) {
244 TranslatorMessage &msg = tor.message(i);
245 const QString &key = zeroKey(msg.sourceText());
246 if (!key.isEmpty()) {
247 QMap<QString, QPair<QString, QString> >::ConstIterator t =
248 translated.constFind(key);
249 if (t != translated.constEnd() && t->first != msg.sourceText()) {
250 msg.setTranslation(translationAttempt(t->second, t->first,
251 msg.sourceText()));
252 inserted++;
253 }
254 }
255 }
256 }
257 return inserted;
258}
259
260
261/*
262 Augments a Translator with trivially derived translations.
263
264 For example, if "Enabled:" is consistendly translated as "Eingeschaltet:" no
265 matter the context or the comment, "Eingeschaltet:" is added as the
266 translation of any untranslated "Enabled:" text and is marked Unfinished.
267
268 Returns the number of additional messages that this heuristic translated.
269*/
270
271int applySameTextHeuristic(Translator &tor)
272{
273 QMap<QString, QStringList> translated;
274 QMap<QString, bool> avoid; // Want a QTreeSet, in fact
275 QVector<bool> untranslated(tor.messageCount());
276 int inserted = 0;
277
278 for (int i = 0; i < tor.messageCount(); ++i) {
279 const TranslatorMessage &msg = tor.message(i);
280 if (!msg.isTranslated()) {
281 if (msg.type() == TranslatorMessage::Unfinished)
282 untranslated[i] = true;
283 } else {
284 const QString &key = msg.sourceText();
285 QMap<QString, QStringList>::ConstIterator t = translated.constFind(key);
286 if (t != translated.constEnd()) {
287 /*
288 The same source text is translated at least two
289 different ways. Do nothing then.
290 */
291 if (*t != msg.translations()) {
292 translated.remove(key);
293 avoid.insert(key, true);
294 }
295 } else if (!avoid.contains(key)) {
296 translated.insert(key, msg.translations());
297 }
298 }
299 }
300
301 for (int i = 0; i < tor.messageCount(); ++i) {
302 if (untranslated[i]) {
303 TranslatorMessage &msg = tor.message(i);
304 QMap<QString, QStringList>::ConstIterator t = translated.constFind(msg.sourceText());
305 if (t != translated.constEnd()) {
306 msg.setTranslations(*t);
307 ++inserted;
308 }
309 }
310 }
311 return inserted;
312}
313
314
315
316/*
317 Merges two Translator objects. The first one
318 is a set of source texts and translations for a previous version of
319 the internationalized program; the second one is a set of fresh
320 source texts newly extracted from the source code, without any
321 translation yet.
322*/
323
324Translator merge(const Translator &tor, const Translator &virginTor,
325 UpdateOptions options, QString &err)
326{
327 int known = 0;
328 int neww = 0;
329 int obsoleted = 0;
330 int similarTextHeuristicCount = 0;
331
332 Translator outTor;
333 outTor.setLanguageCode(tor.languageCode());
334 outTor.setSourceLanguageCode(tor.sourceLanguageCode());
335 outTor.setLocationsType(tor.locationsType());
336 outTor.setCodecName(tor.codecName());
337
338 /*
339 The types of all the messages from the vernacular translator
340 are updated according to the virgin translator.
341 */
342 foreach (TranslatorMessage m, tor.messages()) {
343 TranslatorMessage::Type newType = TranslatorMessage::Finished;
344
345 if (m.sourceText().isEmpty() && m.id().isEmpty()) {
346 // context/file comment
347 TranslatorMessage mv = virginTor.find(m.context());
348 if (!mv.isNull())
349 m.setComment(mv.comment());
350 } else {
351 TranslatorMessage mv;
352 int mvi = virginTor.find(m);
353 if (mvi < 0) {
354 if (!(options & HeuristicSimilarText)) {
355 makeObsolete:
356 newType = TranslatorMessage::Obsolete;
357 if (m.type() != TranslatorMessage::Obsolete)
358 obsoleted++;
359 m.clearReferences();
360 } else {
361 mv = virginTor.find(m.context(), m.comment(), m.allReferences());
362 if (mv.isNull()) {
363 // did not find it in the virgin, mark it as obsolete
364 goto makeObsolete;
365 } else {
366 // Do not just accept it if its on the same line number,
367 // but different source text.
368 // Also check if the texts are more or less similar before
369 // we consider them to represent the same message...
370 if (getSimilarityScore(m.sourceText(), mv.sourceText()) >= textSimilarityThreshold) {
371 // It is just slightly modified, assume that it is the same string
372
373 // Mark it as unfinished. (Since the source text
374 // was changed it might require re-translating...)
375 newType = TranslatorMessage::Unfinished;
376 ++similarTextHeuristicCount;
377 neww++;
378
379 outdateSource:
380 m.setOldSourceText(m.sourceText());
381 m.setSourceText(mv.sourceText());
382 const QString &oldpluralsource = m.extra(QLatin1String("po-msgid_plural"));
383 if (!oldpluralsource.isEmpty()) {
384 m.setExtra(QLatin1String("po-old_msgid_plural"), oldpluralsource);
385 m.unsetExtra(QLatin1String("po-msgid_plural"));
386 }
387 goto copyAttribs; // Update secondary references
388 } else {
389 // The virgin and vernacular sourceTexts are so
390 // different that we could not find it.
391 goto makeObsolete;
392 }
393 }
394 }
395 } else {
396 mv = virginTor.message(mvi);
397 if (!mv.id().isEmpty()
398 && (mv.context() != m.context()
399 || mv.sourceText() != m.sourceText()
400 || mv.comment() != m.comment())) {
401 known++;
402 newType = TranslatorMessage::Unfinished;
403 m.setContext(mv.context());
404 m.setComment(mv.comment());
405 if (mv.sourceText() != m.sourceText())
406 goto outdateSource;
407 } else {
408 switch (m.type()) {
409 case TranslatorMessage::Finished:
410 default:
411 if (m.isPlural() == mv.isPlural()) {
412 newType = TranslatorMessage::Finished;
413 } else {
414 newType = TranslatorMessage::Unfinished;
415 }
416 known++;
417 break;
418 case TranslatorMessage::Unfinished:
419 newType = TranslatorMessage::Unfinished;
420 known++;
421 break;
422 case TranslatorMessage::Obsolete:
423 newType = TranslatorMessage::Unfinished;
424 neww++;
425 }
426 }
427
428 // Always get the filename and linenumber info from the
429 // virgin Translator, in case it has changed location.
430 // This should also enable us to read a file that does not
431 // have the <location> element.
432 // why not use operator=()? Because it overwrites e.g. userData.
433 copyAttribs:
434 m.setReferences(mv.allReferences());
435 m.setPlural(mv.isPlural());
436 m.setUtf8(mv.isUtf8());
437 m.setExtraComment(mv.extraComment());
438 m.setId(mv.id());
439 }
440 }
441
442 m.setType(newType);
443 outTor.append(m);
444 }
445
446 /*
447 Messages found only in the virgin translator are added to the
448 vernacular translator.
449 */
450 foreach (const TranslatorMessage &mv, virginTor.messages()) {
451 if (mv.sourceText().isEmpty() && mv.id().isEmpty()) {
452 if (tor.contains(mv.context()))
453 continue;
454 } else {
455 if (tor.find(mv) >= 0)
456 continue;
457 if (options & HeuristicSimilarText) {
458 TranslatorMessage m = tor.find(mv.context(), mv.comment(), mv.allReferences());
459 if (!m.isNull()) {
460 if (getSimilarityScore(m.sourceText(), mv.sourceText()) >= textSimilarityThreshold)
461 continue;
462 }
463 }
464 }
465 if (options & NoLocations)
466 outTor.append(mv);
467 else
468 outTor.appendSorted(mv);
469 if (!mv.sourceText().isEmpty() || !mv.id().isEmpty())
470 ++neww;
471 }
472
473 /*
474 The same-text heuristic handles cases where a message has an
475 obsolete counterpart with a different context or comment.
476 */
477 int sameTextHeuristicCount = (options & HeuristicSameText) ? applySameTextHeuristic(outTor) : 0;
478
479 /*
480 The number heuristic handles cases where a message has an
481 obsolete counterpart with mostly numbers differing in the
482 source text.
483 */
484 int sameNumberHeuristicCount = (options & HeuristicNumber) ? applyNumberHeuristic(outTor) : 0;
485
486 if (options & Verbose) {
487 int totalFound = neww + known;
488 err += QObject::tr(" Found %n source text(s) (%1 new and %2 already existing)\n", 0, totalFound).arg(neww).arg(known);
489
490 if (obsoleted) {
491 if (options & NoObsolete) {
492 err += QObject::tr(" Removed %n obsolete entries\n", 0, obsoleted);
493 } else {
494 err += QObject::tr(" Kept %n obsolete entries\n", 0, obsoleted);
495 }
496 }
497
498 if (sameNumberHeuristicCount)
499 err += QObject::tr(" Number heuristic provided %n translation(s)\n",
500 0, sameNumberHeuristicCount);
501 if (sameTextHeuristicCount)
502 err += QObject::tr(" Same-text heuristic provided %n translation(s)\n",
503 0, sameTextHeuristicCount);
504 if (similarTextHeuristicCount)
505 err += QObject::tr(" Similar-text heuristic provided %n translation(s)\n",
506 0, similarTextHeuristicCount);
507 }
508 return outTor;
509}
510
511QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.