source: trunk/tools/linguist/lupdate/merge.cpp@ 1147

Last change on this file since 1147 was 846, checked in by Dmitry A. Kuminov, 14 years ago

trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.

  • Property svn:eol-style set to native
File size: 18.7 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation (qt-info@nokia.com)
6**
7** This file is part of the Qt Linguist of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at qt-info@nokia.com.
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "lupdate.h"
43
44#include "simtexth.h"
45#include "translator.h"
46
47#include <QtCore/QCoreApplication>
48#include <QtCore/QDebug>
49#include <QtCore/QMap>
50#include <QtCore/QStringList>
51#include <QtCore/QTextCodec>
52#include <QtCore/QVector>
53
54QT_BEGIN_NAMESPACE
55
56class LU {
57 Q_DECLARE_TR_FUNCTIONS(LUpdate)
58};
59
60static bool isDigitFriendly(QChar c)
61{
62 return c.isPunct() || c.isSpace();
63}
64
65static int numberLength(const QString &s, int i)
66{
67 if (i >= s.size() || !s.at(i).isDigit())
68 return 0;
69
70 int pos = i;
71 do {
72 ++i;
73 } while (i < s.size()
74 && (s.at(i).isDigit()
75 || (isDigitFriendly(s[i])
76 && i + 1 < s.size()
77 && (s[i + 1].isDigit()
78 || (isDigitFriendly(s[i + 1])
79 && i + 2 < s.size()
80 && s[i + 2].isDigit())))));
81 return i - pos;
82}
83
84
85/*
86 Returns a version of 'key' where all numbers have been replaced by zeroes. If
87 there were none, returns "".
88*/
89static QString zeroKey(const QString &key)
90{
91 QString zeroed;
92 bool metSomething = false;
93
94 for (int i = 0; i < key.size(); ++i) {
95 int len = numberLength(key, i);
96 if (len > 0) {
97 i += len;
98 zeroed.append(QLatin1Char('0'));
99 metSomething = true;
100 } else {
101 zeroed.append(key.at(i));
102 }
103 }
104 return metSomething ? zeroed : QString();
105}
106
107static QString translationAttempt(const QString &oldTranslation,
108 const QString &oldSource, const QString &newSource)
109{
110 int p = zeroKey(oldSource).count(QLatin1Char('0'));
111 QString attempt;
112 QStringList oldNumbers;
113 QStringList newNumbers;
114 QVector<bool> met(p);
115 QVector<int> matchedYet(p);
116 int i, j;
117 int k = 0, ell, best;
118 int m, n;
119 int pass;
120
121 /*
122 This algorithm is hard to follow, so we'll consider an example
123 all along: oldTranslation is "XeT 3.0", oldSource is "TeX 3.0"
124 and newSource is "XeT 3.1".
125
126 First, we set up two tables: oldNumbers and newNumbers. In our
127 example, oldNumber[0] is "3.0" and newNumber[0] is "3.1".
128 */
129 for (i = 0, j = 0; i < oldSource.size(); i++, j++) {
130 m = numberLength(oldSource, i);
131 n = numberLength(newSource, j);
132 if (m > 0) {
133 oldNumbers.append(oldSource.mid(i, m + 1));
134 newNumbers.append(newSource.mid(j, n + 1));
135 i += m;
136 j += n;
137 met[k] = false;
138 matchedYet[k] = 0;
139 k++;
140 }
141 }
142
143 /*
144 We now go over the old translation, "XeT 3.0", one letter at a
145 time, looking for numbers found in oldNumbers. Whenever such a
146 number is met, it is replaced with its newNumber equivalent. In
147 our example, the "3.0" of "XeT 3.0" becomes "3.1".
148 */
149 for (i = 0; i < oldTranslation.length(); i++) {
150 attempt += oldTranslation[i];
151 for (k = 0; k < p; k++) {
152 if (oldTranslation[i] == oldNumbers[k][matchedYet[k]])
153 matchedYet[k]++;
154 else
155 matchedYet[k] = 0;
156 }
157
158 /*
159 Let's find out if the last character ended a match. We make
160 two passes over the data. In the first pass, we try to
161 match only numbers that weren't matched yet; if that fails,
162 the second pass does the trick. This is useful in some
163 suspicious cases, flagged below.
164 */
165 for (pass = 0; pass < 2; pass++) {
166 best = p; // an impossible value
167 for (k = 0; k < p; k++) {
168 if ((!met[k] || pass > 0) &&
169 matchedYet[k] == oldNumbers[k].length() &&
170 numberLength(oldTranslation, i + 1 - matchedYet[k]) == matchedYet[k]) {
171 // the longer the better
172 if (best == p || matchedYet[k] > matchedYet[best])
173 best = k;
174 }
175 }
176 if (best != p) {
177 attempt.truncate(attempt.length() - matchedYet[best]);
178 attempt += newNumbers[best];
179 met[best] = true;
180 for (k = 0; k < p; k++)
181 matchedYet[k] = 0;
182 break;
183 }
184 }
185 }
186
187 /*
188 We flag two kinds of suspicious cases. They are identified as
189 such with comments such as "{2000?}" at the end.
190
191 Example of the first kind: old source text "TeX 3.0" translated
192 as "XeT 2.0" is flagged "TeX 2.0 {3.0?}", no matter what the
193 new text is.
194 */
195 for (k = 0; k < p; k++) {
196 if (!met[k])
197 attempt += QLatin1String(" {") + newNumbers[k] + QLatin1String("?}");
198 }
199
200 /*
201 Example of the second kind: "1 of 1" translated as "1 af 1",
202 with new source text "1 of 2", generates "1 af 2 {1 or 2?}"
203 because it's not clear which of "1 af 2" and "2 af 1" is right.
204 */
205 for (k = 0; k < p; k++) {
206 for (ell = 0; ell < p; ell++) {
207 if (k != ell && oldNumbers[k] == oldNumbers[ell] &&
208 newNumbers[k] < newNumbers[ell])
209 attempt += QLatin1String(" {") + newNumbers[k] + QLatin1String(" or ") +
210 newNumbers[ell] + QLatin1String("?}");
211 }
212 }
213 return attempt;
214}
215
216
217/*
218 Augments a Translator with translations easily derived from
219 similar existing (probably obsolete) translations.
220
221 For example, if "TeX 3.0" is translated as "XeT 3.0" and "TeX 3.1"
222 has no translation, "XeT 3.1" is added to the translator and is
223 marked Unfinished.
224
225 Returns the number of additional messages that this heuristic translated.
226*/
227int applyNumberHeuristic(Translator &tor)
228{
229 QMap<QString, QPair<QString, QString> > translated;
230 QVector<bool> untranslated(tor.messageCount());
231 int inserted = 0;
232
233 for (int i = 0; i < tor.messageCount(); ++i) {
234 const TranslatorMessage &msg = tor.message(i);
235 bool hasTranslation = msg.isTranslated();
236 if (msg.type() == TranslatorMessage::Unfinished) {
237 if (!hasTranslation)
238 untranslated[i] = true;
239 } else if (hasTranslation && msg.translations().count() == 1) {
240 const QString &key = zeroKey(msg.sourceText());
241 if (!key.isEmpty())
242 translated.insert(key, qMakePair(msg.sourceText(), msg.translation()));
243 }
244 }
245
246 for (int i = 0; i < tor.messageCount(); ++i) {
247 if (untranslated[i]) {
248 TranslatorMessage &msg = tor.message(i);
249 const QString &key = zeroKey(msg.sourceText());
250 if (!key.isEmpty()) {
251 QMap<QString, QPair<QString, QString> >::ConstIterator t =
252 translated.constFind(key);
253 if (t != translated.constEnd() && t->first != msg.sourceText()) {
254 msg.setTranslation(translationAttempt(t->second, t->first,
255 msg.sourceText()));
256 inserted++;
257 }
258 }
259 }
260 }
261 return inserted;
262}
263
264
265/*
266 Augments a Translator with trivially derived translations.
267
268 For example, if "Enabled:" is consistendly translated as "Eingeschaltet:" no
269 matter the context or the comment, "Eingeschaltet:" is added as the
270 translation of any untranslated "Enabled:" text and is marked Unfinished.
271
272 Returns the number of additional messages that this heuristic translated.
273*/
274
275int applySameTextHeuristic(Translator &tor)
276{
277 QMap<QString, QStringList> translated;
278 QMap<QString, bool> avoid; // Want a QTreeSet, in fact
279 QVector<bool> untranslated(tor.messageCount());
280 int inserted = 0;
281
282 for (int i = 0; i < tor.messageCount(); ++i) {
283 const TranslatorMessage &msg = tor.message(i);
284 if (!msg.isTranslated()) {
285 if (msg.type() == TranslatorMessage::Unfinished)
286 untranslated[i] = true;
287 } else {
288 const QString &key = msg.sourceText();
289 QMap<QString, QStringList>::ConstIterator t = translated.constFind(key);
290 if (t != translated.constEnd()) {
291 /*
292 The same source text is translated at least two
293 different ways. Do nothing then.
294 */
295 if (*t != msg.translations()) {
296 translated.remove(key);
297 avoid.insert(key, true);
298 }
299 } else if (!avoid.contains(key)) {
300 translated.insert(key, msg.translations());
301 }
302 }
303 }
304
305 for (int i = 0; i < tor.messageCount(); ++i) {
306 if (untranslated[i]) {
307 TranslatorMessage &msg = tor.message(i);
308 QMap<QString, QStringList>::ConstIterator t = translated.constFind(msg.sourceText());
309 if (t != translated.constEnd()) {
310 msg.setTranslations(*t);
311 ++inserted;
312 }
313 }
314 }
315 return inserted;
316}
317
318
319
320/*
321 Merges two Translator objects. The first one
322 is a set of source texts and translations for a previous version of
323 the internationalized program; the second one is a set of fresh
324 source texts newly extracted from the source code, without any
325 translation yet.
326*/
327
328Translator merge(const Translator &tor, const Translator &virginTor,
329 UpdateOptions options, QString &err)
330{
331 int known = 0;
332 int neww = 0;
333 int obsoleted = 0;
334 int similarTextHeuristicCount = 0;
335
336 Translator outTor;
337 outTor.setLanguageCode(tor.languageCode());
338 outTor.setSourceLanguageCode(tor.sourceLanguageCode());
339 outTor.setLocationsType(tor.locationsType());
340 outTor.setCodecName(tor.codecName());
341
342 /*
343 The types of all the messages from the vernacular translator
344 are updated according to the virgin translator.
345 */
346 foreach (TranslatorMessage m, tor.messages()) {
347 TranslatorMessage::Type newType = TranslatorMessage::Finished;
348
349 if (m.sourceText().isEmpty() && m.id().isEmpty()) {
350 // context/file comment
351 TranslatorMessage mv = virginTor.find(m.context());
352 if (!mv.isNull())
353 m.setComment(mv.comment());
354 } else {
355 TranslatorMessage mv;
356 int mvi = virginTor.find(m);
357 if (mvi < 0) {
358 if (!(options & HeuristicSimilarText)) {
359 makeObsolete:
360 newType = TranslatorMessage::Obsolete;
361 if (m.type() != TranslatorMessage::Obsolete)
362 obsoleted++;
363 m.clearReferences();
364 } else {
365 mv = virginTor.find(m.context(), m.comment(), m.allReferences());
366 if (mv.isNull()) {
367 // did not find it in the virgin, mark it as obsolete
368 goto makeObsolete;
369 } else {
370 // Do not just accept it if its on the same line number,
371 // but different source text.
372 // Also check if the texts are more or less similar before
373 // we consider them to represent the same message...
374 if (getSimilarityScore(m.sourceText(), mv.sourceText()) >= textSimilarityThreshold) {
375 // It is just slightly modified, assume that it is the same string
376
377 // Mark it as unfinished. (Since the source text
378 // was changed it might require re-translating...)
379 newType = TranslatorMessage::Unfinished;
380 ++similarTextHeuristicCount;
381 neww++;
382
383 outdateSource:
384 m.setOldSourceText(m.sourceText());
385 m.setSourceText(mv.sourceText());
386 const QString &oldpluralsource = m.extra(QLatin1String("po-msgid_plural"));
387 if (!oldpluralsource.isEmpty()) {
388 m.setExtra(QLatin1String("po-old_msgid_plural"), oldpluralsource);
389 m.unsetExtra(QLatin1String("po-msgid_plural"));
390 }
391 goto copyAttribs; // Update secondary references
392 } else {
393 // The virgin and vernacular sourceTexts are so
394 // different that we could not find it.
395 goto makeObsolete;
396 }
397 }
398 }
399 } else {
400 mv = virginTor.message(mvi);
401 if (!mv.id().isEmpty()
402 && (mv.context() != m.context()
403 || mv.sourceText() != m.sourceText()
404 || mv.comment() != m.comment())) {
405 known++;
406 newType = TranslatorMessage::Unfinished;
407 m.setContext(mv.context());
408 m.setComment(mv.comment());
409 if (mv.sourceText() != m.sourceText())
410 goto outdateSource;
411 } else {
412 switch (m.type()) {
413 case TranslatorMessage::Finished:
414 default:
415 if (m.isPlural() == mv.isPlural()) {
416 newType = TranslatorMessage::Finished;
417 } else {
418 newType = TranslatorMessage::Unfinished;
419 }
420 known++;
421 break;
422 case TranslatorMessage::Unfinished:
423 newType = TranslatorMessage::Unfinished;
424 known++;
425 break;
426 case TranslatorMessage::Obsolete:
427 newType = TranslatorMessage::Unfinished;
428 neww++;
429 }
430 }
431
432 // Always get the filename and linenumber info from the
433 // virgin Translator, in case it has changed location.
434 // This should also enable us to read a file that does not
435 // have the <location> element.
436 // why not use operator=()? Because it overwrites e.g. userData.
437 copyAttribs:
438 m.setReferences(mv.allReferences());
439 m.setPlural(mv.isPlural());
440 m.setUtf8(mv.isUtf8());
441 m.setExtraComment(mv.extraComment());
442 m.setId(mv.id());
443 }
444 }
445
446 m.setType(newType);
447 outTor.append(m);
448 }
449
450 /*
451 Messages found only in the virgin translator are added to the
452 vernacular translator.
453 */
454 foreach (const TranslatorMessage &mv, virginTor.messages()) {
455 if (mv.sourceText().isEmpty() && mv.id().isEmpty()) {
456 if (tor.contains(mv.context()))
457 continue;
458 } else {
459 if (tor.find(mv) >= 0)
460 continue;
461 if (options & HeuristicSimilarText) {
462 TranslatorMessage m = tor.find(mv.context(), mv.comment(), mv.allReferences());
463 if (!m.isNull()) {
464 if (getSimilarityScore(m.sourceText(), mv.sourceText()) >= textSimilarityThreshold)
465 continue;
466 }
467 }
468 }
469 if (options & NoLocations)
470 outTor.append(mv);
471 else
472 outTor.appendSorted(mv);
473 if (!mv.sourceText().isEmpty() || !mv.id().isEmpty())
474 ++neww;
475 }
476
477 /*
478 The same-text heuristic handles cases where a message has an
479 obsolete counterpart with a different context or comment.
480 */
481 int sameTextHeuristicCount = (options & HeuristicSameText) ? applySameTextHeuristic(outTor) : 0;
482
483 /*
484 The number heuristic handles cases where a message has an
485 obsolete counterpart with mostly numbers differing in the
486 source text.
487 */
488 int sameNumberHeuristicCount = (options & HeuristicNumber) ? applyNumberHeuristic(outTor) : 0;
489
490 if (options & Verbose) {
491 int totalFound = neww + known;
492 err += LU::tr(" Found %n source text(s) (%1 new and %2 already existing)\n", 0, totalFound).arg(neww).arg(known);
493
494 if (obsoleted) {
495 if (options & NoObsolete) {
496 err += LU::tr(" Removed %n obsolete entries\n", 0, obsoleted);
497 } else {
498 err += LU::tr(" Kept %n obsolete entries\n", 0, obsoleted);
499 }
500 }
501
502 if (sameNumberHeuristicCount)
503 err += LU::tr(" Number heuristic provided %n translation(s)\n",
504 0, sameNumberHeuristicCount);
505 if (sameTextHeuristicCount)
506 err += LU::tr(" Same-text heuristic provided %n translation(s)\n",
507 0, sameTextHeuristicCount);
508 if (similarTextHeuristicCount)
509 err += LU::tr(" Similar-text heuristic provided %n translation(s)\n",
510 0, similarTextHeuristicCount);
511 }
512 return outTor;
513}
514
515QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.