source: vendor/trolltech/current/src/codecs/qrtlcodec.cpp

Last change on this file was 2, checked in by dmik, 20 years ago

Imported xplatform parts of the official release 3.3.1 from Trolltech

  • Property svn:keywords set to Id
File size: 18.0 KB
Line 
1/****************************************************************************
2** $Id: qrtlcodec.cpp 2 2005-11-16 15:49:26Z dmik $
3**
4** Implementation of QTextCodec class
5**
6** Created : 981015
7**
8** Copyright (C) 1998-2002 Trolltech AS. All rights reserved.
9**
10** This file is part of the tools module of the Qt GUI Toolkit.
11**
12** This file may be distributed under the terms of the Q Public License
13** as defined by Trolltech AS of Norway and appearing in the file
14** LICENSE.QPL included in the packaging of this file.
15**
16** This file may be distributed and/or modified under the terms of the
17** GNU General Public License version 2 as published by the Free Software
18** Foundation and appearing in the file LICENSE.GPL included in the
19** packaging of this file.
20**
21** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22** licenses may use this file in accordance with the Qt Commercial License
23** Agreement provided with the Software.
24**
25** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27**
28** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29** information about Qt Commercial License Agreements.
30** See http://www.trolltech.com/qpl/ for QPL licensing information.
31** See http://www.trolltech.com/gpl/ for GPL licensing information.
32**
33** Contact info@trolltech.com if any conditions of this licensing are
34** not clear to you.
35**
36**********************************************************************/
37
38#include "qrtlcodec.h"
39#include <private/qtextengine_p.h>
40
41#ifndef QT_NO_CODEC_HEBREW
42
43// NOT REVISED
44
45static const uchar unkn = '?'; // BLACK SQUARE (94) would be better
46
47static const ushort heb_to_unicode[128] = {
48 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
49 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
50 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
51 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
52 0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
53 0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x203E,
54 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
55 0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0xFFFD,
56 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
57 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
58 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
59 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x2017,
60 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
61 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
62 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
63 0x05E8, 0x05E9, 0x05EA, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD
64};
65
66static const uchar unicode_to_heb_00[32] = {
67 0xA0, unkn, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
68 0xA8, 0xA9, 0xD7, 0xAB, 0xAC, 0xAD, 0xAE, unkn,
69 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
70 0xB8, 0xB9, 0xF7, 0xBB, 0xBC, 0xBD, 0xBE, unkn,
71};
72
73static const uchar unicode_to_heb_05[32] = {
74 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
75 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
76 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
77 0xF8, 0xF9, 0xFA, unkn, unkn, unkn, unkn, unkn
78};
79
80static bool to8bit(const QChar ch, QCString *rstr)
81{
82 bool converted = FALSE;
83
84 if( ch.isMark() ) return TRUE; // ignore marks for conversion
85
86 if ( ch.row() ) {
87 if ( ch.row() == 0x05 ) {
88 if ( ch.cell() > 0x91 )
89 converted = TRUE;
90 // 0x0591 - 0x05cf: Hebrew punctuation... dropped
91 if ( ch.cell() >= 0xD0 )
92 *rstr += (char)unicode_to_heb_05[ch.cell()- 0xD0];
93 } else if ( ch.row() == 0x20 ) {
94 if ( ch.cell() == 0x3E ) {
95 *rstr += (char)0xAF;
96 converted = TRUE;
97 } else if ( ch.cell() == 0x17 ) {
98 *rstr += (char)0xCF;
99 converted = TRUE;
100 }
101 } else {
102 converted = FALSE;
103 }
104 } else {
105 if ( ch.cell() < 0x80 ) {
106 *rstr += (char)ch.cell();
107 converted = TRUE;
108 } else if( ch.cell() < 0xA0 ) {
109 *rstr += (char)unicode_to_heb_00[ch.cell() - 0xA0];
110 converted = TRUE;
111 }
112 }
113
114 if(converted) return TRUE;
115
116 // couldn't convert the char... lets try its decomposition
117 QString d = ch.decomposition();
118 if(d.isNull())
119 return FALSE;
120
121 int l = d.length();
122 for (int i=0; i<l; i++) {
123 const QChar ch = d[i];
124
125 if(to8bit(ch, rstr))
126 converted = TRUE;
127 }
128
129 return converted;
130}
131
132#if 0
133static QString run(const QString &input, unsigned int from, unsigned int to, QChar::Direction runDir)
134{
135 if ( to <= from )
136 return QString::null;
137
138 QString out;
139 if ( runDir == QChar::DirR ) {
140 const QChar *ch = input.unicode() + to - 1;
141 int len = to - from;
142 while (len--) {
143 out += *ch;
144 ch--;
145 }
146 } else {
147 out = input.mid(from, to - from );
148 }
149 return out;
150}
151
152/*
153 we might do better here, but I'm currently not sure if it's worth the effort. It will hopefully convert
154 90% of the visually ordered Hebrew correctly.
155*/
156static QString reverseLine(const QString &str, unsigned int from, unsigned int to, QChar::Direction dir)
157{
158 QString out;
159
160 if ( to <= from ) {
161 out += str.at(from);
162 return out;
163 }
164
165 // since we don't have embedding marks, we get around with bidi levels up to 2.
166
167 // simple case: dir = RTL:
168 // go through the line from right to left, and reverse all continuous Hebrew strings.
169 if ( dir == QChar::DirR ) {
170 unsigned int pos = to;
171 to = from;
172 from = pos;
173 QChar::Direction runDir = QChar::DirON;
174
175 while ( pos > to ) {
176 QChar::Direction d = str.at(pos).direction();
177 switch ( d ) {
178 case QChar::DirL:
179 case QChar::DirAN:
180 case QChar::DirEN:
181 if ( runDir != QChar::DirL ) {
182 out += run( str, pos, from, runDir );
183 from = pos - 1;
184 }
185 runDir = QChar::DirL;
186 break;
187 case QChar::DirON:
188 if ( runDir == QChar::DirON ) {
189 runDir = QChar::DirR;
190 break;
191 }
192 // fall through
193 case QChar::DirR:
194 if ( runDir != QChar::DirR ) {
195 out += run( str, pos, from, runDir );
196 from = pos - 1;
197 }
198 runDir = QChar::DirR;
199 default:
200 break;
201 }
202 pos--;
203 }
204 out += run( str, pos, from, runDir );
205 } else {
206 // basicDir == DirL. A bit more complicated, as we might need to reverse two times for numbers.
207 unsigned int pos = from;
208 QChar::Direction runDir = QChar::DirON;
209
210 // first reversing. Ignore numbers
211 while ( pos < to ) {
212 QChar::Direction d = str.at(pos).direction();
213 switch ( d ) {
214 case QChar::DirL:
215 if ( runDir != QChar::DirL && runDir != QChar::DirON ) {
216 out += run( str, from, pos, runDir );
217 qDebug( "out = %s", out.latin1() );
218 from = pos;
219 }
220 runDir = QChar::DirL;
221 break;
222 case QChar::DirON:
223 if ( runDir == QChar::DirON ) {
224 runDir = QChar::DirL;
225 break;
226 }
227 // fall through
228 case QChar::DirR:
229 case QChar::DirAN:
230 case QChar::DirEN:
231 if ( runDir != QChar::DirR && runDir != QChar::DirON ) {
232 out += run( str, from, pos, runDir );
233 qDebug( "out = %s", out.latin1() );
234 from = pos;
235 }
236 runDir = QChar::DirR;
237 default:
238 break;
239 }
240 pos++;
241 }
242 out += run( str, from, pos, runDir );
243 qDebug( "out = %s", out.latin1() );
244 // second reversing for numbers
245 QString in = out;
246 out = "";
247 pos = 0;
248 from = 0;
249 to = in.length() - 1;
250 runDir = QChar::DirON;
251 while ( pos < to ) {
252 QChar::Direction d = str.at(pos).direction();
253 switch ( d ) {
254 case QChar::DirL:
255 case QChar::DirON:
256 case QChar::DirR:
257 if ( runDir == QChar::DirEN && runDir != QChar::DirON ) {
258 out += run( in, from, pos, QChar::DirR ); //DirR ensures reversing
259 qDebug( "out = %s", out.latin1() );
260 runDir = QChar::DirR;
261 from = pos;
262 }
263 runDir = QChar::DirL;
264 break;
265 case QChar::DirAN:
266 case QChar::DirEN:
267 if ( runDir != QChar::DirEN && runDir != QChar::DirON ) {
268 out += in.mid(from, pos-from+1);
269 qDebug( "out = %s", out.latin1() );
270 from = pos;
271 }
272 runDir = QChar::DirEN;
273 default:
274 break;
275 }
276 pos++;
277 }
278 out += run( str, from, pos, runDir );
279
280 }
281 return out;
282}
283#endif
284
285/* this function assuems the QString is still visually ordered.
286 * Finding the basic direction of the text is not easy in this case, since
287 * a string like "my friend MOLAHS" could (in logical order) mean aswell
288 * "SHALOM my friend" or "my friend SHALOM", depending on the basic direction
289 * one assumes for the text.
290 *
291 * So this function uses some heuristics to find the right answer...
292 */
293static QChar::Direction findBasicDirection(QString str)
294{
295 unsigned int pos;
296 unsigned int len = str.length();
297 QChar::Direction dir1 = QChar::DirON;
298 QChar::Direction dir2 = QChar::DirON;
299
300 unsigned int startLine = 0;
301 // If the visual representation of the first line starts and ends with the same
302 // directionality, we know the answer.
303 pos = 0;
304 while (pos < len) {
305 if ( str.at(pos) == '\n' )
306 startLine = pos;
307 if (str.at(pos).direction() < 2) { // DirR or DirL
308 dir1 = str.at(pos).direction();
309 break;
310 }
311 pos++;
312 }
313
314 if( pos == len ) // no directional chars, assume QChar::DirL
315 return QChar::DirL;
316
317 // move to end of line
318 while( pos < len && str.at(pos) != '\n' )
319 pos++;
320
321 while (pos > startLine) {
322 if (str.at(pos).direction() < 2) { // DirR or DirL
323 dir2 = str.at(pos).direction();
324 break;
325 }
326 pos--;
327 }
328
329 // both are the same, so we have the direction!
330 if ( dir1 == dir2 ) return dir1;
331
332 // guess with the help of punktuation marks...
333 // if the sentence ends with a punktuation, we should have a mark
334 // at one side of the text...
335
336 pos = 0;
337 while (pos < len-1 ) {
338 if(str.at(pos).category() == QChar::Punctuation_Other) {
339 if( str.at(pos) != (char)0xbf && str.at(pos) != (char)0xa1 ) // spanish inverted question and exclamation mark
340 if( str.at(pos+1).direction() < 2 ) return QChar::DirR;
341 }
342 pos++;
343 }
344
345 pos = len;
346 while (pos < 1 && str.at(pos).direction() < 2 ) {
347 if(str.at(pos).category() == QChar::Punctuation_Other) {
348 if( str.at(pos-1).direction() < 2 ) return QChar::DirL;
349 }
350 pos--;
351 }
352
353 // don't know try DirR...
354 return QChar::DirR;
355}
356
357
358/*!
359 \class QHebrewCodec qrtlcodec.h
360 \reentrant
361 \ingroup i18n
362
363 \brief The QHebrewCodec class provides conversion to and from
364 visually ordered Hebrew.
365
366 Hebrew as a semitic language is written from right to left.
367 Because older computer systems couldn't handle reordering a string
368 so that the first letter appears on the right, many older
369 documents were encoded in visual order, so that the first letter
370 of a line is the rightmost one in the string.
371
372 In contrast to this, Unicode defines characters to be in logical
373 order (the order you would read the string). This codec tries to
374 convert visually ordered Hebrew (8859-8) to Unicode. This might
375 not always work perfectly, because reversing the \e bidi
376 (bi-directional) algorithm that transforms from logical to visual
377 order is non-trivial.
378
379 Transformation from Unicode to visual Hebrew (8859-8) is done
380 using the bidi algorithm in Qt, and will produce correct results,
381 so long as the codec is given the text a whole paragraph at a
382 time. Places where newlines are supposed to go can be indicated by
383 a newline character ('\n'). Note that these newline characters
384 change the reordering behaviour of the algorithm, since the bidi
385 reordering only takes place within one line of text, whereas
386 line breaks are determined in visual order.
387
388 Visually ordered Hebrew is still used quite often in some places,
389 mainly in email communication (since most email programs still
390 don't understand logically ordered Hebrew) and on web pages. The
391 use on web pages is rapidly decreasing, due to the availability of
392 browsers that correctly support logically ordered Hebrew.
393
394 This codec has the name "iso8859-8". If you don't want any bidi
395 reordering to happen during conversion, use the "iso8859-8-i"
396 codec, which assumes logical order for the 8-bit string.
397*/
398
399/*! \reimp */
400int QHebrewCodec::mibEnum() const
401{
402 return 11;
403}
404
405/*! \reimp */
406const char* QHebrewCodec::name() const
407{
408 return "ISO 8859-8";
409}
410
411/*!
412 Returns the codec's mime name.
413*/
414const char* QHebrewCodec::mimeName() const
415{
416 return "ISO-8859-8";
417}
418
419static QString visualOrder(QString logical, QChar::Direction basicDir)
420{
421 logical.replace(QChar('\n'), QChar(0x2028));
422
423 QTextEngine e(logical, 0);
424 e.direction = basicDir;
425 e.itemize();
426 Q_UINT8 l[256];
427 Q_UINT8 *levels = l;
428 int vo[256];
429 int *visualOrder = vo;
430 int nitems = e.items.size();
431 if (nitems > 255) {
432 levels = new Q_UINT8[nitems];
433 visualOrder = new int[nitems];
434 }
435 int i;
436 for (i = 0; i < nitems; ++i) {
437 //qDebug("item %d bidiLevel=%d", i, e.items[i].analysis.bidiLevel);
438 levels[i] = e.items[i].analysis.bidiLevel;
439 }
440 e.bidiReorder(nitems, levels, visualOrder);
441
442 QString visual;
443 for (i = 0; i < nitems; ++i) {
444 QScriptItem &si = e.items[visualOrder[i]];
445 QString sub = logical.mid(si.position, e.length(visualOrder[i]));
446 if (si.analysis.bidiLevel % 2) {
447 // reverse sub
448 QChar *a = (QChar *)sub.unicode();
449 QChar *b = a + sub.length() - 1;
450 while (a < b) {
451 QChar tmp = *a;
452 *a = *b;
453 *b = tmp;
454 ++a;
455 --b;
456 }
457 a = (QChar *)sub.unicode();
458 b = a + sub.length();
459 while (a<b) {
460 *a = a->mirroredChar();
461 ++a;
462 }
463 }
464 visual += sub;
465 }
466 // replace Unicode newline back with \n to compare.
467 visual.replace(QChar(0x2028), QChar('\n'));
468 if (l != levels) {
469 delete [] levels;
470 delete [] visualOrder;
471 }
472 return visual;
473}
474
475/*!
476 \reimp
477
478 Since Hebrew (and Arabic) is written from left to right, but
479 iso8859-8 assumes visual ordering (as opposed to the logical
480 ordering of Unicode), we must reverse the order of the input
481 string (the first \a len characters of \a chars) to put it into
482 logical order.
483
484 One problem is that the basic text direction is unknown. So this
485 function uses some heuristics to guess it, and if it can't guess
486 the right one, it assumes, the basic text direction is right to
487 left.
488
489 This behaviour can be overridden, by putting a control character
490 at the beginning of the text to indicate which basic text
491 direction to use. If the basic text direction is left-to-right,
492 the control character should be (uchar) 0xFE. For right-to-left it
493 should be 0xFF. Both characters are undefined in the iso 8859-8
494 charset.
495
496 Example: A visually ordered string "english WERBEH american" would
497 be recognized as having a basic left to right direction. So the
498 logically ordered QString would be "english HEBREW american".
499
500 By prepending a (uchar)0xFF at the start of the string,
501 QHebrewCodec::toUnicode() would use a basic text direction of
502 right to left, and the string would thus become "american HEBREW
503 english".
504*/
505QString QHebrewCodec::toUnicode(const char* chars, int len ) const
506{
507 QString r;
508 const unsigned char * c = (const unsigned char *)chars;
509 QChar::Direction basicDir = QChar::DirON; // neutral, we don't know
510
511 if( len == 0 ) return QString::null;
512
513 // Test, if the user gives us a directionality.
514 // We use 0xFE and 0xFF in ISO8859-8 for that.
515 // These chars are undefined in the charset, and are mapped to
516 // RTL overwrite
517 if( c[0] == 0xfe ) {
518 basicDir = QChar::DirL;
519 c++; // skip directionality hint
520 }
521 if( c[0] == 0xff ) {
522 basicDir = QChar::DirR;
523 c++; // skip directionality hint
524 }
525
526 for( int i=0; i<len; i++ ) {
527 if ( c[i] > 127 )
528 r[i] = heb_to_unicode[c[i]-128];
529 else
530 r[i] = c[i];
531 }
532
533 // do transformation from visual byte ordering to logical byte
534 // ordering
535 if( basicDir == QChar::DirON )
536 basicDir = findBasicDirection(r);
537
538 return visualOrder(r, basicDir);
539}
540
541/*!
542 Transforms the logically ordered QString, \a uc, into a visually
543 ordered string in the 8859-8 encoding. Qt's bidi algorithm is used
544 to perform this task. Note that newline characters affect the
545 reordering, since reordering is done on a line by line basis.
546
547 The algorithm is designed to work on whole paragraphs of text, so
548 processing a line at a time may produce incorrect results. This
549 approach is taken because the reordering of the contents of a
550 particular line in a paragraph may depend on the previous line in
551 the same paragraph.
552
553 Some encodings (for example Japanese or UTF-8) are multibyte (so
554 one input character is mapped to two output characters). The \a
555 lenInOut argument specifies the number of QChars that should be
556 converted and is set to the number of characters returned.
557*/
558QCString QHebrewCodec::fromUnicode(const QString& uc, int& lenInOut) const
559{
560 // process only len chars...
561 int l;
562 if( lenInOut > 0 )
563 l = QMIN((int)uc.length(),lenInOut);
564 else
565 l = (int)uc.length();
566
567 QCString rstr;
568 if( l == 1 ) {
569 if( !to8bit( uc[0], &rstr ) )
570 rstr += (char)unkn;
571 } else {
572 QString tmp = uc;
573 tmp.truncate(l);
574 QString vis = visualOrder(tmp, QChar::DirON);
575
576 for (int i=0; i<l; i++) {
577 const QChar ch = vis[i];
578
579 if( !to8bit( ch, &rstr ) )
580 rstr += (char)unkn;
581 }
582 // lenInOut = cursor - result;
583 }
584 if( l > 0 && !rstr.length() )
585 rstr += (char)unkn;
586
587 return rstr;
588}
589
590/*! \reimp
591 */
592int QHebrewCodec::heuristicContentMatch(const char* chars, int len) const
593{
594 const unsigned char * c = (const unsigned char *)chars;
595
596 int score = 0;
597 for (int i=0; i<len; i++) {
598 if(c[i] > 0x80 ) {
599 if ( heb_to_unicode[c[i] - 0x80] != 0xFFFD)
600 score++;
601 else
602 return -1;
603 }
604 }
605 return score;
606}
607
608#endif
Note: See TracBrowser for help on using the repository browser.