| 1 | /****************************************************************************
|
|---|
| 2 | ** $Id: qrtlcodec.cpp 2 2005-11-16 15:49:26Z dmik $
|
|---|
| 3 | **
|
|---|
| 4 | ** Implementation of QTextCodec class
|
|---|
| 5 | **
|
|---|
| 6 | ** Created : 981015
|
|---|
| 7 | **
|
|---|
| 8 | ** Copyright (C) 1998-2002 Trolltech AS. All rights reserved.
|
|---|
| 9 | **
|
|---|
| 10 | ** This file is part of the tools module of the Qt GUI Toolkit.
|
|---|
| 11 | **
|
|---|
| 12 | ** This file may be distributed under the terms of the Q Public License
|
|---|
| 13 | ** as defined by Trolltech AS of Norway and appearing in the file
|
|---|
| 14 | ** LICENSE.QPL included in the packaging of this file.
|
|---|
| 15 | **
|
|---|
| 16 | ** This file may be distributed and/or modified under the terms of the
|
|---|
| 17 | ** GNU General Public License version 2 as published by the Free Software
|
|---|
| 18 | ** Foundation and appearing in the file LICENSE.GPL included in the
|
|---|
| 19 | ** packaging of this file.
|
|---|
| 20 | **
|
|---|
| 21 | ** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
|
|---|
| 22 | ** licenses may use this file in accordance with the Qt Commercial License
|
|---|
| 23 | ** Agreement provided with the Software.
|
|---|
| 24 | **
|
|---|
| 25 | ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
|
|---|
| 26 | ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
|
|---|
| 27 | **
|
|---|
| 28 | ** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
|
|---|
| 29 | ** information about Qt Commercial License Agreements.
|
|---|
| 30 | ** See http://www.trolltech.com/qpl/ for QPL licensing information.
|
|---|
| 31 | ** See http://www.trolltech.com/gpl/ for GPL licensing information.
|
|---|
| 32 | **
|
|---|
| 33 | ** Contact info@trolltech.com if any conditions of this licensing are
|
|---|
| 34 | ** not clear to you.
|
|---|
| 35 | **
|
|---|
| 36 | **********************************************************************/
|
|---|
| 37 |
|
|---|
| 38 | #include "qrtlcodec.h"
|
|---|
| 39 | #include <private/qtextengine_p.h>
|
|---|
| 40 |
|
|---|
| 41 | #ifndef QT_NO_CODEC_HEBREW
|
|---|
| 42 |
|
|---|
| 43 | // NOT REVISED
|
|---|
| 44 |
|
|---|
| 45 | static const uchar unkn = '?'; // BLACK SQUARE (94) would be better
|
|---|
| 46 |
|
|---|
| 47 | static const ushort heb_to_unicode[128] = {
|
|---|
| 48 | 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
|---|
| 49 | 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
|---|
| 50 | 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
|---|
| 51 | 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
|---|
| 52 | 0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
|
|---|
| 53 | 0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x203E,
|
|---|
| 54 | 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
|
|---|
| 55 | 0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0xFFFD,
|
|---|
| 56 | 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
|---|
| 57 | 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
|---|
| 58 | 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
|---|
| 59 | 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x2017,
|
|---|
| 60 | 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
|
|---|
| 61 | 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
|
|---|
| 62 | 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
|
|---|
| 63 | 0x05E8, 0x05E9, 0x05EA, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD
|
|---|
| 64 | };
|
|---|
| 65 |
|
|---|
| 66 | static const uchar unicode_to_heb_00[32] = {
|
|---|
| 67 | 0xA0, unkn, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
|
|---|
| 68 | 0xA8, 0xA9, 0xD7, 0xAB, 0xAC, 0xAD, 0xAE, unkn,
|
|---|
| 69 | 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
|
|---|
| 70 | 0xB8, 0xB9, 0xF7, 0xBB, 0xBC, 0xBD, 0xBE, unkn,
|
|---|
| 71 | };
|
|---|
| 72 |
|
|---|
| 73 | static const uchar unicode_to_heb_05[32] = {
|
|---|
| 74 | 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
|---|
| 75 | 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
|---|
| 76 | 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
|
|---|
| 77 | 0xF8, 0xF9, 0xFA, unkn, unkn, unkn, unkn, unkn
|
|---|
| 78 | };
|
|---|
| 79 |
|
|---|
| 80 | static bool to8bit(const QChar ch, QCString *rstr)
|
|---|
| 81 | {
|
|---|
| 82 | bool converted = FALSE;
|
|---|
| 83 |
|
|---|
| 84 | if( ch.isMark() ) return TRUE; // ignore marks for conversion
|
|---|
| 85 |
|
|---|
| 86 | if ( ch.row() ) {
|
|---|
| 87 | if ( ch.row() == 0x05 ) {
|
|---|
| 88 | if ( ch.cell() > 0x91 )
|
|---|
| 89 | converted = TRUE;
|
|---|
| 90 | // 0x0591 - 0x05cf: Hebrew punctuation... dropped
|
|---|
| 91 | if ( ch.cell() >= 0xD0 )
|
|---|
| 92 | *rstr += (char)unicode_to_heb_05[ch.cell()- 0xD0];
|
|---|
| 93 | } else if ( ch.row() == 0x20 ) {
|
|---|
| 94 | if ( ch.cell() == 0x3E ) {
|
|---|
| 95 | *rstr += (char)0xAF;
|
|---|
| 96 | converted = TRUE;
|
|---|
| 97 | } else if ( ch.cell() == 0x17 ) {
|
|---|
| 98 | *rstr += (char)0xCF;
|
|---|
| 99 | converted = TRUE;
|
|---|
| 100 | }
|
|---|
| 101 | } else {
|
|---|
| 102 | converted = FALSE;
|
|---|
| 103 | }
|
|---|
| 104 | } else {
|
|---|
| 105 | if ( ch.cell() < 0x80 ) {
|
|---|
| 106 | *rstr += (char)ch.cell();
|
|---|
| 107 | converted = TRUE;
|
|---|
| 108 | } else if( ch.cell() < 0xA0 ) {
|
|---|
| 109 | *rstr += (char)unicode_to_heb_00[ch.cell() - 0xA0];
|
|---|
| 110 | converted = TRUE;
|
|---|
| 111 | }
|
|---|
| 112 | }
|
|---|
| 113 |
|
|---|
| 114 | if(converted) return TRUE;
|
|---|
| 115 |
|
|---|
| 116 | // couldn't convert the char... lets try its decomposition
|
|---|
| 117 | QString d = ch.decomposition();
|
|---|
| 118 | if(d.isNull())
|
|---|
| 119 | return FALSE;
|
|---|
| 120 |
|
|---|
| 121 | int l = d.length();
|
|---|
| 122 | for (int i=0; i<l; i++) {
|
|---|
| 123 | const QChar ch = d[i];
|
|---|
| 124 |
|
|---|
| 125 | if(to8bit(ch, rstr))
|
|---|
| 126 | converted = TRUE;
|
|---|
| 127 | }
|
|---|
| 128 |
|
|---|
| 129 | return converted;
|
|---|
| 130 | }
|
|---|
| 131 |
|
|---|
| 132 | #if 0
|
|---|
| 133 | static QString run(const QString &input, unsigned int from, unsigned int to, QChar::Direction runDir)
|
|---|
| 134 | {
|
|---|
| 135 | if ( to <= from )
|
|---|
| 136 | return QString::null;
|
|---|
| 137 |
|
|---|
| 138 | QString out;
|
|---|
| 139 | if ( runDir == QChar::DirR ) {
|
|---|
| 140 | const QChar *ch = input.unicode() + to - 1;
|
|---|
| 141 | int len = to - from;
|
|---|
| 142 | while (len--) {
|
|---|
| 143 | out += *ch;
|
|---|
| 144 | ch--;
|
|---|
| 145 | }
|
|---|
| 146 | } else {
|
|---|
| 147 | out = input.mid(from, to - from );
|
|---|
| 148 | }
|
|---|
| 149 | return out;
|
|---|
| 150 | }
|
|---|
| 151 |
|
|---|
| 152 | /*
|
|---|
| 153 | we might do better here, but I'm currently not sure if it's worth the effort. It will hopefully convert
|
|---|
| 154 | 90% of the visually ordered Hebrew correctly.
|
|---|
| 155 | */
|
|---|
| 156 | static QString reverseLine(const QString &str, unsigned int from, unsigned int to, QChar::Direction dir)
|
|---|
| 157 | {
|
|---|
| 158 | QString out;
|
|---|
| 159 |
|
|---|
| 160 | if ( to <= from ) {
|
|---|
| 161 | out += str.at(from);
|
|---|
| 162 | return out;
|
|---|
| 163 | }
|
|---|
| 164 |
|
|---|
| 165 | // since we don't have embedding marks, we get around with bidi levels up to 2.
|
|---|
| 166 |
|
|---|
| 167 | // simple case: dir = RTL:
|
|---|
| 168 | // go through the line from right to left, and reverse all continuous Hebrew strings.
|
|---|
| 169 | if ( dir == QChar::DirR ) {
|
|---|
| 170 | unsigned int pos = to;
|
|---|
| 171 | to = from;
|
|---|
| 172 | from = pos;
|
|---|
| 173 | QChar::Direction runDir = QChar::DirON;
|
|---|
| 174 |
|
|---|
| 175 | while ( pos > to ) {
|
|---|
| 176 | QChar::Direction d = str.at(pos).direction();
|
|---|
| 177 | switch ( d ) {
|
|---|
| 178 | case QChar::DirL:
|
|---|
| 179 | case QChar::DirAN:
|
|---|
| 180 | case QChar::DirEN:
|
|---|
| 181 | if ( runDir != QChar::DirL ) {
|
|---|
| 182 | out += run( str, pos, from, runDir );
|
|---|
| 183 | from = pos - 1;
|
|---|
| 184 | }
|
|---|
| 185 | runDir = QChar::DirL;
|
|---|
| 186 | break;
|
|---|
| 187 | case QChar::DirON:
|
|---|
| 188 | if ( runDir == QChar::DirON ) {
|
|---|
| 189 | runDir = QChar::DirR;
|
|---|
| 190 | break;
|
|---|
| 191 | }
|
|---|
| 192 | // fall through
|
|---|
| 193 | case QChar::DirR:
|
|---|
| 194 | if ( runDir != QChar::DirR ) {
|
|---|
| 195 | out += run( str, pos, from, runDir );
|
|---|
| 196 | from = pos - 1;
|
|---|
| 197 | }
|
|---|
| 198 | runDir = QChar::DirR;
|
|---|
| 199 | default:
|
|---|
| 200 | break;
|
|---|
| 201 | }
|
|---|
| 202 | pos--;
|
|---|
| 203 | }
|
|---|
| 204 | out += run( str, pos, from, runDir );
|
|---|
| 205 | } else {
|
|---|
| 206 | // basicDir == DirL. A bit more complicated, as we might need to reverse two times for numbers.
|
|---|
| 207 | unsigned int pos = from;
|
|---|
| 208 | QChar::Direction runDir = QChar::DirON;
|
|---|
| 209 |
|
|---|
| 210 | // first reversing. Ignore numbers
|
|---|
| 211 | while ( pos < to ) {
|
|---|
| 212 | QChar::Direction d = str.at(pos).direction();
|
|---|
| 213 | switch ( d ) {
|
|---|
| 214 | case QChar::DirL:
|
|---|
| 215 | if ( runDir != QChar::DirL && runDir != QChar::DirON ) {
|
|---|
| 216 | out += run( str, from, pos, runDir );
|
|---|
| 217 | qDebug( "out = %s", out.latin1() );
|
|---|
| 218 | from = pos;
|
|---|
| 219 | }
|
|---|
| 220 | runDir = QChar::DirL;
|
|---|
| 221 | break;
|
|---|
| 222 | case QChar::DirON:
|
|---|
| 223 | if ( runDir == QChar::DirON ) {
|
|---|
| 224 | runDir = QChar::DirL;
|
|---|
| 225 | break;
|
|---|
| 226 | }
|
|---|
| 227 | // fall through
|
|---|
| 228 | case QChar::DirR:
|
|---|
| 229 | case QChar::DirAN:
|
|---|
| 230 | case QChar::DirEN:
|
|---|
| 231 | if ( runDir != QChar::DirR && runDir != QChar::DirON ) {
|
|---|
| 232 | out += run( str, from, pos, runDir );
|
|---|
| 233 | qDebug( "out = %s", out.latin1() );
|
|---|
| 234 | from = pos;
|
|---|
| 235 | }
|
|---|
| 236 | runDir = QChar::DirR;
|
|---|
| 237 | default:
|
|---|
| 238 | break;
|
|---|
| 239 | }
|
|---|
| 240 | pos++;
|
|---|
| 241 | }
|
|---|
| 242 | out += run( str, from, pos, runDir );
|
|---|
| 243 | qDebug( "out = %s", out.latin1() );
|
|---|
| 244 | // second reversing for numbers
|
|---|
| 245 | QString in = out;
|
|---|
| 246 | out = "";
|
|---|
| 247 | pos = 0;
|
|---|
| 248 | from = 0;
|
|---|
| 249 | to = in.length() - 1;
|
|---|
| 250 | runDir = QChar::DirON;
|
|---|
| 251 | while ( pos < to ) {
|
|---|
| 252 | QChar::Direction d = str.at(pos).direction();
|
|---|
| 253 | switch ( d ) {
|
|---|
| 254 | case QChar::DirL:
|
|---|
| 255 | case QChar::DirON:
|
|---|
| 256 | case QChar::DirR:
|
|---|
| 257 | if ( runDir == QChar::DirEN && runDir != QChar::DirON ) {
|
|---|
| 258 | out += run( in, from, pos, QChar::DirR ); //DirR ensures reversing
|
|---|
| 259 | qDebug( "out = %s", out.latin1() );
|
|---|
| 260 | runDir = QChar::DirR;
|
|---|
| 261 | from = pos;
|
|---|
| 262 | }
|
|---|
| 263 | runDir = QChar::DirL;
|
|---|
| 264 | break;
|
|---|
| 265 | case QChar::DirAN:
|
|---|
| 266 | case QChar::DirEN:
|
|---|
| 267 | if ( runDir != QChar::DirEN && runDir != QChar::DirON ) {
|
|---|
| 268 | out += in.mid(from, pos-from+1);
|
|---|
| 269 | qDebug( "out = %s", out.latin1() );
|
|---|
| 270 | from = pos;
|
|---|
| 271 | }
|
|---|
| 272 | runDir = QChar::DirEN;
|
|---|
| 273 | default:
|
|---|
| 274 | break;
|
|---|
| 275 | }
|
|---|
| 276 | pos++;
|
|---|
| 277 | }
|
|---|
| 278 | out += run( str, from, pos, runDir );
|
|---|
| 279 |
|
|---|
| 280 | }
|
|---|
| 281 | return out;
|
|---|
| 282 | }
|
|---|
| 283 | #endif
|
|---|
| 284 |
|
|---|
| 285 | /* this function assuems the QString is still visually ordered.
|
|---|
| 286 | * Finding the basic direction of the text is not easy in this case, since
|
|---|
| 287 | * a string like "my friend MOLAHS" could (in logical order) mean aswell
|
|---|
| 288 | * "SHALOM my friend" or "my friend SHALOM", depending on the basic direction
|
|---|
| 289 | * one assumes for the text.
|
|---|
| 290 | *
|
|---|
| 291 | * So this function uses some heuristics to find the right answer...
|
|---|
| 292 | */
|
|---|
| 293 | static QChar::Direction findBasicDirection(QString str)
|
|---|
| 294 | {
|
|---|
| 295 | unsigned int pos;
|
|---|
| 296 | unsigned int len = str.length();
|
|---|
| 297 | QChar::Direction dir1 = QChar::DirON;
|
|---|
| 298 | QChar::Direction dir2 = QChar::DirON;
|
|---|
| 299 |
|
|---|
| 300 | unsigned int startLine = 0;
|
|---|
| 301 | // If the visual representation of the first line starts and ends with the same
|
|---|
| 302 | // directionality, we know the answer.
|
|---|
| 303 | pos = 0;
|
|---|
| 304 | while (pos < len) {
|
|---|
| 305 | if ( str.at(pos) == '\n' )
|
|---|
| 306 | startLine = pos;
|
|---|
| 307 | if (str.at(pos).direction() < 2) { // DirR or DirL
|
|---|
| 308 | dir1 = str.at(pos).direction();
|
|---|
| 309 | break;
|
|---|
| 310 | }
|
|---|
| 311 | pos++;
|
|---|
| 312 | }
|
|---|
| 313 |
|
|---|
| 314 | if( pos == len ) // no directional chars, assume QChar::DirL
|
|---|
| 315 | return QChar::DirL;
|
|---|
| 316 |
|
|---|
| 317 | // move to end of line
|
|---|
| 318 | while( pos < len && str.at(pos) != '\n' )
|
|---|
| 319 | pos++;
|
|---|
| 320 |
|
|---|
| 321 | while (pos > startLine) {
|
|---|
| 322 | if (str.at(pos).direction() < 2) { // DirR or DirL
|
|---|
| 323 | dir2 = str.at(pos).direction();
|
|---|
| 324 | break;
|
|---|
| 325 | }
|
|---|
| 326 | pos--;
|
|---|
| 327 | }
|
|---|
| 328 |
|
|---|
| 329 | // both are the same, so we have the direction!
|
|---|
| 330 | if ( dir1 == dir2 ) return dir1;
|
|---|
| 331 |
|
|---|
| 332 | // guess with the help of punktuation marks...
|
|---|
| 333 | // if the sentence ends with a punktuation, we should have a mark
|
|---|
| 334 | // at one side of the text...
|
|---|
| 335 |
|
|---|
| 336 | pos = 0;
|
|---|
| 337 | while (pos < len-1 ) {
|
|---|
| 338 | if(str.at(pos).category() == QChar::Punctuation_Other) {
|
|---|
| 339 | if( str.at(pos) != (char)0xbf && str.at(pos) != (char)0xa1 ) // spanish inverted question and exclamation mark
|
|---|
| 340 | if( str.at(pos+1).direction() < 2 ) return QChar::DirR;
|
|---|
| 341 | }
|
|---|
| 342 | pos++;
|
|---|
| 343 | }
|
|---|
| 344 |
|
|---|
| 345 | pos = len;
|
|---|
| 346 | while (pos < 1 && str.at(pos).direction() < 2 ) {
|
|---|
| 347 | if(str.at(pos).category() == QChar::Punctuation_Other) {
|
|---|
| 348 | if( str.at(pos-1).direction() < 2 ) return QChar::DirL;
|
|---|
| 349 | }
|
|---|
| 350 | pos--;
|
|---|
| 351 | }
|
|---|
| 352 |
|
|---|
| 353 | // don't know try DirR...
|
|---|
| 354 | return QChar::DirR;
|
|---|
| 355 | }
|
|---|
| 356 |
|
|---|
| 357 |
|
|---|
| 358 | /*!
|
|---|
| 359 | \class QHebrewCodec qrtlcodec.h
|
|---|
| 360 | \reentrant
|
|---|
| 361 | \ingroup i18n
|
|---|
| 362 |
|
|---|
| 363 | \brief The QHebrewCodec class provides conversion to and from
|
|---|
| 364 | visually ordered Hebrew.
|
|---|
| 365 |
|
|---|
| 366 | Hebrew as a semitic language is written from right to left.
|
|---|
| 367 | Because older computer systems couldn't handle reordering a string
|
|---|
| 368 | so that the first letter appears on the right, many older
|
|---|
| 369 | documents were encoded in visual order, so that the first letter
|
|---|
| 370 | of a line is the rightmost one in the string.
|
|---|
| 371 |
|
|---|
| 372 | In contrast to this, Unicode defines characters to be in logical
|
|---|
| 373 | order (the order you would read the string). This codec tries to
|
|---|
| 374 | convert visually ordered Hebrew (8859-8) to Unicode. This might
|
|---|
| 375 | not always work perfectly, because reversing the \e bidi
|
|---|
| 376 | (bi-directional) algorithm that transforms from logical to visual
|
|---|
| 377 | order is non-trivial.
|
|---|
| 378 |
|
|---|
| 379 | Transformation from Unicode to visual Hebrew (8859-8) is done
|
|---|
| 380 | using the bidi algorithm in Qt, and will produce correct results,
|
|---|
| 381 | so long as the codec is given the text a whole paragraph at a
|
|---|
| 382 | time. Places where newlines are supposed to go can be indicated by
|
|---|
| 383 | a newline character ('\n'). Note that these newline characters
|
|---|
| 384 | change the reordering behaviour of the algorithm, since the bidi
|
|---|
| 385 | reordering only takes place within one line of text, whereas
|
|---|
| 386 | line breaks are determined in visual order.
|
|---|
| 387 |
|
|---|
| 388 | Visually ordered Hebrew is still used quite often in some places,
|
|---|
| 389 | mainly in email communication (since most email programs still
|
|---|
| 390 | don't understand logically ordered Hebrew) and on web pages. The
|
|---|
| 391 | use on web pages is rapidly decreasing, due to the availability of
|
|---|
| 392 | browsers that correctly support logically ordered Hebrew.
|
|---|
| 393 |
|
|---|
| 394 | This codec has the name "iso8859-8". If you don't want any bidi
|
|---|
| 395 | reordering to happen during conversion, use the "iso8859-8-i"
|
|---|
| 396 | codec, which assumes logical order for the 8-bit string.
|
|---|
| 397 | */
|
|---|
| 398 |
|
|---|
| 399 | /*! \reimp */
|
|---|
| 400 | int QHebrewCodec::mibEnum() const
|
|---|
| 401 | {
|
|---|
| 402 | return 11;
|
|---|
| 403 | }
|
|---|
| 404 |
|
|---|
| 405 | /*! \reimp */
|
|---|
| 406 | const char* QHebrewCodec::name() const
|
|---|
| 407 | {
|
|---|
| 408 | return "ISO 8859-8";
|
|---|
| 409 | }
|
|---|
| 410 |
|
|---|
| 411 | /*!
|
|---|
| 412 | Returns the codec's mime name.
|
|---|
| 413 | */
|
|---|
| 414 | const char* QHebrewCodec::mimeName() const
|
|---|
| 415 | {
|
|---|
| 416 | return "ISO-8859-8";
|
|---|
| 417 | }
|
|---|
| 418 |
|
|---|
| 419 | static QString visualOrder(QString logical, QChar::Direction basicDir)
|
|---|
| 420 | {
|
|---|
| 421 | logical.replace(QChar('\n'), QChar(0x2028));
|
|---|
| 422 |
|
|---|
| 423 | QTextEngine e(logical, 0);
|
|---|
| 424 | e.direction = basicDir;
|
|---|
| 425 | e.itemize();
|
|---|
| 426 | Q_UINT8 l[256];
|
|---|
| 427 | Q_UINT8 *levels = l;
|
|---|
| 428 | int vo[256];
|
|---|
| 429 | int *visualOrder = vo;
|
|---|
| 430 | int nitems = e.items.size();
|
|---|
| 431 | if (nitems > 255) {
|
|---|
| 432 | levels = new Q_UINT8[nitems];
|
|---|
| 433 | visualOrder = new int[nitems];
|
|---|
| 434 | }
|
|---|
| 435 | int i;
|
|---|
| 436 | for (i = 0; i < nitems; ++i) {
|
|---|
| 437 | //qDebug("item %d bidiLevel=%d", i, e.items[i].analysis.bidiLevel);
|
|---|
| 438 | levels[i] = e.items[i].analysis.bidiLevel;
|
|---|
| 439 | }
|
|---|
| 440 | e.bidiReorder(nitems, levels, visualOrder);
|
|---|
| 441 |
|
|---|
| 442 | QString visual;
|
|---|
| 443 | for (i = 0; i < nitems; ++i) {
|
|---|
| 444 | QScriptItem &si = e.items[visualOrder[i]];
|
|---|
| 445 | QString sub = logical.mid(si.position, e.length(visualOrder[i]));
|
|---|
| 446 | if (si.analysis.bidiLevel % 2) {
|
|---|
| 447 | // reverse sub
|
|---|
| 448 | QChar *a = (QChar *)sub.unicode();
|
|---|
| 449 | QChar *b = a + sub.length() - 1;
|
|---|
| 450 | while (a < b) {
|
|---|
| 451 | QChar tmp = *a;
|
|---|
| 452 | *a = *b;
|
|---|
| 453 | *b = tmp;
|
|---|
| 454 | ++a;
|
|---|
| 455 | --b;
|
|---|
| 456 | }
|
|---|
| 457 | a = (QChar *)sub.unicode();
|
|---|
| 458 | b = a + sub.length();
|
|---|
| 459 | while (a<b) {
|
|---|
| 460 | *a = a->mirroredChar();
|
|---|
| 461 | ++a;
|
|---|
| 462 | }
|
|---|
| 463 | }
|
|---|
| 464 | visual += sub;
|
|---|
| 465 | }
|
|---|
| 466 | // replace Unicode newline back with \n to compare.
|
|---|
| 467 | visual.replace(QChar(0x2028), QChar('\n'));
|
|---|
| 468 | if (l != levels) {
|
|---|
| 469 | delete [] levels;
|
|---|
| 470 | delete [] visualOrder;
|
|---|
| 471 | }
|
|---|
| 472 | return visual;
|
|---|
| 473 | }
|
|---|
| 474 |
|
|---|
| 475 | /*!
|
|---|
| 476 | \reimp
|
|---|
| 477 |
|
|---|
| 478 | Since Hebrew (and Arabic) is written from left to right, but
|
|---|
| 479 | iso8859-8 assumes visual ordering (as opposed to the logical
|
|---|
| 480 | ordering of Unicode), we must reverse the order of the input
|
|---|
| 481 | string (the first \a len characters of \a chars) to put it into
|
|---|
| 482 | logical order.
|
|---|
| 483 |
|
|---|
| 484 | One problem is that the basic text direction is unknown. So this
|
|---|
| 485 | function uses some heuristics to guess it, and if it can't guess
|
|---|
| 486 | the right one, it assumes, the basic text direction is right to
|
|---|
| 487 | left.
|
|---|
| 488 |
|
|---|
| 489 | This behaviour can be overridden, by putting a control character
|
|---|
| 490 | at the beginning of the text to indicate which basic text
|
|---|
| 491 | direction to use. If the basic text direction is left-to-right,
|
|---|
| 492 | the control character should be (uchar) 0xFE. For right-to-left it
|
|---|
| 493 | should be 0xFF. Both characters are undefined in the iso 8859-8
|
|---|
| 494 | charset.
|
|---|
| 495 |
|
|---|
| 496 | Example: A visually ordered string "english WERBEH american" would
|
|---|
| 497 | be recognized as having a basic left to right direction. So the
|
|---|
| 498 | logically ordered QString would be "english HEBREW american".
|
|---|
| 499 |
|
|---|
| 500 | By prepending a (uchar)0xFF at the start of the string,
|
|---|
| 501 | QHebrewCodec::toUnicode() would use a basic text direction of
|
|---|
| 502 | right to left, and the string would thus become "american HEBREW
|
|---|
| 503 | english".
|
|---|
| 504 | */
|
|---|
| 505 | QString QHebrewCodec::toUnicode(const char* chars, int len ) const
|
|---|
| 506 | {
|
|---|
| 507 | QString r;
|
|---|
| 508 | const unsigned char * c = (const unsigned char *)chars;
|
|---|
| 509 | QChar::Direction basicDir = QChar::DirON; // neutral, we don't know
|
|---|
| 510 |
|
|---|
| 511 | if( len == 0 ) return QString::null;
|
|---|
| 512 |
|
|---|
| 513 | // Test, if the user gives us a directionality.
|
|---|
| 514 | // We use 0xFE and 0xFF in ISO8859-8 for that.
|
|---|
| 515 | // These chars are undefined in the charset, and are mapped to
|
|---|
| 516 | // RTL overwrite
|
|---|
| 517 | if( c[0] == 0xfe ) {
|
|---|
| 518 | basicDir = QChar::DirL;
|
|---|
| 519 | c++; // skip directionality hint
|
|---|
| 520 | }
|
|---|
| 521 | if( c[0] == 0xff ) {
|
|---|
| 522 | basicDir = QChar::DirR;
|
|---|
| 523 | c++; // skip directionality hint
|
|---|
| 524 | }
|
|---|
| 525 |
|
|---|
| 526 | for( int i=0; i<len; i++ ) {
|
|---|
| 527 | if ( c[i] > 127 )
|
|---|
| 528 | r[i] = heb_to_unicode[c[i]-128];
|
|---|
| 529 | else
|
|---|
| 530 | r[i] = c[i];
|
|---|
| 531 | }
|
|---|
| 532 |
|
|---|
| 533 | // do transformation from visual byte ordering to logical byte
|
|---|
| 534 | // ordering
|
|---|
| 535 | if( basicDir == QChar::DirON )
|
|---|
| 536 | basicDir = findBasicDirection(r);
|
|---|
| 537 |
|
|---|
| 538 | return visualOrder(r, basicDir);
|
|---|
| 539 | }
|
|---|
| 540 |
|
|---|
| 541 | /*!
|
|---|
| 542 | Transforms the logically ordered QString, \a uc, into a visually
|
|---|
| 543 | ordered string in the 8859-8 encoding. Qt's bidi algorithm is used
|
|---|
| 544 | to perform this task. Note that newline characters affect the
|
|---|
| 545 | reordering, since reordering is done on a line by line basis.
|
|---|
| 546 |
|
|---|
| 547 | The algorithm is designed to work on whole paragraphs of text, so
|
|---|
| 548 | processing a line at a time may produce incorrect results. This
|
|---|
| 549 | approach is taken because the reordering of the contents of a
|
|---|
| 550 | particular line in a paragraph may depend on the previous line in
|
|---|
| 551 | the same paragraph.
|
|---|
| 552 |
|
|---|
| 553 | Some encodings (for example Japanese or UTF-8) are multibyte (so
|
|---|
| 554 | one input character is mapped to two output characters). The \a
|
|---|
| 555 | lenInOut argument specifies the number of QChars that should be
|
|---|
| 556 | converted and is set to the number of characters returned.
|
|---|
| 557 | */
|
|---|
| 558 | QCString QHebrewCodec::fromUnicode(const QString& uc, int& lenInOut) const
|
|---|
| 559 | {
|
|---|
| 560 | // process only len chars...
|
|---|
| 561 | int l;
|
|---|
| 562 | if( lenInOut > 0 )
|
|---|
| 563 | l = QMIN((int)uc.length(),lenInOut);
|
|---|
| 564 | else
|
|---|
| 565 | l = (int)uc.length();
|
|---|
| 566 |
|
|---|
| 567 | QCString rstr;
|
|---|
| 568 | if( l == 1 ) {
|
|---|
| 569 | if( !to8bit( uc[0], &rstr ) )
|
|---|
| 570 | rstr += (char)unkn;
|
|---|
| 571 | } else {
|
|---|
| 572 | QString tmp = uc;
|
|---|
| 573 | tmp.truncate(l);
|
|---|
| 574 | QString vis = visualOrder(tmp, QChar::DirON);
|
|---|
| 575 |
|
|---|
| 576 | for (int i=0; i<l; i++) {
|
|---|
| 577 | const QChar ch = vis[i];
|
|---|
| 578 |
|
|---|
| 579 | if( !to8bit( ch, &rstr ) )
|
|---|
| 580 | rstr += (char)unkn;
|
|---|
| 581 | }
|
|---|
| 582 | // lenInOut = cursor - result;
|
|---|
| 583 | }
|
|---|
| 584 | if( l > 0 && !rstr.length() )
|
|---|
| 585 | rstr += (char)unkn;
|
|---|
| 586 |
|
|---|
| 587 | return rstr;
|
|---|
| 588 | }
|
|---|
| 589 |
|
|---|
| 590 | /*! \reimp
|
|---|
| 591 | */
|
|---|
| 592 | int QHebrewCodec::heuristicContentMatch(const char* chars, int len) const
|
|---|
| 593 | {
|
|---|
| 594 | const unsigned char * c = (const unsigned char *)chars;
|
|---|
| 595 |
|
|---|
| 596 | int score = 0;
|
|---|
| 597 | for (int i=0; i<len; i++) {
|
|---|
| 598 | if(c[i] > 0x80 ) {
|
|---|
| 599 | if ( heb_to_unicode[c[i] - 0x80] != 0xFFFD)
|
|---|
| 600 | score++;
|
|---|
| 601 | else
|
|---|
| 602 | return -1;
|
|---|
| 603 | }
|
|---|
| 604 | }
|
|---|
| 605 | return score;
|
|---|
| 606 | }
|
|---|
| 607 |
|
|---|
| 608 | #endif
|
|---|