Context Navigation

qrtlcodec.cpp

Last change on this file was 2, checked in by dmik, 20 years ago
Imported xplatform parts of the official release 3.3.1 from Trolltech
Property svn:keywords set to `Id`
File size: 18.0 KB

Line
1	/****************************************************************************
2	** $Id: qrtlcodec.cpp 2 2005-11-16 15:49:26Z dmik $
3	**
4	** Implementation of QTextCodec class
5	**
6	** Created : 981015
7	**
8	** Copyright (C) 1998-2002 Trolltech AS. All rights reserved.
9	**
10	** This file is part of the tools module of the Qt GUI Toolkit.
11	**
12	** This file may be distributed under the terms of the Q Public License
13	** as defined by Trolltech AS of Norway and appearing in the file
14	** LICENSE.QPL included in the packaging of this file.
15	**
16	** This file may be distributed and/or modified under the terms of the
17	** GNU General Public License version 2 as published by the Free Software
18	** Foundation and appearing in the file LICENSE.GPL included in the
19	** packaging of this file.
20	**
21	** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22	** licenses may use this file in accordance with the Qt Commercial License
23	** Agreement provided with the Software.
24	**
25	** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26	** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27	**
28	** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29	** information about Qt Commercial License Agreements.
30	** See http://www.trolltech.com/qpl/ for QPL licensing information.
31	** See http://www.trolltech.com/gpl/ for GPL licensing information.
32	**
33	** Contact info@trolltech.com if any conditions of this licensing are
34	** not clear to you.
35	**
36	**********************************************************************/
37
38	#include "qrtlcodec.h"
39	#include <private/qtextengine_p.h>
40
41	#ifndef QT_NO_CODEC_HEBREW
42
43	// NOT REVISED
44
45	static const uchar unkn = '?'; // BLACK SQUARE (94) would be better
46
47	static const ushort heb_to_unicode[128] = {
48	0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
49	0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
50	0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
51	0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
52	0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
53	0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x203E,
54	0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
55	0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0xFFFD,
56	0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
57	0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
58	0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
59	0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x2017,
60	0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
61	0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
62	0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
63	0x05E8, 0x05E9, 0x05EA, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD
64	};
65
66	static const uchar unicode_to_heb_00[32] = {
67	0xA0, unkn, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
68	0xA8, 0xA9, 0xD7, 0xAB, 0xAC, 0xAD, 0xAE, unkn,
69	0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
70	0xB8, 0xB9, 0xF7, 0xBB, 0xBC, 0xBD, 0xBE, unkn,
71	};
72
73	static const uchar unicode_to_heb_05[32] = {
74	0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
75	0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
76	0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
77	0xF8, 0xF9, 0xFA, unkn, unkn, unkn, unkn, unkn
78	};
79
80	static bool to8bit(const QChar ch, QCString *rstr)
81	{
82	bool converted = FALSE;
83
84	if( ch.isMark() ) return TRUE; // ignore marks for conversion
85
86	if ( ch.row() ) {
87	if ( ch.row() == 0x05 ) {
88	if ( ch.cell() > 0x91 )
89	converted = TRUE;
90	// 0x0591 - 0x05cf: Hebrew punctuation... dropped
91	if ( ch.cell() >= 0xD0 )
92	*rstr += (char)unicode_to_heb_05[ch.cell()- 0xD0];
93	} else if ( ch.row() == 0x20 ) {
94	if ( ch.cell() == 0x3E ) {
95	*rstr += (char)0xAF;
96	converted = TRUE;
97	} else if ( ch.cell() == 0x17 ) {
98	*rstr += (char)0xCF;
99	converted = TRUE;
100	}
101	} else {
102	converted = FALSE;
103	}
104	} else {
105	if ( ch.cell() < 0x80 ) {
106	*rstr += (char)ch.cell();
107	converted = TRUE;
108	} else if( ch.cell() < 0xA0 ) {
109	*rstr += (char)unicode_to_heb_00[ch.cell() - 0xA0];
110	converted = TRUE;
111	}
112	}
113
114	if(converted) return TRUE;
115
116	// couldn't convert the char... lets try its decomposition
117	QString d = ch.decomposition();
118	if(d.isNull())
119	return FALSE;
120
121	int l = d.length();
122	for (int i=0; i<l; i++) {
123	const QChar ch = d[i];
124
125	if(to8bit(ch, rstr))
126	converted = TRUE;
127	}
128
129	return converted;
130	}
131
132	#if 0
133	static QString run(const QString &input, unsigned int from, unsigned int to, QChar::Direction runDir)
134	{
135	if ( to <= from )
136	return QString::null;
137
138	QString out;
139	if ( runDir == QChar::DirR ) {
140	const QChar *ch = input.unicode() + to - 1;
141	int len = to - from;
142	while (len--) {
143	out += *ch;
144	ch--;
145	}
146	} else {
147	out = input.mid(from, to - from );
148	}
149	return out;
150	}
151
152	/*
153	we might do better here, but I'm currently not sure if it's worth the effort. It will hopefully convert
154	90% of the visually ordered Hebrew correctly.
155	*/
156	static QString reverseLine(const QString &str, unsigned int from, unsigned int to, QChar::Direction dir)
157	{
158	QString out;
159
160	if ( to <= from ) {
161	out += str.at(from);
162	return out;
163	}
164
165	// since we don't have embedding marks, we get around with bidi levels up to 2.
166
167	// simple case: dir = RTL:
168	// go through the line from right to left, and reverse all continuous Hebrew strings.
169	if ( dir == QChar::DirR ) {
170	unsigned int pos = to;
171	to = from;
172	from = pos;
173	QChar::Direction runDir = QChar::DirON;
174
175	while ( pos > to ) {
176	QChar::Direction d = str.at(pos).direction();
177	switch ( d ) {
178	case QChar::DirL:
179	case QChar::DirAN:
180	case QChar::DirEN:
181	if ( runDir != QChar::DirL ) {
182	out += run( str, pos, from, runDir );
183	from = pos - 1;
184	}
185	runDir = QChar::DirL;
186	break;
187	case QChar::DirON:
188	if ( runDir == QChar::DirON ) {
189	runDir = QChar::DirR;
190	break;
191	}
192	// fall through
193	case QChar::DirR:
194	if ( runDir != QChar::DirR ) {
195	out += run( str, pos, from, runDir );
196	from = pos - 1;
197	}
198	runDir = QChar::DirR;
199	default:
200	break;
201	}
202	pos--;
203	}
204	out += run( str, pos, from, runDir );
205	} else {
206	// basicDir == DirL. A bit more complicated, as we might need to reverse two times for numbers.
207	unsigned int pos = from;
208	QChar::Direction runDir = QChar::DirON;
209
210	// first reversing. Ignore numbers
211	while ( pos < to ) {
212	QChar::Direction d = str.at(pos).direction();
213	switch ( d ) {
214	case QChar::DirL:
215	if ( runDir != QChar::DirL && runDir != QChar::DirON ) {
216	out += run( str, from, pos, runDir );
217	qDebug( "out = %s", out.latin1() );
218	from = pos;
219	}
220	runDir = QChar::DirL;
221	break;
222	case QChar::DirON:
223	if ( runDir == QChar::DirON ) {
224	runDir = QChar::DirL;
225	break;
226	}
227	// fall through
228	case QChar::DirR:
229	case QChar::DirAN:
230	case QChar::DirEN:
231	if ( runDir != QChar::DirR && runDir != QChar::DirON ) {
232	out += run( str, from, pos, runDir );
233	qDebug( "out = %s", out.latin1() );
234	from = pos;
235	}
236	runDir = QChar::DirR;
237	default:
238	break;
239	}
240	pos++;
241	}
242	out += run( str, from, pos, runDir );
243	qDebug( "out = %s", out.latin1() );
244	// second reversing for numbers
245	QString in = out;
246	out = "";
247	pos = 0;
248	from = 0;
249	to = in.length() - 1;
250	runDir = QChar::DirON;
251	while ( pos < to ) {
252	QChar::Direction d = str.at(pos).direction();
253	switch ( d ) {
254	case QChar::DirL:
255	case QChar::DirON:
256	case QChar::DirR:
257	if ( runDir == QChar::DirEN && runDir != QChar::DirON ) {
258	out += run( in, from, pos, QChar::DirR ); //DirR ensures reversing
259	qDebug( "out = %s", out.latin1() );
260	runDir = QChar::DirR;
261	from = pos;
262	}
263	runDir = QChar::DirL;
264	break;
265	case QChar::DirAN:
266	case QChar::DirEN:
267	if ( runDir != QChar::DirEN && runDir != QChar::DirON ) {
268	out += in.mid(from, pos-from+1);
269	qDebug( "out = %s", out.latin1() );
270	from = pos;
271	}
272	runDir = QChar::DirEN;
273	default:
274	break;
275	}
276	pos++;
277	}
278	out += run( str, from, pos, runDir );
279
280	}
281	return out;
282	}
283	#endif
284
285	/* this function assuems the QString is still visually ordered.
286	* Finding the basic direction of the text is not easy in this case, since
287	* a string like "my friend MOLAHS" could (in logical order) mean aswell
288	* "SHALOM my friend" or "my friend SHALOM", depending on the basic direction
289	* one assumes for the text.
290	*
291	* So this function uses some heuristics to find the right answer...
292	*/
293	static QChar::Direction findBasicDirection(QString str)
294	{
295	unsigned int pos;
296	unsigned int len = str.length();
297	QChar::Direction dir1 = QChar::DirON;
298	QChar::Direction dir2 = QChar::DirON;
299
300	unsigned int startLine = 0;
301	// If the visual representation of the first line starts and ends with the same
302	// directionality, we know the answer.
303	pos = 0;
304	while (pos < len) {
305	if ( str.at(pos) == '\n' )
306	startLine = pos;
307	if (str.at(pos).direction() < 2) { // DirR or DirL
308	dir1 = str.at(pos).direction();
309	break;
310	}
311	pos++;
312	}
313
314	if( pos == len ) // no directional chars, assume QChar::DirL
315	return QChar::DirL;
316
317	// move to end of line
318	while( pos < len && str.at(pos) != '\n' )
319	pos++;
320
321	while (pos > startLine) {
322	if (str.at(pos).direction() < 2) { // DirR or DirL
323	dir2 = str.at(pos).direction();
324	break;
325	}
326	pos--;
327	}
328
329	// both are the same, so we have the direction!
330	if ( dir1 == dir2 ) return dir1;
331
332	// guess with the help of punktuation marks...
333	// if the sentence ends with a punktuation, we should have a mark
334	// at one side of the text...
335
336	pos = 0;
337	while (pos < len-1 ) {
338	if(str.at(pos).category() == QChar::Punctuation_Other) {
339	if( str.at(pos) != (char)0xbf && str.at(pos) != (char)0xa1 ) // spanish inverted question and exclamation mark
340	if( str.at(pos+1).direction() < 2 ) return QChar::DirR;
341	}
342	pos++;
343	}
344
345	pos = len;
346	while (pos < 1 && str.at(pos).direction() < 2 ) {
347	if(str.at(pos).category() == QChar::Punctuation_Other) {
348	if( str.at(pos-1).direction() < 2 ) return QChar::DirL;
349	}
350	pos--;
351	}
352
353	// don't know try DirR...
354	return QChar::DirR;
355	}
356
357
358	/*!
359	\class QHebrewCodec qrtlcodec.h
360	\reentrant
361	\ingroup i18n
362
363	\brief The QHebrewCodec class provides conversion to and from
364	visually ordered Hebrew.
365
366	Hebrew as a semitic language is written from right to left.
367	Because older computer systems couldn't handle reordering a string
368	so that the first letter appears on the right, many older
369	documents were encoded in visual order, so that the first letter
370	of a line is the rightmost one in the string.
371
372	In contrast to this, Unicode defines characters to be in logical
373	order (the order you would read the string). This codec tries to
374	convert visually ordered Hebrew (8859-8) to Unicode. This might
375	not always work perfectly, because reversing the \e bidi
376	(bi-directional) algorithm that transforms from logical to visual
377	order is non-trivial.
378
379	Transformation from Unicode to visual Hebrew (8859-8) is done
380	using the bidi algorithm in Qt, and will produce correct results,
381	so long as the codec is given the text a whole paragraph at a
382	time. Places where newlines are supposed to go can be indicated by
383	a newline character ('\n'). Note that these newline characters
384	change the reordering behaviour of the algorithm, since the bidi
385	reordering only takes place within one line of text, whereas
386	line breaks are determined in visual order.
387
388	Visually ordered Hebrew is still used quite often in some places,
389	mainly in email communication (since most email programs still
390	don't understand logically ordered Hebrew) and on web pages. The
391	use on web pages is rapidly decreasing, due to the availability of
392	browsers that correctly support logically ordered Hebrew.
393
394	This codec has the name "iso8859-8". If you don't want any bidi
395	reordering to happen during conversion, use the "iso8859-8-i"
396	codec, which assumes logical order for the 8-bit string.
397	*/
398
399	/! \reimp /
400	int QHebrewCodec::mibEnum() const
401	{
402	return 11;
403	}
404
405	/! \reimp /
406	const char* QHebrewCodec::name() const
407	{
408	return "ISO 8859-8";
409	}
410
411	/*!
412	Returns the codec's mime name.
413	*/
414	const char* QHebrewCodec::mimeName() const
415	{
416	return "ISO-8859-8";
417	}
418
419	static QString visualOrder(QString logical, QChar::Direction basicDir)
420	{
421	logical.replace(QChar('\n'), QChar(0x2028));
422
423	QTextEngine e(logical, 0);
424	e.direction = basicDir;
425	e.itemize();
426	Q_UINT8 l[256];
427	Q_UINT8 *levels = l;
428	int vo[256];
429	int *visualOrder = vo;
430	int nitems = e.items.size();
431	if (nitems > 255) {
432	levels = new Q_UINT8[nitems];
433	visualOrder = new int[nitems];
434	}
435	int i;
436	for (i = 0; i < nitems; ++i) {
437	//qDebug("item %d bidiLevel=%d", i, e.items[i].analysis.bidiLevel);
438	levels[i] = e.items[i].analysis.bidiLevel;
439	}
440	e.bidiReorder(nitems, levels, visualOrder);
441
442	QString visual;
443	for (i = 0; i < nitems; ++i) {
444	QScriptItem &si = e.items[visualOrder[i]];
445	QString sub = logical.mid(si.position, e.length(visualOrder[i]));
446	if (si.analysis.bidiLevel % 2) {
447	// reverse sub
448	QChar a = (QChar )sub.unicode();
449	QChar *b = a + sub.length() - 1;
450	while (a < b) {
451	QChar tmp = *a;
452	a = b;
453	*b = tmp;
454	++a;
455	--b;
456	}
457	a = (QChar *)sub.unicode();
458	b = a + sub.length();
459	while (a<b) {
460	*a = a->mirroredChar();
461	++a;
462	}
463	}
464	visual += sub;
465	}
466	// replace Unicode newline back with \n to compare.
467	visual.replace(QChar(0x2028), QChar('\n'));
468	if (l != levels) {
469	delete [] levels;
470	delete [] visualOrder;
471	}
472	return visual;
473	}
474
475	/*!
476	\reimp
477
478	Since Hebrew (and Arabic) is written from left to right, but
479	iso8859-8 assumes visual ordering (as opposed to the logical
480	ordering of Unicode), we must reverse the order of the input
481	string (the first \a len characters of \a chars) to put it into
482	logical order.
483
484	One problem is that the basic text direction is unknown. So this
485	function uses some heuristics to guess it, and if it can't guess
486	the right one, it assumes, the basic text direction is right to
487	left.
488
489	This behaviour can be overridden, by putting a control character
490	at the beginning of the text to indicate which basic text
491	direction to use. If the basic text direction is left-to-right,
492	the control character should be (uchar) 0xFE. For right-to-left it
493	should be 0xFF. Both characters are undefined in the iso 8859-8
494	charset.
495
496	Example: A visually ordered string "english WERBEH american" would
497	be recognized as having a basic left to right direction. So the
498	logically ordered QString would be "english HEBREW american".
499
500	By prepending a (uchar)0xFF at the start of the string,
501	QHebrewCodec::toUnicode() would use a basic text direction of
502	right to left, and the string would thus become "american HEBREW
503	english".
504	*/
505	QString QHebrewCodec::toUnicode(const char* chars, int len ) const
506	{
507	QString r;
508	const unsigned char * c = (const unsigned char *)chars;
509	QChar::Direction basicDir = QChar::DirON; // neutral, we don't know
510
511	if( len == 0 ) return QString::null;
512
513	// Test, if the user gives us a directionality.
514	// We use 0xFE and 0xFF in ISO8859-8 for that.
515	// These chars are undefined in the charset, and are mapped to
516	// RTL overwrite
517	if( c[0] == 0xfe ) {
518	basicDir = QChar::DirL;
519	c++; // skip directionality hint
520	}
521	if( c[0] == 0xff ) {
522	basicDir = QChar::DirR;
523	c++; // skip directionality hint
524	}
525
526	for( int i=0; i<len; i++ ) {
527	if ( c[i] > 127 )
528	r[i] = heb_to_unicode[c[i]-128];
529	else
530	r[i] = c[i];
531	}
532
533	// do transformation from visual byte ordering to logical byte
534	// ordering
535	if( basicDir == QChar::DirON )
536	basicDir = findBasicDirection(r);
537
538	return visualOrder(r, basicDir);
539	}
540
541	/*!
542	Transforms the logically ordered QString, \a uc, into a visually
543	ordered string in the 8859-8 encoding. Qt's bidi algorithm is used
544	to perform this task. Note that newline characters affect the
545	reordering, since reordering is done on a line by line basis.
546
547	The algorithm is designed to work on whole paragraphs of text, so
548	processing a line at a time may produce incorrect results. This
549	approach is taken because the reordering of the contents of a
550	particular line in a paragraph may depend on the previous line in
551	the same paragraph.
552
553	Some encodings (for example Japanese or UTF-8) are multibyte (so
554	one input character is mapped to two output characters). The \a
555	lenInOut argument specifies the number of QChars that should be
556	converted and is set to the number of characters returned.
557	*/
558	QCString QHebrewCodec::fromUnicode(const QString& uc, int& lenInOut) const
559	{
560	// process only len chars...
561	int l;
562	if( lenInOut > 0 )
563	l = QMIN((int)uc.length(),lenInOut);
564	else
565	l = (int)uc.length();
566
567	QCString rstr;
568	if( l == 1 ) {
569	if( !to8bit( uc[0], &rstr ) )
570	rstr += (char)unkn;
571	} else {
572	QString tmp = uc;
573	tmp.truncate(l);
574	QString vis = visualOrder(tmp, QChar::DirON);
575
576	for (int i=0; i<l; i++) {
577	const QChar ch = vis[i];
578
579	if( !to8bit( ch, &rstr ) )
580	rstr += (char)unkn;
581	}
582	// lenInOut = cursor - result;
583	}
584	if( l > 0 && !rstr.length() )
585	rstr += (char)unkn;
586
587	return rstr;
588	}
589
590	/*! \reimp
591	*/
592	int QHebrewCodec::heuristicContentMatch(const char* chars, int len) const
593	{
594	const unsigned char * c = (const unsigned char *)chars;
595
596	int score = 0;
597	for (int i=0; i<len; i++) {
598	if(c[i] > 0x80 ) {
599	if ( heb_to_unicode[c[i] - 0x80] != 0xFFFD)
600	score++;
601	else
602	return -1;
603	}
604	}
605	return score;
606	}
607
608	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/trolltech/current/src/codecs/qrtlcodec.cpp

Download in other formats: