source: trunk/src/codecs/qjiscodec.cpp@ 94

Last change on this file since 94 was 2, checked in by dmik, 20 years ago

Imported xplatform parts of the official release 3.3.1 from Trolltech

  • Property svn:keywords set to Id
File size: 17.6 KB
Line 
1/****************************************************************************
2** $Id: qjiscodec.cpp 2 2005-11-16 15:49:26Z dmik $
3**
4** Implementation of QJisCodec class
5**
6** Created : 990225
7**
8** Copyright (C) 2000-2002 Trolltech AS. All rights reserved.
9**
10** This file is part of the tools module of the Qt GUI Toolkit.
11**
12** This file may be distributed under the terms of the Q Public License
13** as defined by Trolltech AS of Norway and appearing in the file
14** LICENSE.QPL included in the packaging of this file.
15**
16** This file may be distributed and/or modified under the terms of the
17** GNU General Public License version 2 as published by the Free Software
18** Foundation and appearing in the file LICENSE.GPL included in the
19** packaging of this file.
20**
21** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22** licenses may use this file in accordance with the Qt Commercial License
23** Agreement provided with the Software.
24**
25** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27**
28** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29** information about Qt Commercial License Agreements.
30** See http://www.trolltech.com/qpl/ for QPL licensing information.
31** See http://www.trolltech.com/gpl/ for GPL licensing information.
32**
33** Contact info@trolltech.com if any conditions of this licensing are
34** not clear to you.
35**
36**********************************************************************/
37
38// Most of the code here was originally written by Serika Kurusugawa,
39// a.k.a. Junji Takagi, and is included in Qt with the author's permission
40// and the grateful thanks of the Trolltech team.
41
42/*! \class QJisCodec
43 \reentrant
44 \ingroup i18n
45
46 \brief The QJisCodec class provides conversion to and from JIS character sets.
47
48 More precisely, the QJisCodec class subclasses QTextCodec to
49 provide support for JIS X 0201 Latin, JIS X 0201 Kana, JIS X 0208
50 and JIS X 0212.
51
52 The environment variable UNICODEMAP_JP can be used to fine-tune
53 QJisCodec, QSjisCodec and QEucJpCodec. The mapping names are as for
54 the Japanese XML working group's \link
55 http://www.y-adagio.com/public/standards/tr_xml_jpf/toc.htm XML
56 Japanese Profile\endlink, because it names and explains all the
57 widely used mappings. Here are brief descriptions, written by
58 Serika Kurusugawa:
59
60 \list
61
62 \i "unicode-0.9" or "unicode-0201" for Unicode style. This assumes
63 JISX0201 for 0x00-0x7f. (0.9 is a table version of jisx02xx mapping
64 used for Uniocde spec version 1.1.)
65
66 \i "unicode-ascii" This assumes US-ASCII for 0x00-0x7f; some
67 chars (JISX0208 0x2140 and JISX0212 0x2237) are different from
68 Unicode 1.1 to avoid conflict.
69
70 \i "open-19970715-0201" ("open-0201" for convenience) or
71 "jisx0221-1995" for JISX0221-JISX0201 style. JIS X 0221 is JIS
72 version of Unicode, but a few chars (0x5c, 0x7e, 0x2140, 0x216f,
73 0x2131) are different from Unicode 1.1. This is used when 0x5c is
74 treated as YEN SIGN.
75
76 \i "open-19970715-ascii" ("open-ascii" for convenience) for
77 JISX0221-ASCII style. This is used when 0x5c is treated as REVERSE
78 SOLIDUS.
79
80 \i "open-19970715-ms" ("open-ms" for convenience) or "cp932" for
81 Microsoft Windows style. Windows Code Page 932. Some chars (0x2140,
82 0x2141, 0x2142, 0x215d, 0x2171, 0x2172) are different from Unicode
83 1.1.
84
85 \i "jdk1.1.7" for Sun's JDK style. Same as Unicode 1.1, except that
86 JIS 0x2140 is mapped to UFF3C. Either ASCII or JISX0201 can be used
87 for 0x00-0x7f.
88
89 \endlist
90
91 In addition, the extensions "nec-vdc", "ibm-vdc" and "udc" are
92 supported.
93
94 For example, if you want to use Unicode style conversion but with
95 NEC's extension, set \c UNICODEMAP_JP to
96 <nobr>\c {unicode-0.9, nec-vdc}.</nobr> (You will probably
97 need to quote that in a shell command.)
98
99 Most of the code here was written by Serika Kurusugawa,
100 a.k.a. Junji Takagi, and is included in Qt with the author's
101 permission and the grateful thanks of the Trolltech team. Here is
102 the copyright statement for that code:
103
104 \legalese
105
106 Copyright (C) 1999 Serika Kurusugawa. All rights reserved.
107
108 Redistribution and use in source and binary forms, with or without
109 modification, are permitted provided that the following conditions
110 are met:
111 \list 1
112 \i Redistributions of source code must retain the above copyright
113 notice, this list of conditions and the following disclaimer.
114 \i Redistributions in binary form must reproduce the above copyright
115 notice, this list of conditions and the following disclaimer in the
116 documentation and/or other materials provided with the distribution.
117 \endlist
118
119 THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS".
120 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
121 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
122 ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
123 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
124 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
125 OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
126 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
127 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
128 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
129 SUCH DAMAGE.
130*/
131
132#include "qjiscodec.h"
133
134#ifndef QT_NO_BIG_CODECS
135
136static const uchar Esc = 0x1b;
137static const uchar So = 0x0e; // Shift Out
138static const uchar Si = 0x0f; // Shift In
139
140static const uchar ReverseSolidus = 0x5c;
141static const uchar YenSign = 0x5c;
142static const uchar Tilde = 0x7e;
143static const uchar Overline = 0x7e;
144
145#define IsKana(c) (((c) >= 0xa1) && ((c) <= 0xdf))
146#define IsJisChar(c) (((c) >= 0x21) && ((c) <= 0x7e))
147
148#define QValidChar(u) ((u) ? QChar((ushort)(u)) : QChar::replacement)
149
150enum Iso2022State{ Ascii, MinState = Ascii,
151 JISX0201_Latin, JISX0201_Kana,
152 JISX0208_1978, JISX0208_1983,
153 JISX0212, MaxState = JISX0212,
154 UnknownState };
155
156static const char Esc_CHARS[] = "()*+-./";
157
158static const char Esc_Ascii[] = {Esc, '(', 'B', 0 };
159static const char Esc_JISX0201_Latin[] = {Esc, '(', 'J', 0 };
160static const char Esc_JISX0201_Kana[] = {Esc, '(', 'I', 0 };
161static const char Esc_JISX0208_1978[] = {Esc, '$', '@', 0 };
162static const char Esc_JISX0208_1983[] = {Esc, '$', 'B', 0 };
163static const char Esc_JISX0212[] = {Esc, '$', '(', 'D', 0 };
164static const char * const Esc_SEQ[] = { Esc_Ascii,
165 Esc_JISX0201_Latin,
166 Esc_JISX0201_Kana,
167 Esc_JISX0208_1978,
168 Esc_JISX0208_1983,
169 Esc_JISX0212 };
170
171/*! \internal */
172QJisCodec::QJisCodec() : conv(QJpUnicodeConv::newConverter(QJpUnicodeConv::Default))
173{
174}
175
176
177/*! \internal */
178QJisCodec::~QJisCodec()
179{
180 delete (QJpUnicodeConv*)conv;
181 conv = 0;
182}
183
184
185/*! \internal */
186int QJisCodec::mibEnum() const
187{
188 /*
189 Name: JIS_Encoding
190 MIBenum: 16
191 Source: JIS X 0202-1991. Uses ISO 2022 escape sequences to
192 shift code sets as documented in JIS X 0202-1991.
193 Alias: csJISEncoding
194 */
195 return 16;
196}
197
198/*! \internal */
199QCString QJisCodec::fromUnicode(const QString& uc, int& lenInOut) const
200{
201 int l = QMIN((int)uc.length(),lenInOut);
202 QCString result;
203 Iso2022State state = Ascii;
204 Iso2022State prev = Ascii;
205 for (int i=0; i<l; i++) {
206 QChar ch = uc[i];
207 uint j;
208 if ( ch.row() == 0x00 && ch.cell() < 0x80 ) {
209 // Ascii
210 if (state != JISX0201_Latin ||
211 ch.cell() == ReverseSolidus || ch.cell() == Tilde) {
212 state = Ascii;
213 }
214 j = ch.cell();
215 } else if ((j = conv->unicodeToJisx0201(ch.row(), ch.cell())) != 0) {
216 if (j < 0x80) {
217 // JIS X 0201 Latin
218 if (state != Ascii ||
219 ch.cell() == YenSign || ch.cell() == Overline) {
220 state = JISX0201_Latin;
221 }
222 } else {
223 // JIS X 0201 Kana
224 state = JISX0201_Kana;
225 j &= 0x7f;
226 }
227 } else if ((j = conv->unicodeToJisx0208(ch.row(), ch.cell())) != 0) {
228 // JIS X 0208
229 state = JISX0208_1983;
230 } else if ((j = conv->unicodeToJisx0212(ch.row(), ch.cell())) != 0) {
231 // JIS X 0212
232 state = JISX0212;
233 } else {
234 // Invalid
235 state = UnknownState;
236 j = '?';
237 }
238 if (state != prev) {
239 if (state == UnknownState) {
240 result += Esc_Ascii;
241 } else {
242 result += Esc_SEQ[state - MinState];
243 }
244 prev = state;
245 }
246 if (j < 0x0100) {
247 result += j & 0xff;
248 } else {
249 result += (j >> 8) & 0xff;
250 result += j & 0xff;
251 }
252 }
253 if (prev != Ascii) {
254 result += Esc_Ascii;
255 }
256 lenInOut = result.length();
257 return result;
258}
259
260/*! \internal */
261QString QJisCodec::toUnicode(const char* chars, int len) const
262{
263 QString result;
264 Iso2022State state = Ascii, prev = Ascii;
265 for (int i=0; i<len; i++) {
266 uchar ch = chars[i];
267 if ( ch == Esc ) {
268 // Escape sequence
269 state = UnknownState;
270 if ( i < len-1 ) {
271 uchar c2 = chars[++i];
272 if (c2 == '$') {
273 if ( i < len-1 ) {
274 uchar c3 = chars[++i];
275 if (strchr(Esc_CHARS, c3)) {
276 if ( i < len-1 ) {
277 uchar c4 = chars[++i];
278 if (c4 == '(') {
279 switch (c4) {
280 case 'D':
281 state = JISX0212; // Esc $ ( D
282 break;
283 }
284 }
285 }
286 } else {
287 switch (c3) {
288 case '@':
289 state = JISX0208_1978; // Esc $ @
290 break;
291 case 'B':
292 state = JISX0208_1983; // Esc $ B
293 break;
294 }
295 }
296 }
297 } else {
298 if (strchr(Esc_CHARS, c2)) {
299 if ( i < len-1 ) {
300 uchar c3 = chars[++i];
301 if (c2 == '(') {
302 switch (c3) {
303 case 'B':
304 state = Ascii; // Esc ( B
305 break;
306 case 'I':
307 state = JISX0201_Kana; // Esc ( I
308 break;
309 case 'J':
310 state = JISX0201_Latin; // Esc ( J
311 break;
312 }
313 }
314 }
315 }
316 }
317 }
318 } else if (ch == So) {
319 // Shift out
320 prev = state;
321 state = JISX0201_Kana;
322 } else if (ch == Si) {
323 // Shift in
324 if (prev == Ascii || prev == JISX0201_Latin) {
325 state = prev;
326 } else {
327 state = Ascii;
328 }
329 } else {
330 uint u;
331 switch (state) {
332 case Ascii:
333 if (ch < 0x80) {
334 result += QChar(ch);
335 break;
336 }
337 /* fall throught */
338 case JISX0201_Latin:
339 u = conv->jisx0201ToUnicode(ch);
340 result += QValidChar(u);
341 break;
342 case JISX0201_Kana:
343 u = conv->jisx0201ToUnicode(ch | 0x80);
344 result += QValidChar(u);
345 break;
346 case JISX0208_1978:
347 case JISX0208_1983:
348 if ( i < len-1 ) {
349 uchar c2 = chars[++i];
350 u = conv->jisx0208ToUnicode(ch & 0x7f, c2 & 0x7f);
351 result += QValidChar(u);
352 }
353 break;
354 case JISX0212:
355 if ( i < len-1 ) {
356 uchar c2 = chars[++i];
357 u = conv->jisx0212ToUnicode(ch & 0x7f, c2 & 0x7f);
358 result += QValidChar(u);
359 }
360 break;
361 default:
362 result += QChar::replacement;
363 break;
364 }
365 }
366 }
367 return result;
368}
369
370/*! \internal */
371const char* QJisCodec::name() const
372{
373 return "JIS7";
374}
375
376/*!
377 Returns the codec's mime name.
378*/
379const char* QJisCodec::mimeName() const
380{
381 return "ISO-2022-JP";
382}
383
384/*! \internal */
385int QJisCodec::heuristicNameMatch(const char* hint) const
386{
387 if ( qstrnicmp( hint, "ISO-2022-JP", 11 ) == 0 )
388 return 10000;
389 if ( simpleHeuristicNameMatch( "ISO-2022-JP-2", hint ) > 0 )
390 return 10;
391
392 int score = 0;
393 bool ja = FALSE;
394 if (qstrnicmp(hint, "ja_JP", 5) == 0 || qstrnicmp(hint, "japan", 5) == 0) {
395 score += 3;
396 ja = TRUE;
397 } else if (qstrnicmp(hint, "ja", 2) == 0) {
398 score += 2;
399 ja = TRUE;
400 }
401 const char *p;
402 if (ja) {
403 p = strchr(hint, '.');
404 if (p == 0) {
405 return score - 2;
406 }
407 p++;
408 } else {
409 p = hint;
410 }
411 if (p) {
412 if ((qstricmp(p, "JIS") == 0) ||
413 (qstricmp(p, "JIS7") == 0) ||
414 (simpleHeuristicNameMatch("ISO-2022-JP", p) > 0)) {
415 return score + 4;
416 }
417 }
418 return QTextCodec::heuristicNameMatch(hint);
419}
420
421/*! \internal */
422int QJisCodec::heuristicContentMatch(const char* chars, int len) const
423{
424 int score = 0;
425 Iso2022State state = Ascii, prev = Ascii;
426 for (int i=0; i<len; i++) {
427 uchar ch = chars[i];
428 // No nulls allowed.
429 if ( !ch )
430 return -1;
431 if ( ch == Esc ) {
432 // Escape sequence
433 state = UnknownState;
434 if ( i < len-1 ) {
435 uchar c2 = chars[++i];
436 if (c2 == '$') {
437 if ( i < len-1 ) {
438 uchar c3 = chars[++i];
439 if (strchr(Esc_CHARS, c3)) {
440 if ( i < len-1 ) {
441 uchar c4 = chars[++i];
442 if (c4 == '(') {
443 switch (c4) {
444 case 'D':
445 state = JISX0212; // Esc $ ( D
446 score++;
447 break;
448 }
449 }
450 }
451 score++;
452 } else {
453 switch (c3) {
454 case '@':
455 state = JISX0208_1978; // Esc $ @
456 score++;
457 break;
458 case 'B':
459 state = JISX0208_1983; // Esc $ B
460 score++;
461 break;
462 }
463 }
464 }
465 score++;
466 } else {
467 if (strchr(Esc_CHARS, c2)) {
468 if ( i < len-1 ) {
469 uchar c3 = chars[++i];
470 if (c2 == '(') {
471 switch (c3) {
472 case 'B':
473 state = Ascii; // Esc ( B
474 score++;
475 break;
476 case 'I':
477 state = JISX0201_Kana; // Esc ( I
478 score++;
479 break;
480 case 'J':
481 state = JISX0201_Latin; // Esc ( J
482 score++;
483 break;
484 }
485 }
486 }
487 score++;
488 }
489 }
490 }
491 if ( state == UnknownState ) {
492 return -1;
493 }
494 score++;
495 } else if (ch == So) {
496 // Shift out
497 prev = state;
498 state = JISX0201_Kana;
499 score++;
500 } else if (ch == Si) {
501 // Shift in
502 if (prev == Ascii || prev == JISX0201_Latin) {
503 state = prev;
504 } else {
505 state = Ascii;
506 }
507 score++;
508 } else {
509 switch (state) {
510 case Ascii:
511 case JISX0201_Latin:
512 if ( ch < 32 && ch != '\t' && ch != '\n' && ch != '\r' ) {
513 // Suspicious
514 if ( score )
515 score--;
516 } else {
517 // Inconclusive
518 }
519 break;
520 case JISX0201_Kana:
521 if ( !IsKana(ch | 0x80) ) {
522 return -1;
523 }
524 score++;
525 break;
526 case JISX0208_1978:
527 case JISX0208_1983:
528 case JISX0212:
529 if ( !IsJisChar(ch) ) {
530 // Invalid
531 return -1;
532 }
533 if ( i < len-1 ) {
534 uchar c2 = chars[++i];
535 if ( !IsJisChar(c2) ) {
536 // Invalid
537 return -1;
538 }
539 score++;
540 }
541 score++;
542 break;
543 default:
544 return -1;
545 }
546 }
547 }
548 return score;
549}
550
551class QJisDecoder : public QTextDecoder {
552 uchar buf[4];
553 int nbuf;
554 Iso2022State state, prev;
555 bool esc;
556 const QJpUnicodeConv * const conv;
557public:
558 QJisDecoder(const QJpUnicodeConv *c) : nbuf(0), state(Ascii), prev(Ascii), esc(FALSE), conv(c)
559 {
560 }
561
562 QString toUnicode(const char* chars, int len)
563 {
564 QString result;
565 for (int i=0; i<len; i++) {
566 uchar ch = chars[i];
567 if (esc) {
568 // Escape sequence
569 state = UnknownState;
570 switch (nbuf) {
571 case 0:
572 if (ch == '$' || strchr(Esc_CHARS, ch)) {
573 buf[nbuf++] = ch;
574 } else {
575 nbuf = 0;
576 esc = FALSE;
577 }
578 break;
579 case 1:
580 if (buf[0] == '$') {
581 if (strchr(Esc_CHARS, ch)) {
582 buf[nbuf++] = ch;
583 } else {
584 switch (ch) {
585 case '@':
586 state = JISX0208_1978; // Esc $ @
587 break;
588 case 'B':
589 state = JISX0208_1983; // Esc $ B
590 break;
591 }
592 nbuf = 0;
593 esc = FALSE;
594 }
595 } else {
596 if (buf[0] == '(') {
597 switch (ch) {
598 case 'B':
599 state = Ascii; // Esc ( B
600 break;
601 case 'I':
602 state = JISX0201_Kana; // Esc ( I
603 break;
604 case 'J':
605 state = JISX0201_Latin; // Esc ( J
606 break;
607 }
608 }
609 nbuf = 0;
610 esc = FALSE;
611 }
612 break;
613 case 2:
614 if (buf[1] == '(') {
615 switch (ch) {
616 case 'D':
617 state = JISX0212; // Esc $ ( D
618 break;
619 }
620 }
621 nbuf = 0;
622 esc = FALSE;
623 break;
624 }
625 } else {
626 if (ch == Esc) {
627 // Escape sequence
628 nbuf = 0;
629 esc = TRUE;
630 } else if (ch == So) {
631 // Shift out
632 prev = state;
633 state = JISX0201_Kana;
634 nbuf = 0;
635 } else if (ch == Si) {
636 // Shift in
637 if (prev == Ascii || prev == JISX0201_Latin) {
638 state = prev;
639 } else {
640 state = Ascii;
641 }
642 nbuf = 0;
643 } else {
644 uint u;
645 switch (nbuf) {
646 case 0:
647 switch (state) {
648 case Ascii:
649 if (ch < 0x80) {
650 result += QChar(ch);
651 break;
652 }
653 /* fall throught */
654 case JISX0201_Latin:
655 u = conv->jisx0201ToUnicode(ch);
656 result += QValidChar(u);
657 break;
658 case JISX0201_Kana:
659 u = conv->jisx0201ToUnicode(ch | 0x80);
660 result += QValidChar(u);
661 break;
662 case JISX0208_1978:
663 case JISX0208_1983:
664 case JISX0212:
665 buf[nbuf++] = ch;
666 break;
667 default:
668 result += QChar::replacement;
669 break;
670 }
671 break;
672 case 1:
673 switch (state) {
674 case JISX0208_1978:
675 case JISX0208_1983:
676 u = conv->jisx0208ToUnicode(buf[0] & 0x7f, ch & 0x7f);
677 result += QValidChar(u);
678 break;
679 case JISX0212:
680 u = conv->jisx0212ToUnicode(buf[0] & 0x7f, ch & 0x7f);
681 result += QValidChar(u);
682 break;
683 default:
684 result += QChar::replacement;
685 break;
686 }
687 nbuf = 0;
688 break;
689 }
690 }
691 }
692 }
693 return result;
694 }
695};
696
697/*! \internal */
698QTextDecoder* QJisCodec::makeDecoder() const
699{
700 return new QJisDecoder(conv);
701}
702
703#endif
Note: See TracBrowser for help on using the repository browser.