source: vendor/trolltech/current/src/codecs/qeucjpcodec.cpp

Last change on this file was 2, checked in by dmik, 20 years ago

Imported xplatform parts of the official release 3.3.1 from Trolltech

  • Property svn:keywords set to Id
File size: 13.2 KB
Line 
1/****************************************************************************
2** $Id: qeucjpcodec.cpp 2 2005-11-16 15:49:26Z dmik $
3**
4** Implementation of QEucJpCodec class
5**
6** Created : 990225
7**
8** Copyright (C) 2000-2002 Trolltech AS. All rights reserved.
9**
10** This file is part of the tools module of the Qt GUI Toolkit.
11**
12** This file may be distributed under the terms of the Q Public License
13** as defined by Trolltech AS of Norway and appearing in the file
14** LICENSE.QPL included in the packaging of this file.
15**
16** This file may be distributed and/or modified under the terms of the
17** GNU General Public License version 2 as published by the Free Software
18** Foundation and appearing in the file LICENSE.GPL included in the
19** packaging of this file.
20**
21** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22** licenses may use this file in accordance with the Qt Commercial License
23** Agreement provided with the Software.
24**
25** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27**
28** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29** information about Qt Commercial License Agreements.
30** See http://www.trolltech.com/qpl/ for QPL licensing information.
31** See http://www.trolltech.com/gpl/ for GPL licensing information.
32**
33** Contact info@trolltech.com if any conditions of this licensing are
34** not clear to you.
35**
36**********************************************************************/
37
38// Most of the code here was originally written by Serika Kurusugawa
39// a.k.a. Junji Takagi, and is included in Qt with the author's permission,
40// and the grateful thanks of the Trolltech team.
41
42/*! \class QEucJpCodec qeucjpcodec.h
43 \reentrant
44 \ingroup i18n
45
46 \brief The QEucJpCodec class provides conversion to and from EUC-JP character sets.
47
48 More precisely, the QEucJpCodec class subclasses QTextCodec to
49 provide support for EUC-JP, the main legacy encoding for Unix
50 machines in Japan.
51
52 The environment variable \c UNICODEMAP_JP can be used to fine-tune
53 QJisCodec, QSjisCodec and QEucJpCodec. The \l QJisCodec
54 documentation describes how to use this variable.
55
56 Most of the code here was written by Serika Kurusugawa,
57 a.k.a. Junji Takagi, and is included in Qt with the author's
58 permission and the grateful thanks of the Trolltech team. Here is
59 the copyright statement for that code:
60
61 \legalese
62
63 Copyright (C) 1999 Serika Kurusugawa. All rights reserved.
64
65 Redistribution and use in source and binary forms, with or without
66 modification, are permitted provided that the following conditions
67 are met:
68 \list 1
69 \i Redistributions of source code must retain the above copyright
70 notice, this list of conditions and the following disclaimer.
71 \i Redistributions in binary form must reproduce the above copyright
72 notice, this list of conditions and the following disclaimer in the
73 documentation and/or other materials provided with the distribution.
74 \endlist
75
76 THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS".
77 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
78 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
79 ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
80 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
81 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
82 OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
83 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
84 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
85 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
86 SUCH DAMAGE.
87*/
88
89/*
90 * Copyright (C) 1999 Serika Kurusugawa, All rights reserved.
91 *
92 * Redistribution and use in source and binary forms, with or without
93 * modification, are permitted provided that the following conditions
94 * are met:
95 * 1. Redistributions of source code must retain the above copyright
96 * notice, this list of conditions and the following disclaimer.
97 * 2. Redistributions in binary form must reproduce the above copyright
98 * notice, this list of conditions and the following disclaimer in the
99 * documentation and/or other materials provided with the distribution.
100 *
101 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
102 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
103 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
104 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
105 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
106 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
107 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
108 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
109 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
110 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
111 * SUCH DAMAGE.
112 */
113
114#include "qeucjpcodec.h"
115
116#ifndef QT_NO_BIG_CODECS
117
118static const uchar Esc = 0x1b;
119static const uchar Ss2 = 0x8e; // Single Shift 2
120static const uchar Ss3 = 0x8f; // Single Shift 3
121
122#define IsKana(c) (((c) >= 0xa1) && ((c) <= 0xdf))
123#define IsEucChar(c) (((c) >= 0xa1) && ((c) <= 0xfe))
124
125#define QValidChar(u) ((u) ? QChar((ushort)(u)) : QChar::replacement)
126
127/*!
128 Constructs a QEucJpCodec.
129*/
130QEucJpCodec::QEucJpCodec() : conv(QJpUnicodeConv::newConverter( QJpUnicodeConv::Default ))
131{
132}
133
134/*!
135 Destroys the codec.
136*/
137QEucJpCodec::~QEucJpCodec()
138{
139 delete (QJpUnicodeConv*)conv;
140 conv = 0;
141}
142
143/*!
144 Returns 18.
145*/
146int QEucJpCodec::mibEnum() const
147{
148 /*
149 Name: Extended_UNIX_Code_Packed_Format_for_Japanese
150 MIBenum: 18
151 Source: Standardized by OSF, UNIX International, and UNIX Systems
152 Laboratories Pacific. Uses ISO 2022 rules to select
153 code set 0: US-ASCII (a single 7-bit byte set)
154 code set 1: JIS X0208-1990 (a double 8-bit byte set)
155 restricted to A0-FF in both bytes
156 code set 2: Half Width Katakana (a single 7-bit byte set)
157 requiring SS2 as the character prefix
158 code set 3: JIS X0212-1990 (a double 7-bit byte set)
159 restricted to A0-FF in both bytes
160 requiring SS3 as the character prefix
161 Alias: csEUCPkdFmtJapanese
162 Alias: EUC-JP (preferred MIME name)
163 */
164 return 18;
165}
166
167/*!
168 \reimp
169*/
170QCString QEucJpCodec::fromUnicode(const QString& uc, int& lenInOut) const
171{
172 int l = QMIN((int)uc.length(),lenInOut);
173 int rlen = l*3+1;
174 QCString rstr(rlen);
175 uchar* cursor = (uchar*)rstr.data();
176 for (int i=0; i<l; i++) {
177 QChar ch = uc[i];
178 uint j;
179 if ( ch.row() == 0x00 && ch.cell() < 0x80 ) {
180 // ASCII
181 *cursor++ = ch.cell();
182 } else if ((j = conv->unicodeToJisx0201(ch.row(), ch.cell())) != 0) {
183 if (j < 0x80) {
184 // JIS X 0201 Latin ?
185 *cursor++ = j;
186 } else {
187 // JIS X 0201 Kana
188 *cursor++ = Ss2;
189 *cursor++ = j;
190 }
191 } else if ((j = conv->unicodeToJisx0208(ch.row(), ch.cell())) != 0) {
192 // JIS X 0208
193 *cursor++ = (j >> 8) | 0x80;
194 *cursor++ = (j & 0xff) | 0x80;
195 } else if ((j = conv->unicodeToJisx0212(ch.row(), ch.cell())) != 0) {
196 // JIS X 0212
197 *cursor++ = Ss3;
198 *cursor++ = (j >> 8) | 0x80;
199 *cursor++ = (j & 0xff) | 0x80;
200 } else {
201 // Error
202 *cursor++ = '?'; // unknown char
203 }
204 }
205 lenInOut = cursor - (uchar*)rstr.data();
206 rstr.truncate(lenInOut);
207 return rstr;
208}
209
210/*!
211 \reimp
212*/
213QString QEucJpCodec::toUnicode(const char* chars, int len) const
214{
215 QString result;
216 for (int i=0; i<len; i++) {
217 uchar ch = chars[i];
218 if ( ch < 0x80 ) {
219 // ASCII
220 result += QChar(ch);
221 } else if ( ch == Ss2 ) {
222 // JIS X 0201 Kana
223 if ( i < len-1 ) {
224 uchar c2 = chars[++i];
225 if ( IsKana(c2) ) {
226 uint u = conv->jisx0201ToUnicode(c2);
227 result += QValidChar(u);
228 } else {
229 i--;
230 result += QChar::replacement;
231 }
232 }
233 } else if ( ch == Ss3 ) {
234 // JIS X 0212
235 if ( i < len-1 ) {
236 uchar c2 = chars[++i];
237 if ( IsEucChar(c2) ) {
238 if ( i < len-1 ) {
239 uchar c3 = chars[++i];
240 if ( IsEucChar(c3) ) {
241 uint u = conv->jisx0212ToUnicode(c2 & 0x7f, c3 & 0x7f);
242 result += QValidChar(u);
243 } else {
244 i--;
245 result += QChar::replacement;
246 }
247 } else {
248 result += QChar::replacement;
249 }
250 } else {
251 i--;
252 result += QChar::replacement;
253 }
254 } else {
255 result += QChar::replacement;
256 }
257 } else if ( IsEucChar(ch) ) {
258 // JIS X 0208
259 if ( i < len-1 ) {
260 uchar c2 = chars[++i];
261 if ( IsEucChar(c2) ) {
262 uint u = conv->jisx0208ToUnicode(ch & 0x7f, c2 & 0x7f);
263 result += QValidChar(u);
264 } else {
265 i--;
266 result += QChar::replacement;
267 }
268 } else {
269 result += QChar::replacement;
270 }
271 } else {
272 // Invalid
273 result += QChar::replacement;
274 }
275 }
276 return result;
277}
278
279/*!
280 \reimp
281*/
282const char* QEucJpCodec::name() const
283{
284 return "eucJP";
285}
286
287/*!
288 Returns the codec's mime name.
289*/
290const char* QEucJpCodec::mimeName() const
291{
292 return "EUC-JP";
293}
294
295/*!
296 \reimp
297*/
298int QEucJpCodec::heuristicNameMatch(const char* hint) const
299{
300 int score = 0;
301 bool ja = FALSE;
302 if (qstrnicmp(hint, "ja_JP", 5) == 0 || qstrnicmp(hint, "japan", 5) == 0) {
303 score += 3;
304 ja = TRUE;
305 } else if (qstrnicmp(hint, "ja", 2) == 0) {
306 score += 2;
307 ja = TRUE;
308 }
309 const char *p;
310 if (ja) {
311 p = strchr(hint, '.');
312 if (p == 0) {
313 return score;
314 }
315 p++;
316 } else {
317 p = hint;
318 }
319 if (p) {
320 if ((qstricmp(p, "AJEC") == 0) ||
321 (qstricmp(p, "eucJP") == 0) ||
322 (qstricmp(p, "ujis") == 0) ||
323 (simpleHeuristicNameMatch(p, "eucJP") > 0) ||
324 (simpleHeuristicNameMatch(p, "x-euc-jp") > 0)) {
325 return score + 4;
326 }
327 // there exists ja_JP.EUC, ko_KR.EUC, zh_CN.EUC and zh_TW.EUC
328 // so "euc" may or may not be Japanese EUC.
329 if (qstricmp(p, "euc") == 0 && ja) {
330 return score + 4;
331 }
332 }
333 return QTextCodec::heuristicNameMatch(hint);
334}
335
336/*!
337 \reimp
338*/
339int QEucJpCodec::heuristicContentMatch(const char* chars, int len) const
340{
341 int score = 0;
342 for (int i=0; i<len; i++) {
343 uchar ch = chars[i];
344 // No nulls allowed.
345 if ( !ch || ch == Esc )
346 return -1;
347 if ( ch < 32 && ch != '\t' && ch != '\n' && ch != '\r' ) {
348 // Suspicious
349 if ( score )
350 score--;
351 } else if ( ch < 0x80 ) {
352 // Inconclusive
353 score++;
354 } else if ( ch == Ss2 ) {
355 // JIS X 0201 Kana
356 if ( i < len-1 ) {
357 uchar c2 = chars[++i];
358 if ( !IsKana(c2) )
359 return -1;
360 score+=2;
361 }
362 score++;
363 } else if ( ch == Ss3 ) {
364 // JIS X 0212
365 if ( i < len-1 ) {
366 uchar c2 = chars[++i];
367 if ( !IsEucChar(c2) )
368 return -1;
369 if ( i < len-1 ) {
370 uchar c3 = chars[++i];
371 if ( !IsEucChar(c3) )
372 return -1;
373 score++;
374 }
375 score+=2;
376 }
377 score++;
378 } else if ( IsEucChar(ch) ) {
379 // JIS X 0208-1990
380 if ( i < len-1 ) {
381 uchar c2 = chars[++i];
382 if ( !IsEucChar(c2) )
383 return -1;
384 score+=2;
385 }
386 score++;
387 } else {
388 // Invalid
389 return -1;
390 }
391 }
392 return score;
393}
394
395class QEucJpDecoder : public QTextDecoder {
396 uchar buf[2];
397 int nbuf;
398 const QJpUnicodeConv * const conv;
399public:
400 QEucJpDecoder(const QJpUnicodeConv *c) : nbuf(0), conv(c)
401 {
402 }
403
404 QString toUnicode(const char* chars, int len)
405 {
406 QString result;
407 for (int i=0; i<len; i++) {
408 uchar ch = chars[i];
409 switch (nbuf) {
410 case 0:
411 if ( ch < 0x80 ) {
412 // ASCII
413 result += QChar(ch);
414 } else if ( ch == Ss2 || ch == Ss3 ) {
415 // JIS X 0201 Kana or JIS X 0212
416 buf[0] = ch;
417 nbuf = 1;
418 } else if ( IsEucChar(ch) ) {
419 // JIS X 0208
420 buf[0] = ch;
421 nbuf = 1;
422 } else {
423 // Invalid
424 result += QChar::replacement;
425 }
426 break;
427 case 1:
428 if ( buf[0] == Ss2 ) {
429 // JIS X 0201 Kana
430 if ( IsKana(ch) ) {
431 uint u = conv->jisx0201ToUnicode(ch);
432 result += QValidChar(u);
433 } else {
434 result += QChar::replacement;
435 }
436 nbuf = 0;
437 } else if ( buf[0] == Ss3 ) {
438 // JIS X 0212-1990
439 if ( IsEucChar(ch) ) {
440 buf[1] = ch;
441 nbuf = 2;
442 } else {
443 // Error
444 result += QChar::replacement;
445 nbuf = 0;
446 }
447 } else {
448 // JIS X 0208-1990
449 if ( IsEucChar(ch) ) {
450 uint u = conv->jisx0208ToUnicode(buf[0] & 0x7f, ch & 0x7f);
451 result += QValidChar(u);
452 } else {
453 // Error
454 result += QChar::replacement;
455 }
456 nbuf = 0;
457 }
458 break;
459 case 2:
460 // JIS X 0212
461 if ( IsEucChar(ch) ) {
462 uint u = conv->jisx0212ToUnicode(buf[1] & 0x7f, ch & 0x7f);
463 result += QValidChar(u);
464 } else {
465 result += QChar::replacement;
466 }
467 nbuf = 0;
468 }
469 }
470 return result;
471 }
472};
473
474/*!
475 \reimp
476*/
477QTextDecoder* QEucJpCodec::makeDecoder() const
478{
479 return new QEucJpDecoder(conv);
480}
481
482#endif
Note: See TracBrowser for help on using the repository browser.