1 |
|
---|
2 | /*
|
---|
3 | *@@sourcefile bs_string_conv.cpp:
|
---|
4 | * implementation for BSUniCodec.
|
---|
5 | *
|
---|
6 | *@@header "cppbase\bs_string.h"
|
---|
7 | *@@added V0.9.18 (2002-03-08) [umoeller]
|
---|
8 | */
|
---|
9 |
|
---|
10 | /*
|
---|
11 | * This file Copyright (C) 2002-2008 Ulrich Mller.
|
---|
12 | * This program is free software; you can redistribute it and/or modify
|
---|
13 | * it under the terms of the GNU General Public License as published by
|
---|
14 | * the Free Software Foundation, in version 2 as it comes in the COPYING
|
---|
15 | * file of this distribution.
|
---|
16 | * This program is distributed in the hope that it will be useful,
|
---|
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
19 | * GNU General Public License for more details.
|
---|
20 | */
|
---|
21 |
|
---|
22 | #define OS2EMX_PLAIN_CHAR
|
---|
23 | // this is needed for "os2emx.h"; if this is defined,
|
---|
24 | // emx will define PSZ as _signed_ char, otherwise
|
---|
25 | // as unsigned char
|
---|
26 |
|
---|
27 | #define INCL_DOSSEMAPHORES
|
---|
28 | #define INCL_DOSNLS
|
---|
29 | #include <os2.h>
|
---|
30 |
|
---|
31 | #include <stdio.h>
|
---|
32 | #include <stdlib.h>
|
---|
33 | #include <string.h>
|
---|
34 | #include <stdarg.h>
|
---|
35 |
|
---|
36 | #include "setup.h"
|
---|
37 |
|
---|
38 | #include "helpers\stringh.h"
|
---|
39 | #include "helpers\xstring.h"
|
---|
40 |
|
---|
41 | // base includes
|
---|
42 | #include "cppbase\bs_base.h"
|
---|
43 | #include "cppbase\bs_string.h"
|
---|
44 | #include "cppbase\bs_errors.h"
|
---|
45 |
|
---|
46 | #include "encodings\base.h"
|
---|
47 |
|
---|
48 | #pragma hdrstop
|
---|
49 |
|
---|
50 | #define CP_UTF8 1208
|
---|
51 | #define UNI_COPYRIGHT 0xA9
|
---|
52 |
|
---|
53 | DEFINE_CLASS(BSUniCodec, BSRoot);
|
---|
54 |
|
---|
55 | /*
|
---|
56 | *@@ FindEncodingID:
|
---|
57 | *
|
---|
58 | *@@added V0.9.18 (2002-03-08) [umoeller]
|
---|
59 | */
|
---|
60 |
|
---|
61 | static ENCID FindEncodingID(USHORT usCodepage,
|
---|
62 | BOOL *pfDouble) // out: TRUE if double-byte cp
|
---|
63 | {
|
---|
64 | ENCBYTECOUNT bc;
|
---|
65 | ENCID id = encFindIdForCodepage(usCodepage, NULL, &bc);
|
---|
66 | if ( id != UNSUPPORTED
|
---|
67 | && ( (bc == SINGLE)
|
---|
68 | || (bc == DOUBLE)
|
---|
69 | )
|
---|
70 | )
|
---|
71 | {
|
---|
72 | if (pfDouble)
|
---|
73 | *pfDouble = (bc == DOUBLE);
|
---|
74 |
|
---|
75 | return (id);
|
---|
76 | }
|
---|
77 |
|
---|
78 | throw BSUnsupportedCPExcpt(usCodepage);
|
---|
79 | }
|
---|
80 |
|
---|
81 | /* ******************************************************************
|
---|
82 | *
|
---|
83 | * BSUniCodec implementation
|
---|
84 | *
|
---|
85 | ********************************************************************/
|
---|
86 |
|
---|
87 | /*
|
---|
88 | *@@ BSUniCodec:
|
---|
89 | * constructor. Creates the internal conversion
|
---|
90 | * object by calling encCreateCodec.
|
---|
91 | *
|
---|
92 | *@@changed WarpIN V1.0.18 (2008-09-24) [pr]: added codepage 1208 support @@fixes 1127
|
---|
93 | */
|
---|
94 |
|
---|
95 | BSUniCodec::BSUniCodec(unsigned short usCodepage) // in: codepage
|
---|
96 | : BSRoot(tBSUniCodec),
|
---|
97 | _usCodepage(usCodepage)
|
---|
98 | {
|
---|
99 | ENCID id;
|
---|
100 |
|
---|
101 | // WarpIN V1.0.18
|
---|
102 | if (usCodepage == CP_UTF8)
|
---|
103 | {
|
---|
104 | _pCodec = NULL;
|
---|
105 | return;
|
---|
106 | }
|
---|
107 |
|
---|
108 | id = FindEncodingID(usCodepage, &_fDouble);
|
---|
109 | if (_fDouble)
|
---|
110 | {
|
---|
111 | APIRET arc;
|
---|
112 | COUNTRYCODE cc = { 0, usCodepage };
|
---|
113 | if (arc = DosQueryDBCSEnv(sizeof(_achDBCS),
|
---|
114 | &cc,
|
---|
115 | _achDBCS))
|
---|
116 | {
|
---|
117 | CHAR sz[200];
|
---|
118 | sprintf(sz,
|
---|
119 | "DosQueryDBCSEnv returned error %d for codepage %d",
|
---|
120 | arc,
|
---|
121 | usCodepage);
|
---|
122 | throw BSExcptBase(sz);
|
---|
123 | }
|
---|
124 | }
|
---|
125 |
|
---|
126 | _pCodec = encCreateCodec(id);
|
---|
127 | }
|
---|
128 |
|
---|
129 | /*
|
---|
130 | *@@ ~BSUniCodec:
|
---|
131 | * destructor. Frees the internal conversion object
|
---|
132 | * by calling encFreeCodec.
|
---|
133 | *
|
---|
134 | *@@changed WarpIN V1.0.18 (2008-09-24) [pr]: added codepage 1208 support @@fixes 1127
|
---|
135 | */
|
---|
136 |
|
---|
137 | BSUniCodec::~BSUniCodec()
|
---|
138 | {
|
---|
139 | if (_pCodec)
|
---|
140 | {
|
---|
141 | encFreeCodec((PCONVERSION*)&_pCodec);
|
---|
142 | _pCodec = NULL;
|
---|
143 | }
|
---|
144 | }
|
---|
145 |
|
---|
146 | /*
|
---|
147 | *@@ IsLeadByte:
|
---|
148 | * returns TRUE if c is a DBCS lead byte.
|
---|
149 | *
|
---|
150 | *@@added V0.9.19 (2002-04-02) [umoeller]
|
---|
151 | */
|
---|
152 |
|
---|
153 | static BOOL IsLeadByte(CHAR c, // in: character to test
|
---|
154 | PSZ pachDBCS) // in: DBCS array from DosQueryDBCSEnv
|
---|
155 | {
|
---|
156 | while (*pachDBCS)
|
---|
157 | {
|
---|
158 | if ( (c >= *pachDBCS++)
|
---|
159 | && (c <= *pachDBCS++)
|
---|
160 | )
|
---|
161 | return TRUE;
|
---|
162 | }
|
---|
163 |
|
---|
164 | return FALSE;
|
---|
165 | }
|
---|
166 |
|
---|
167 | /*
|
---|
168 | *@@ Codepage2Uni:
|
---|
169 | * converts the given string from codepage-specific
|
---|
170 | * to UTF-8. Used by BSString::assignUtf8 and
|
---|
171 | * others.
|
---|
172 | *
|
---|
173 | *@@changed WarpIN V1.0.18 (2008-09-24) [pr]: added codepage 1208 support @@fixes 1127
|
---|
174 | */
|
---|
175 |
|
---|
176 | void BSUniCodec::Codepage2Uni(BSUString &ustr, // out: target
|
---|
177 | const char *pcszCP, // in: cp string
|
---|
178 | unsigned long ulLength) // in: length of cp string
|
---|
179 | {
|
---|
180 | PCONVERSION pTable = (PCONVERSION)_pCodec;
|
---|
181 |
|
---|
182 | // WarpIN V1.0.18
|
---|
183 | if (QueryCodepage() == CP_UTF8)
|
---|
184 | {
|
---|
185 | ustr.assignUtf8(pcszCP);
|
---|
186 | return;
|
---|
187 | }
|
---|
188 |
|
---|
189 | XSTRING xstrNew;
|
---|
190 | xstrInit(&xstrNew, ulLength + 1);
|
---|
191 |
|
---|
192 | ULONG ul;
|
---|
193 | for (ul = 0;
|
---|
194 | ul < ulLength;
|
---|
195 | ++ul)
|
---|
196 | {
|
---|
197 | unsigned short c = pcszCP[ul];
|
---|
198 |
|
---|
199 | if (_fDouble)
|
---|
200 | {
|
---|
201 | // we're using a double-byte codepage:
|
---|
202 | // check if this is a DBCS char
|
---|
203 | if (IsLeadByte(c, _achDBCS))
|
---|
204 | c = (c << 8) | pcszCP[++ul];
|
---|
205 | }
|
---|
206 |
|
---|
207 | // convert this char to Unicode, using the current codepage
|
---|
208 | unsigned long ulUni = encChar2Uni(pTable, c);
|
---|
209 |
|
---|
210 | if (ulUni == 0xFFFF)
|
---|
211 | // shouldn't happen
|
---|
212 | xstrcatc(&xstrNew, '?');
|
---|
213 | else if (ulUni < 0x80)
|
---|
214 | xstrcatc(&xstrNew, (CHAR)ulUni);
|
---|
215 | else if (ulUni < 0x800)
|
---|
216 | {
|
---|
217 | xstrcatc(&xstrNew, (CHAR)(0xC0 | ulUni>>6));
|
---|
218 | xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni & 0x3F));
|
---|
219 | }
|
---|
220 | else if (ulUni < 0x10000)
|
---|
221 | {
|
---|
222 | xstrcatc(&xstrNew, (CHAR)(0xE0 | ulUni>>12));
|
---|
223 | xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni>>6 & 0x3F));
|
---|
224 | xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni & 0x3F));
|
---|
225 | }
|
---|
226 | /* else if (ulUni < 0x200000)
|
---|
227 | {
|
---|
228 | xstrcatc(&xstrNew, (CHAR)(0xF0 | ulUni>>18));
|
---|
229 | xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni>>12 & 0x3F));
|
---|
230 | xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni>>6 & 0x3F));
|
---|
231 | xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni & 0x3F));
|
---|
232 | } */
|
---|
233 | else
|
---|
234 | {
|
---|
235 | CHAR sz[100];
|
---|
236 | sprintf(sz,
|
---|
237 | "Unsupported Unicode character %u at offset %u in string",
|
---|
238 | ulUni,
|
---|
239 | ul);
|
---|
240 | throw BSExcptBase(sz);
|
---|
241 | }
|
---|
242 | }
|
---|
243 |
|
---|
244 | // copy back
|
---|
245 | ustr._take_from(xstrNew);
|
---|
246 | }
|
---|
247 |
|
---|
248 | /*
|
---|
249 | *@@ Uni2Codepage:
|
---|
250 | * converts the given string from UTF-8 to
|
---|
251 | * codepage-specific. Used by BSUString::assignCP
|
---|
252 | * and others.
|
---|
253 | *
|
---|
254 | * Characters that are not supported with the
|
---|
255 | * given codepage are replaced by '?'.
|
---|
256 | *
|
---|
257 | *@@changed WarpIN V1.0.18 (2008-09-24) [pr]: added codepage 1208 support @@fixes 1127
|
---|
258 | *@@changed WarpIN V1.0.25 (2020-09-05) [pr]: added copyright symbol substitution
|
---|
259 | */
|
---|
260 |
|
---|
261 | void BSUniCodec::Uni2Codepage(BSString &str,
|
---|
262 | const char *pcszUni,
|
---|
263 | unsigned long ulLength)
|
---|
264 | {
|
---|
265 | // WarpIN V1.0.18
|
---|
266 | if (QueryCodepage() == CP_UTF8)
|
---|
267 | {
|
---|
268 | str.assign(pcszUni);
|
---|
269 | return;
|
---|
270 | }
|
---|
271 |
|
---|
272 | PCONVERSION pTable = (PCONVERSION)_pCodec;
|
---|
273 |
|
---|
274 | XSTRING xstrNew;
|
---|
275 | xstrInit(&xstrNew, ulLength + 1);
|
---|
276 |
|
---|
277 | const char *pFirst = pcszUni;
|
---|
278 |
|
---|
279 | while (*pcszUni)
|
---|
280 | {
|
---|
281 | unsigned long ulUni = encDecodeUTF8(&pcszUni);
|
---|
282 | unsigned short us = encUni2Char(pTable, ulUni);
|
---|
283 | if (us > 0xFF)
|
---|
284 | {
|
---|
285 | if (us == 0xFFFF)
|
---|
286 | if (ulUni == UNI_COPYRIGHT)
|
---|
287 | {
|
---|
288 | xstrcat(&xstrNew, "(C)", 3);
|
---|
289 | continue;
|
---|
290 | }
|
---|
291 | else
|
---|
292 | us = '?';
|
---|
293 | else if (_fDouble)
|
---|
294 | {
|
---|
295 | // we're using a double-byte codepage:
|
---|
296 | // store lead byte first
|
---|
297 | xstrcatc(&xstrNew, (CHAR)(us >> 8));
|
---|
298 | }
|
---|
299 | else
|
---|
300 | {
|
---|
301 | // not double-byte codepage: then we can't handle > 0xFF
|
---|
302 | CHAR sz[100];
|
---|
303 | sprintf(sz,
|
---|
304 | "Unsupported Unicode character %u at offset %u in string",
|
---|
305 | ulUni,
|
---|
306 | pcszUni - pFirst);
|
---|
307 | throw BSExcptBase(sz);
|
---|
308 |
|
---|
309 | }
|
---|
310 | }
|
---|
311 | xstrcatc(&xstrNew, (CHAR)us);
|
---|
312 | }
|
---|
313 |
|
---|
314 | // copy back
|
---|
315 | str._take_from(xstrNew);
|
---|
316 | }
|
---|
317 |
|
---|