source: trunk/src/cppbase/bs_string_conv.cpp

Last change on this file was 449, checked in by pr, 5 years ago

Add BSUString copy constructor for codepaged C strings.
Substitute "(C)" for copyright symbol on unsupported codepages.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 8.2 KB
Line 
1
2/*
3 *@@sourcefile bs_string_conv.cpp:
4 * implementation for BSUniCodec.
5 *
6 *@@header "cppbase\bs_string.h"
7 *@@added V0.9.18 (2002-03-08) [umoeller]
8 */
9
10/*
11 * This file Copyright (C) 2002-2008 Ulrich M”ller.
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation, in version 2 as it comes in the COPYING
15 * file of this distribution.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 */
21
22#define OS2EMX_PLAIN_CHAR
23 // this is needed for "os2emx.h"; if this is defined,
24 // emx will define PSZ as _signed_ char, otherwise
25 // as unsigned char
26
27#define INCL_DOSSEMAPHORES
28#define INCL_DOSNLS
29#include <os2.h>
30
31#include <stdio.h>
32#include <stdlib.h>
33#include <string.h>
34#include <stdarg.h>
35
36#include "setup.h"
37
38#include "helpers\stringh.h"
39#include "helpers\xstring.h"
40
41// base includes
42#include "cppbase\bs_base.h"
43#include "cppbase\bs_string.h"
44#include "cppbase\bs_errors.h"
45
46#include "encodings\base.h"
47
48#pragma hdrstop
49
50#define CP_UTF8 1208
51#define UNI_COPYRIGHT 0xA9
52
53DEFINE_CLASS(BSUniCodec, BSRoot);
54
55/*
56 *@@ FindEncodingID:
57 *
58 *@@added V0.9.18 (2002-03-08) [umoeller]
59 */
60
61static ENCID FindEncodingID(USHORT usCodepage,
62 BOOL *pfDouble) // out: TRUE if double-byte cp
63{
64 ENCBYTECOUNT bc;
65 ENCID id = encFindIdForCodepage(usCodepage, NULL, &bc);
66 if ( id != UNSUPPORTED
67 && ( (bc == SINGLE)
68 || (bc == DOUBLE)
69 )
70 )
71 {
72 if (pfDouble)
73 *pfDouble = (bc == DOUBLE);
74
75 return (id);
76 }
77
78 throw BSUnsupportedCPExcpt(usCodepage);
79}
80
81/* ******************************************************************
82 *
83 * BSUniCodec implementation
84 *
85 ********************************************************************/
86
87/*
88 *@@ BSUniCodec:
89 * constructor. Creates the internal conversion
90 * object by calling encCreateCodec.
91 *
92 *@@changed WarpIN V1.0.18 (2008-09-24) [pr]: added codepage 1208 support @@fixes 1127
93 */
94
95BSUniCodec::BSUniCodec(unsigned short usCodepage) // in: codepage
96 : BSRoot(tBSUniCodec),
97 _usCodepage(usCodepage)
98{
99 ENCID id;
100
101 // WarpIN V1.0.18
102 if (usCodepage == CP_UTF8)
103 {
104 _pCodec = NULL;
105 return;
106 }
107
108 id = FindEncodingID(usCodepage, &_fDouble);
109 if (_fDouble)
110 {
111 APIRET arc;
112 COUNTRYCODE cc = { 0, usCodepage };
113 if (arc = DosQueryDBCSEnv(sizeof(_achDBCS),
114 &cc,
115 _achDBCS))
116 {
117 CHAR sz[200];
118 sprintf(sz,
119 "DosQueryDBCSEnv returned error %d for codepage %d",
120 arc,
121 usCodepage);
122 throw BSExcptBase(sz);
123 }
124 }
125
126 _pCodec = encCreateCodec(id);
127}
128
129/*
130 *@@ ~BSUniCodec:
131 * destructor. Frees the internal conversion object
132 * by calling encFreeCodec.
133 *
134 *@@changed WarpIN V1.0.18 (2008-09-24) [pr]: added codepage 1208 support @@fixes 1127
135 */
136
137BSUniCodec::~BSUniCodec()
138{
139 if (_pCodec)
140 {
141 encFreeCodec((PCONVERSION*)&_pCodec);
142 _pCodec = NULL;
143 }
144}
145
146/*
147 *@@ IsLeadByte:
148 * returns TRUE if c is a DBCS lead byte.
149 *
150 *@@added V0.9.19 (2002-04-02) [umoeller]
151 */
152
153static BOOL IsLeadByte(CHAR c, // in: character to test
154 PSZ pachDBCS) // in: DBCS array from DosQueryDBCSEnv
155{
156 while (*pachDBCS)
157 {
158 if ( (c >= *pachDBCS++)
159 && (c <= *pachDBCS++)
160 )
161 return TRUE;
162 }
163
164 return FALSE;
165}
166
167/*
168 *@@ Codepage2Uni:
169 * converts the given string from codepage-specific
170 * to UTF-8. Used by BSString::assignUtf8 and
171 * others.
172 *
173 *@@changed WarpIN V1.0.18 (2008-09-24) [pr]: added codepage 1208 support @@fixes 1127
174 */
175
176void BSUniCodec::Codepage2Uni(BSUString &ustr, // out: target
177 const char *pcszCP, // in: cp string
178 unsigned long ulLength) // in: length of cp string
179{
180 PCONVERSION pTable = (PCONVERSION)_pCodec;
181
182 // WarpIN V1.0.18
183 if (QueryCodepage() == CP_UTF8)
184 {
185 ustr.assignUtf8(pcszCP);
186 return;
187 }
188
189 XSTRING xstrNew;
190 xstrInit(&xstrNew, ulLength + 1);
191
192 ULONG ul;
193 for (ul = 0;
194 ul < ulLength;
195 ++ul)
196 {
197 unsigned short c = pcszCP[ul];
198
199 if (_fDouble)
200 {
201 // we're using a double-byte codepage:
202 // check if this is a DBCS char
203 if (IsLeadByte(c, _achDBCS))
204 c = (c << 8) | pcszCP[++ul];
205 }
206
207 // convert this char to Unicode, using the current codepage
208 unsigned long ulUni = encChar2Uni(pTable, c);
209
210 if (ulUni == 0xFFFF)
211 // shouldn't happen
212 xstrcatc(&xstrNew, '?');
213 else if (ulUni < 0x80)
214 xstrcatc(&xstrNew, (CHAR)ulUni);
215 else if (ulUni < 0x800)
216 {
217 xstrcatc(&xstrNew, (CHAR)(0xC0 | ulUni>>6));
218 xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni & 0x3F));
219 }
220 else if (ulUni < 0x10000)
221 {
222 xstrcatc(&xstrNew, (CHAR)(0xE0 | ulUni>>12));
223 xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni>>6 & 0x3F));
224 xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni & 0x3F));
225 }
226 /* else if (ulUni < 0x200000)
227 {
228 xstrcatc(&xstrNew, (CHAR)(0xF0 | ulUni>>18));
229 xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni>>12 & 0x3F));
230 xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni>>6 & 0x3F));
231 xstrcatc(&xstrNew, (CHAR)(0x80 | ulUni & 0x3F));
232 } */
233 else
234 {
235 CHAR sz[100];
236 sprintf(sz,
237 "Unsupported Unicode character %u at offset %u in string",
238 ulUni,
239 ul);
240 throw BSExcptBase(sz);
241 }
242 }
243
244 // copy back
245 ustr._take_from(xstrNew);
246}
247
248/*
249 *@@ Uni2Codepage:
250 * converts the given string from UTF-8 to
251 * codepage-specific. Used by BSUString::assignCP
252 * and others.
253 *
254 * Characters that are not supported with the
255 * given codepage are replaced by '?'.
256 *
257 *@@changed WarpIN V1.0.18 (2008-09-24) [pr]: added codepage 1208 support @@fixes 1127
258 *@@changed WarpIN V1.0.25 (2020-09-05) [pr]: added copyright symbol substitution
259 */
260
261void BSUniCodec::Uni2Codepage(BSString &str,
262 const char *pcszUni,
263 unsigned long ulLength)
264{
265 // WarpIN V1.0.18
266 if (QueryCodepage() == CP_UTF8)
267 {
268 str.assign(pcszUni);
269 return;
270 }
271
272 PCONVERSION pTable = (PCONVERSION)_pCodec;
273
274 XSTRING xstrNew;
275 xstrInit(&xstrNew, ulLength + 1);
276
277 const char *pFirst = pcszUni;
278
279 while (*pcszUni)
280 {
281 unsigned long ulUni = encDecodeUTF8(&pcszUni);
282 unsigned short us = encUni2Char(pTable, ulUni);
283 if (us > 0xFF)
284 {
285 if (us == 0xFFFF)
286 if (ulUni == UNI_COPYRIGHT)
287 {
288 xstrcat(&xstrNew, "(C)", 3);
289 continue;
290 }
291 else
292 us = '?';
293 else if (_fDouble)
294 {
295 // we're using a double-byte codepage:
296 // store lead byte first
297 xstrcatc(&xstrNew, (CHAR)(us >> 8));
298 }
299 else
300 {
301 // not double-byte codepage: then we can't handle > 0xFF
302 CHAR sz[100];
303 sprintf(sz,
304 "Unsupported Unicode character %u at offset %u in string",
305 ulUni,
306 pcszUni - pFirst);
307 throw BSExcptBase(sz);
308
309 }
310 }
311 xstrcatc(&xstrNew, (CHAR)us);
312 }
313
314 // copy back
315 str._take_from(xstrNew);
316}
317
Note: See TracBrowser for help on using the repository browser.