1 | /* ***** BEGIN LICENSE BLOCK *****
|
---|
2 | * Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
---|
3 | *
|
---|
4 | * The contents of this file are subject to the Mozilla Public License Version
|
---|
5 | * 1.1 (the "License"); you may not use this file except in compliance with
|
---|
6 | * the License. You may obtain a copy of the License at
|
---|
7 | * http://www.mozilla.org/MPL/
|
---|
8 | *
|
---|
9 | * Software distributed under the License is distributed on an "AS IS" basis,
|
---|
10 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
---|
11 | * for the specific language governing rights and limitations under the
|
---|
12 | * License.
|
---|
13 | *
|
---|
14 | * The Original Code is Mozilla.
|
---|
15 | *
|
---|
16 | * The Initial Developer of the Original Code is
|
---|
17 | * Netscape Communications Corporation.
|
---|
18 | * Portions created by the Initial Developer are Copyright (C) 2002
|
---|
19 | * the Initial Developer. All Rights Reserved.
|
---|
20 | *
|
---|
21 | * Contributor(s):
|
---|
22 | * Darin Fisher <[email protected]>
|
---|
23 | * Brian Stell <[email protected]>
|
---|
24 | * Frank Tang <[email protected]>
|
---|
25 | * Brendan Eich <[email protected]>
|
---|
26 | * Sergei Dolgov <[email protected]>
|
---|
27 | *
|
---|
28 | * Alternatively, the contents of this file may be used under the terms of
|
---|
29 | * either the GNU General Public License Version 2 or later (the "GPL"), or
|
---|
30 | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
---|
31 | * in which case the provisions of the GPL or the LGPL are applicable instead
|
---|
32 | * of those above. If you wish to allow use of your version of this file only
|
---|
33 | * under the terms of either the GPL or the LGPL, and not to allow others to
|
---|
34 | * use your version of this file under the terms of the MPL, indicate your
|
---|
35 | * decision by deleting the provisions above and replace them with the notice
|
---|
36 | * and other provisions required by the GPL or the LGPL. If you do not delete
|
---|
37 | * the provisions above, a recipient may use your version of this file under
|
---|
38 | * the terms of any one of the MPL, the GPL or the LGPL.
|
---|
39 | *
|
---|
40 | * ***** END LICENSE BLOCK ***** */
|
---|
41 |
|
---|
42 | #include "xpcom-private.h"
|
---|
43 |
|
---|
44 | //-----------------------------------------------------------------------------
|
---|
45 | // XP_UNIX
|
---|
46 | //-----------------------------------------------------------------------------
|
---|
47 | #if defined(XP_UNIX)
|
---|
48 |
|
---|
49 | #include <stdlib.h> // mbtowc, wctomb
|
---|
50 | #include <locale.h> // setlocale
|
---|
51 | #include "nscore.h"
|
---|
52 | #include "nsAString.h"
|
---|
53 | #include "nsReadableUtils.h"
|
---|
54 |
|
---|
55 | #include <iprt/assert.h>
|
---|
56 | #include <iprt/errcore.h>
|
---|
57 | #include <iprt/semaphore.h>
|
---|
58 |
|
---|
59 | //
|
---|
60 | // choose a conversion library. we used to use mbrtowc/wcrtomb under Linux,
|
---|
61 | // but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
|
---|
62 | // or not (see bug 206811 and
|
---|
63 | // news://news.mozilla.org:119/[email protected]). we now use
|
---|
64 | // iconv for all platforms where nltypes.h and nllanginfo.h are present
|
---|
65 | // along with iconv.
|
---|
66 | //
|
---|
67 | #if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
|
---|
68 | #define USE_ICONV 1
|
---|
69 | #else
|
---|
70 | #define USE_STDCONV 1
|
---|
71 | #endif
|
---|
72 |
|
---|
73 | static void
|
---|
74 | isolatin1_to_utf16(const char **input, PRUint32 *inputLeft, PRUnichar **output, PRUint32 *outputLeft)
|
---|
75 | {
|
---|
76 | while (*inputLeft && *outputLeft) {
|
---|
77 | **output = (unsigned char) **input;
|
---|
78 | (*input)++;
|
---|
79 | (*inputLeft)--;
|
---|
80 | (*output)++;
|
---|
81 | (*outputLeft)--;
|
---|
82 | }
|
---|
83 | }
|
---|
84 |
|
---|
85 | static void
|
---|
86 | utf16_to_isolatin1(const PRUnichar **input, PRUint32 *inputLeft, char **output, PRUint32 *outputLeft)
|
---|
87 | {
|
---|
88 | while (*inputLeft && *outputLeft) {
|
---|
89 | **output = (unsigned char) **input;
|
---|
90 | (*input)++;
|
---|
91 | (*inputLeft)--;
|
---|
92 | (*output)++;
|
---|
93 | (*outputLeft)--;
|
---|
94 | }
|
---|
95 | }
|
---|
96 |
|
---|
97 | //-----------------------------------------------------------------------------
|
---|
98 | // conversion using iconv
|
---|
99 | //-----------------------------------------------------------------------------
|
---|
100 | #if defined(USE_ICONV)
|
---|
101 | #include <nl_types.h> // CODESET
|
---|
102 | #include <langinfo.h> // nl_langinfo
|
---|
103 | #include <iconv.h> // iconv_open, iconv, iconv_close
|
---|
104 | #include <errno.h>
|
---|
105 |
|
---|
106 | #if defined(HAVE_ICONV_WITH_CONST_INPUT)
|
---|
107 | #define ICONV_INPUT(x) (x)
|
---|
108 | #else
|
---|
109 | #define ICONV_INPUT(x) ((char **)x)
|
---|
110 | #endif
|
---|
111 |
|
---|
112 | // solaris definitely needs this, but we'll enable it by default
|
---|
113 | // just in case... but we know for sure that iconv(3) in glibc
|
---|
114 | // doesn't need this.
|
---|
115 | #if !defined(__GLIBC__)
|
---|
116 | #define ENABLE_UTF8_FALLBACK_SUPPORT
|
---|
117 | #endif
|
---|
118 |
|
---|
119 | #define INVALID_ICONV_T ((iconv_t) -1)
|
---|
120 |
|
---|
121 | static inline size_t
|
---|
122 | xp_iconv(iconv_t converter,
|
---|
123 | const char **input,
|
---|
124 | size_t *inputLeft,
|
---|
125 | char **output,
|
---|
126 | size_t *outputLeft)
|
---|
127 | {
|
---|
128 | size_t res, outputAvail = outputLeft ? *outputLeft : 0;
|
---|
129 | res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
|
---|
130 | if (res == (size_t) -1) {
|
---|
131 | // on some platforms (e.g., linux) iconv will fail with
|
---|
132 | // E2BIG if it cannot convert _all_ of its input. it'll
|
---|
133 | // still adjust all of the in/out params correctly, so we
|
---|
134 | // can ignore this error. the assumption is that we will
|
---|
135 | // be called again to complete the conversion.
|
---|
136 | if ((errno == E2BIG) && (*outputLeft < outputAvail))
|
---|
137 | res = 0;
|
---|
138 | }
|
---|
139 | return res;
|
---|
140 | }
|
---|
141 |
|
---|
142 | static inline void
|
---|
143 | xp_iconv_reset(iconv_t converter)
|
---|
144 | {
|
---|
145 | // NOTE: the man pages on Solaris claim that you can pass NULL
|
---|
146 | // for all parameter to reset the converter, but beware the
|
---|
147 | // evil Solaris crash if you go down this route >:-)
|
---|
148 |
|
---|
149 | const char *zero_char_in_ptr = NULL;
|
---|
150 | char *zero_char_out_ptr = NULL;
|
---|
151 | size_t zero_size_in = 0,
|
---|
152 | zero_size_out = 0;
|
---|
153 |
|
---|
154 | xp_iconv(converter, &zero_char_in_ptr,
|
---|
155 | &zero_size_in,
|
---|
156 | &zero_char_out_ptr,
|
---|
157 | &zero_size_out);
|
---|
158 | }
|
---|
159 |
|
---|
160 | static inline iconv_t
|
---|
161 | xp_iconv_open(const char **to_list, const char **from_list)
|
---|
162 | {
|
---|
163 | iconv_t res;
|
---|
164 | const char **from_name;
|
---|
165 | const char **to_name;
|
---|
166 |
|
---|
167 | // try all possible combinations to locate a converter.
|
---|
168 | to_name = to_list;
|
---|
169 | while (*to_name) {
|
---|
170 | if (**to_name) {
|
---|
171 | from_name = from_list;
|
---|
172 | while (*from_name) {
|
---|
173 | if (**from_name) {
|
---|
174 | res = iconv_open(*to_name, *from_name);
|
---|
175 | if (res != INVALID_ICONV_T)
|
---|
176 | return res;
|
---|
177 | }
|
---|
178 | from_name++;
|
---|
179 | }
|
---|
180 | }
|
---|
181 | to_name++;
|
---|
182 | }
|
---|
183 |
|
---|
184 | return INVALID_ICONV_T;
|
---|
185 | }
|
---|
186 |
|
---|
187 | /*
|
---|
188 | * PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
|
---|
189 | * have to use UTF-16 with iconv(3) on platforms where it's supported.
|
---|
190 | * However, the way UTF-16 and UCS-2 are interpreted varies across platforms
|
---|
191 | * and implementations of iconv(3). On Tru64, it also depends on the environment
|
---|
192 | * variable. To avoid the trouble arising from byte-swapping
|
---|
193 | * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
|
---|
194 | * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
|
---|
195 | * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
|
---|
196 | * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
|
---|
197 | * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
|
---|
198 | * variable ICONV_BYTEORDER is set to 'big-endian', about which not much
|
---|
199 | * can be done other than adding a note in the release notes. (bug 206811)
|
---|
200 | */
|
---|
201 | static const char *UTF_16_NAMES[] = {
|
---|
202 | #if defined(IS_LITTLE_ENDIAN)
|
---|
203 | "UTF-16LE",
|
---|
204 | #if defined(__GLIBC__)
|
---|
205 | "UNICODELITTLE",
|
---|
206 | #endif
|
---|
207 | "UCS-2LE",
|
---|
208 | #else
|
---|
209 | "UTF-16BE",
|
---|
210 | #if defined(__GLIBC__)
|
---|
211 | "UNICODEBIG",
|
---|
212 | #endif
|
---|
213 | "UCS-2BE",
|
---|
214 | #endif
|
---|
215 | "UTF-16",
|
---|
216 | "UCS-2",
|
---|
217 | "UCS2",
|
---|
218 | "UCS_2",
|
---|
219 | "ucs-2",
|
---|
220 | "ucs2",
|
---|
221 | "ucs_2",
|
---|
222 | NULL
|
---|
223 | };
|
---|
224 |
|
---|
225 | #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
|
---|
226 | static const char *UTF_8_NAMES[] = {
|
---|
227 | "UTF-8",
|
---|
228 | "UTF8",
|
---|
229 | "UTF_8",
|
---|
230 | "utf-8",
|
---|
231 | "utf8",
|
---|
232 | "utf_8",
|
---|
233 | NULL
|
---|
234 | };
|
---|
235 | #endif
|
---|
236 |
|
---|
237 | static const char *ISO_8859_1_NAMES[] = {
|
---|
238 | "ISO-8859-1",
|
---|
239 | #if !defined(__GLIBC__)
|
---|
240 | "ISO8859-1",
|
---|
241 | "ISO88591",
|
---|
242 | "ISO_8859_1",
|
---|
243 | "ISO8859_1",
|
---|
244 | "iso-8859-1",
|
---|
245 | "iso8859-1",
|
---|
246 | "iso88591",
|
---|
247 | "iso_8859_1",
|
---|
248 | "iso8859_1",
|
---|
249 | #endif
|
---|
250 | NULL
|
---|
251 | };
|
---|
252 |
|
---|
253 | class nsNativeCharsetConverter
|
---|
254 | {
|
---|
255 | public:
|
---|
256 | nsNativeCharsetConverter();
|
---|
257 | ~nsNativeCharsetConverter();
|
---|
258 |
|
---|
259 | nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
|
---|
260 | PRUnichar **output, PRUint32 *outputLeft);
|
---|
261 | nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
|
---|
262 | char **output, PRUint32 *outputLeft);
|
---|
263 |
|
---|
264 | static void GlobalInit();
|
---|
265 | static void GlobalShutdown();
|
---|
266 |
|
---|
267 | private:
|
---|
268 | static iconv_t gNativeToUnicode;
|
---|
269 | static iconv_t gUnicodeToNative;
|
---|
270 | #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
|
---|
271 | static iconv_t gNativeToUTF8;
|
---|
272 | static iconv_t gUTF8ToNative;
|
---|
273 | static iconv_t gUnicodeToUTF8;
|
---|
274 | static iconv_t gUTF8ToUnicode;
|
---|
275 | #endif
|
---|
276 | static RTSEMFASTMUTEX gLock;
|
---|
277 | static PRBool gInitialized;
|
---|
278 |
|
---|
279 | static void LazyInit();
|
---|
280 |
|
---|
281 | static void Lock() { if (gLock != NILRTSEMFASTMUTEX) RTSemFastMutexRequest(gLock); }
|
---|
282 | static void Unlock() { if (gLock != NILRTSEMFASTMUTEX) RTSemFastMutexRelease(gLock); }
|
---|
283 | };
|
---|
284 |
|
---|
285 | iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
|
---|
286 | iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
|
---|
287 | #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
|
---|
288 | iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T;
|
---|
289 | iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T;
|
---|
290 | iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T;
|
---|
291 | iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T;
|
---|
292 | #endif
|
---|
293 | RTSEMFASTMUTEX nsNativeCharsetConverter::gLock = NIL_RTSEMFASTMUTEX;
|
---|
294 | PRBool nsNativeCharsetConverter::gInitialized = PR_FALSE;
|
---|
295 |
|
---|
296 | void
|
---|
297 | nsNativeCharsetConverter::LazyInit()
|
---|
298 | {
|
---|
299 | const char *blank_list[] = { "", NULL };
|
---|
300 | const char **native_charset_list = blank_list;
|
---|
301 | const char *native_charset = nl_langinfo(CODESET);
|
---|
302 | if (native_charset == nsnull) {
|
---|
303 | NS_ERROR("native charset is unknown");
|
---|
304 | // fallback to ISO-8859-1
|
---|
305 | native_charset_list = ISO_8859_1_NAMES;
|
---|
306 | }
|
---|
307 | else
|
---|
308 | native_charset_list[0] = native_charset;
|
---|
309 |
|
---|
310 | gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
|
---|
311 | gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
|
---|
312 |
|
---|
313 | #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
|
---|
314 | if (gNativeToUnicode == INVALID_ICONV_T) {
|
---|
315 | gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
|
---|
316 | gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
|
---|
317 | NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
|
---|
318 | NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
|
---|
319 | }
|
---|
320 | if (gUnicodeToNative == INVALID_ICONV_T) {
|
---|
321 | gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
|
---|
322 | gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
|
---|
323 | NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
|
---|
324 | NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
|
---|
325 | }
|
---|
326 | #else
|
---|
327 | NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
|
---|
328 | NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
|
---|
329 | #endif
|
---|
330 |
|
---|
331 | /*
|
---|
332 | * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
|
---|
333 | * prepend a byte order mark unicode character (BOM, u+FEFF) during
|
---|
334 | * the first use of the iconv converter. The same is the case of
|
---|
335 | * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
|
---|
336 | * However, we use 'UTF-16LE/BE' in both cases, instead so that we
|
---|
337 | * should be safe. But just in case...
|
---|
338 | *
|
---|
339 | * This dummy conversion gets rid of the BOMs and fixes bug 153562.
|
---|
340 | */
|
---|
341 | char dummy_input[1] = { ' ' };
|
---|
342 | char dummy_output[4];
|
---|
343 |
|
---|
344 | if (gNativeToUnicode != INVALID_ICONV_T) {
|
---|
345 | const char *input = dummy_input;
|
---|
346 | size_t input_left = sizeof(dummy_input);
|
---|
347 | char *output = dummy_output;
|
---|
348 | size_t output_left = sizeof(dummy_output);
|
---|
349 |
|
---|
350 | xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
|
---|
351 | }
|
---|
352 | #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
|
---|
353 | if (gUTF8ToUnicode != INVALID_ICONV_T) {
|
---|
354 | const char *input = dummy_input;
|
---|
355 | size_t input_left = sizeof(dummy_input);
|
---|
356 | char *output = dummy_output;
|
---|
357 | size_t output_left = sizeof(dummy_output);
|
---|
358 |
|
---|
359 | xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
|
---|
360 | }
|
---|
361 | #endif
|
---|
362 |
|
---|
363 | gInitialized = PR_TRUE;
|
---|
364 | }
|
---|
365 |
|
---|
366 | void
|
---|
367 | nsNativeCharsetConverter::GlobalInit()
|
---|
368 | {
|
---|
369 | int vrc = RTSemFastMutexCreate(&gLock);
|
---|
370 | NS_ASSERTION(RT_SUCCESS(vrc), "lock creation failed");
|
---|
371 | }
|
---|
372 |
|
---|
373 | void
|
---|
374 | nsNativeCharsetConverter::GlobalShutdown()
|
---|
375 | {
|
---|
376 | if (gLock != NIL_RTSEMFASTMUTEX) {
|
---|
377 | RTSemFastMutexDestroy(gLock);
|
---|
378 | gLock = NIL_RTSEMFASTMUTEX;
|
---|
379 | }
|
---|
380 |
|
---|
381 | if (gNativeToUnicode != INVALID_ICONV_T) {
|
---|
382 | iconv_close(gNativeToUnicode);
|
---|
383 | gNativeToUnicode = INVALID_ICONV_T;
|
---|
384 | }
|
---|
385 |
|
---|
386 | if (gUnicodeToNative != INVALID_ICONV_T) {
|
---|
387 | iconv_close(gUnicodeToNative);
|
---|
388 | gUnicodeToNative = INVALID_ICONV_T;
|
---|
389 | }
|
---|
390 |
|
---|
391 | #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
|
---|
392 | if (gNativeToUTF8 != INVALID_ICONV_T) {
|
---|
393 | iconv_close(gNativeToUTF8);
|
---|
394 | gNativeToUTF8 = INVALID_ICONV_T;
|
---|
395 | }
|
---|
396 | if (gUTF8ToNative != INVALID_ICONV_T) {
|
---|
397 | iconv_close(gUTF8ToNative);
|
---|
398 | gUTF8ToNative = INVALID_ICONV_T;
|
---|
399 | }
|
---|
400 | if (gUnicodeToUTF8 != INVALID_ICONV_T) {
|
---|
401 | iconv_close(gUnicodeToUTF8);
|
---|
402 | gUnicodeToUTF8 = INVALID_ICONV_T;
|
---|
403 | }
|
---|
404 | if (gUTF8ToUnicode != INVALID_ICONV_T) {
|
---|
405 | iconv_close(gUTF8ToUnicode);
|
---|
406 | gUTF8ToUnicode = INVALID_ICONV_T;
|
---|
407 | }
|
---|
408 | #endif
|
---|
409 |
|
---|
410 | gInitialized = PR_FALSE;
|
---|
411 | }
|
---|
412 |
|
---|
413 | nsNativeCharsetConverter::nsNativeCharsetConverter()
|
---|
414 | {
|
---|
415 | Lock();
|
---|
416 | if (!gInitialized)
|
---|
417 | LazyInit();
|
---|
418 | }
|
---|
419 |
|
---|
420 | nsNativeCharsetConverter::~nsNativeCharsetConverter()
|
---|
421 | {
|
---|
422 | // reset converters for next time
|
---|
423 | if (gNativeToUnicode != INVALID_ICONV_T)
|
---|
424 | xp_iconv_reset(gNativeToUnicode);
|
---|
425 | if (gUnicodeToNative != INVALID_ICONV_T)
|
---|
426 | xp_iconv_reset(gUnicodeToNative);
|
---|
427 | #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
|
---|
428 | if (gNativeToUTF8 != INVALID_ICONV_T)
|
---|
429 | xp_iconv_reset(gNativeToUTF8);
|
---|
430 | if (gUTF8ToNative != INVALID_ICONV_T)
|
---|
431 | xp_iconv_reset(gUTF8ToNative);
|
---|
432 | if (gUnicodeToUTF8 != INVALID_ICONV_T)
|
---|
433 | xp_iconv_reset(gUnicodeToUTF8);
|
---|
434 | if (gUTF8ToUnicode != INVALID_ICONV_T)
|
---|
435 | xp_iconv_reset(gUTF8ToUnicode);
|
---|
436 | #endif
|
---|
437 | Unlock();
|
---|
438 | }
|
---|
439 |
|
---|
440 | nsresult
|
---|
441 | nsNativeCharsetConverter::NativeToUnicode(const char **input,
|
---|
442 | PRUint32 *inputLeft,
|
---|
443 | PRUnichar **output,
|
---|
444 | PRUint32 *outputLeft)
|
---|
445 | {
|
---|
446 | size_t res = 0;
|
---|
447 | size_t inLeft = (size_t) *inputLeft;
|
---|
448 | size_t outLeft = (size_t) *outputLeft * 2;
|
---|
449 |
|
---|
450 | if (gNativeToUnicode != INVALID_ICONV_T) {
|
---|
451 |
|
---|
452 | res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
|
---|
453 |
|
---|
454 | *inputLeft = inLeft;
|
---|
455 | *outputLeft = outLeft / 2;
|
---|
456 | if (res != (size_t) -1)
|
---|
457 | return NS_OK;
|
---|
458 |
|
---|
459 | NS_WARNING("conversion from native to utf-16 failed");
|
---|
460 |
|
---|
461 | // reset converter
|
---|
462 | xp_iconv_reset(gNativeToUnicode);
|
---|
463 | }
|
---|
464 | #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
|
---|
465 | else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
|
---|
466 | (gUTF8ToUnicode != INVALID_ICONV_T)) {
|
---|
467 | // convert first to UTF8, then from UTF8 to UCS2
|
---|
468 | const char *in = *input;
|
---|
469 |
|
---|
470 | char ubuf[1024];
|
---|
471 |
|
---|
472 | // we assume we're always called with enough space in |output|,
|
---|
473 | // so convert many chars at a time...
|
---|
474 | while (inLeft) {
|
---|
475 | char *p = ubuf;
|
---|
476 | size_t n = sizeof(ubuf);
|
---|
477 | res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
|
---|
478 | if (res == (size_t) -1) {
|
---|
479 | NS_ERROR("conversion from native to utf-8 failed");
|
---|
480 | break;
|
---|
481 | }
|
---|
482 | NS_ASSERTION(outLeft > 0, "bad assumption");
|
---|
483 | p = ubuf;
|
---|
484 | n = sizeof(ubuf) - n;
|
---|
485 | res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
|
---|
486 | if (res == (size_t) -1) {
|
---|
487 | NS_ERROR("conversion from utf-8 to utf-16 failed");
|
---|
488 | break;
|
---|
489 | }
|
---|
490 | }
|
---|
491 |
|
---|
492 | (*input) += (*inputLeft - inLeft);
|
---|
493 | *inputLeft = inLeft;
|
---|
494 | *outputLeft = outLeft / 2;
|
---|
495 |
|
---|
496 | if (res != (size_t) -1)
|
---|
497 | return NS_OK;
|
---|
498 |
|
---|
499 | // reset converters
|
---|
500 | xp_iconv_reset(gNativeToUTF8);
|
---|
501 | xp_iconv_reset(gUTF8ToUnicode);
|
---|
502 | }
|
---|
503 | #endif
|
---|
504 |
|
---|
505 | // fallback: zero-pad and hope for the best
|
---|
506 | // XXX This is lame and we have to do better.
|
---|
507 | isolatin1_to_utf16(input, inputLeft, output, outputLeft);
|
---|
508 |
|
---|
509 | return NS_OK;
|
---|
510 | }
|
---|
511 |
|
---|
512 | nsresult
|
---|
513 | nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
|
---|
514 | PRUint32 *inputLeft,
|
---|
515 | char **output,
|
---|
516 | PRUint32 *outputLeft)
|
---|
517 | {
|
---|
518 | size_t res = 0;
|
---|
519 | size_t inLeft = (size_t) *inputLeft * 2;
|
---|
520 | size_t outLeft = (size_t) *outputLeft;
|
---|
521 |
|
---|
522 | if (gUnicodeToNative != INVALID_ICONV_T) {
|
---|
523 | res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
|
---|
524 |
|
---|
525 | if (res != (size_t) -1) {
|
---|
526 | *inputLeft = inLeft / 2;
|
---|
527 | *outputLeft = outLeft;
|
---|
528 | return NS_OK;
|
---|
529 | }
|
---|
530 |
|
---|
531 | NS_ERROR("iconv failed");
|
---|
532 |
|
---|
533 | // reset converter
|
---|
534 | xp_iconv_reset(gUnicodeToNative);
|
---|
535 | }
|
---|
536 | #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
|
---|
537 | else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
|
---|
538 | (gUTF8ToNative != INVALID_ICONV_T)) {
|
---|
539 | const char *in = (const char *) *input;
|
---|
540 |
|
---|
541 | char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
|
---|
542 |
|
---|
543 | // convert one uchar at a time...
|
---|
544 | while (inLeft && outLeft) {
|
---|
545 | char *p = ubuf;
|
---|
546 | size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
|
---|
547 | res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
|
---|
548 | if (res == (size_t) -1) {
|
---|
549 | NS_ERROR("conversion from utf-16 to utf-8 failed");
|
---|
550 | break;
|
---|
551 | }
|
---|
552 | p = ubuf;
|
---|
553 | n = sizeof(ubuf) - n;
|
---|
554 | res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
|
---|
555 | if (res == (size_t) -1) {
|
---|
556 | if (errno == E2BIG) {
|
---|
557 | // not enough room for last uchar... back up and return.
|
---|
558 | in -= sizeof(PRUnichar);
|
---|
559 | res = 0;
|
---|
560 | }
|
---|
561 | else
|
---|
562 | NS_ERROR("conversion from utf-8 to native failed");
|
---|
563 | break;
|
---|
564 | }
|
---|
565 | inLeft -= sizeof(PRUnichar);
|
---|
566 | }
|
---|
567 |
|
---|
568 | if (res != (size_t) -1) {
|
---|
569 | (*input) += (*inputLeft - inLeft/2);
|
---|
570 | *inputLeft = inLeft/2;
|
---|
571 | *outputLeft = outLeft;
|
---|
572 | return NS_OK;
|
---|
573 | }
|
---|
574 |
|
---|
575 | // reset converters
|
---|
576 | xp_iconv_reset(gUnicodeToUTF8);
|
---|
577 | xp_iconv_reset(gUTF8ToNative);
|
---|
578 | }
|
---|
579 | #endif
|
---|
580 |
|
---|
581 | // fallback: truncate and hope for the best
|
---|
582 | utf16_to_isolatin1(input, inputLeft, output, outputLeft);
|
---|
583 |
|
---|
584 | return NS_OK;
|
---|
585 | }
|
---|
586 |
|
---|
587 | #endif // USE_ICONV
|
---|
588 |
|
---|
589 | //-----------------------------------------------------------------------------
|
---|
590 | // conversion using mb[r]towc/wc[r]tomb
|
---|
591 | //-----------------------------------------------------------------------------
|
---|
592 | #if defined(USE_STDCONV)
|
---|
593 | #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
|
---|
594 | #include <wchar.h> // mbrtowc, wcrtomb
|
---|
595 | #endif
|
---|
596 |
|
---|
597 | class nsNativeCharsetConverter
|
---|
598 | {
|
---|
599 | public:
|
---|
600 | nsNativeCharsetConverter();
|
---|
601 |
|
---|
602 | nsresult NativeToUnicode(const char **input , PRUint32 *inputLeft,
|
---|
603 | PRUnichar **output, PRUint32 *outputLeft);
|
---|
604 | nsresult UnicodeToNative(const PRUnichar **input , PRUint32 *inputLeft,
|
---|
605 | char **output, PRUint32 *outputLeft);
|
---|
606 |
|
---|
607 | static void GlobalInit();
|
---|
608 | static void GlobalShutdown() { }
|
---|
609 |
|
---|
610 | private:
|
---|
611 | static PRBool gWCharIsUnicode;
|
---|
612 |
|
---|
613 | #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
|
---|
614 | mbstate_t ps;
|
---|
615 | #endif
|
---|
616 | };
|
---|
617 |
|
---|
618 | PRBool nsNativeCharsetConverter::gWCharIsUnicode = PR_FALSE;
|
---|
619 |
|
---|
620 | nsNativeCharsetConverter::nsNativeCharsetConverter()
|
---|
621 | {
|
---|
622 | #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
|
---|
623 | memset(&ps, 0, sizeof(ps));
|
---|
624 | #endif
|
---|
625 | }
|
---|
626 |
|
---|
627 | void
|
---|
628 | nsNativeCharsetConverter::GlobalInit()
|
---|
629 | {
|
---|
630 | // verify that wchar_t for the current locale is actually unicode.
|
---|
631 | // if it is not, then we should avoid calling mbtowc/wctomb and
|
---|
632 | // just fallback on zero-pad/truncation conversion.
|
---|
633 | //
|
---|
634 | // this test cannot be done at build time because the encoding of
|
---|
635 | // wchar_t may depend on the runtime locale. sad, but true!!
|
---|
636 | //
|
---|
637 | // so, if wchar_t is unicode then converting an ASCII character
|
---|
638 | // to wchar_t should not change its numeric value. we'll just
|
---|
639 | // check what happens with the ASCII 'a' character.
|
---|
640 | //
|
---|
641 | // this test is not perfect... obviously, it could yield false
|
---|
642 | // positives, but then at least ASCII text would be converted
|
---|
643 | // properly (or maybe just the 'a' character) -- oh well :(
|
---|
644 |
|
---|
645 | char a = 'a';
|
---|
646 | unsigned int w = 0;
|
---|
647 |
|
---|
648 | int res = mbtowc((wchar_t *) &w, &a, 1);
|
---|
649 |
|
---|
650 | gWCharIsUnicode = (res != -1 && w == 'a');
|
---|
651 |
|
---|
652 | #ifdef DEBUG
|
---|
653 | if (!gWCharIsUnicode)
|
---|
654 | NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
|
---|
655 | #endif
|
---|
656 | }
|
---|
657 |
|
---|
658 | nsresult
|
---|
659 | nsNativeCharsetConverter::NativeToUnicode(const char **input,
|
---|
660 | PRUint32 *inputLeft,
|
---|
661 | PRUnichar **output,
|
---|
662 | PRUint32 *outputLeft)
|
---|
663 | {
|
---|
664 | if (gWCharIsUnicode) {
|
---|
665 | int incr;
|
---|
666 |
|
---|
667 | // cannot use wchar_t here since it may have been redefined (e.g.,
|
---|
668 | // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
|
---|
669 | unsigned int tmp = 0;
|
---|
670 | while (*inputLeft && *outputLeft) {
|
---|
671 | #ifdef HAVE_MBRTOWC
|
---|
672 | incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);
|
---|
673 | #else
|
---|
674 | // XXX is this thread-safe?
|
---|
675 | incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);
|
---|
676 | #endif
|
---|
677 | if (incr < 0) {
|
---|
678 | NS_WARNING("mbtowc failed: possible charset mismatch");
|
---|
679 | // zero-pad and hope for the best
|
---|
680 | tmp = (unsigned char) **input;
|
---|
681 | incr = 1;
|
---|
682 | }
|
---|
683 | **output = (PRUnichar) tmp;
|
---|
684 | (*input) += incr;
|
---|
685 | (*inputLeft) -= incr;
|
---|
686 | (*output)++;
|
---|
687 | (*outputLeft)--;
|
---|
688 | }
|
---|
689 | }
|
---|
690 | else {
|
---|
691 | // wchar_t isn't unicode, so the best we can do is treat the
|
---|
692 | // input as if it is isolatin1 :(
|
---|
693 | isolatin1_to_utf16(input, inputLeft, output, outputLeft);
|
---|
694 | }
|
---|
695 |
|
---|
696 | return NS_OK;
|
---|
697 | }
|
---|
698 |
|
---|
699 | nsresult
|
---|
700 | nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
|
---|
701 | PRUint32 *inputLeft,
|
---|
702 | char **output,
|
---|
703 | PRUint32 *outputLeft)
|
---|
704 | {
|
---|
705 | if (gWCharIsUnicode) {
|
---|
706 | int incr;
|
---|
707 |
|
---|
708 | /* MB_CUR_MAX better be positive. */
|
---|
709 | while (*inputLeft && *outputLeft >= (PRUint32)MB_CUR_MAX) {
|
---|
710 | #ifdef HAVE_WCRTOMB
|
---|
711 | incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);
|
---|
712 | #else
|
---|
713 | // XXX is this thread-safe?
|
---|
714 | incr = (int) wctomb(*output, (wchar_t) **input);
|
---|
715 | #endif
|
---|
716 | if (incr < 0) {
|
---|
717 | NS_WARNING("mbtowc failed: possible charset mismatch");
|
---|
718 | **output = (unsigned char) **input; // truncate
|
---|
719 | incr = 1;
|
---|
720 | }
|
---|
721 | // most likely we're dead anyways if this assertion should fire
|
---|
722 | NS_ASSERTION(PRUint32(incr) <= *outputLeft, "wrote beyond end of string");
|
---|
723 | (*output) += incr;
|
---|
724 | (*outputLeft) -= incr;
|
---|
725 | (*input)++;
|
---|
726 | (*inputLeft)--;
|
---|
727 | }
|
---|
728 | }
|
---|
729 | else {
|
---|
730 | // wchar_t isn't unicode, so the best we can do is treat the
|
---|
731 | // input as if it is isolatin1 :(
|
---|
732 | utf16_to_isolatin1(input, inputLeft, output, outputLeft);
|
---|
733 | }
|
---|
734 |
|
---|
735 | return NS_OK;
|
---|
736 | }
|
---|
737 |
|
---|
738 | #endif // USE_STDCONV
|
---|
739 |
|
---|
740 | //-----------------------------------------------------------------------------
|
---|
741 | // API implementation
|
---|
742 | //-----------------------------------------------------------------------------
|
---|
743 |
|
---|
744 | NS_COM nsresult
|
---|
745 | NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
|
---|
746 | {
|
---|
747 | output.Truncate();
|
---|
748 |
|
---|
749 | PRUint32 inputLen = input.Length();
|
---|
750 |
|
---|
751 | nsACString::const_iterator iter;
|
---|
752 | input.BeginReading(iter);
|
---|
753 |
|
---|
754 | //
|
---|
755 | // OPTIMIZATION: preallocate space for largest possible result; convert
|
---|
756 | // directly into the result buffer to avoid intermediate buffer copy.
|
---|
757 | //
|
---|
758 | // this will generally result in a larger allocation, but that seems
|
---|
759 | // better than an extra buffer copy.
|
---|
760 | //
|
---|
761 | output.SetLength(inputLen);
|
---|
762 | nsAString::iterator out_iter;
|
---|
763 | output.BeginWriting(out_iter);
|
---|
764 |
|
---|
765 | PRUnichar *result = out_iter.get();
|
---|
766 | PRUint32 resultLeft = inputLen;
|
---|
767 |
|
---|
768 | const char *buf = iter.get();
|
---|
769 | PRUint32 bufLeft = inputLen;
|
---|
770 |
|
---|
771 | nsNativeCharsetConverter conv;
|
---|
772 | nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
|
---|
773 | if (NS_SUCCEEDED(rv)) {
|
---|
774 | NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
|
---|
775 | output.SetLength(inputLen - resultLeft);
|
---|
776 | }
|
---|
777 | return rv;
|
---|
778 | }
|
---|
779 |
|
---|
780 | NS_COM nsresult
|
---|
781 | NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
|
---|
782 | {
|
---|
783 | output.Truncate();
|
---|
784 |
|
---|
785 | nsAString::const_iterator iter, end;
|
---|
786 | input.BeginReading(iter);
|
---|
787 | input.EndReading(end);
|
---|
788 |
|
---|
789 | // cannot easily avoid intermediate buffer copy.
|
---|
790 | char temp[4096];
|
---|
791 |
|
---|
792 | nsNativeCharsetConverter conv;
|
---|
793 |
|
---|
794 | const PRUnichar *buf = iter.get();
|
---|
795 | PRUint32 bufLeft = Distance(iter, end);
|
---|
796 | while (bufLeft) {
|
---|
797 | char *p = temp;
|
---|
798 | PRUint32 tempLeft = sizeof(temp);
|
---|
799 |
|
---|
800 | nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
|
---|
801 | if (NS_FAILED(rv)) return rv;
|
---|
802 |
|
---|
803 | if (tempLeft < sizeof(temp))
|
---|
804 | output.Append(temp, sizeof(temp) - tempLeft);
|
---|
805 | }
|
---|
806 | return NS_OK;
|
---|
807 | }
|
---|
808 |
|
---|
809 | void
|
---|
810 | NS_StartupNativeCharsetUtils()
|
---|
811 | {
|
---|
812 | //
|
---|
813 | // need to initialize the locale or else charset conversion will fail.
|
---|
814 | // better not delay this in case some other component alters the locale
|
---|
815 | // settings.
|
---|
816 | //
|
---|
817 | // XXX we assume that we are called early enough that we should
|
---|
818 | // always be the first to care about the locale's charset.
|
---|
819 | //
|
---|
820 | setlocale(LC_CTYPE, "");
|
---|
821 |
|
---|
822 | nsNativeCharsetConverter::GlobalInit();
|
---|
823 | }
|
---|
824 |
|
---|
825 | void
|
---|
826 | NS_ShutdownNativeCharsetUtils()
|
---|
827 | {
|
---|
828 | nsNativeCharsetConverter::GlobalShutdown();
|
---|
829 | }
|
---|
830 |
|
---|
831 | //-----------------------------------------------------------------------------
|
---|
832 | // default : truncate/zeropad
|
---|
833 | //-----------------------------------------------------------------------------
|
---|
834 | #else
|
---|
835 |
|
---|
836 | #include "nsReadableUtils.h"
|
---|
837 |
|
---|
838 | NS_COM nsresult
|
---|
839 | NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
|
---|
840 | {
|
---|
841 | CopyASCIItoUCS2(input, output);
|
---|
842 | return NS_OK;
|
---|
843 | }
|
---|
844 |
|
---|
845 | NS_COM nsresult
|
---|
846 | NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
|
---|
847 | {
|
---|
848 | CopyUCS2toASCII(input, output);
|
---|
849 | return NS_OK;
|
---|
850 | }
|
---|
851 |
|
---|
852 | void
|
---|
853 | NS_StartupNativeCharsetUtils()
|
---|
854 | {
|
---|
855 | }
|
---|
856 |
|
---|
857 | void
|
---|
858 | NS_ShutdownNativeCharsetUtils()
|
---|
859 | {
|
---|
860 | }
|
---|
861 |
|
---|
862 | #endif
|
---|
863 |
|
---|