VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 40073

最後變更 在這個檔案從40073是 40073,由 vboxsync 提交於 13 年 前

Runtime/strings: add Utf-8 and Utf-16 sanitising to a white list of characters. Fix a burn.

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Id
檔案大小: 30.8 KB
 
1/* $Id: utf-16.cpp 40073 2012-02-10 22:08:19Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.alldomusa.eu.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
43{
44 if (pwszString)
45 RTMemTmpFree(pwszString);
46}
47RT_EXPORT_SYMBOL(RTUtf16Free);
48
49
50RTDECL(PRTUTF16) RTUtf16DupTag(PCRTUTF16 pwszString, const char *pszTag)
51{
52 Assert(pwszString);
53 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
54 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb, pszTag);
55 if (pwsz)
56 memcpy(pwsz, pwszString, cb);
57 return pwsz;
58}
59RT_EXPORT_SYMBOL(RTUtf16DupTag);
60
61
62RTDECL(int) RTUtf16DupExTag(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra, const char *pszTag)
63{
64 Assert(pwszString);
65 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
66 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag(cb + cwcExtra * sizeof(RTUTF16), pszTag);
67 if (pwsz)
68 {
69 memcpy(pwsz, pwszString, cb);
70 *ppwszString = pwsz;
71 return VINF_SUCCESS;
72 }
73 return VERR_NO_MEMORY;
74}
75RT_EXPORT_SYMBOL(RTUtf16DupExTag);
76
77
78RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
79{
80 if (!pwszString)
81 return 0;
82
83 PCRTUTF16 pwsz = pwszString;
84 while (*pwsz)
85 pwsz++;
86 return pwsz - pwszString;
87}
88RT_EXPORT_SYMBOL(RTUtf16Len);
89
90
91RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
92{
93 if (pwsz1 == pwsz2)
94 return 0;
95 if (!pwsz1)
96 return -1;
97 if (!pwsz2)
98 return 1;
99
100 for (;;)
101 {
102 register RTUTF16 wcs = *pwsz1;
103 register int iDiff = wcs - *pwsz2;
104 if (iDiff || !wcs)
105 return iDiff;
106 pwsz1++;
107 pwsz2++;
108 }
109}
110RT_EXPORT_SYMBOL(RTUtf16Cmp);
111
112
113RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
114{
115 if (pwsz1 == pwsz2)
116 return 0;
117 if (!pwsz1)
118 return -1;
119 if (!pwsz2)
120 return 1;
121
122 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
123 for (;;)
124 {
125 register RTUTF16 wc1 = *pwsz1;
126 register RTUTF16 wc2 = *pwsz2;
127 register int iDiff = wc1 - wc2;
128 if (iDiff)
129 {
130 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
131 if ( wc1 < 0xd800
132 || wc2 < 0xd800
133 || wc1 > 0xdfff
134 || wc2 > 0xdfff)
135 {
136 /* simple UCS-2 char */
137 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
138 if (iDiff)
139 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
140 }
141 else
142 {
143 /* a damned pair */
144 RTUNICP uc1;
145 RTUNICP uc2;
146 if (wc1 >= 0xdc00)
147 {
148 if (pwsz1Start == pwsz1)
149 return iDiff;
150 uc1 = pwsz1[-1];
151 if (uc1 < 0xd800 || uc1 >= 0xdc00)
152 return iDiff;
153 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
154 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
155 }
156 else
157 {
158 uc1 = *++pwsz1;
159 if (uc1 < 0xdc00 || uc1 >= 0xe000)
160 return iDiff;
161 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
162 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
163 }
164 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
165 if (iDiff)
166 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
167 }
168 if (iDiff)
169 return iDiff;
170 }
171 if (!wc1)
172 return 0;
173 pwsz1++;
174 pwsz2++;
175 }
176}
177RT_EXPORT_SYMBOL(RTUtf16ICmp);
178
179
180RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
181{
182 PRTUTF16 pwc = pwsz;
183 for (;;)
184 {
185 RTUTF16 wc = *pwc;
186 if (!wc)
187 break;
188 if (wc < 0xd800 || wc >= 0xdc00)
189 {
190 RTUNICP ucFolded = RTUniCpToLower(wc);
191 if (ucFolded < 0x10000)
192 *pwc++ = RTUniCpToLower(wc);
193 }
194 else
195 {
196 /* surrogate */
197 RTUTF16 wc2 = pwc[1];
198 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
199 {
200 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
201 RTUNICP ucFolded = RTUniCpToLower(uc);
202 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
203 {
204 uc -= 0x10000;
205 *pwc++ = 0xd800 | (uc >> 10);
206 *pwc++ = 0xdc00 | (uc & 0x3ff);
207 }
208 }
209 else /* invalid encoding. */
210 pwc++;
211 }
212 }
213 return pwsz;
214}
215RT_EXPORT_SYMBOL(RTUtf16ToLower);
216
217
218RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
219{
220 PRTUTF16 pwc = pwsz;
221 for (;;)
222 {
223 RTUTF16 wc = *pwc;
224 if (!wc)
225 break;
226 if (wc < 0xd800 || wc >= 0xdc00)
227 *pwc++ = RTUniCpToUpper(wc);
228 else
229 {
230 /* surrogate */
231 RTUTF16 wc2 = pwc[1];
232 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
233 {
234 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
235 RTUNICP ucFolded = RTUniCpToUpper(uc);
236 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
237 {
238 uc -= 0x10000;
239 *pwc++ = 0xd800 | (uc >> 10);
240 *pwc++ = 0xdc00 | (uc & 0x3ff);
241 }
242 }
243 else /* invalid encoding. */
244 pwc++;
245 }
246 }
247 return pwsz;
248}
249RT_EXPORT_SYMBOL(RTUtf16ToUpper);
250
251
252RTDECL(ssize_t) RTUtf16PurgeComplementSet(PRTUTF16 pwsz, PCRTUNICP puszValidSet, char chReplacement)
253{
254 size_t cReplacements = 0;
255 AssertReturn(chReplacement && (unsigned)chReplacement < 128, -1);
256 /* Validate the encoding. */
257 if (RT_FAILURE(RTUtf16CalcUtf8LenEx(pwsz, RTSTR_MAX, NULL)))
258 return -1;
259 for (;;)
260 {
261 RTUNICP Cp;
262 PCRTUNICP pCp;
263 PRTUTF16 pwszOld = pwsz;
264 RTUtf16GetCpEx((PCRTUTF16 *)&pwsz, &Cp);
265 if (!Cp)
266 break;
267 for (pCp = puszValidSet; ; ++pCp)
268 if (!*pCp || *pCp == Cp)
269 break;
270 if (!*pCp)
271 {
272 for (; pwszOld != pwsz; ++pwszOld)
273 *pwszOld = chReplacement;
274 ++cReplacements;
275 }
276 }
277 return cReplacements;
278}
279RT_EXPORT_SYMBOL(RTUtf16PurgeComplementSet);
280
281
282/**
283 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
284 *
285 * @returns iprt status code.
286 * @param pwsz The UTF-16 string.
287 * @param cwc The max length of the UTF-16 string to consider.
288 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
289 */
290static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
291{
292 int rc = VINF_SUCCESS;
293 size_t cch = 0;
294 while (cwc > 0)
295 {
296 RTUTF16 wc = *pwsz++; cwc--;
297 if (!wc)
298 break;
299 else if (wc < 0xd800 || wc > 0xdfff)
300 {
301 if (wc < 0x80)
302 cch++;
303 else if (wc < 0x800)
304 cch += 2;
305 else if (wc < 0xfffe)
306 cch += 3;
307 else
308 {
309 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
310 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
311 break;
312 }
313 }
314 else
315 {
316 if (wc >= 0xdc00)
317 {
318 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
319 rc = VERR_INVALID_UTF16_ENCODING;
320 break;
321 }
322 if (cwc <= 0)
323 {
324 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
325 rc = VERR_INVALID_UTF16_ENCODING;
326 break;
327 }
328 wc = *pwsz++; cwc--;
329 if (wc < 0xdc00 || wc > 0xdfff)
330 {
331 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
332 rc = VERR_INVALID_UTF16_ENCODING;
333 break;
334 }
335 cch += 4;
336 }
337 }
338
339
340 /* done */
341 *pcch = cch;
342 return rc;
343}
344
345
346/**
347 * Recodes an valid UTF-16 string as UTF-8.
348 *
349 * @returns iprt status code.
350 * @param pwsz The UTF-16 string.
351 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
352 * will stop when cwc or '\\0' is reached.
353 * @param psz Where to store the UTF-8 string.
354 * @param cch The size of the UTF-8 buffer, excluding the terminator.
355 * @param pcch Where to store the number of octets actually encoded.
356 */
357static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
358{
359 unsigned char *pwch = (unsigned char *)psz;
360 int rc = VINF_SUCCESS;
361 while (cwc > 0)
362 {
363 RTUTF16 wc = *pwsz++; cwc--;
364 if (!wc)
365 break;
366 else if (wc < 0xd800 || wc > 0xdfff)
367 {
368 if (wc < 0x80)
369 {
370 if (RT_UNLIKELY(cch < 1))
371 {
372 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
373 rc = VERR_BUFFER_OVERFLOW;
374 break;
375 }
376 cch--;
377 *pwch++ = (unsigned char)wc;
378 }
379 else if (wc < 0x800)
380 {
381 if (RT_UNLIKELY(cch < 2))
382 {
383 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
384 rc = VERR_BUFFER_OVERFLOW;
385 break;
386 }
387 cch -= 2;
388 *pwch++ = 0xc0 | (wc >> 6);
389 *pwch++ = 0x80 | (wc & 0x3f);
390 }
391 else if (wc < 0xfffe)
392 {
393 if (RT_UNLIKELY(cch < 3))
394 {
395 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
396 rc = VERR_BUFFER_OVERFLOW;
397 break;
398 }
399 cch -= 3;
400 *pwch++ = 0xe0 | (wc >> 12);
401 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
402 *pwch++ = 0x80 | (wc & 0x3f);
403 }
404 else
405 {
406 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
407 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
408 break;
409 }
410 }
411 else
412 {
413 if (wc >= 0xdc00)
414 {
415 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
416 rc = VERR_INVALID_UTF16_ENCODING;
417 break;
418 }
419 if (cwc <= 0)
420 {
421 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
422 rc = VERR_INVALID_UTF16_ENCODING;
423 break;
424 }
425 RTUTF16 wc2 = *pwsz++; cwc--;
426 if (wc2 < 0xdc00 || wc2 > 0xdfff)
427 {
428 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
429 rc = VERR_INVALID_UTF16_ENCODING;
430 break;
431 }
432 uint32_t CodePoint = 0x10000
433 + ( ((wc & 0x3ff) << 10)
434 | (wc2 & 0x3ff));
435 if (RT_UNLIKELY(cch < 4))
436 {
437 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
438 rc = VERR_BUFFER_OVERFLOW;
439 break;
440 }
441 cch -= 4;
442 *pwch++ = 0xf0 | (CodePoint >> 18);
443 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
444 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
445 *pwch++ = 0x80 | (CodePoint & 0x3f);
446 }
447 }
448
449 /* done */
450 *pwch = '\0';
451 *pcch = (char *)pwch - psz;
452 return rc;
453}
454
455
456
457RTDECL(int) RTUtf16ToUtf8Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
458{
459 /*
460 * Validate input.
461 */
462 Assert(VALID_PTR(ppszString));
463 Assert(VALID_PTR(pwszString));
464 *ppszString = NULL;
465
466 /*
467 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
468 */
469 size_t cch;
470 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
471 if (RT_SUCCESS(rc))
472 {
473 /*
474 * Allocate buffer and recode it.
475 */
476 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
477 if (pszResult)
478 {
479 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
480 if (RT_SUCCESS(rc))
481 {
482 *ppszString = pszResult;
483 return rc;
484 }
485
486 RTMemFree(pszResult);
487 }
488 else
489 rc = VERR_NO_STR_MEMORY;
490 }
491 return rc;
492}
493RT_EXPORT_SYMBOL(RTUtf16ToUtf8Tag);
494
495
496RTDECL(int) RTUtf16ToUtf8ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
497{
498 /*
499 * Validate input.
500 */
501 Assert(VALID_PTR(pwszString));
502 Assert(VALID_PTR(ppsz));
503 Assert(!pcch || VALID_PTR(pcch));
504
505 /*
506 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
507 */
508 size_t cchResult;
509 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
510 if (RT_SUCCESS(rc))
511 {
512 if (pcch)
513 *pcch = cchResult;
514
515 /*
516 * Check buffer size / Allocate buffer and recode it.
517 */
518 bool fShouldFree;
519 char *pszResult;
520 if (cch > 0 && *ppsz)
521 {
522 fShouldFree = false;
523 if (RT_UNLIKELY(cch <= cchResult))
524 return VERR_BUFFER_OVERFLOW;
525 pszResult = *ppsz;
526 }
527 else
528 {
529 *ppsz = NULL;
530 fShouldFree = true;
531 cch = RT_MAX(cch, cchResult + 1);
532 pszResult = (char *)RTStrAllocTag(cch, pszTag);
533 }
534 if (pszResult)
535 {
536 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
537 if (RT_SUCCESS(rc))
538 {
539 *ppsz = pszResult;
540 return rc;
541 }
542
543 if (fShouldFree)
544 RTStrFree(pszResult);
545 }
546 else
547 rc = VERR_NO_STR_MEMORY;
548 }
549 return rc;
550}
551RT_EXPORT_SYMBOL(RTUtf16ToUtf8ExTag);
552
553
554RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
555{
556 size_t cch;
557 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
558 return RT_SUCCESS(rc) ? cch : 0;
559}
560RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
561
562
563RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
564{
565 size_t cch;
566 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
567 if (pcch)
568 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
569 return rc;
570}
571RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
572
573
574RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
575{
576 const RTUTF16 wc = *pwsz;
577
578 /* simple */
579 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
580 return wc;
581 if (wc < 0xfffe)
582 {
583 /* surrogate pair */
584 if (wc < 0xdc00)
585 {
586 const RTUTF16 wc2 = pwsz[1];
587 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
588 {
589 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
590 return uc;
591 }
592
593 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
594 }
595 else
596 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
597 }
598 else
599 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
600 return RTUNICP_INVALID;
601}
602RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
603
604
605RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
606{
607 const RTUTF16 wc = **ppwsz;
608
609 /* simple */
610 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
611 {
612 (*ppwsz)++;
613 *pCp = wc;
614 return VINF_SUCCESS;
615 }
616
617 int rc;
618 if (wc < 0xfffe)
619 {
620 /* surrogate pair */
621 if (wc < 0xdc00)
622 {
623 const RTUTF16 wc2 = (*ppwsz)[1];
624 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
625 {
626 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
627 *pCp = uc;
628 (*ppwsz) += 2;
629 return VINF_SUCCESS;
630 }
631
632 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
633 }
634 else
635 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
636 rc = VERR_INVALID_UTF16_ENCODING;
637 }
638 else
639 {
640 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
641 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
642 }
643 *pCp = RTUNICP_INVALID;
644 (*ppwsz)++;
645 return rc;
646}
647RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
648
649
650RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
651{
652 /* simple */
653 if ( CodePoint < 0xd800
654 || ( CodePoint > 0xdfff
655 && CodePoint < 0xfffe))
656 {
657 *pwsz++ = (RTUTF16)CodePoint;
658 return pwsz;
659 }
660
661 /* surrogate pair */
662 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
663 {
664 CodePoint -= 0x10000;
665 *pwsz++ = 0xd800 | (CodePoint >> 10);
666 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
667 return pwsz;
668 }
669
670 /* invalid code point. */
671 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
672 *pwsz++ = 0x7f;
673 return pwsz;
674}
675RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
676
677
678/**
679 * Validate the UTF-16 encoding and calculates the length of a Latin1 encoding.
680 *
681 * @returns iprt status code.
682 * @param pwsz The UTF-16 string.
683 * @param cwc The max length of the UTF-16 string to consider.
684 * @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw)
685 */
686static int rtUtf16CalcLatin1Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
687{
688 int rc = VINF_SUCCESS;
689 size_t cch = 0;
690 while (cwc > 0)
691 {
692 RTUTF16 wc = *pwsz++; cwc--;
693 if (!wc)
694 break;
695 else if (RT_LIKELY(wc < 0x100))
696 ++cch;
697 else
698 {
699 if (wc < 0xd800 || wc > 0xdfff)
700 {
701 if (wc >= 0xfffe)
702 {
703 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
704 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
705 break;
706 }
707 }
708 else
709 {
710 if (wc >= 0xdc00)
711 {
712 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
713 rc = VERR_INVALID_UTF16_ENCODING;
714 break;
715 }
716 if (cwc <= 0)
717 {
718 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
719 rc = VERR_INVALID_UTF16_ENCODING;
720 break;
721 }
722 wc = *pwsz++; cwc--;
723 if (wc < 0xdc00 || wc > 0xdfff)
724 {
725 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
726 rc = VERR_INVALID_UTF16_ENCODING;
727 break;
728 }
729 }
730
731 rc = VERR_NO_TRANSLATION;
732 break;
733 }
734 }
735
736 /* done */
737 *pcch = cch;
738 return rc;
739}
740
741
742/**
743 * Recodes an valid UTF-16 string as Latin1.
744 *
745 * @returns iprt status code.
746 * @param pwsz The UTF-16 string.
747 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
748 * will stop when cwc or '\\0' is reached.
749 * @param psz Where to store the Latin1 string.
750 * @param cch The size of the Latin1 buffer, excluding the terminator.
751 */
752static int rtUtf16RecodeAsLatin1(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch)
753{
754 unsigned char *pch = (unsigned char *)psz;
755 int rc = VINF_SUCCESS;
756 while (cwc > 0)
757 {
758 RTUTF16 wc = *pwsz++; cwc--;
759 if (!wc)
760 break;
761 if (RT_LIKELY(wc < 0x100))
762 {
763 if (RT_UNLIKELY(cch < 1))
764 {
765 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
766 rc = VERR_BUFFER_OVERFLOW;
767 break;
768 }
769 cch--;
770 *pch++ = (unsigned char)wc;
771 }
772 else
773 {
774 if (wc < 0xd800 || wc > 0xdfff)
775 {
776 if (wc >= 0xfffe)
777 {
778 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
779 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
780 break;
781 }
782 }
783 else
784 {
785 if (wc >= 0xdc00)
786 {
787 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
788 rc = VERR_INVALID_UTF16_ENCODING;
789 break;
790 }
791 if (cwc <= 0)
792 {
793 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
794 rc = VERR_INVALID_UTF16_ENCODING;
795 break;
796 }
797 RTUTF16 wc2 = *pwsz++; cwc--;
798 if (wc2 < 0xdc00 || wc2 > 0xdfff)
799 {
800 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
801 rc = VERR_INVALID_UTF16_ENCODING;
802 break;
803 }
804 }
805
806 rc = VERR_NO_TRANSLATION;
807 break;
808 }
809 }
810
811 /* done */
812 *pch = '\0';
813 return rc;
814}
815
816
817RTDECL(int) RTUtf16ToLatin1Tag(PCRTUTF16 pwszString, char **ppszString, const char *pszTag)
818{
819 /*
820 * Validate input.
821 */
822 Assert(VALID_PTR(ppszString));
823 Assert(VALID_PTR(pwszString));
824 *ppszString = NULL;
825
826 /*
827 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
828 */
829 size_t cch;
830 int rc = rtUtf16CalcLatin1Length(pwszString, RTSTR_MAX, &cch);
831 if (RT_SUCCESS(rc))
832 {
833 /*
834 * Allocate buffer and recode it.
835 */
836 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
837 if (pszResult)
838 {
839 rc = rtUtf16RecodeAsLatin1(pwszString, RTSTR_MAX, pszResult, cch);
840 if (RT_SUCCESS(rc))
841 {
842 *ppszString = pszResult;
843 return rc;
844 }
845
846 RTMemFree(pszResult);
847 }
848 else
849 rc = VERR_NO_STR_MEMORY;
850 }
851 return rc;
852}
853RT_EXPORT_SYMBOL(RTUtf16ToLatin1Tag);
854
855
856RTDECL(int) RTUtf16ToLatin1ExTag(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
857{
858 /*
859 * Validate input.
860 */
861 AssertPtr(pwszString);
862 AssertPtr(ppsz);
863 AssertPtrNull(pcch);
864
865 /*
866 * Validate the UTF-16 string and calculate the length of the Latin1 encoding of it.
867 */
868 size_t cchResult;
869 int rc = rtUtf16CalcLatin1Length(pwszString, cwcString, &cchResult);
870 if (RT_SUCCESS(rc))
871 {
872 if (pcch)
873 *pcch = cchResult;
874
875 /*
876 * Check buffer size / Allocate buffer and recode it.
877 */
878 bool fShouldFree;
879 char *pszResult;
880 if (cch > 0 && *ppsz)
881 {
882 fShouldFree = false;
883 if (cch <= cchResult)
884 return VERR_BUFFER_OVERFLOW;
885 pszResult = *ppsz;
886 }
887 else
888 {
889 *ppsz = NULL;
890 fShouldFree = true;
891 cch = RT_MAX(cch, cchResult + 1);
892 pszResult = (char *)RTMemAllocTag(cch, pszTag);
893 }
894 if (pszResult)
895 {
896 rc = rtUtf16RecodeAsLatin1(pwszString, cwcString, pszResult, cch - 1);
897 if (RT_SUCCESS(rc))
898 {
899 *ppsz = pszResult;
900 return rc;
901 }
902
903 if (fShouldFree)
904 RTMemFree(pszResult);
905 }
906 else
907 rc = VERR_NO_STR_MEMORY;
908 }
909 return rc;
910}
911RT_EXPORT_SYMBOL(RTUtf16ToLatin1ExTag);
912
913
914RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz)
915{
916 size_t cch;
917 int rc = rtUtf16CalcLatin1Length(pwsz, RTSTR_MAX, &cch);
918 return RT_SUCCESS(rc) ? cch : 0;
919}
920RT_EXPORT_SYMBOL(RTUtf16CalcLatin1Len);
921
922
923RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
924{
925 size_t cch;
926 int rc = rtUtf16CalcLatin1Length(pwsz, cwc, &cch);
927 if (pcch)
928 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
929 return rc;
930}
931RT_EXPORT_SYMBOL(RTUtf16CalcLatin1LenEx);
932
933
934/**
935 * Calculates the UTF-16 length of a Latin1 string. In fact this is just the
936 * original length, but the function saves us nasty comments to that effect
937 * all over the place.
938 *
939 * @returns IPRT status code.
940 * @param psz Pointer to the Latin1 string.
941 * @param cch The max length of the string. (btw cch = cb)
942 * Use RTSTR_MAX if all of the string is to be examined.s
943 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
944 */
945static int rtLatin1CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
946{
947 *pcwc = RTStrNLen(psz, cch);
948 return VINF_SUCCESS;
949}
950
951
952/**
953 * Recodes a Latin1 string as UTF-16. This is just a case of expanding it to
954 * sixteen bits, as Unicode is a superset of Latin1.
955 *
956 * Since we know the input is valid, we do *not* perform length checks.
957 *
958 * @returns iprt status code.
959 * @param psz The Latin1 string to recode.
960 * @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string.
961 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
962 * @param pwsz Where to store the UTF-16 string.
963 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
964 */
965static int rtLatin1RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
966{
967 int rc = VINF_SUCCESS;
968 const unsigned char *puch = (const unsigned char *)psz;
969 PRTUTF16 pwc = pwsz;
970 while (cch-- > 0)
971 {
972 /* read the next char and check for terminator. */
973 const unsigned char uch = *puch;
974 if (!uch)
975 break;
976
977 /* check for output overflow */
978 if (RT_UNLIKELY(cwc < 1))
979 {
980 rc = VERR_BUFFER_OVERFLOW;
981 break;
982 }
983
984 /* expand the code point */
985 *pwc++ = uch;
986 cwc--;
987 puch++;
988 }
989
990 /* done */
991 *pwc = '\0';
992 return rc;
993}
994
995
996RTDECL(int) RTLatin1ToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
997{
998 /*
999 * Validate input.
1000 */
1001 Assert(VALID_PTR(ppwszString));
1002 Assert(VALID_PTR(pszString));
1003 *ppwszString = NULL;
1004
1005 /*
1006 * Validate the input and calculate the length of the UTF-16 string.
1007 */
1008 size_t cwc;
1009 int rc = rtLatin1CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
1010 if (RT_SUCCESS(rc))
1011 {
1012 /*
1013 * Allocate buffer.
1014 */
1015 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
1016 if (pwsz)
1017 {
1018 /*
1019 * Encode the UTF-16 string.
1020 */
1021 rc = rtLatin1RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
1022 if (RT_SUCCESS(rc))
1023 {
1024 *ppwszString = pwsz;
1025 return rc;
1026 }
1027 RTMemFree(pwsz);
1028 }
1029 else
1030 rc = VERR_NO_UTF16_MEMORY;
1031 }
1032 return rc;
1033}
1034RT_EXPORT_SYMBOL(RTLatin1ToUtf16Tag);
1035
1036
1037RTDECL(int) RTLatin1ToUtf16ExTag(const char *pszString, size_t cchString,
1038 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
1039{
1040 /*
1041 * Validate input.
1042 */
1043 Assert(VALID_PTR(pszString));
1044 Assert(VALID_PTR(ppwsz));
1045 Assert(!pcwc || VALID_PTR(pcwc));
1046
1047 /*
1048 * Validate the input and calculate the length of the UTF-16 string.
1049 */
1050 size_t cwcResult;
1051 int rc = rtLatin1CalcUtf16Length(pszString, cchString, &cwcResult);
1052 if (RT_SUCCESS(rc))
1053 {
1054 if (pcwc)
1055 *pcwc = cwcResult;
1056
1057 /*
1058 * Check buffer size / Allocate buffer.
1059 */
1060 bool fShouldFree;
1061 PRTUTF16 pwszResult;
1062 if (cwc > 0 && *ppwsz)
1063 {
1064 fShouldFree = false;
1065 if (cwc <= cwcResult)
1066 return VERR_BUFFER_OVERFLOW;
1067 pwszResult = *ppwsz;
1068 }
1069 else
1070 {
1071 *ppwsz = NULL;
1072 fShouldFree = true;
1073 cwc = RT_MAX(cwcResult + 1, cwc);
1074 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
1075 }
1076 if (pwszResult)
1077 {
1078 /*
1079 * Encode the UTF-16 string.
1080 */
1081 rc = rtLatin1RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
1082 if (RT_SUCCESS(rc))
1083 {
1084 *ppwsz = pwszResult;
1085 return rc;
1086 }
1087 if (fShouldFree)
1088 RTMemFree(pwszResult);
1089 }
1090 else
1091 rc = VERR_NO_UTF16_MEMORY;
1092 }
1093 return rc;
1094}
1095RT_EXPORT_SYMBOL(RTLatin1ToUtf16ExTag);
1096
1097
1098RTDECL(size_t) RTLatin1CalcUtf16Len(const char *psz)
1099{
1100 size_t cwc;
1101 int rc = rtLatin1CalcUtf16Length(psz, RTSTR_MAX, &cwc);
1102 return RT_SUCCESS(rc) ? cwc : 0;
1103}
1104RT_EXPORT_SYMBOL(RTLatin1CalcUtf16Len);
1105
1106
1107RTDECL(int) RTLatin1CalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1108{
1109 size_t cwc;
1110 int rc = rtLatin1CalcUtf16Length(psz, cch, &cwc);
1111 if (pcwc)
1112 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1113 return rc;
1114}
1115RT_EXPORT_SYMBOL(RTLatin1CalcUtf16LenEx);
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette