VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 31221

最後變更 在這個檔案從31221是 31221,由 vboxsync 提交於 15 年 前

Runtime/string: more Utf-8 <-> Latin-1 fixes

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Id
檔案大小: 60.1 KB
 
1/* $Id: utf-8.cpp 31221 2010-07-29 16:03:53Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.alldomusa.eu.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42/**
43 * Get get length in code points of a UTF-8 encoded string.
44 * The string is validated while doing this.
45 *
46 * @returns IPRT status code.
47 * @param psz Pointer to the UTF-8 string.
48 * @param cch The max length of the string. (btw cch = cb)
49 * Use RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcchActual Where to store the actual size of the UTF-8 string
52 * on success (cch = cb again). Optional.
53 */
54int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
55{
56 const unsigned char *puch = (const unsigned char *)psz;
57 size_t cCodePoints = 0;
58 while (cch > 0)
59 {
60 const unsigned char uch = *puch;
61 if (!uch)
62 break;
63 if (uch & RT_BIT(7))
64 {
65 /* figure sequence length and validate the first byte */
66 unsigned cb;
67 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
68 cb = 2;
69 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
70 cb = 3;
71 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
72 cb = 4;
73 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
74 cb = 5;
75 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
76 cb = 6;
77 else
78 {
79 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
80 return VERR_INVALID_UTF8_ENCODING;
81 }
82
83 /* check length */
84 if (cb > cch)
85 {
86 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
87 return VERR_INVALID_UTF8_ENCODING;
88 }
89
90 /* validate the rest */
91 switch (cb)
92 {
93 case 6:
94 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
95 case 5:
96 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97 case 4:
98 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99 case 3:
100 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101 case 2:
102 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103 break;
104 }
105
106 /* validate the code point. */
107 RTUNICP uc;
108 switch (cb)
109 {
110 case 6:
111 uc = (puch[5] & 0x3f)
112 | ((RTUNICP)(puch[4] & 0x3f) << 6)
113 | ((RTUNICP)(puch[3] & 0x3f) << 12)
114 | ((RTUNICP)(puch[2] & 0x3f) << 18)
115 | ((RTUNICP)(puch[1] & 0x3f) << 24)
116 | ((RTUNICP)(uch & 0x01) << 30);
117 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
118 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
119 break;
120 case 5:
121 uc = (puch[4] & 0x3f)
122 | ((RTUNICP)(puch[3] & 0x3f) << 6)
123 | ((RTUNICP)(puch[2] & 0x3f) << 12)
124 | ((RTUNICP)(puch[1] & 0x3f) << 18)
125 | ((RTUNICP)(uch & 0x03) << 24);
126 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
127 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
128 break;
129 case 4:
130 uc = (puch[3] & 0x3f)
131 | ((RTUNICP)(puch[2] & 0x3f) << 6)
132 | ((RTUNICP)(puch[1] & 0x3f) << 12)
133 | ((RTUNICP)(uch & 0x07) << 18);
134 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
135 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
136 break;
137 case 3:
138 uc = (puch[2] & 0x3f)
139 | ((RTUNICP)(puch[1] & 0x3f) << 6)
140 | ((RTUNICP)(uch & 0x0f) << 12);
141 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
142 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
143 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
144 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
145 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
146 break;
147 case 2:
148 uc = (puch[1] & 0x3f)
149 | ((RTUNICP)(uch & 0x1f) << 6);
150 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
151 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
152 break;
153 }
154
155 /* advance */
156 cch -= cb;
157 puch += cb;
158 }
159 else
160 {
161 /* one ASCII byte */
162 puch++;
163 cch--;
164 }
165 cCodePoints++;
166 }
167
168 /* done */
169 *pcuc = cCodePoints;
170 if (pcchActual)
171 *pcchActual = puch - (unsigned char const *)psz;
172 return VINF_SUCCESS;
173}
174
175
176/**
177 * Decodes and UTF-8 string into an array of unicode code point.
178 *
179 * Since we know the input is valid, we do *not* perform encoding or length checks.
180 *
181 * @returns iprt status code.
182 * @param psz The UTF-8 string to recode. This is a valid encoding.
183 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
184 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
185 * @param paCps Where to store the code points array.
186 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
187 */
188static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
189{
190 int rc = VINF_SUCCESS;
191 const unsigned char *puch = (const unsigned char *)psz;
192 PRTUNICP pCp = paCps;
193 while (cch > 0)
194 {
195 /* read the next char and check for terminator. */
196 const unsigned char uch = *puch;
197 if (!uch)
198 break;
199
200 /* check for output overflow */
201 if (RT_UNLIKELY(cCps < 1))
202 {
203 rc = VERR_BUFFER_OVERFLOW;
204 break;
205 }
206 cCps--;
207
208 /* decode and recode the code point */
209 if (!(uch & RT_BIT(7)))
210 {
211 *pCp++ = uch;
212 puch++;
213 cch--;
214 }
215#ifdef RT_STRICT
216 else if (!(uch & RT_BIT(6)))
217 AssertMsgFailed(("Internal error!\n"));
218#endif
219 else if (!(uch & RT_BIT(5)))
220 {
221 *pCp++ = (puch[1] & 0x3f)
222 | ((uint16_t)(uch & 0x1f) << 6);
223 puch += 2;
224 cch -= 2;
225 }
226 else if (!(uch & RT_BIT(4)))
227 {
228 *pCp++ = (puch[2] & 0x3f)
229 | ((uint16_t)(puch[1] & 0x3f) << 6)
230 | ((uint16_t)(uch & 0x0f) << 12);
231 puch += 3;
232 cch -= 3;
233 }
234 else if (!(uch & RT_BIT(3)))
235 {
236 *pCp++ = (puch[3] & 0x3f)
237 | ((RTUNICP)(puch[2] & 0x3f) << 6)
238 | ((RTUNICP)(puch[1] & 0x3f) << 12)
239 | ((RTUNICP)(uch & 0x07) << 18);
240 puch += 4;
241 cch -= 4;
242 }
243 else if (!(uch & RT_BIT(2)))
244 {
245 *pCp++ = (puch[4] & 0x3f)
246 | ((RTUNICP)(puch[3] & 0x3f) << 6)
247 | ((RTUNICP)(puch[2] & 0x3f) << 12)
248 | ((RTUNICP)(puch[1] & 0x3f) << 18)
249 | ((RTUNICP)(uch & 0x03) << 24);
250 puch += 5;
251 cch -= 6;
252 }
253 else
254 {
255 Assert(!(uch & RT_BIT(1)));
256 *pCp++ = (puch[5] & 0x3f)
257 | ((RTUNICP)(puch[4] & 0x3f) << 6)
258 | ((RTUNICP)(puch[3] & 0x3f) << 12)
259 | ((RTUNICP)(puch[2] & 0x3f) << 18)
260 | ((RTUNICP)(puch[1] & 0x3f) << 24)
261 | ((RTUNICP)(uch & 0x01) << 30);
262 puch += 6;
263 cch -= 6;
264 }
265 }
266
267 /* done */
268 *pCp = 0;
269 return rc;
270}
271
272
273RTDECL(size_t) RTStrUniLen(const char *psz)
274{
275 size_t cCodePoints;
276 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
277 return RT_SUCCESS(rc) ? cCodePoints : 0;
278}
279RT_EXPORT_SYMBOL(RTStrUniLen);
280
281
282RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
283{
284 size_t cCodePoints;
285 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
286 if (pcCps)
287 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
288 return rc;
289}
290RT_EXPORT_SYMBOL(RTStrUniLenEx);
291
292
293RTDECL(int) RTStrValidateEncoding(const char *psz)
294{
295 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
296}
297RT_EXPORT_SYMBOL(RTStrValidateEncoding);
298
299
300RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
301{
302 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
303 AssertPtr(psz);
304
305 /*
306 * Use rtUtf8Length for the job.
307 */
308 size_t cchActual;
309 size_t cCpsIgnored;
310 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
311 if (RT_SUCCESS(rc))
312 {
313 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
314 && cchActual >= cch)
315 rc = VERR_BUFFER_OVERFLOW;
316 }
317 return rc;
318}
319RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
320
321
322RTDECL(bool) RTStrIsValidEncoding(const char *psz)
323{
324 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
325 return RT_SUCCESS(rc);
326}
327RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
328
329
330RTDECL(size_t) RTStrPurgeEncoding(char *psz)
331{
332 size_t cErrors = 0;
333 for (;;)
334 {
335 RTUNICP Cp;
336 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
337 if (RT_SUCCESS(rc))
338 {
339 if (!Cp)
340 break;
341 }
342 else
343 {
344 psz[-1] = '?';
345 cErrors++;
346 }
347 }
348 return cErrors;
349}
350RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
351
352
353RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
354{
355 /*
356 * Validate input.
357 */
358 Assert(VALID_PTR(pszString));
359 Assert(VALID_PTR(ppaCps));
360 *ppaCps = NULL;
361
362 /*
363 * Validate the UTF-8 input and count its code points.
364 */
365 size_t cCps;
366 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
367 if (RT_SUCCESS(rc))
368 {
369 /*
370 * Allocate buffer.
371 */
372 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
373 if (paCps)
374 {
375 /*
376 * Decode the string.
377 */
378 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
379 if (RT_SUCCESS(rc))
380 {
381 *ppaCps = paCps;
382 return rc;
383 }
384 RTMemFree(paCps);
385 }
386 else
387 rc = VERR_NO_CODE_POINT_MEMORY;
388 }
389 return rc;
390}
391RT_EXPORT_SYMBOL(RTStrToUni);
392
393
394RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
395{
396 /*
397 * Validate input.
398 */
399 Assert(VALID_PTR(pszString));
400 Assert(VALID_PTR(ppaCps));
401 Assert(!pcCps || VALID_PTR(pcCps));
402
403 /*
404 * Validate the UTF-8 input and count the code points.
405 */
406 size_t cCpsResult;
407 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
408 if (RT_SUCCESS(rc))
409 {
410 if (pcCps)
411 *pcCps = cCpsResult;
412
413 /*
414 * Check buffer size / Allocate buffer.
415 */
416 bool fShouldFree;
417 PRTUNICP paCpsResult;
418 if (cCps > 0 && *ppaCps)
419 {
420 fShouldFree = false;
421 if (cCps <= cCpsResult)
422 return VERR_BUFFER_OVERFLOW;
423 paCpsResult = *ppaCps;
424 }
425 else
426 {
427 *ppaCps = NULL;
428 fShouldFree = true;
429 cCps = RT_MAX(cCpsResult + 1, cCps);
430 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
431 }
432 if (paCpsResult)
433 {
434 /*
435 * Encode the UTF-16 string.
436 */
437 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
438 if (RT_SUCCESS(rc))
439 {
440 *ppaCps = paCpsResult;
441 return rc;
442 }
443 if (fShouldFree)
444 RTMemFree(paCpsResult);
445 }
446 else
447 rc = VERR_NO_CODE_POINT_MEMORY;
448 }
449 return rc;
450}
451RT_EXPORT_SYMBOL(RTStrToUniEx);
452
453
454/**
455 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
456 *
457 * @returns IPRT status code.
458 * @param psz Pointer to the UTF-8 string.
459 * @param cch The max length of the string. (btw cch = cb)
460 * Use RTSTR_MAX if all of the string is to be examined.
461 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
462 */
463static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
464{
465 const unsigned char *puch = (const unsigned char *)psz;
466 size_t cwc = 0;
467 while (cch > 0)
468 {
469 const unsigned char uch = *puch;
470 if (!uch)
471 break;
472 if (!(uch & RT_BIT(7)))
473 {
474 /* one ASCII byte */
475 cwc++;
476 puch++;
477 cch--;
478 }
479 else
480 {
481 /* figure sequence length and validate the first byte */
482 unsigned cb;
483 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
484 cb = 2;
485 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
486 cb = 3;
487 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
488 cb = 4;
489 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
490 cb = 5;
491 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
492 cb = 6;
493 else
494 {
495 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
496 return VERR_INVALID_UTF8_ENCODING;
497 }
498
499 /* check length */
500 if (cb > cch)
501 {
502 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
503 return VERR_INVALID_UTF8_ENCODING;
504 }
505
506 /* validate the rest */
507 switch (cb)
508 {
509 case 6:
510 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
511 case 5:
512 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
513 case 4:
514 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
515 case 3:
516 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
517 case 2:
518 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
519 break;
520 }
521
522 /* validate the code point. */
523 RTUNICP uc;
524 switch (cb)
525 {
526 case 6:
527 uc = (puch[5] & 0x3f)
528 | ((RTUNICP)(puch[4] & 0x3f) << 6)
529 | ((RTUNICP)(puch[3] & 0x3f) << 12)
530 | ((RTUNICP)(puch[2] & 0x3f) << 18)
531 | ((RTUNICP)(puch[1] & 0x3f) << 24)
532 | ((RTUNICP)(uch & 0x01) << 30);
533 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
534 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
535 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
536 return VERR_CANT_RECODE_AS_UTF16;
537 case 5:
538 uc = (puch[4] & 0x3f)
539 | ((RTUNICP)(puch[3] & 0x3f) << 6)
540 | ((RTUNICP)(puch[2] & 0x3f) << 12)
541 | ((RTUNICP)(puch[1] & 0x3f) << 18)
542 | ((RTUNICP)(uch & 0x03) << 24);
543 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
544 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
545 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
546 return VERR_CANT_RECODE_AS_UTF16;
547 case 4:
548 uc = (puch[3] & 0x3f)
549 | ((RTUNICP)(puch[2] & 0x3f) << 6)
550 | ((RTUNICP)(puch[1] & 0x3f) << 12)
551 | ((RTUNICP)(uch & 0x07) << 18);
552 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
553 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
554 RTStrAssertMsgReturn(uc <= 0x0010ffff,
555 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
556 cwc++;
557 break;
558 case 3:
559 uc = (puch[2] & 0x3f)
560 | ((RTUNICP)(puch[1] & 0x3f) << 6)
561 | ((RTUNICP)(uch & 0x0f) << 12);
562 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
563 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
564 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
565 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
566 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
567 break;
568 case 2:
569 uc = (puch[1] & 0x3f)
570 | ((RTUNICP)(uch & 0x1f) << 6);
571 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
572 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
573 break;
574 }
575
576 /* advance */
577 cch -= cb;
578 puch += cb;
579 cwc++;
580 }
581 }
582
583 /* done */
584 *pcwc = cwc;
585 return VINF_SUCCESS;
586}
587
588
589/**
590 * Recodes a valid UTF-8 string as UTF-16.
591 *
592 * Since we know the input is valid, we do *not* perform encoding or length checks.
593 *
594 * @returns iprt status code.
595 * @param psz The UTF-8 string to recode. This is a valid encoding.
596 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
597 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
598 * @param pwsz Where to store the UTF-16 string.
599 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
600 */
601static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
602{
603 int rc = VINF_SUCCESS;
604 const unsigned char *puch = (const unsigned char *)psz;
605 PRTUTF16 pwc = pwsz;
606 while (cch > 0)
607 {
608 /* read the next char and check for terminator. */
609 const unsigned char uch = *puch;
610 if (!uch)
611 break;
612
613 /* check for output overflow */
614 if (RT_UNLIKELY(cwc < 1))
615 {
616 rc = VERR_BUFFER_OVERFLOW;
617 break;
618 }
619 cwc--;
620
621 /* decode and recode the code point */
622 if (!(uch & RT_BIT(7)))
623 {
624 *pwc++ = uch;
625 puch++;
626 cch--;
627 }
628 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
629 {
630 uint16_t uc = (puch[1] & 0x3f)
631 | ((uint16_t)(uch & 0x1f) << 6);
632 *pwc++ = uc;
633 puch += 2;
634 cch -= 2;
635 }
636 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
637 {
638 uint16_t uc = (puch[2] & 0x3f)
639 | ((uint16_t)(puch[1] & 0x3f) << 6)
640 | ((uint16_t)(uch & 0x0f) << 12);
641 *pwc++ = uc;
642 puch += 3;
643 cch -= 3;
644 }
645 else
646 {
647 /* generate surrugate pair */
648 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
649 RTUNICP uc = (puch[3] & 0x3f)
650 | ((RTUNICP)(puch[2] & 0x3f) << 6)
651 | ((RTUNICP)(puch[1] & 0x3f) << 12)
652 | ((RTUNICP)(uch & 0x07) << 18);
653 if (RT_UNLIKELY(cwc < 1))
654 {
655 rc = VERR_BUFFER_OVERFLOW;
656 break;
657 }
658 cwc--;
659
660 uc -= 0x10000;
661 *pwc++ = 0xd800 | (uc >> 10);
662 *pwc++ = 0xdc00 | (uc & 0x3ff);
663 puch += 4;
664 cch -= 4;
665 }
666 }
667
668 /* done */
669 *pwc = '\0';
670 return rc;
671}
672
673
674RTDECL(int) RTStrToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
675{
676 /*
677 * Validate input.
678 */
679 Assert(VALID_PTR(ppwszString));
680 Assert(VALID_PTR(pszString));
681 *ppwszString = NULL;
682
683 /*
684 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
685 */
686 size_t cwc;
687 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
688 if (RT_SUCCESS(rc))
689 {
690 /*
691 * Allocate buffer.
692 */
693 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
694 if (pwsz)
695 {
696 /*
697 * Encode the UTF-16 string.
698 */
699 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
700 if (RT_SUCCESS(rc))
701 {
702 *ppwszString = pwsz;
703 return rc;
704 }
705 RTMemFree(pwsz);
706 }
707 else
708 rc = VERR_NO_UTF16_MEMORY;
709 }
710 return rc;
711}
712RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
713
714
715RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
716 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
717{
718 /*
719 * Validate input.
720 */
721 Assert(VALID_PTR(pszString));
722 Assert(VALID_PTR(ppwsz));
723 Assert(!pcwc || VALID_PTR(pcwc));
724
725 /*
726 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
727 */
728 size_t cwcResult;
729 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
730 if (RT_SUCCESS(rc))
731 {
732 if (pcwc)
733 *pcwc = cwcResult;
734
735 /*
736 * Check buffer size / Allocate buffer.
737 */
738 bool fShouldFree;
739 PRTUTF16 pwszResult;
740 if (cwc > 0 && *ppwsz)
741 {
742 fShouldFree = false;
743 if (cwc <= cwcResult)
744 return VERR_BUFFER_OVERFLOW;
745 pwszResult = *ppwsz;
746 }
747 else
748 {
749 *ppwsz = NULL;
750 fShouldFree = true;
751 cwc = RT_MAX(cwcResult + 1, cwc);
752 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
753 }
754 if (pwszResult)
755 {
756 /*
757 * Encode the UTF-16 string.
758 */
759 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
760 if (RT_SUCCESS(rc))
761 {
762 *ppwsz = pwszResult;
763 return rc;
764 }
765 if (fShouldFree)
766 RTMemFree(pwszResult);
767 }
768 else
769 rc = VERR_NO_UTF16_MEMORY;
770 }
771 return rc;
772}
773RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
774
775
776RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
777{
778 size_t cwc;
779 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
780 return RT_SUCCESS(rc) ? cwc : 0;
781}
782RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
783
784
785RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
786{
787 size_t cwc;
788 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
789 if (pcwc)
790 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
791 return rc;
792}
793RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
794
795
796/**
797 * Calculates the length of the UTF-8 encoding of a Latin-1 string.
798 *
799 * @returns iprt status code.
800 * @param psz The Latin-1 string.
801 * @param cchIn The max length of the Latin-1 string to consider.
802 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
803 */
804static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch)
805{
806 size_t cch = 0;
807 while (cchIn > 0)
808 {
809 char ch = *psz++; cchIn--;
810 if (!ch)
811 break;
812 if (!(ch & 0x80))
813 cch++;
814 else
815 cch += 2;
816 }
817
818
819 /* done */
820 *pcch = cch;
821 return VINF_SUCCESS;
822}
823
824
825/**
826 * Recodes a Latin-1 string as UTF-8.
827 *
828 * @returns iprt status code.
829 * @param psz The Latin-1 string.
830 * @param cchIn The number of characters to process from psz. The recoding
831 * will stop when cch or '\\0' is reached.
832 * @param psz Where to store the UTF-8 string.
833 * @param cch The size of the UTF-8 buffer, excluding the terminator.
834 * @param pcch Where to store the number of octets actually encoded.
835 */
836static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch, size_t *pcch)
837{
838 unsigned char *puch = (unsigned char *)psz;
839 int rc = VINF_SUCCESS;
840 while (cchIn > 0)
841 {
842 unsigned char ch = (unsigned char) *pszIn++; cchIn--;
843 if (!ch)
844 break;
845 if (!(ch & 0x80))
846 {
847 if (RT_UNLIKELY(cch < 1))
848 {
849 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
850 rc = VERR_BUFFER_OVERFLOW;
851 break;
852 }
853 cch--;
854 *puch++ = (unsigned char)ch;
855 }
856 else
857 {
858 if (RT_UNLIKELY(cch < 2))
859 {
860 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
861 rc = VERR_BUFFER_OVERFLOW;
862 break;
863 }
864 cch -= 2;
865 *puch++ = 0xc0 | (ch >> 6);
866 *puch++ = 0x80 | (ch & 0x3f);
867 }
868 }
869
870 /* done */
871 *puch = '\0';
872 *pcch = (char *)puch - psz;
873 return rc;
874}
875
876
877
878RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag)
879{
880 /*
881 * Validate input.
882 */
883 Assert(VALID_PTR(ppszString));
884 Assert(VALID_PTR(pszString));
885 *ppszString = NULL;
886
887 /*
888 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
889 */
890 size_t cch;
891 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
892 if (RT_SUCCESS(rc))
893 {
894 /*
895 * Allocate buffer and recode it.
896 */
897 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
898 if (pszResult)
899 {
900 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch, &cch);
901 if (RT_SUCCESS(rc))
902 {
903 *ppszString = pszResult;
904 return rc;
905 }
906
907 RTMemFree(pszResult);
908 }
909 else
910 rc = VERR_NO_STR_MEMORY;
911 }
912 return rc;
913}
914RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
915
916
917RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
918{
919 /*
920 * Validate input.
921 */
922 Assert(VALID_PTR(pszString));
923 Assert(VALID_PTR(ppsz));
924 Assert(!pcch || VALID_PTR(pcch));
925
926 /*
927 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
928 */
929 size_t cchResult;
930 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
931 if (RT_SUCCESS(rc))
932 {
933 if (pcch)
934 *pcch = cchResult;
935
936 /*
937 * Check buffer size / Allocate buffer and recode it.
938 */
939 bool fShouldFree;
940 char *pszResult;
941 if (cch > 0 && *ppsz)
942 {
943 fShouldFree = false;
944 if (RT_UNLIKELY(cch <= cchResult))
945 return VERR_BUFFER_OVERFLOW;
946 pszResult = *ppsz;
947 }
948 else
949 {
950 *ppsz = NULL;
951 fShouldFree = true;
952 cch = RT_MAX(cch, cchResult + 1);
953 pszResult = (char *)RTStrAllocTag(cch, pszTag);
954 }
955 if (pszResult)
956 {
957 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1, &cch);
958 if (RT_SUCCESS(rc))
959 {
960 *ppsz = pszResult;
961 return rc;
962 }
963
964 if (fShouldFree)
965 RTStrFree(pszResult);
966 }
967 else
968 rc = VERR_NO_STR_MEMORY;
969 }
970 return rc;
971}
972RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
973
974
975RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
976{
977 size_t cch;
978 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
979 return RT_SUCCESS(rc) ? cch : 0;
980}
981RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
982
983
984RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch)
985{
986 size_t cch;
987 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
988 if (pcch)
989 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
990 return rc;
991}
992RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
993
994
995/**
996 * Calculates the Latin-1 length of a string, validating the encoding while doing so.
997 *
998 * @returns IPRT status code.
999 * @param psz Pointer to the UTF-8 string.
1000 * @param cch The max length of the string. (btw cch = cb)
1001 * Use RTSTR_MAX if all of the string is to be examined.
1002 * @param pcch Where to store the length of the Latin-1 string in bytes.
1003 */
1004static int rtUtf8CalcLatin1Length(const char *psz, size_t cch, size_t *pcch)
1005{
1006 const unsigned char *puch = (const unsigned char *)psz;
1007 size_t cchOut = 0;
1008 while (cch > 0)
1009 {
1010 const unsigned char uch = *puch;
1011 if (!uch)
1012 break;
1013 if (!(uch & RT_BIT(7)))
1014 {
1015 /* one ASCII byte */
1016 cchOut++;
1017 puch++;
1018 cch--;
1019 }
1020 else
1021 {
1022 /* figure sequence length and validate the first byte */
1023 unsigned cb;
1024 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
1025 cb = 2;
1026 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
1027 cb = 3;
1028 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
1029 cb = 4;
1030 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
1031 cb = 5;
1032 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
1033 cb = 6;
1034 else
1035 {
1036 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
1037 return VERR_INVALID_UTF8_ENCODING;
1038 }
1039
1040 /* check length */
1041 if (cb > cch)
1042 {
1043 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
1044 return VERR_INVALID_UTF8_ENCODING;
1045 }
1046
1047 /* validate the rest */
1048 switch (cb)
1049 {
1050 case 6:
1051 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1052 case 5:
1053 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1054 case 4:
1055 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1056 case 3:
1057 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1058 case 2:
1059 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1060 break;
1061 }
1062
1063 /* validate the code point. */
1064 RTUNICP uc;
1065 switch (cb)
1066 {
1067 case 6:
1068 uc = (puch[5] & 0x3f)
1069 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1070 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1071 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1072 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1073 | ((RTUNICP)(uch & 0x01) << 30);
1074 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1075 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1076 break;
1077 case 5:
1078 uc = (puch[4] & 0x3f)
1079 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1080 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1081 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1082 | ((RTUNICP)(uch & 0x03) << 24);
1083 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1084 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1085 break;
1086 case 4:
1087 uc = (puch[3] & 0x3f)
1088 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1089 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1090 | ((RTUNICP)(uch & 0x07) << 18);
1091 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1092 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1093 break;
1094 case 3:
1095 uc = (puch[2] & 0x3f)
1096 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1097 | ((RTUNICP)(uch & 0x0f) << 12);
1098 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1099 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
1100 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
1101 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1102 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
1103 break;
1104 case 2:
1105 uc = (puch[1] & 0x3f)
1106 | ((RTUNICP)(uch & 0x1f) << 6);
1107 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1108 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1109 break;
1110 }
1111
1112 /* does this code point have a Latin-1 translation? */
1113 if (cb > 2 || uch > 0xC3)
1114 return VERR_NO_TRANSLATION;
1115
1116 /* advance */
1117 cch -= cb;
1118 puch += cb;
1119 cchOut++;
1120 }
1121 }
1122
1123 /* done */
1124 *pcch = cchOut;
1125 return VINF_SUCCESS;
1126}
1127
1128
1129/**
1130 * Recodes a valid UTF-8 string as Latin-1.
1131 *
1132 * Since we know the input is valid, we do *not* perform encoding or length checks.
1133 *
1134 * @returns iprt status code.
1135 * @param psz The UTF-8 string to recode. This is a valid encoding.
1136 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1137 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1138 * @param pszOut Where to store the Latin-1 string.
1139 * @param cchOut The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1140 */
1141static int rtUtf8RecodeAsLatin1(const char *psz, size_t cch, char *pszOut, size_t cchOut)
1142{
1143 int rc = VINF_SUCCESS;
1144 const unsigned char *puch = (const unsigned char *)psz;
1145 unsigned char *puchOut = (unsigned char *)pszOut;
1146 while (cch > 0)
1147 {
1148 /* read the next char and check for terminator. */
1149 const unsigned char uch = *puch;
1150 if (!uch)
1151 break;
1152
1153 /* check for output overflow */
1154 if (RT_UNLIKELY(cchOut < 1))
1155 {
1156 rc = VERR_BUFFER_OVERFLOW;
1157 break;
1158 }
1159 cchOut--;
1160
1161 /* decode and recode the code point */
1162 if (!(uch & RT_BIT(7)))
1163 {
1164 *puchOut++ = uch;
1165 puch++;
1166 cch--;
1167 }
1168 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
1169 {
1170 uint16_t uc = (puch[1] & 0x3f)
1171 | ((uint16_t)(uch & 0x1f) << 6);
1172 if (uc >= 0x100)
1173 {
1174 rc = VERR_NO_TRANSLATION;
1175 break;
1176 }
1177 *puchOut++ = uc;
1178 puch += 2;
1179 cch -= 2;
1180 }
1181 else
1182 {
1183 rc = VERR_NO_TRANSLATION;
1184 break;
1185 }
1186 }
1187
1188 /* done */
1189 *puchOut = '\0';
1190 return rc;
1191}
1192
1193
1194RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag)
1195{
1196 /*
1197 * Validate input.
1198 */
1199 Assert(VALID_PTR(ppszString));
1200 Assert(VALID_PTR(pszString));
1201 *ppszString = NULL;
1202
1203 /*
1204 * Validate the UTF-8 input and calculate the length of the Latin-1 string.
1205 */
1206 size_t cch;
1207 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1208 if (RT_SUCCESS(rc))
1209 {
1210 /*
1211 * Allocate buffer.
1212 */
1213 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag);
1214 if (psz)
1215 {
1216 /*
1217 * Encode the UTF-16 string.
1218 */
1219 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1220 if (RT_SUCCESS(rc))
1221 {
1222 *ppszString = psz;
1223 return rc;
1224 }
1225 RTMemFree(psz);
1226 }
1227 else
1228 rc = VERR_NO_STR_MEMORY;
1229 }
1230 return rc;
1231}
1232RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1233
1234
1235RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1236 char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1237{
1238 /*
1239 * Validate input.
1240 */
1241 Assert(VALID_PTR(pszString));
1242 Assert(VALID_PTR(ppsz));
1243 Assert(!pcch || VALID_PTR(pcch));
1244
1245 /*
1246 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1247 */
1248 size_t cchResult;
1249 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1250 if (RT_SUCCESS(rc))
1251 {
1252 if (pcch)
1253 *pcch = cchResult;
1254
1255 /*
1256 * Check buffer size / Allocate buffer.
1257 */
1258 bool fShouldFree;
1259 char *pszResult;
1260 if (cch > 0 && *ppsz)
1261 {
1262 fShouldFree = false;
1263 if (cch <= cchResult)
1264 return VERR_BUFFER_OVERFLOW;
1265 pszResult = *ppsz;
1266 }
1267 else
1268 {
1269 *ppsz = NULL;
1270 fShouldFree = true;
1271 cch = RT_MAX(cchResult + 1, cch);
1272 pszResult = (char *)RTMemAllocTag(cch, pszTag);
1273 }
1274 if (pszResult)
1275 {
1276 /*
1277 * Encode the Latin-1 string.
1278 */
1279 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1280 if (RT_SUCCESS(rc))
1281 {
1282 *ppsz = pszResult;
1283 return rc;
1284 }
1285 if (fShouldFree)
1286 RTMemFree(pszResult);
1287 }
1288 else
1289 rc = VERR_NO_STR_MEMORY;
1290 }
1291 return rc;
1292}
1293RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1294
1295
1296RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1297{
1298 size_t cch;
1299 int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1300 return RT_SUCCESS(rc) ? cch : 0;
1301}
1302RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1303
1304
1305RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch)
1306{
1307 size_t cch;
1308 int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1309 if (pcch)
1310 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1311 return rc;
1312}
1313RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1314
1315
1316/**
1317 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1318 * @returns rc
1319 * @param ppsz The pointer to the string position point.
1320 * @param pCp Where to store RTUNICP_INVALID.
1321 * @param rc The iprt error code.
1322 */
1323static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1324{
1325 /*
1326 * Try find a valid encoding.
1327 */
1328 (*ppsz)++; /** @todo code this! */
1329 *pCp = RTUNICP_INVALID;
1330 return rc;
1331}
1332
1333
1334RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1335{
1336 RTUNICP Cp;
1337 RTStrGetCpExInternal(&psz, &Cp);
1338 return Cp;
1339}
1340RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1341
1342
1343RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1344{
1345 const unsigned char *puch = (const unsigned char *)*ppsz;
1346 const unsigned char uch = *puch;
1347 RTUNICP uc;
1348
1349 /* ASCII ? */
1350 if (!(uch & RT_BIT(7)))
1351 {
1352 uc = uch;
1353 puch++;
1354 }
1355 else if (uch & RT_BIT(6))
1356 {
1357 /* figure the length and validate the first octet. */
1358 unsigned cb;
1359 if (!(uch & RT_BIT(5)))
1360 cb = 2;
1361 else if (!(uch & RT_BIT(4)))
1362 cb = 3;
1363 else if (!(uch & RT_BIT(3)))
1364 cb = 4;
1365 else if (!(uch & RT_BIT(2)))
1366 cb = 5;
1367 else if (!(uch & RT_BIT(1)))
1368 cb = 6;
1369 else
1370 {
1371 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1372 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1373 }
1374
1375 /* validate the rest */
1376 switch (cb)
1377 {
1378 case 6:
1379 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1380 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1381 case 5:
1382 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1383 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1384 case 4:
1385 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1386 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1387 case 3:
1388 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1389 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1390 case 2:
1391 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1392 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1393 break;
1394 }
1395
1396 /* get and validate the code point. */
1397 switch (cb)
1398 {
1399 case 6:
1400 uc = (puch[5] & 0x3f)
1401 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1402 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1403 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1404 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1405 | ((RTUNICP)(uch & 0x01) << 30);
1406 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1407 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1408 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1409 break;
1410 case 5:
1411 uc = (puch[4] & 0x3f)
1412 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1413 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1414 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1415 | ((RTUNICP)(uch & 0x03) << 24);
1416 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1417 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1418 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1419 break;
1420 case 4:
1421 uc = (puch[3] & 0x3f)
1422 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1423 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1424 | ((RTUNICP)(uch & 0x07) << 18);
1425 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1426 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1427 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1428 break;
1429 case 3:
1430 uc = (puch[2] & 0x3f)
1431 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1432 | ((RTUNICP)(uch & 0x0f) << 12);
1433 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1434 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1435 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1436 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1437 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1438 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1439 break;
1440 case 2:
1441 uc = (puch[1] & 0x3f)
1442 | ((RTUNICP)(uch & 0x1f) << 6);
1443 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1444 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1445 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1446 break;
1447 default: /* impossible, but GCC is bitching. */
1448 uc = RTUNICP_INVALID;
1449 break;
1450 }
1451 puch += cb;
1452 }
1453 else
1454 {
1455 /* 6th bit is always set. */
1456 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1457 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1458 }
1459 *pCp = uc;
1460 *ppsz = (const char *)puch;
1461 return VINF_SUCCESS;
1462}
1463RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1464
1465
1466/**
1467 * Handle invalid encodings passed to RTStrGetCpNEx().
1468 * @returns rc
1469 * @param ppsz The pointer to the string position point.
1470 * @param pcch Pointer to the string length.
1471 * @param pCp Where to store RTUNICP_INVALID.
1472 * @param rc The iprt error code.
1473 */
1474static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
1475{
1476 /*
1477 * Try find a valid encoding.
1478 */
1479 (*ppsz)++; /** @todo code this! */
1480 (*pcch)--;
1481 *pCp = RTUNICP_INVALID;
1482 return rc;
1483}
1484
1485
1486RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
1487{
1488 const unsigned char *puch = (const unsigned char *)*ppsz;
1489 const unsigned char uch = *puch;
1490 size_t cch = *pcch;
1491 RTUNICP uc;
1492
1493 if (cch == 0)
1494 {
1495 *pCp = RTUNICP_INVALID;
1496 return VERR_END_OF_STRING;
1497 }
1498
1499 /* ASCII ? */
1500 if (!(uch & RT_BIT(7)))
1501 {
1502 uc = uch;
1503 puch++;
1504 cch--;
1505 }
1506 else if (uch & RT_BIT(6))
1507 {
1508 /* figure the length and validate the first octet. */
1509 unsigned cb;
1510 if (!(uch & RT_BIT(5)))
1511 cb = 2;
1512 else if (!(uch & RT_BIT(4)))
1513 cb = 3;
1514 else if (!(uch & RT_BIT(3)))
1515 cb = 4;
1516 else if (!(uch & RT_BIT(2)))
1517 cb = 5;
1518 else if (!(uch & RT_BIT(1)))
1519 cb = 6;
1520 else
1521 {
1522 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1523 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1524 }
1525
1526 if (cb > cch)
1527 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1528
1529 /* validate the rest */
1530 switch (cb)
1531 {
1532 case 6:
1533 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1534 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1535 case 5:
1536 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1537 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1538 case 4:
1539 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1540 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1541 case 3:
1542 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1543 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1544 case 2:
1545 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1546 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1547 break;
1548 }
1549
1550 /* get and validate the code point. */
1551 switch (cb)
1552 {
1553 case 6:
1554 uc = (puch[5] & 0x3f)
1555 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1556 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1557 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1558 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1559 | ((RTUNICP)(uch & 0x01) << 30);
1560 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1561 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1562 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1563 break;
1564 case 5:
1565 uc = (puch[4] & 0x3f)
1566 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1567 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1568 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1569 | ((RTUNICP)(uch & 0x03) << 24);
1570 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1571 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1572 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1573 break;
1574 case 4:
1575 uc = (puch[3] & 0x3f)
1576 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1577 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1578 | ((RTUNICP)(uch & 0x07) << 18);
1579 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1580 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1581 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1582 break;
1583 case 3:
1584 uc = (puch[2] & 0x3f)
1585 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1586 | ((RTUNICP)(uch & 0x0f) << 12);
1587 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1588 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1589 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1590 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1591 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1592 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1593 break;
1594 case 2:
1595 uc = (puch[1] & 0x3f)
1596 | ((RTUNICP)(uch & 0x1f) << 6);
1597 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1598 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1599 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1600 break;
1601 default: /* impossible, but GCC is bitching. */
1602 uc = RTUNICP_INVALID;
1603 break;
1604 }
1605 puch += cb;
1606 cch -= cb;
1607 }
1608 else
1609 {
1610 /* 6th bit is always set. */
1611 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1612 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1613 }
1614 *pCp = uc;
1615 *ppsz = (const char *)puch;
1616 (*pcch) = cch;
1617 return VINF_SUCCESS;
1618}
1619RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1620
1621
1622RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1623{
1624 unsigned char *puch = (unsigned char *)psz;
1625 if (uc < 0x80)
1626 *puch++ = (unsigned char )uc;
1627 else if (uc < 0x00000800)
1628 {
1629 *puch++ = 0xc0 | (uc >> 6);
1630 *puch++ = 0x80 | (uc & 0x3f);
1631 }
1632 else if (uc < 0x00010000)
1633 {
1634 if ( uc < 0x0000d8000
1635 || ( uc > 0x0000dfff
1636 && uc < 0x0000fffe))
1637 {
1638 *puch++ = 0xe0 | (uc >> 12);
1639 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1640 *puch++ = 0x80 | (uc & 0x3f);
1641 }
1642 else
1643 {
1644 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1645 *puch++ = 0x7f;
1646 }
1647 }
1648 else if (uc < 0x00200000)
1649 {
1650 *puch++ = 0xf0 | (uc >> 18);
1651 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1652 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1653 *puch++ = 0x80 | (uc & 0x3f);
1654 }
1655 else if (uc < 0x04000000)
1656 {
1657 *puch++ = 0xf8 | (uc >> 24);
1658 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1659 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1660 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1661 *puch++ = 0x80 | (uc & 0x3f);
1662 }
1663 else if (uc <= 0x7fffffff)
1664 {
1665 *puch++ = 0xfc | (uc >> 30);
1666 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1667 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1668 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1669 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1670 *puch++ = 0x80 | (uc & 0x3f);
1671 }
1672 else
1673 {
1674 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1675 *puch++ = 0x7f;
1676 }
1677
1678 return (char *)puch;
1679}
1680RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1681
1682
1683RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1684{
1685 if (pszStart < psz)
1686 {
1687 /* simple char? */
1688 const unsigned char *puch = (const unsigned char *)psz;
1689 unsigned uch = *--puch;
1690 if (!(uch & RT_BIT(7)))
1691 return (char *)puch;
1692 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1693
1694 /* two or more. */
1695 uint32_t uMask = 0xffffffc0;
1696 while ( (const unsigned char *)pszStart < puch
1697 && !(uMask & 1))
1698 {
1699 uch = *--puch;
1700 if ((uch & 0xc0) != 0x80)
1701 {
1702 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1703 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1704 (char *)pszStart);
1705 return (char *)puch;
1706 }
1707 uMask >>= 1;
1708 }
1709 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1710 }
1711 return (char *)pszStart;
1712}
1713RT_EXPORT_SYMBOL(RTStrPrevCp);
1714
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette