utf-8.cpp@ 31221

最後變更在這個檔案從31221是 31221,由 vboxsync 提交於 15 年前
Runtime/string: more Utf-8 <-> Latin-1 fixes
屬性 svn:eol-style 設為 `native` 屬性 svn:keywords 設為 `Id`
檔案大小: 60.1 KB

行
1	/* $Id: utf-8.cpp 31221 2010-07-29 16:03:53Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2010 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.alldomusa.eu.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*******************************************************************************
29	* Header Files *
30	*******************************************************************************/
31	#include <iprt/string.h>
32	#include "internal/iprt.h"
33
34	#include <iprt/uni.h>
35	#include <iprt/alloc.h>
36	#include <iprt/assert.h>
37	#include <iprt/err.h>
38	#include "internal/string.h"
39
40
41
42	/**
43	* Get get length in code points of a UTF-8 encoded string.
44	* The string is validated while doing this.
45	*
46	* @returns IPRT status code.
47	* @param psz Pointer to the UTF-8 string.
48	* @param cch The max length of the string. (btw cch = cb)
49	* Use RTSTR_MAX if all of the string is to be examined.
50	* @param pcuc Where to store the length in unicode code points.
51	* @param pcchActual Where to store the actual size of the UTF-8 string
52	* on success (cch = cb again). Optional.
53	*/
54	int rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
55	{
56	const unsigned char puch = (const unsigned char )psz;
57	size_t cCodePoints = 0;
58	while (cch > 0)
59	{
60	const unsigned char uch = *puch;
61	if (!uch)
62	break;
63	if (uch & RT_BIT(7))
64	{
65	/* figure sequence length and validate the first byte */
66	unsigned cb;
67	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
68	cb = 2;
69	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
70	cb = 3;
71	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
72	cb = 4;
73	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
74	cb = 5;
75	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
76	cb = 6;
77	else
78	{
79	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
80	return VERR_INVALID_UTF8_ENCODING;
81	}
82
83	/* check length */
84	if (cb > cch)
85	{
86	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
87	return VERR_INVALID_UTF8_ENCODING;
88	}
89
90	/* validate the rest */
91	switch (cb)
92	{
93	case 6:
94	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
95	case 5:
96	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
97	case 4:
98	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
99	case 3:
100	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
101	case 2:
102	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
103	break;
104	}
105
106	/* validate the code point. */
107	RTUNICP uc;
108	switch (cb)
109	{
110	case 6:
111	uc = (puch[5] & 0x3f)
112	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
113	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
114	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
115	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
116	\| ((RTUNICP)(uch & 0x01) << 30);
117	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
118	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
119	break;
120	case 5:
121	uc = (puch[4] & 0x3f)
122	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
123	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
124	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
125	\| ((RTUNICP)(uch & 0x03) << 24);
126	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
127	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
128	break;
129	case 4:
130	uc = (puch[3] & 0x3f)
131	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
132	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
133	\| ((RTUNICP)(uch & 0x07) << 18);
134	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
135	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
136	break;
137	case 3:
138	uc = (puch[2] & 0x3f)
139	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
140	\| ((RTUNICP)(uch & 0x0f) << 12);
141	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
142	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
143	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
144	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
145	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
146	break;
147	case 2:
148	uc = (puch[1] & 0x3f)
149	\| ((RTUNICP)(uch & 0x1f) << 6);
150	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
151	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
152	break;
153	}
154
155	/* advance */
156	cch -= cb;
157	puch += cb;
158	}
159	else
160	{
161	/* one ASCII byte */
162	puch++;
163	cch--;
164	}
165	cCodePoints++;
166	}
167
168	/* done */
169	*pcuc = cCodePoints;
170	if (pcchActual)
171	pcchActual = puch - (unsigned char const )psz;
172	return VINF_SUCCESS;
173	}
174
175
176	/**
177	* Decodes and UTF-8 string into an array of unicode code point.
178	*
179	* Since we know the input is valid, we do not perform encoding or length checks.
180	*
181	* @returns iprt status code.
182	* @param psz The UTF-8 string to recode. This is a valid encoding.
183	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
184	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
185	* @param paCps Where to store the code points array.
186	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
187	*/
188	static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
189	{
190	int rc = VINF_SUCCESS;
191	const unsigned char puch = (const unsigned char )psz;
192	PRTUNICP pCp = paCps;
193	while (cch > 0)
194	{
195	/* read the next char and check for terminator. */
196	const unsigned char uch = *puch;
197	if (!uch)
198	break;
199
200	/* check for output overflow */
201	if (RT_UNLIKELY(cCps < 1))
202	{
203	rc = VERR_BUFFER_OVERFLOW;
204	break;
205	}
206	cCps--;
207
208	/* decode and recode the code point */
209	if (!(uch & RT_BIT(7)))
210	{
211	*pCp++ = uch;
212	puch++;
213	cch--;
214	}
215	#ifdef RT_STRICT
216	else if (!(uch & RT_BIT(6)))
217	AssertMsgFailed(("Internal error!\n"));
218	#endif
219	else if (!(uch & RT_BIT(5)))
220	{
221	*pCp++ = (puch[1] & 0x3f)
222	\| ((uint16_t)(uch & 0x1f) << 6);
223	puch += 2;
224	cch -= 2;
225	}
226	else if (!(uch & RT_BIT(4)))
227	{
228	*pCp++ = (puch[2] & 0x3f)
229	\| ((uint16_t)(puch[1] & 0x3f) << 6)
230	\| ((uint16_t)(uch & 0x0f) << 12);
231	puch += 3;
232	cch -= 3;
233	}
234	else if (!(uch & RT_BIT(3)))
235	{
236	*pCp++ = (puch[3] & 0x3f)
237	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
238	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
239	\| ((RTUNICP)(uch & 0x07) << 18);
240	puch += 4;
241	cch -= 4;
242	}
243	else if (!(uch & RT_BIT(2)))
244	{
245	*pCp++ = (puch[4] & 0x3f)
246	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
247	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
248	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
249	\| ((RTUNICP)(uch & 0x03) << 24);
250	puch += 5;
251	cch -= 6;
252	}
253	else
254	{
255	Assert(!(uch & RT_BIT(1)));
256	*pCp++ = (puch[5] & 0x3f)
257	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
258	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
259	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
260	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
261	\| ((RTUNICP)(uch & 0x01) << 30);
262	puch += 6;
263	cch -= 6;
264	}
265	}
266
267	/* done */
268	*pCp = 0;
269	return rc;
270	}
271
272
273	RTDECL(size_t) RTStrUniLen(const char *psz)
274	{
275	size_t cCodePoints;
276	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
277	return RT_SUCCESS(rc) ? cCodePoints : 0;
278	}
279	RT_EXPORT_SYMBOL(RTStrUniLen);
280
281
282	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
283	{
284	size_t cCodePoints;
285	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
286	if (pcCps)
287	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
288	return rc;
289	}
290	RT_EXPORT_SYMBOL(RTStrUniLenEx);
291
292
293	RTDECL(int) RTStrValidateEncoding(const char *psz)
294	{
295	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
296	}
297	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
298
299
300	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
301	{
302	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
303	AssertPtr(psz);
304
305	/*
306	* Use rtUtf8Length for the job.
307	*/
308	size_t cchActual;
309	size_t cCpsIgnored;
310	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
311	if (RT_SUCCESS(rc))
312	{
313	if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
314	&& cchActual >= cch)
315	rc = VERR_BUFFER_OVERFLOW;
316	}
317	return rc;
318	}
319	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
320
321
322	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
323	{
324	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
325	return RT_SUCCESS(rc);
326	}
327	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
328
329
330	RTDECL(size_t) RTStrPurgeEncoding(char *psz)
331	{
332	size_t cErrors = 0;
333	for (;;)
334	{
335	RTUNICP Cp;
336	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
337	if (RT_SUCCESS(rc))
338	{
339	if (!Cp)
340	break;
341	}
342	else
343	{
344	psz[-1] = '?';
345	cErrors++;
346	}
347	}
348	return cErrors;
349	}
350	RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
351
352
353	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
354	{
355	/*
356	* Validate input.
357	*/
358	Assert(VALID_PTR(pszString));
359	Assert(VALID_PTR(ppaCps));
360	*ppaCps = NULL;
361
362	/*
363	* Validate the UTF-8 input and count its code points.
364	*/
365	size_t cCps;
366	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
367	if (RT_SUCCESS(rc))
368	{
369	/*
370	* Allocate buffer.
371	*/
372	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
373	if (paCps)
374	{
375	/*
376	* Decode the string.
377	*/
378	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
379	if (RT_SUCCESS(rc))
380	{
381	*ppaCps = paCps;
382	return rc;
383	}
384	RTMemFree(paCps);
385	}
386	else
387	rc = VERR_NO_CODE_POINT_MEMORY;
388	}
389	return rc;
390	}
391	RT_EXPORT_SYMBOL(RTStrToUni);
392
393
394	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
395	{
396	/*
397	* Validate input.
398	*/
399	Assert(VALID_PTR(pszString));
400	Assert(VALID_PTR(ppaCps));
401	Assert(!pcCps \|\| VALID_PTR(pcCps));
402
403	/*
404	* Validate the UTF-8 input and count the code points.
405	*/
406	size_t cCpsResult;
407	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
408	if (RT_SUCCESS(rc))
409	{
410	if (pcCps)
411	*pcCps = cCpsResult;
412
413	/*
414	* Check buffer size / Allocate buffer.
415	*/
416	bool fShouldFree;
417	PRTUNICP paCpsResult;
418	if (cCps > 0 && *ppaCps)
419	{
420	fShouldFree = false;
421	if (cCps <= cCpsResult)
422	return VERR_BUFFER_OVERFLOW;
423	paCpsResult = *ppaCps;
424	}
425	else
426	{
427	*ppaCps = NULL;
428	fShouldFree = true;
429	cCps = RT_MAX(cCpsResult + 1, cCps);
430	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
431	}
432	if (paCpsResult)
433	{
434	/*
435	* Encode the UTF-16 string.
436	*/
437	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
438	if (RT_SUCCESS(rc))
439	{
440	*ppaCps = paCpsResult;
441	return rc;
442	}
443	if (fShouldFree)
444	RTMemFree(paCpsResult);
445	}
446	else
447	rc = VERR_NO_CODE_POINT_MEMORY;
448	}
449	return rc;
450	}
451	RT_EXPORT_SYMBOL(RTStrToUniEx);
452
453
454	/**
455	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
456	*
457	* @returns IPRT status code.
458	* @param psz Pointer to the UTF-8 string.
459	* @param cch The max length of the string. (btw cch = cb)
460	* Use RTSTR_MAX if all of the string is to be examined.
461	* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
462	*/
463	static int rtUtf8CalcUtf16Length(const char psz, size_t cch, size_t pcwc)
464	{
465	const unsigned char puch = (const unsigned char )psz;
466	size_t cwc = 0;
467	while (cch > 0)
468	{
469	const unsigned char uch = *puch;
470	if (!uch)
471	break;
472	if (!(uch & RT_BIT(7)))
473	{
474	/* one ASCII byte */
475	cwc++;
476	puch++;
477	cch--;
478	}
479	else
480	{
481	/* figure sequence length and validate the first byte */
482	unsigned cb;
483	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
484	cb = 2;
485	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
486	cb = 3;
487	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
488	cb = 4;
489	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
490	cb = 5;
491	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
492	cb = 6;
493	else
494	{
495	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
496	return VERR_INVALID_UTF8_ENCODING;
497	}
498
499	/* check length */
500	if (cb > cch)
501	{
502	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
503	return VERR_INVALID_UTF8_ENCODING;
504	}
505
506	/* validate the rest */
507	switch (cb)
508	{
509	case 6:
510	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
511	case 5:
512	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
513	case 4:
514	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
515	case 3:
516	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
517	case 2:
518	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
519	break;
520	}
521
522	/* validate the code point. */
523	RTUNICP uc;
524	switch (cb)
525	{
526	case 6:
527	uc = (puch[5] & 0x3f)
528	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
529	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
530	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
531	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
532	\| ((RTUNICP)(uch & 0x01) << 30);
533	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
534	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
535	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
536	return VERR_CANT_RECODE_AS_UTF16;
537	case 5:
538	uc = (puch[4] & 0x3f)
539	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
540	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
541	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
542	\| ((RTUNICP)(uch & 0x03) << 24);
543	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
544	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
545	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
546	return VERR_CANT_RECODE_AS_UTF16;
547	case 4:
548	uc = (puch[3] & 0x3f)
549	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
550	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
551	\| ((RTUNICP)(uch & 0x07) << 18);
552	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
553	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
554	RTStrAssertMsgReturn(uc <= 0x0010ffff,
555	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
556	cwc++;
557	break;
558	case 3:
559	uc = (puch[2] & 0x3f)
560	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
561	\| ((RTUNICP)(uch & 0x0f) << 12);
562	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
563	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
564	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
565	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
566	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
567	break;
568	case 2:
569	uc = (puch[1] & 0x3f)
570	\| ((RTUNICP)(uch & 0x1f) << 6);
571	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
572	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
573	break;
574	}
575
576	/* advance */
577	cch -= cb;
578	puch += cb;
579	cwc++;
580	}
581	}
582
583	/* done */
584	*pcwc = cwc;
585	return VINF_SUCCESS;
586	}
587
588
589	/**
590	* Recodes a valid UTF-8 string as UTF-16.
591	*
592	* Since we know the input is valid, we do not perform encoding or length checks.
593	*
594	* @returns iprt status code.
595	* @param psz The UTF-8 string to recode. This is a valid encoding.
596	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
597	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
598	* @param pwsz Where to store the UTF-16 string.
599	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
600	*/
601	static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
602	{
603	int rc = VINF_SUCCESS;
604	const unsigned char puch = (const unsigned char )psz;
605	PRTUTF16 pwc = pwsz;
606	while (cch > 0)
607	{
608	/* read the next char and check for terminator. */
609	const unsigned char uch = *puch;
610	if (!uch)
611	break;
612
613	/* check for output overflow */
614	if (RT_UNLIKELY(cwc < 1))
615	{
616	rc = VERR_BUFFER_OVERFLOW;
617	break;
618	}
619	cwc--;
620
621	/* decode and recode the code point */
622	if (!(uch & RT_BIT(7)))
623	{
624	*pwc++ = uch;
625	puch++;
626	cch--;
627	}
628	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
629	{
630	uint16_t uc = (puch[1] & 0x3f)
631	\| ((uint16_t)(uch & 0x1f) << 6);
632	*pwc++ = uc;
633	puch += 2;
634	cch -= 2;
635	}
636	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
637	{
638	uint16_t uc = (puch[2] & 0x3f)
639	\| ((uint16_t)(puch[1] & 0x3f) << 6)
640	\| ((uint16_t)(uch & 0x0f) << 12);
641	*pwc++ = uc;
642	puch += 3;
643	cch -= 3;
644	}
645	else
646	{
647	/* generate surrugate pair */
648	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
649	RTUNICP uc = (puch[3] & 0x3f)
650	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
651	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
652	\| ((RTUNICP)(uch & 0x07) << 18);
653	if (RT_UNLIKELY(cwc < 1))
654	{
655	rc = VERR_BUFFER_OVERFLOW;
656	break;
657	}
658	cwc--;
659
660	uc -= 0x10000;
661	*pwc++ = 0xd800 \| (uc >> 10);
662	*pwc++ = 0xdc00 \| (uc & 0x3ff);
663	puch += 4;
664	cch -= 4;
665	}
666	}
667
668	/* done */
669	*pwc = '\0';
670	return rc;
671	}
672
673
674	RTDECL(int) RTStrToUtf16Tag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
675	{
676	/*
677	* Validate input.
678	*/
679	Assert(VALID_PTR(ppwszString));
680	Assert(VALID_PTR(pszString));
681	*ppwszString = NULL;
682
683	/*
684	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
685	*/
686	size_t cwc;
687	int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
688	if (RT_SUCCESS(rc))
689	{
690	/*
691	* Allocate buffer.
692	*/
693	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
694	if (pwsz)
695	{
696	/*
697	* Encode the UTF-16 string.
698	*/
699	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
700	if (RT_SUCCESS(rc))
701	{
702	*ppwszString = pwsz;
703	return rc;
704	}
705	RTMemFree(pwsz);
706	}
707	else
708	rc = VERR_NO_UTF16_MEMORY;
709	}
710	return rc;
711	}
712	RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
713
714
715	RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
716	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
717	{
718	/*
719	* Validate input.
720	*/
721	Assert(VALID_PTR(pszString));
722	Assert(VALID_PTR(ppwsz));
723	Assert(!pcwc \|\| VALID_PTR(pcwc));
724
725	/*
726	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
727	*/
728	size_t cwcResult;
729	int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
730	if (RT_SUCCESS(rc))
731	{
732	if (pcwc)
733	*pcwc = cwcResult;
734
735	/*
736	* Check buffer size / Allocate buffer.
737	*/
738	bool fShouldFree;
739	PRTUTF16 pwszResult;
740	if (cwc > 0 && *ppwsz)
741	{
742	fShouldFree = false;
743	if (cwc <= cwcResult)
744	return VERR_BUFFER_OVERFLOW;
745	pwszResult = *ppwsz;
746	}
747	else
748	{
749	*ppwsz = NULL;
750	fShouldFree = true;
751	cwc = RT_MAX(cwcResult + 1, cwc);
752	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
753	}
754	if (pwszResult)
755	{
756	/*
757	* Encode the UTF-16 string.
758	*/
759	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
760	if (RT_SUCCESS(rc))
761	{
762	*ppwsz = pwszResult;
763	return rc;
764	}
765	if (fShouldFree)
766	RTMemFree(pwszResult);
767	}
768	else
769	rc = VERR_NO_UTF16_MEMORY;
770	}
771	return rc;
772	}
773	RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
774
775
776	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
777	{
778	size_t cwc;
779	int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
780	return RT_SUCCESS(rc) ? cwc : 0;
781	}
782	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
783
784
785	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
786	{
787	size_t cwc;
788	int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
789	if (pcwc)
790	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
791	return rc;
792	}
793	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
794
795
796	/**
797	* Calculates the length of the UTF-8 encoding of a Latin-1 string.
798	*
799	* @returns iprt status code.
800	* @param psz The Latin-1 string.
801	* @param cchIn The max length of the Latin-1 string to consider.
802	* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
803	*/
804	static int rtLatin1CalcUtf8Length(const char psz, size_t cchIn, size_t pcch)
805	{
806	size_t cch = 0;
807	while (cchIn > 0)
808	{
809	char ch = *psz++; cchIn--;
810	if (!ch)
811	break;
812	if (!(ch & 0x80))
813	cch++;
814	else
815	cch += 2;
816	}
817
818
819	/* done */
820	*pcch = cch;
821	return VINF_SUCCESS;
822	}
823
824
825	/**
826	* Recodes a Latin-1 string as UTF-8.
827	*
828	* @returns iprt status code.
829	* @param psz The Latin-1 string.
830	* @param cchIn The number of characters to process from psz. The recoding
831	* will stop when cch or '\\0' is reached.
832	* @param psz Where to store the UTF-8 string.
833	* @param cch The size of the UTF-8 buffer, excluding the terminator.
834	* @param pcch Where to store the number of octets actually encoded.
835	*/
836	static int rtLatin1RecodeAsUtf8(const char pszIn, size_t cchIn, char psz, size_t cch, size_t *pcch)
837	{
838	unsigned char puch = (unsigned char )psz;
839	int rc = VINF_SUCCESS;
840	while (cchIn > 0)
841	{
842	unsigned char ch = (unsigned char) *pszIn++; cchIn--;
843	if (!ch)
844	break;
845	if (!(ch & 0x80))
846	{
847	if (RT_UNLIKELY(cch < 1))
848	{
849	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
850	rc = VERR_BUFFER_OVERFLOW;
851	break;
852	}
853	cch--;
854	*puch++ = (unsigned char)ch;
855	}
856	else
857	{
858	if (RT_UNLIKELY(cch < 2))
859	{
860	RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
861	rc = VERR_BUFFER_OVERFLOW;
862	break;
863	}
864	cch -= 2;
865	*puch++ = 0xc0 \| (ch >> 6);
866	*puch++ = 0x80 \| (ch & 0x3f);
867	}
868	}
869
870	/* done */
871	*puch = '\0';
872	pcch = (char )puch - psz;
873	return rc;
874	}
875
876
877
878	RTDECL(int) RTLatin1ToUtf8Tag(const char pszString, char ppszString, const char pszTag)
879	{
880	/*
881	* Validate input.
882	*/
883	Assert(VALID_PTR(ppszString));
884	Assert(VALID_PTR(pszString));
885	*ppszString = NULL;
886
887	/*
888	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
889	*/
890	size_t cch;
891	int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
892	if (RT_SUCCESS(rc))
893	{
894	/*
895	* Allocate buffer and recode it.
896	*/
897	char pszResult = (char )RTMemAllocTag(cch + 1, pszTag);
898	if (pszResult)
899	{
900	rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch, &cch);
901	if (RT_SUCCESS(rc))
902	{
903	*ppszString = pszResult;
904	return rc;
905	}
906
907	RTMemFree(pszResult);
908	}
909	else
910	rc = VERR_NO_STR_MEMORY;
911	}
912	return rc;
913	}
914	RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
915
916
917	RTDECL(int) RTLatin1ToUtf8ExTag(const char pszString, size_t cchString, char ppsz, size_t cch, size_t pcch, const char *pszTag)
918	{
919	/*
920	* Validate input.
921	*/
922	Assert(VALID_PTR(pszString));
923	Assert(VALID_PTR(ppsz));
924	Assert(!pcch \|\| VALID_PTR(pcch));
925
926	/*
927	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
928	*/
929	size_t cchResult;
930	int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
931	if (RT_SUCCESS(rc))
932	{
933	if (pcch)
934	*pcch = cchResult;
935
936	/*
937	* Check buffer size / Allocate buffer and recode it.
938	*/
939	bool fShouldFree;
940	char *pszResult;
941	if (cch > 0 && *ppsz)
942	{
943	fShouldFree = false;
944	if (RT_UNLIKELY(cch <= cchResult))
945	return VERR_BUFFER_OVERFLOW;
946	pszResult = *ppsz;
947	}
948	else
949	{
950	*ppsz = NULL;
951	fShouldFree = true;
952	cch = RT_MAX(cch, cchResult + 1);
953	pszResult = (char *)RTStrAllocTag(cch, pszTag);
954	}
955	if (pszResult)
956	{
957	rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1, &cch);
958	if (RT_SUCCESS(rc))
959	{
960	*ppsz = pszResult;
961	return rc;
962	}
963
964	if (fShouldFree)
965	RTStrFree(pszResult);
966	}
967	else
968	rc = VERR_NO_STR_MEMORY;
969	}
970	return rc;
971	}
972	RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
973
974
975	RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
976	{
977	size_t cch;
978	int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
979	return RT_SUCCESS(rc) ? cch : 0;
980	}
981	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
982
983
984	RTDECL(int) RTLatin1CalcUtf8LenEx(const char psz, size_t cchIn, size_t pcch)
985	{
986	size_t cch;
987	int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
988	if (pcch)
989	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
990	return rc;
991	}
992	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
993
994
995	/**
996	* Calculates the Latin-1 length of a string, validating the encoding while doing so.
997	*
998	* @returns IPRT status code.
999	* @param psz Pointer to the UTF-8 string.
1000	* @param cch The max length of the string. (btw cch = cb)
1001	* Use RTSTR_MAX if all of the string is to be examined.
1002	* @param pcch Where to store the length of the Latin-1 string in bytes.
1003	*/
1004	static int rtUtf8CalcLatin1Length(const char psz, size_t cch, size_t pcch)
1005	{
1006	const unsigned char puch = (const unsigned char )psz;
1007	size_t cchOut = 0;
1008	while (cch > 0)
1009	{
1010	const unsigned char uch = *puch;
1011	if (!uch)
1012	break;
1013	if (!(uch & RT_BIT(7)))
1014	{
1015	/* one ASCII byte */
1016	cchOut++;
1017	puch++;
1018	cch--;
1019	}
1020	else
1021	{
1022	/* figure sequence length and validate the first byte */
1023	unsigned cb;
1024	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
1025	cb = 2;
1026	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
1027	cb = 3;
1028	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
1029	cb = 4;
1030	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
1031	cb = 5;
1032	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
1033	cb = 6;
1034	else
1035	{
1036	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
1037	return VERR_INVALID_UTF8_ENCODING;
1038	}
1039
1040	/* check length */
1041	if (cb > cch)
1042	{
1043	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
1044	return VERR_INVALID_UTF8_ENCODING;
1045	}
1046
1047	/* validate the rest */
1048	switch (cb)
1049	{
1050	case 6:
1051	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1052	case 5:
1053	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1054	case 4:
1055	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1056	case 3:
1057	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1058	case 2:
1059	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1060	break;
1061	}
1062
1063	/* validate the code point. */
1064	RTUNICP uc;
1065	switch (cb)
1066	{
1067	case 6:
1068	uc = (puch[5] & 0x3f)
1069	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1070	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1071	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1072	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1073	\| ((RTUNICP)(uch & 0x01) << 30);
1074	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1075	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1076	break;
1077	case 5:
1078	uc = (puch[4] & 0x3f)
1079	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1080	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1081	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1082	\| ((RTUNICP)(uch & 0x03) << 24);
1083	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1084	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1085	break;
1086	case 4:
1087	uc = (puch[3] & 0x3f)
1088	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1089	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1090	\| ((RTUNICP)(uch & 0x07) << 18);
1091	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1092	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1093	break;
1094	case 3:
1095	uc = (puch[2] & 0x3f)
1096	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1097	\| ((RTUNICP)(uch & 0x0f) << 12);
1098	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1099	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
1100	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
1101	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1102	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
1103	break;
1104	case 2:
1105	uc = (puch[1] & 0x3f)
1106	\| ((RTUNICP)(uch & 0x1f) << 6);
1107	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1108	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
1109	break;
1110	}
1111
1112	/* does this code point have a Latin-1 translation? */
1113	if (cb > 2 \|\| uch > 0xC3)
1114	return VERR_NO_TRANSLATION;
1115
1116	/* advance */
1117	cch -= cb;
1118	puch += cb;
1119	cchOut++;
1120	}
1121	}
1122
1123	/* done */
1124	*pcch = cchOut;
1125	return VINF_SUCCESS;
1126	}
1127
1128
1129	/**
1130	* Recodes a valid UTF-8 string as Latin-1.
1131	*
1132	* Since we know the input is valid, we do not perform encoding or length checks.
1133	*
1134	* @returns iprt status code.
1135	* @param psz The UTF-8 string to recode. This is a valid encoding.
1136	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1137	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1138	* @param pszOut Where to store the Latin-1 string.
1139	* @param cchOut The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1140	*/
1141	static int rtUtf8RecodeAsLatin1(const char psz, size_t cch, char pszOut, size_t cchOut)
1142	{
1143	int rc = VINF_SUCCESS;
1144	const unsigned char puch = (const unsigned char )psz;
1145	unsigned char puchOut = (unsigned char )pszOut;
1146	while (cch > 0)
1147	{
1148	/* read the next char and check for terminator. */
1149	const unsigned char uch = *puch;
1150	if (!uch)
1151	break;
1152
1153	/* check for output overflow */
1154	if (RT_UNLIKELY(cchOut < 1))
1155	{
1156	rc = VERR_BUFFER_OVERFLOW;
1157	break;
1158	}
1159	cchOut--;
1160
1161	/* decode and recode the code point */
1162	if (!(uch & RT_BIT(7)))
1163	{
1164	*puchOut++ = uch;
1165	puch++;
1166	cch--;
1167	}
1168	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
1169	{
1170	uint16_t uc = (puch[1] & 0x3f)
1171	\| ((uint16_t)(uch & 0x1f) << 6);
1172	if (uc >= 0x100)
1173	{
1174	rc = VERR_NO_TRANSLATION;
1175	break;
1176	}
1177	*puchOut++ = uc;
1178	puch += 2;
1179	cch -= 2;
1180	}
1181	else
1182	{
1183	rc = VERR_NO_TRANSLATION;
1184	break;
1185	}
1186	}
1187
1188	/* done */
1189	*puchOut = '\0';
1190	return rc;
1191	}
1192
1193
1194	RTDECL(int) RTStrToLatin1Tag(const char pszString, char ppszString, const char pszTag)
1195	{
1196	/*
1197	* Validate input.
1198	*/
1199	Assert(VALID_PTR(ppszString));
1200	Assert(VALID_PTR(pszString));
1201	*ppszString = NULL;
1202
1203	/*
1204	* Validate the UTF-8 input and calculate the length of the Latin-1 string.
1205	*/
1206	size_t cch;
1207	int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1208	if (RT_SUCCESS(rc))
1209	{
1210	/*
1211	* Allocate buffer.
1212	*/
1213	char psz = (char )RTMemAllocTag(cch + 1, pszTag);
1214	if (psz)
1215	{
1216	/*
1217	* Encode the UTF-16 string.
1218	*/
1219	rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1220	if (RT_SUCCESS(rc))
1221	{
1222	*ppszString = psz;
1223	return rc;
1224	}
1225	RTMemFree(psz);
1226	}
1227	else
1228	rc = VERR_NO_STR_MEMORY;
1229	}
1230	return rc;
1231	}
1232	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1233
1234
1235	RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1236	char *ppsz, size_t cch, size_t pcch, const char *pszTag)
1237	{
1238	/*
1239	* Validate input.
1240	*/
1241	Assert(VALID_PTR(pszString));
1242	Assert(VALID_PTR(ppsz));
1243	Assert(!pcch \|\| VALID_PTR(pcch));
1244
1245	/*
1246	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1247	*/
1248	size_t cchResult;
1249	int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1250	if (RT_SUCCESS(rc))
1251	{
1252	if (pcch)
1253	*pcch = cchResult;
1254
1255	/*
1256	* Check buffer size / Allocate buffer.
1257	*/
1258	bool fShouldFree;
1259	char *pszResult;
1260	if (cch > 0 && *ppsz)
1261	{
1262	fShouldFree = false;
1263	if (cch <= cchResult)
1264	return VERR_BUFFER_OVERFLOW;
1265	pszResult = *ppsz;
1266	}
1267	else
1268	{
1269	*ppsz = NULL;
1270	fShouldFree = true;
1271	cch = RT_MAX(cchResult + 1, cch);
1272	pszResult = (char *)RTMemAllocTag(cch, pszTag);
1273	}
1274	if (pszResult)
1275	{
1276	/*
1277	* Encode the Latin-1 string.
1278	*/
1279	rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1280	if (RT_SUCCESS(rc))
1281	{
1282	*ppsz = pszResult;
1283	return rc;
1284	}
1285	if (fShouldFree)
1286	RTMemFree(pszResult);
1287	}
1288	else
1289	rc = VERR_NO_STR_MEMORY;
1290	}
1291	return rc;
1292	}
1293	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1294
1295
1296	RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1297	{
1298	size_t cch;
1299	int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1300	return RT_SUCCESS(rc) ? cch : 0;
1301	}
1302	RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1303
1304
1305	RTDECL(int) RTStrCalcLatin1LenEx(const char psz, size_t cchIn, size_t pcch)
1306	{
1307	size_t cch;
1308	int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1309	if (pcch)
1310	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1311	return rc;
1312	}
1313	RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1314
1315
1316	/**
1317	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1318	* @returns rc
1319	* @param ppsz The pointer to the string position point.
1320	* @param pCp Where to store RTUNICP_INVALID.
1321	* @param rc The iprt error code.
1322	*/
1323	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1324	{
1325	/*
1326	* Try find a valid encoding.
1327	*/
1328	(ppsz)++; /* @todo code this! */
1329	*pCp = RTUNICP_INVALID;
1330	return rc;
1331	}
1332
1333
1334	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1335	{
1336	RTUNICP Cp;
1337	RTStrGetCpExInternal(&psz, &Cp);
1338	return Cp;
1339	}
1340	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1341
1342
1343	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1344	{
1345	const unsigned char puch = (const unsigned char )*ppsz;
1346	const unsigned char uch = *puch;
1347	RTUNICP uc;
1348
1349	/* ASCII ? */
1350	if (!(uch & RT_BIT(7)))
1351	{
1352	uc = uch;
1353	puch++;
1354	}
1355	else if (uch & RT_BIT(6))
1356	{
1357	/* figure the length and validate the first octet. */
1358	unsigned cb;
1359	if (!(uch & RT_BIT(5)))
1360	cb = 2;
1361	else if (!(uch & RT_BIT(4)))
1362	cb = 3;
1363	else if (!(uch & RT_BIT(3)))
1364	cb = 4;
1365	else if (!(uch & RT_BIT(2)))
1366	cb = 5;
1367	else if (!(uch & RT_BIT(1)))
1368	cb = 6;
1369	else
1370	{
1371	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1372	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1373	}
1374
1375	/* validate the rest */
1376	switch (cb)
1377	{
1378	case 6:
1379	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1380	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1381	case 5:
1382	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1383	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1384	case 4:
1385	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1386	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1387	case 3:
1388	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1389	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1390	case 2:
1391	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1392	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1393	break;
1394	}
1395
1396	/* get and validate the code point. */
1397	switch (cb)
1398	{
1399	case 6:
1400	uc = (puch[5] & 0x3f)
1401	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1402	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1403	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1404	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1405	\| ((RTUNICP)(uch & 0x01) << 30);
1406	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1407	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1408	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1409	break;
1410	case 5:
1411	uc = (puch[4] & 0x3f)
1412	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1413	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1414	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1415	\| ((RTUNICP)(uch & 0x03) << 24);
1416	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1417	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1418	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1419	break;
1420	case 4:
1421	uc = (puch[3] & 0x3f)
1422	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1423	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1424	\| ((RTUNICP)(uch & 0x07) << 18);
1425	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1426	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1427	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1428	break;
1429	case 3:
1430	uc = (puch[2] & 0x3f)
1431	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1432	\| ((RTUNICP)(uch & 0x0f) << 12);
1433	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1434	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1435	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1436	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1437	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1438	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1439	break;
1440	case 2:
1441	uc = (puch[1] & 0x3f)
1442	\| ((RTUNICP)(uch & 0x1f) << 6);
1443	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1444	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1445	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1446	break;
1447	default: /* impossible, but GCC is bitching. */
1448	uc = RTUNICP_INVALID;
1449	break;
1450	}
1451	puch += cb;
1452	}
1453	else
1454	{
1455	/* 6th bit is always set. */
1456	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1457	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1458	}
1459	*pCp = uc;
1460	ppsz = (const char )puch;
1461	return VINF_SUCCESS;
1462	}
1463	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1464
1465
1466	/**
1467	* Handle invalid encodings passed to RTStrGetCpNEx().
1468	* @returns rc
1469	* @param ppsz The pointer to the string position point.
1470	* @param pcch Pointer to the string length.
1471	* @param pCp Where to store RTUNICP_INVALID.
1472	* @param rc The iprt error code.
1473	*/
1474	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
1475	{
1476	/*
1477	* Try find a valid encoding.
1478	*/
1479	(ppsz)++; /* @todo code this! */
1480	(*pcch)--;
1481	*pCp = RTUNICP_INVALID;
1482	return rc;
1483	}
1484
1485
1486	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
1487	{
1488	const unsigned char puch = (const unsigned char )*ppsz;
1489	const unsigned char uch = *puch;
1490	size_t cch = *pcch;
1491	RTUNICP uc;
1492
1493	if (cch == 0)
1494	{
1495	*pCp = RTUNICP_INVALID;
1496	return VERR_END_OF_STRING;
1497	}
1498
1499	/* ASCII ? */
1500	if (!(uch & RT_BIT(7)))
1501	{
1502	uc = uch;
1503	puch++;
1504	cch--;
1505	}
1506	else if (uch & RT_BIT(6))
1507	{
1508	/* figure the length and validate the first octet. */
1509	unsigned cb;
1510	if (!(uch & RT_BIT(5)))
1511	cb = 2;
1512	else if (!(uch & RT_BIT(4)))
1513	cb = 3;
1514	else if (!(uch & RT_BIT(3)))
1515	cb = 4;
1516	else if (!(uch & RT_BIT(2)))
1517	cb = 5;
1518	else if (!(uch & RT_BIT(1)))
1519	cb = 6;
1520	else
1521	{
1522	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1523	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1524	}
1525
1526	if (cb > cch)
1527	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1528
1529	/* validate the rest */
1530	switch (cb)
1531	{
1532	case 6:
1533	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1534	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1535	case 5:
1536	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1537	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1538	case 4:
1539	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1540	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1541	case 3:
1542	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1543	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1544	case 2:
1545	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1546	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1547	break;
1548	}
1549
1550	/* get and validate the code point. */
1551	switch (cb)
1552	{
1553	case 6:
1554	uc = (puch[5] & 0x3f)
1555	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1556	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1557	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1558	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1559	\| ((RTUNICP)(uch & 0x01) << 30);
1560	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1561	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1562	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1563	break;
1564	case 5:
1565	uc = (puch[4] & 0x3f)
1566	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1567	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1568	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1569	\| ((RTUNICP)(uch & 0x03) << 24);
1570	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1571	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1572	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1573	break;
1574	case 4:
1575	uc = (puch[3] & 0x3f)
1576	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1577	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1578	\| ((RTUNICP)(uch & 0x07) << 18);
1579	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1580	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1581	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1582	break;
1583	case 3:
1584	uc = (puch[2] & 0x3f)
1585	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1586	\| ((RTUNICP)(uch & 0x0f) << 12);
1587	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1588	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1589	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1590	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1591	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1592	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1593	break;
1594	case 2:
1595	uc = (puch[1] & 0x3f)
1596	\| ((RTUNICP)(uch & 0x1f) << 6);
1597	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1598	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1599	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1600	break;
1601	default: /* impossible, but GCC is bitching. */
1602	uc = RTUNICP_INVALID;
1603	break;
1604	}
1605	puch += cb;
1606	cch -= cb;
1607	}
1608	else
1609	{
1610	/* 6th bit is always set. */
1611	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1612	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1613	}
1614	*pCp = uc;
1615	ppsz = (const char )puch;
1616	(*pcch) = cch;
1617	return VINF_SUCCESS;
1618	}
1619	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1620
1621
1622	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1623	{
1624	unsigned char puch = (unsigned char )psz;
1625	if (uc < 0x80)
1626	*puch++ = (unsigned char )uc;
1627	else if (uc < 0x00000800)
1628	{
1629	*puch++ = 0xc0 \| (uc >> 6);
1630	*puch++ = 0x80 \| (uc & 0x3f);
1631	}
1632	else if (uc < 0x00010000)
1633	{
1634	if ( uc < 0x0000d8000
1635	\|\| ( uc > 0x0000dfff
1636	&& uc < 0x0000fffe))
1637	{
1638	*puch++ = 0xe0 \| (uc >> 12);
1639	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1640	*puch++ = 0x80 \| (uc & 0x3f);
1641	}
1642	else
1643	{
1644	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1645	*puch++ = 0x7f;
1646	}
1647	}
1648	else if (uc < 0x00200000)
1649	{
1650	*puch++ = 0xf0 \| (uc >> 18);
1651	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1652	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1653	*puch++ = 0x80 \| (uc & 0x3f);
1654	}
1655	else if (uc < 0x04000000)
1656	{
1657	*puch++ = 0xf8 \| (uc >> 24);
1658	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1659	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1660	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1661	*puch++ = 0x80 \| (uc & 0x3f);
1662	}
1663	else if (uc <= 0x7fffffff)
1664	{
1665	*puch++ = 0xfc \| (uc >> 30);
1666	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1667	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1668	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1669	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1670	*puch++ = 0x80 \| (uc & 0x3f);
1671	}
1672	else
1673	{
1674	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1675	*puch++ = 0x7f;
1676	}
1677
1678	return (char *)puch;
1679	}
1680	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1681
1682
1683	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
1684	{
1685	if (pszStart < psz)
1686	{
1687	/* simple char? */
1688	const unsigned char puch = (const unsigned char )psz;
1689	unsigned uch = *--puch;
1690	if (!(uch & RT_BIT(7)))
1691	return (char *)puch;
1692	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1693
1694	/* two or more. */
1695	uint32_t uMask = 0xffffffc0;
1696	while ( (const unsigned char *)pszStart < puch
1697	&& !(uMask & 1))
1698	{
1699	uch = *--puch;
1700	if ((uch & 0xc0) != 0x80)
1701	{
1702	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1703	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
1704	(char *)pszStart);
1705	return (char *)puch;
1706	}
1707	uMask >>= 1;
1708	}
1709	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
1710	}
1711	return (char *)pszStart;
1712	}
1713	RT_EXPORT_SYMBOL(RTStrPrevCp);
1714

注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 31221

以其他格式下載: