1/*===--- ConvertUTF.c - Universal Character Names conversions ---------------=== 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 *===------------------------------------------------------------------------=*/ 9 * Copyright © 1991-2015 Unicode, Inc. All rights reserved. 10 * Distributed under the Terms of Use in 11 * http://www.unicode.org/copyright.html. 13 * Permission is hereby granted, free of charge, to any person obtaining 14 * a copy of the Unicode data files and any associated documentation 15 * (the "Data Files") or Unicode software and any associated documentation 16 * (the "Software") to deal in the Data Files or Software 17 * without restriction, including without limitation the rights to use, 18 * copy, modify, merge, publish, distribute, and/or sell copies of 19 * the Data Files or Software, and to permit persons to whom the Data Files 20 * or Software are furnished to do so, provided that 21 * (a) this copyright and permission notice appear with all copies 22 * of the Data Files or Software, 23 * (b) this copyright and permission notice appear in associated 25 * (c) there is clear notice in each modified Data File or in the Software 26 * as well as in the documentation associated with the Data File(s) or 27 * Software that the data or software has been modified. 29 * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 30 * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 31 * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 32 * NONINFRINGEMENT OF THIRD PARTY RIGHTS. 33 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 34 * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 35 * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 36 * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 37 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 38 * PERFORMANCE OF THE DATA FILES OR SOFTWARE. 40 * Except as contained in this notice, the name of a copyright holder 41 * shall not be used in advertising or otherwise to promote the sale, 42 * use or other dealings in these Data Files or Software without prior 43 * written authorization of the copyright holder. 46/* --------------------------------------------------------------------- 48 Conversions between UTF32, UTF-16, and UTF-8. Source code file. 49 Author: Mark E. Davis, 1994. 50 Rev History: Rick McGowan, fixes & updates May 2001. 51 Sept 2001: fixed const & error conditions per 52 mods suggested by S. Parent & A. Lillich. 53 June 2002: Tim Dodd added detection and handling of incomplete 54 source sequences, enhanced error detection, added casts 55 to eliminate compiler warnings. 56 July 2003: slight mods to back out aggressive FFFE detection. 57 Jan 2004: updated switches in from-UTF8 conversions. 58 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 60 See the header file "ConvertUTF.h" for complete documentation. 62------------------------------------------------------------------------ */ 71 * This code extensively uses fall-through switches. 72 * Keep the compiler from warning about that. 74#if defined(__clang__) && defined(__has_warning) 75# if __has_warning("-Wimplicit-fallthrough")
76# define ConvertUTF_DISABLE_WARNINGS \ 77 _Pragma("clang diagnostic push") \
78 _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
79# define ConvertUTF_RESTORE_WARNINGS \ 80 _Pragma("clang diagnostic pop")
82#elif defined(__GNUC__) && __GNUC__ > 6 83# define ConvertUTF_DISABLE_WARNINGS \ 84 _Pragma("GCC diagnostic push") \
85 _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
86# define ConvertUTF_RESTORE_WARNINGS \ 87 _Pragma("GCC diagnostic pop")
89#ifndef ConvertUTF_DISABLE_WARNINGS 90# define ConvertUTF_DISABLE_WARNINGS 92#ifndef ConvertUTF_RESTORE_WARNINGS 93# define ConvertUTF_RESTORE_WARNINGS 100staticconstinthalfShift = 10;
/* used for shifting by 10 bits */ 105#define UNI_SUR_HIGH_START (UTF32)0xD800 106#define UNI_SUR_HIGH_END (UTF32)0xDBFF 107#define UNI_SUR_LOW_START (UTF32)0xDC00 108#define UNI_SUR_LOW_END (UTF32)0xDFFF 110/* --------------------------------------------------------------------- */ 113 * Index into the table below with the first byte of a UTF-8 sequence to 114 * get the number of trailing bytes that are supposed to follow it. 115 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 116 * left as-is for anyone who may want to do such conversion, which was 117 * allowed in earlier algorithms. 120 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
121 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
122 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
123 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
124 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
125 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
126 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
127 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
131 * Magic values subtracted from a buffer value during UTF8 conversion. 132 * This table contains as many values as there might be trailing bytes 133 * in a UTF-8 sequence. 136 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
139 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 140 * into the first byte, depending on how many bytes follow. There are 141 * as many entries in this table as there are UTF-8 sequence types. 142 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs 143 * for *legal* UTF-8 will be 4 or fewer bytes total. 147/* --------------------------------------------------------------------- */ 149/* The interface converts a whole buffer to avoid function-call overhead. 150 * Constants have been gathered. Loops & conditionals have been removed as 151 * much as possible for efficiency, in favor of drop-through switches. 152 * (See "Note A" at the bottom of the file for equivalent code.) 153 * If your compiler supports it, the "isLegalUTF8" call can be turned 154 * into an inline function. 158/* --------------------------------------------------------------------- */ 161constUTF32** sourceStart,
constUTF32* sourceEnd,
164constUTF32* source = *sourceStart;
165UTF16* target = *targetStart;
166while (source < sourceEnd) {
168if (target >= targetEnd) {
172if (ch <=
UNI_MAX_BMP) {
/* Target is a character <= 0xFFFF */ 173/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 176 --source;
/* return to the illegal value itself */ 183 *target++ = (
UTF16)ch;
/* normal case */ 192/* target is a character in range 0xFFFF - 0x10FFFF. */ 193if (target + 1 >= targetEnd) {
194 --source;
/* Back up source pointer! */ 202 *sourceStart = source;
203 *targetStart = target;
207/* --------------------------------------------------------------------- */ 210constUTF16** sourceStart,
constUTF16* sourceEnd,
213constUTF16* source = *sourceStart;
214UTF32* target = *targetStart;
216while (source < sourceEnd) {
217constUTF16* oldSource = source;
/* In case we have to back up because of target overflow. */ 219/* If we have a surrogate pair, convert to UTF32 first. */ 221/* If the 16 bits following the high surrogate are in the source buffer... */ 222if (source < sourceEnd) {
224/* If it's a low surrogate, convert to UTF32. */ 230 --source;
/* return to the illegal value itself */ 234 }
else {
/* We don't have the 16 bits following the high surrogate. */ 235 --source;
/* return to the high surrogate */ 240/* UTF-16 surrogate values are illegal in UTF-32 */ 242 --source;
/* return to the illegal value itself */ 247if (target >= targetEnd) {
248 source = oldSource;
/* Back up source pointer! */ 253 *sourceStart = source;
254 *targetStart = target;
257 fprintf(stderr,
"ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
264constUTF16** sourceStart,
constUTF16* sourceEnd,
267constUTF16* source = *sourceStart;
268UTF8* target = *targetStart;
269while (source < sourceEnd) {
271unsignedshort bytesToWrite = 0;
272constUTF32 byteMask = 0xBF;
273constUTF32 byteMark = 0x80;
274constUTF16* oldSource = source;
/* In case we have to back up because of target overflow. */ 276/* If we have a surrogate pair, convert to UTF32 first. */ 278/* If the 16 bits following the high surrogate are in the source buffer... */ 279if (source < sourceEnd) {
281/* If it's a low surrogate, convert to UTF32. */ 287 --source;
/* return to the illegal value itself */ 291 }
else {
/* We don't have the 16 bits following the high surrogate. */ 292 --source;
/* return to the high surrogate */ 297/* UTF-16 surrogate values are illegal in UTF-32 */ 299 --source;
/* return to the illegal value itself */ 304/* Figure out how many bytes the result will require */ 305if (ch < (
UTF32)0x80) { bytesToWrite = 1;
306 }
elseif (ch < (
UTF32)0x800) { bytesToWrite = 2;
307 }
elseif (ch < (
UTF32)0x10000) { bytesToWrite = 3;
308 }
elseif (ch < (
UTF32)0x110000) { bytesToWrite = 4;
309 }
else { bytesToWrite = 3;
313 target += bytesToWrite;
314if (target > targetEnd) {
315 source = oldSource;
/* Back up source pointer! */ 318switch (bytesToWrite) {
/* note: everything falls through. */ 319case 4: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
320case 3: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
321case 2: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
324 target += bytesToWrite;
326 *sourceStart = source;
327 *targetStart = target;
331/* --------------------------------------------------------------------- */ 334constUTF32** sourceStart,
constUTF32* sourceEnd,
337constUTF32* source = *sourceStart;
338UTF8* target = *targetStart;
339while (source < sourceEnd) {
341unsignedshort bytesToWrite = 0;
342constUTF32 byteMask = 0xBF;
343constUTF32 byteMark = 0x80;
346/* UTF-16 surrogate values are illegal in UTF-32 */ 348 --source;
/* return to the illegal value itself */ 354 * Figure out how many bytes the result will require. Turn any 355 * illegally large UTF32 things (> Plane 17) into replacement chars. 357if (ch < (
UTF32)0x80) { bytesToWrite = 1;
358 }
elseif (ch < (
UTF32)0x800) { bytesToWrite = 2;
359 }
elseif (ch < (
UTF32)0x10000) { bytesToWrite = 3;
361 }
else { bytesToWrite = 3;
366 target += bytesToWrite;
367if (target > targetEnd) {
368 --source;
/* Back up source pointer! */ 371switch (bytesToWrite) {
/* note: everything falls through. */ 372case 4: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
373case 3: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
374case 2: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
377 target += bytesToWrite;
379 *sourceStart = source;
380 *targetStart = target;
384/* --------------------------------------------------------------------- */ 387 * Utility routine to tell whether a sequence of bytes is legal UTF-8. 388 * This must be called with the length pre-determined by the first byte. 389 * If not calling this from ConvertUTF8to*, then the length can be set by: 390 * length = trailingBytesForUTF8[*source]+1; 391 * and the sequence is illegal right away if there aren't that many bytes 393 * If presented with a length > 4, this returns false. The Unicode 394 * definition of UTF-8 goes up to 4-byte sequences. 399constUTF8 *srcptr = source+length;
402/* Everything else falls through when "true"... */ 403case 4:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
returnfalse;
404case 3:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
returnfalse;
405case 2:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
returnfalse;
408/* no fall-through in this inner switch */ 409case 0xE0:
if (a < 0xA0)
returnfalse;
break;
410case 0xED:
if (a > 0x9F)
returnfalse;
break;
411case 0xF0:
if (a < 0x90)
returnfalse;
break;
412case 0xF4:
if (a > 0x8F)
returnfalse;
break;
413default:
if (a < 0x80)
returnfalse;
416case 1:
if (*source >= 0x80 && *source < 0xC2)
returnfalse;
418if (*source > 0xF4)
returnfalse;
422/* --------------------------------------------------------------------- */ 425 * Exported function to return whether a UTF-8 sequence is legal or not. 426 * This is not used here; it's just exported. 430if (length > sourceEnd - source) {
437 * Exported function to return the size of the first utf-8 code unit sequence, 438 * Or 0 if the sequence is not valid; 442return (length <= sourceEnd - source &&
isLegalUTF8(source, length)) ? length
446/* --------------------------------------------------------------------- */ 450constUTF8 *sourceEnd) {
456 * Unicode 6.3.0, D93b: 458 * Maximal subpart of an ill-formed subsequence: The longest code unit 459 * subsequence starting at an unconvertible offset that is either: 460 * a. the initial subsequence of a well-formed code unit sequence, or 461 * b. a subsequence of length one. 464if (source == sourceEnd)
468 * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 474if (b1 >= 0xC2 && b1 <= 0xDF) {
476 * First byte is valid, but we know that this code unit sequence is 477 * invalid, so the maximal subpart has to end after the first byte. 482if (source == sourceEnd)
489return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
491if (b1 >= 0xE1 && b1 <= 0xEC) {
492return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
495return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
497if (b1 >= 0xEE && b1 <= 0xEF) {
498return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
501if (b2 >= 0x90 && b2 <= 0xBF) {
502if (source == sourceEnd)
506return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
510if (b1 >= 0xF1 && b1 <= 0xF3) {
511if (b2 >= 0x80 && b2 <= 0xBF) {
512if (source == sourceEnd)
516return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
521if (b2 >= 0x80 && b2 <= 0x8F) {
522if (source == sourceEnd)
526return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
531assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
533 * There are no valid sequences that start with these bytes. Maximal subpart 534 * is defined to have length 1 in these cases. 539/* --------------------------------------------------------------------- */ 542 * Exported function to return the total number of bytes in a codepoint 543 * represented in UTF-8, given the value of the first byte. 549/* --------------------------------------------------------------------- */ 552 * Exported function to return whether a UTF-8 string is legal or not. 553 * This is not used here; it's just exported. 556while (*source != sourceEnd) {
558if (length > sourceEnd - *source || !
isLegalUTF8(*source, length))
565/* --------------------------------------------------------------------- */ 568constUTF8** sourceStart,
constUTF8* sourceEnd,
571constUTF8* source = *sourceStart;
572UTF16* target = *targetStart;
573while (source < sourceEnd) {
576if (extraBytesToRead >= sourceEnd - source) {
579/* Do this check whether lenient or strict */ 585 * The cases all fall through. See "Note A" below. 587switch (extraBytesToRead) {
588case 5: ch += *source++; ch <<= 6;
/* remember, illegal UTF-8 */ 589case 4: ch += *source++; ch <<= 6;
/* remember, illegal UTF-8 */ 590case 3: ch += *source++; ch <<= 6;
591case 2: ch += *source++; ch <<= 6;
592case 1: ch += *source++; ch <<= 6;
593case 0: ch += *source++;
597if (target >= targetEnd) {
598 source -= (extraBytesToRead+1);
/* Back up source pointer! */ 601if (ch <=
UNI_MAX_BMP) {
/* Target is a character <= 0xFFFF */ 602/* UTF-16 surrogate values are illegal in UTF-32 */ 605 source -= (extraBytesToRead+1);
/* return to the illegal value itself */ 612 *target++ = (
UTF16)ch;
/* normal case */ 617 source -= (extraBytesToRead+1);
/* return to the start */ 618break;
/* Bail out; shouldn't continue */ 623/* target is a character in range 0xFFFF - 0x10FFFF. */ 624if (target + 1 >= targetEnd) {
625 source -= (extraBytesToRead+1);
/* Back up source pointer! */ 633 *sourceStart = source;
634 *targetStart = target;
638/* --------------------------------------------------------------------- */ 641constUTF8** sourceStart,
constUTF8* sourceEnd,
645constUTF8* source = *sourceStart;
646UTF32* target = *targetStart;
647while (source < sourceEnd) {
650if (extraBytesToRead >= sourceEnd - source) {
658 * Replace the maximal subpart of ill-formed sequence with 659 * replacement character. 667if (target >= targetEnd) {
671/* Do this check whether lenient or strict */ 675/* Abort conversion. */ 679 * Replace the maximal subpart of ill-formed sequence with 680 * replacement character. 689 * The cases all fall through. See "Note A" below. 691switch (extraBytesToRead) {
692case 5: ch += *source++; ch <<= 6;
693case 4: ch += *source++; ch <<= 6;
694case 3: ch += *source++; ch <<= 6;
695case 2: ch += *source++; ch <<= 6;
696case 1: ch += *source++; ch <<= 6;
697case 0: ch += *source++;
703 * UTF-16 surrogate values are illegal in UTF-32, and anything 704 * over Plane 17 (> 0x10FFFF) is illegal. 708 source -= (extraBytesToRead+1);
/* return to the illegal value itself */ 717 }
else {
/* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 722 *sourceStart = source;
723 *targetStart = target;
733 flags,
/*InputIsPartial=*/true);
740 flags,
/*InputIsPartial=*/false);
743/* --------------------------------------------------------------------- 746 The fall-through switches in UTF-8 reading code save a 747 temp variable, some decrements & conditionals. The switches 748 are equivalent to the following loop: 750 int tmpBytesToRead = extraBytesToRead+1; 754 if (tmpBytesToRead) ch <<= 6; 755 } while (tmpBytesToRead > 0); 757 In UTF-8 writing code, the switches on "bytesToWrite" are 758 similarly unrolled loops. 760 --------------------------------------------------------------------- */ #define UNI_SUR_LOW_START
#define UNI_SUR_HIGH_START
#define ConvertUTF_DISABLE_WARNINGS
#define ConvertUTF_RESTORE_WARNINGS
#define UNI_REPLACEMENT_CHAR
#define UNI_MAX_LEGAL_UTF32
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This is an optimization pass for GlobalISel generic memory operations.
static ConversionResult ConvertUTF8toUTF32Impl(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags, Boolean InputIsPartial)
static const UTF32 offsetsFromUTF8[6]
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
static const int halfShift
unsigned getNumBytesForUTF8(UTF8 firstByte)
static const UTF32 halfBase
static Boolean isLegalUTF8(const UTF8 *source, int length)
static const char trailingBytesForUTF8[256]
ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
Convert a partial UTF8 sequence to UTF32.
ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
static const UTF32 halfMask
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd)
static unsigned findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd)
ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
static const UTF8 firstByteMark[7]
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)