Movatterモバイル変換

Go to the documentation of this file.

1/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===

2 *

3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4 * See https://llvm.org/LICENSE.txt for license information.

5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6 *

7 *===------------------------------------------------------------------------=*/

8/*

10 * Distributed under the Terms of Use in

11 * http://www.unicode.org/copyright.html.

12 *

13 * Permission is hereby granted, free of charge, to any person obtaining

14 * a copy of the Unicode data files and any associated documentation

15 * (the "Data Files") or Unicode software and any associated documentation

16 * (the "Software") to deal in the Data Files or Software

17 * without restriction, including without limitation the rights to use,

18 * copy, modify, merge, publish, distribute, and/or sell copies of

19 * the Data Files or Software, and to permit persons to whom the Data Files

20 * or Software are furnished to do so, provided that

21 * (a) this copyright and permission notice appear with all copies

22 * of the Data Files or Software,

23 * (b) this copyright and permission notice appear in associated

24 * documentation, and

25 * (c) there is clear notice in each modified Data File or in the Software

26 * as well as in the documentation associated with the Data File(s) or

27 * Software that the data or software has been modified.

28 *

29 * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF

30 * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE

31 * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

32 * NONINFRINGEMENT OF THIRD PARTY RIGHTS.

33 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS

34 * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL

35 * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,

36 * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER

37 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR

38 * PERFORMANCE OF THE DATA FILES OR SOFTWARE.

39 *

40 * Except as contained in this notice, the name of a copyright holder

41 * shall not be used in advertising or otherwise to promote the sale,

42 * use or other dealings in these Data Files or Software without prior

43 * written authorization of the copyright holder.

44 */

46/* ---------------------------------------------------------------------

48 Conversions between UTF32, UTF-16, and UTF-8. Source code file.

49 Author: Mark E. Davis, 1994.

50 Rev History: Rick McGowan, fixes & updates May 2001.

51 Sept 2001: fixed const & error conditions per

52 mods suggested by S. Parent & A. Lillich.

53 June 2002: Tim Dodd added detection and handling of incomplete

54 source sequences, enhanced error detection, added casts

55 to eliminate compiler warnings.

56 July 2003: slight mods to back out aggressive FFFE detection.

57 Jan 2004: updated switches in from-UTF8 conversions.

58 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.

60 See the header file "ConvertUTF.h" for complete documentation.

62------------------------------------------------------------------------ */

64#include "llvm/Support/ConvertUTF.h"

65#ifdef CVTUTF_DEBUG

66#include <stdio.h>

67#endif

68#include <assert.h>

70/*

71 * This code extensively uses fall-through switches.

72 * Keep the compiler from warning about that.

73 */

74#if defined(__clang__) && defined(__has_warning)

75# if __has_warning("-Wimplicit-fallthrough")

76# define ConvertUTF_DISABLE_WARNINGS \

77 _Pragma("clang diagnostic push") \

78 _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")

79# define ConvertUTF_RESTORE_WARNINGS \

80 _Pragma("clang diagnostic pop")

81# endif

82#elif defined(__GNUC__) && __GNUC__ > 6

83# define ConvertUTF_DISABLE_WARNINGS \

84 _Pragma("GCC diagnostic push") \

85 _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")

86# define ConvertUTF_RESTORE_WARNINGS \

87 _Pragma("GCC diagnostic pop")

88#endif

89#ifndef ConvertUTF_DISABLE_WARNINGS

90# define ConvertUTF_DISABLE_WARNINGS

91#endif

92#ifndef ConvertUTF_RESTORE_WARNINGS

93# define ConvertUTF_RESTORE_WARNINGS

94#endif

96ConvertUTF_DISABLE_WARNINGS

98namespacellvm {

100staticconstinthalfShift = 10;/* used for shifting by 10 bits */

101

102staticconstUTF32 halfBase = 0x0010000UL;

103staticconstUTF32 halfMask = 0x3FFUL;

104

105#define UNI_SUR_HIGH_START (UTF32)0xD800

106#define UNI_SUR_HIGH_END (UTF32)0xDBFF

107#define UNI_SUR_LOW_START (UTF32)0xDC00

108#define UNI_SUR_LOW_END (UTF32)0xDFFF

109

110/* --------------------------------------------------------------------- */

111

112/*

113 * Index into the table below with the first byte of a UTF-8 sequence to

114 * get the number of trailing bytes that are supposed to follow it.

115 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is

116 * left as-is for anyone who may want to do such conversion, which was

117 * allowed in earlier algorithms.

118 */

119staticconstchartrailingBytesForUTF8[256] = {

120 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

121 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

122 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

123 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

124 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

125 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

126 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

127 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5

128};

129

130/*

131 * Magic values subtracted from a buffer value during UTF8 conversion.

132 * This table contains as many values as there might be trailing bytes

133 * in a UTF-8 sequence.

134 */

135staticconstUTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,

136 0x03C82080UL, 0xFA082080UL, 0x82082080UL };

137

138/*

139 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed

140 * into the first byte, depending on how many bytes follow. There are

141 * as many entries in this table as there are UTF-8 sequence types.

142 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs

143 * for *legal* UTF-8 will be 4 or fewer bytes total.

144 */

145staticconstUTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

146

147/* --------------------------------------------------------------------- */

148

149/* The interface converts a whole buffer to avoid function-call overhead.

150 * Constants have been gathered. Loops & conditionals have been removed as

151 * much as possible for efficiency, in favor of drop-through switches.

152 * (See "Note A" at the bottom of the file for equivalent code.)

153 * If your compiler supports it, the "isLegalUTF8" call can be turned

154 * into an inline function.

155 */

156

157

158/* --------------------------------------------------------------------- */

159

160ConversionResult ConvertUTF32toUTF16 (

161constUTF32** sourceStart,constUTF32* sourceEnd,

162UTF16** targetStart,UTF16* targetEnd,ConversionFlags flags) {

163ConversionResult result =conversionOK;

164constUTF32* source = *sourceStart;

165UTF16* target = *targetStart;

166while (source < sourceEnd) {

167UTF32 ch;

168if (target >= targetEnd) {

169 result =targetExhausted;break;

170 }

171 ch = *source++;

172if (ch <=UNI_MAX_BMP) {/* Target is a character <= 0xFFFF */

173/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */

174if (ch >=UNI_SUR_HIGH_START && ch <=UNI_SUR_LOW_END) {

175if (flags ==strictConversion) {

176 --source;/* return to the illegal value itself */

177 result =sourceIllegal;

178break;

179 }else {

180 *target++ =UNI_REPLACEMENT_CHAR;

181 }

182 }else {

183 *target++ = (UTF16)ch;/* normal case */

184 }

185 }elseif (ch >UNI_MAX_LEGAL_UTF32) {

186if (flags ==strictConversion) {

187 result =sourceIllegal;

188 }else {

189 *target++ =UNI_REPLACEMENT_CHAR;

190 }

191 }else {

192/* target is a character in range 0xFFFF - 0x10FFFF. */

193if (target + 1 >= targetEnd) {

194 --source;/* Back up source pointer! */

195 result =targetExhausted;break;

196 }

197 ch -=halfBase;

198 *target++ = (UTF16)((ch >>halfShift) +UNI_SUR_HIGH_START);

199 *target++ = (UTF16)((ch &halfMask) +UNI_SUR_LOW_START);

200 }

201 }

202 *sourceStart = source;

203 *targetStart = target;

204return result;

205}

206

207/* --------------------------------------------------------------------- */

208

209ConversionResult ConvertUTF16toUTF32 (

210constUTF16** sourceStart,constUTF16* sourceEnd,

211UTF32** targetStart,UTF32* targetEnd,ConversionFlags flags) {

212ConversionResult result =conversionOK;

213constUTF16* source = *sourceStart;

214UTF32* target = *targetStart;

215UTF32 ch, ch2;

216while (source < sourceEnd) {

217constUTF16* oldSource = source;/* In case we have to back up because of target overflow. */

218 ch = *source++;

219/* If we have a surrogate pair, convert to UTF32 first. */

220if (ch >=UNI_SUR_HIGH_START && ch <=UNI_SUR_HIGH_END) {

221/* If the 16 bits following the high surrogate are in the source buffer... */

222if (source < sourceEnd) {

223 ch2 = *source;

224/* If it's a low surrogate, convert to UTF32. */

225if (ch2 >=UNI_SUR_LOW_START && ch2 <=UNI_SUR_LOW_END) {

226 ch = ((ch -UNI_SUR_HIGH_START) <<halfShift)

227 + (ch2 -UNI_SUR_LOW_START) +halfBase;

228 ++source;

229 }elseif (flags ==strictConversion) {/* it's an unpaired high surrogate */

230 --source;/* return to the illegal value itself */

231 result =sourceIllegal;

232break;

233 }

234 }else {/* We don't have the 16 bits following the high surrogate. */

235 --source;/* return to the high surrogate */

236 result =sourceExhausted;

237break;

238 }

239 }elseif (flags ==strictConversion) {

240/* UTF-16 surrogate values are illegal in UTF-32 */

241if (ch >=UNI_SUR_LOW_START && ch <=UNI_SUR_LOW_END) {

242 --source;/* return to the illegal value itself */

243 result =sourceIllegal;

244break;

245 }

246 }

247if (target >= targetEnd) {

248 source = oldSource;/* Back up source pointer! */

249 result =targetExhausted;break;

250 }

251 *target++ = ch;

252 }

253 *sourceStart = source;

254 *targetStart = target;

255#ifdef CVTUTF_DEBUG

256if (result ==sourceIllegal) {

257 fprintf(stderr,"ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);

258 fflush(stderr);

259}

260#endif

261return result;

262}

263ConversionResult ConvertUTF16toUTF8 (

264constUTF16** sourceStart,constUTF16* sourceEnd,

265UTF8** targetStart,UTF8* targetEnd,ConversionFlags flags) {

266ConversionResult result =conversionOK;

267constUTF16* source = *sourceStart;

268UTF8* target = *targetStart;

269while (source < sourceEnd) {

270UTF32 ch;

271unsignedshort bytesToWrite = 0;

272constUTF32 byteMask = 0xBF;

273constUTF32 byteMark = 0x80;

274constUTF16* oldSource = source;/* In case we have to back up because of target overflow. */

275 ch = *source++;

276/* If we have a surrogate pair, convert to UTF32 first. */

277if (ch >=UNI_SUR_HIGH_START && ch <=UNI_SUR_HIGH_END) {

278/* If the 16 bits following the high surrogate are in the source buffer... */

279if (source < sourceEnd) {

280UTF32 ch2 = *source;

281/* If it's a low surrogate, convert to UTF32. */

282if (ch2 >=UNI_SUR_LOW_START && ch2 <=UNI_SUR_LOW_END) {

283 ch = ((ch -UNI_SUR_HIGH_START) <<halfShift)

284 + (ch2 -UNI_SUR_LOW_START) +halfBase;

285 ++source;

286 }elseif (flags ==strictConversion) {/* it's an unpaired high surrogate */

287 --source;/* return to the illegal value itself */

288 result =sourceIllegal;

289break;

290 }

291 }else {/* We don't have the 16 bits following the high surrogate. */

292 --source;/* return to the high surrogate */

293 result =sourceExhausted;

294break;

295 }

296 }elseif (flags ==strictConversion) {

297/* UTF-16 surrogate values are illegal in UTF-32 */

298if (ch >=UNI_SUR_LOW_START && ch <=UNI_SUR_LOW_END) {

299 --source;/* return to the illegal value itself */

300 result =sourceIllegal;

301break;

302 }

303 }

304/* Figure out how many bytes the result will require */

305if (ch < (UTF32)0x80) { bytesToWrite = 1;

306 }elseif (ch < (UTF32)0x800) { bytesToWrite = 2;

307 }elseif (ch < (UTF32)0x10000) { bytesToWrite = 3;

308 }elseif (ch < (UTF32)0x110000) { bytesToWrite = 4;

309 }else { bytesToWrite = 3;

310 ch =UNI_REPLACEMENT_CHAR;

311 }

312

313 target += bytesToWrite;

314if (target > targetEnd) {

315 source = oldSource;/* Back up source pointer! */

316 target -= bytesToWrite; result =targetExhausted;break;

317 }

318switch (bytesToWrite) {/* note: everything falls through. */

319case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

320case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

321case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

322case 1: *--target = (UTF8)(ch |firstByteMark[bytesToWrite]);

323 }

324 target += bytesToWrite;

325 }

326 *sourceStart = source;

327 *targetStart = target;

328return result;

329}

330

331/* --------------------------------------------------------------------- */

332

333ConversionResult ConvertUTF32toUTF8 (

334constUTF32** sourceStart,constUTF32* sourceEnd,

335UTF8** targetStart,UTF8* targetEnd,ConversionFlags flags) {

336ConversionResult result =conversionOK;

337constUTF32* source = *sourceStart;

338UTF8* target = *targetStart;

339while (source < sourceEnd) {

340UTF32 ch;

341unsignedshort bytesToWrite = 0;

342constUTF32 byteMask = 0xBF;

343constUTF32 byteMark = 0x80;

344 ch = *source++;

345if (flags ==strictConversion ) {

346/* UTF-16 surrogate values are illegal in UTF-32 */

347if (ch >=UNI_SUR_HIGH_START && ch <=UNI_SUR_LOW_END) {

348 --source;/* return to the illegal value itself */

349 result =sourceIllegal;

350break;

351 }

352 }

353/*

354 * Figure out how many bytes the result will require. Turn any

355 * illegally large UTF32 things (> Plane 17) into replacement chars.

356 */

357if (ch < (UTF32)0x80) { bytesToWrite = 1;

358 }elseif (ch < (UTF32)0x800) { bytesToWrite = 2;

359 }elseif (ch < (UTF32)0x10000) { bytesToWrite = 3;

360 }elseif (ch <=UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;

361 }else { bytesToWrite = 3;

362 ch =UNI_REPLACEMENT_CHAR;

363 result =sourceIllegal;

364 }

365

366 target += bytesToWrite;

367if (target > targetEnd) {

368 --source;/* Back up source pointer! */

369 target -= bytesToWrite; result =targetExhausted;break;

370 }

371switch (bytesToWrite) {/* note: everything falls through. */

372case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

373case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

374case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;

375case 1: *--target = (UTF8) (ch |firstByteMark[bytesToWrite]);

376 }

377 target += bytesToWrite;

378 }

379 *sourceStart = source;

380 *targetStart = target;

381return result;

382}

383

384/* --------------------------------------------------------------------- */

385

386/*

387 * Utility routine to tell whether a sequence of bytes is legal UTF-8.

388 * This must be called with the length pre-determined by the first byte.

389 * If not calling this from ConvertUTF8to*, then the length can be set by:

390 * length = trailingBytesForUTF8[*source]+1;

391 * and the sequence is illegal right away if there aren't that many bytes

392 * available.

393 * If presented with a length > 4, this returns false. The Unicode

394 * definition of UTF-8 goes up to 4-byte sequences.

395 */

396

397staticBoolean isLegalUTF8(constUTF8 *source,int length) {

398UTF8 a;

399constUTF8 *srcptr = source+length;

400switch (length) {

401default:returnfalse;

402/* Everything else falls through when "true"... */

403case 4:if ((a = (*--srcptr)) < 0x80 || a > 0xBF)returnfalse;

404case 3:if ((a = (*--srcptr)) < 0x80 || a > 0xBF)returnfalse;

405case 2:if ((a = (*--srcptr)) < 0x80 || a > 0xBF)returnfalse;

406

407switch (*source) {

408/* no fall-through in this inner switch */

409case 0xE0:if (a < 0xA0)returnfalse;break;

410case 0xED:if (a > 0x9F)returnfalse;break;

411case 0xF0:if (a < 0x90)returnfalse;break;

412case 0xF4:if (a > 0x8F)returnfalse;break;

413default:if (a < 0x80)returnfalse;

414 }

415

416case 1:if (*source >= 0x80 && *source < 0xC2)returnfalse;

417 }

418if (*source > 0xF4)returnfalse;

419returntrue;

420}

421

422/* --------------------------------------------------------------------- */

423

424/*

425 * Exported function to return whether a UTF-8 sequence is legal or not.

426 * This is not used here; it's just exported.

427 */

428Boolean isLegalUTF8Sequence(constUTF8 *source,constUTF8 *sourceEnd) {

429int length =trailingBytesForUTF8[*source]+1;

430if (length > sourceEnd - source) {

431returnfalse;

432 }

433returnisLegalUTF8(source, length);

434}

435

436/*

437 * Exported function to return the size of the first utf-8 code unit sequence,

438 * Or 0 if the sequence is not valid;

439 */

440unsignedgetUTF8SequenceSize(constUTF8 *source,constUTF8 *sourceEnd) {

441int length =trailingBytesForUTF8[*source] + 1;

442return (length <= sourceEnd - source &&isLegalUTF8(source, length)) ? length

443 : 0;

444}

445

446/* --------------------------------------------------------------------- */

447

448staticunsigned

449findMaximalSubpartOfIllFormedUTF8Sequence(constUTF8 *source,

450constUTF8 *sourceEnd) {

451UTF8 b1, b2, b3;

452

453assert(!isLegalUTF8Sequence(source, sourceEnd));

454

455/*

456 * Unicode 6.3.0, D93b:

457 *

458 * Maximal subpart of an ill-formed subsequence: The longest code unit

459 * subsequence starting at an unconvertible offset that is either:

460 * a. the initial subsequence of a well-formed code unit sequence, or

461 * b. a subsequence of length one.

462 */

463

464if (source == sourceEnd)

465return 0;

466

467/*

468 * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8

469 * Byte Sequences.

470 */

471

472 b1 = *source;

473 ++source;

474if (b1 >= 0xC2 && b1 <= 0xDF) {

475/*

476 * First byte is valid, but we know that this code unit sequence is

477 * invalid, so the maximal subpart has to end after the first byte.

478 */

479return 1;

480 }

481

482if (source == sourceEnd)

483return 1;

484

485 b2 = *source;

486 ++source;

487

488if (b1 == 0xE0) {

489return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;

490 }

491if (b1 >= 0xE1 && b1 <= 0xEC) {

492return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;

493 }

494if (b1 == 0xED) {

495return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;

496 }

497if (b1 >= 0xEE && b1 <= 0xEF) {

498return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;

499 }

500if (b1 == 0xF0) {

501if (b2 >= 0x90 && b2 <= 0xBF) {

502if (source == sourceEnd)

503return 2;

504

505 b3 = *source;

506return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;

507 }

508return 1;

509 }

510if (b1 >= 0xF1 && b1 <= 0xF3) {

511if (b2 >= 0x80 && b2 <= 0xBF) {

512if (source == sourceEnd)

513return 2;

514

515 b3 = *source;

516return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;

517 }

518return 1;

519 }

520if (b1 == 0xF4) {

521if (b2 >= 0x80 && b2 <= 0x8F) {

522if (source == sourceEnd)

523return 2;

524

525 b3 = *source;

526return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;

527 }

528return 1;

529 }

530

531assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);

532/*

533 * There are no valid sequences that start with these bytes. Maximal subpart

534 * is defined to have length 1 in these cases.

535 */

536return 1;

537}

538

539/* --------------------------------------------------------------------- */

540

541/*

542 * Exported function to return the total number of bytes in a codepoint

543 * represented in UTF-8, given the value of the first byte.

544 */

545unsignedgetNumBytesForUTF8(UTF8 first) {

546returntrailingBytesForUTF8[first] + 1;

547}

548

549/* --------------------------------------------------------------------- */

550

551/*

552 * Exported function to return whether a UTF-8 string is legal or not.

553 * This is not used here; it's just exported.

554 */

555Boolean isLegalUTF8String(constUTF8 **source,constUTF8 *sourceEnd) {

556while (*source != sourceEnd) {

557int length =trailingBytesForUTF8[**source] + 1;

558if (length > sourceEnd - *source || !isLegalUTF8(*source, length))

559returnfalse;

560 *source += length;

561 }

562returntrue;

563}

564

565/* --------------------------------------------------------------------- */

566

567ConversionResult ConvertUTF8toUTF16 (

568constUTF8** sourceStart,constUTF8* sourceEnd,

569UTF16** targetStart,UTF16* targetEnd,ConversionFlags flags) {

570ConversionResult result =conversionOK;

571constUTF8* source = *sourceStart;

572UTF16* target = *targetStart;

573while (source < sourceEnd) {

574UTF32 ch = 0;

575unsignedshort extraBytesToRead =trailingBytesForUTF8[*source];

576if (extraBytesToRead >= sourceEnd - source) {

577 result =sourceExhausted;break;

578 }

579/* Do this check whether lenient or strict */

580if (!isLegalUTF8(source, extraBytesToRead+1)) {

581 result =sourceIllegal;

582break;

583 }

584/*

585 * The cases all fall through. See "Note A" below.

586 */

587switch (extraBytesToRead) {

588case 5: ch += *source++; ch <<= 6;/* remember, illegal UTF-8 */

589case 4: ch += *source++; ch <<= 6;/* remember, illegal UTF-8 */

590case 3: ch += *source++; ch <<= 6;

591case 2: ch += *source++; ch <<= 6;

592case 1: ch += *source++; ch <<= 6;

593case 0: ch += *source++;

594 }

595 ch -=offsetsFromUTF8[extraBytesToRead];

596

597if (target >= targetEnd) {

598 source -= (extraBytesToRead+1);/* Back up source pointer! */

599 result =targetExhausted;break;

600 }

601if (ch <=UNI_MAX_BMP) {/* Target is a character <= 0xFFFF */

602/* UTF-16 surrogate values are illegal in UTF-32 */

603if (ch >=UNI_SUR_HIGH_START && ch <=UNI_SUR_LOW_END) {

604if (flags ==strictConversion) {

605 source -= (extraBytesToRead+1);/* return to the illegal value itself */

606 result =sourceIllegal;

607break;

608 }else {

609 *target++ =UNI_REPLACEMENT_CHAR;

610 }

611 }else {

612 *target++ = (UTF16)ch;/* normal case */

613 }

614 }elseif (ch >UNI_MAX_UTF16) {

615if (flags ==strictConversion) {

616 result =sourceIllegal;

617 source -= (extraBytesToRead+1);/* return to the start */

618break;/* Bail out; shouldn't continue */

619 }else {

620 *target++ =UNI_REPLACEMENT_CHAR;

621 }

622 }else {

623/* target is a character in range 0xFFFF - 0x10FFFF. */

624if (target + 1 >= targetEnd) {

625 source -= (extraBytesToRead+1);/* Back up source pointer! */

626 result =targetExhausted;break;

627 }

628 ch -=halfBase;

629 *target++ = (UTF16)((ch >>halfShift) +UNI_SUR_HIGH_START);

630 *target++ = (UTF16)((ch &halfMask) +UNI_SUR_LOW_START);

631 }

632 }

633 *sourceStart = source;

634 *targetStart = target;

635return result;

636}

637

638/* --------------------------------------------------------------------- */

639

640staticConversionResult ConvertUTF8toUTF32Impl(

641constUTF8** sourceStart,constUTF8* sourceEnd,

642UTF32** targetStart,UTF32* targetEnd,ConversionFlags flags,

643Boolean InputIsPartial) {

644ConversionResult result =conversionOK;

645constUTF8* source = *sourceStart;

646UTF32* target = *targetStart;

647while (source < sourceEnd) {

648UTF32 ch = 0;

649unsignedshort extraBytesToRead =trailingBytesForUTF8[*source];

650if (extraBytesToRead >= sourceEnd - source) {

651if (flags ==strictConversion || InputIsPartial) {

652 result =sourceExhausted;

653break;

654 }else {

655 result =sourceIllegal;

656

657/*

658 * Replace the maximal subpart of ill-formed sequence with

659 * replacement character.

660 */

661 source +=findMaximalSubpartOfIllFormedUTF8Sequence(source,

662 sourceEnd);

663 *target++ =UNI_REPLACEMENT_CHAR;

664continue;

665 }

666 }

667if (target >= targetEnd) {

668 result =targetExhausted;break;

669 }

670

671/* Do this check whether lenient or strict */

672if (!isLegalUTF8(source, extraBytesToRead+1)) {

673 result =sourceIllegal;

674if (flags ==strictConversion) {

675/* Abort conversion. */

676break;

677 }else {

678/*

679 * Replace the maximal subpart of ill-formed sequence with

680 * replacement character.

681 */

682 source +=findMaximalSubpartOfIllFormedUTF8Sequence(source,

683 sourceEnd);

684 *target++ =UNI_REPLACEMENT_CHAR;

685continue;

686 }

687 }

688/*

689 * The cases all fall through. See "Note A" below.

690 */

691switch (extraBytesToRead) {

692case 5: ch += *source++; ch <<= 6;

693case 4: ch += *source++; ch <<= 6;

694case 3: ch += *source++; ch <<= 6;

695case 2: ch += *source++; ch <<= 6;

696case 1: ch += *source++; ch <<= 6;

697case 0: ch += *source++;

698 }

699 ch -=offsetsFromUTF8[extraBytesToRead];

700

701if (ch <=UNI_MAX_LEGAL_UTF32) {

702/*

703 * UTF-16 surrogate values are illegal in UTF-32, and anything

704 * over Plane 17 (> 0x10FFFF) is illegal.

705 */

706if (ch >=UNI_SUR_HIGH_START && ch <=UNI_SUR_LOW_END) {

707if (flags ==strictConversion) {

708 source -= (extraBytesToRead+1);/* return to the illegal value itself */

709 result =sourceIllegal;

710break;

711 }else {

712 *target++ =UNI_REPLACEMENT_CHAR;

713 }

714 }else {

715 *target++ = ch;

716 }

717 }else {/* i.e., ch > UNI_MAX_LEGAL_UTF32 */

718 result =sourceIllegal;

719 *target++ =UNI_REPLACEMENT_CHAR;

720 }

721 }

722 *sourceStart = source;

723 *targetStart = target;

724return result;

725}

726

727ConversionResult ConvertUTF8toUTF32Partial(constUTF8 **sourceStart,

728constUTF8 *sourceEnd,

729UTF32 **targetStart,

730UTF32 *targetEnd,

731ConversionFlags flags) {

732returnConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,

733 flags,/*InputIsPartial=*/true);

734}

735

736ConversionResult ConvertUTF8toUTF32(constUTF8 **sourceStart,

737constUTF8 *sourceEnd,UTF32 **targetStart,

738UTF32 *targetEnd,ConversionFlags flags) {

739returnConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,

740 flags,/*InputIsPartial=*/false);

741}

742

743/* ---------------------------------------------------------------------

744

745 Note A.

746 The fall-through switches in UTF-8 reading code save a

747 temp variable, some decrements & conditionals. The switches

748 are equivalent to the following loop:

749 {

750 int tmpBytesToRead = extraBytesToRead+1;

751 do {

752 ch += *source++;

753 --tmpBytesToRead;

754 if (tmpBytesToRead) ch <<= 6;

755 } while (tmpBytesToRead > 0);

756 }

757 In UTF-8 writing code, the switches on "bytesToWrite" are

758 similarly unrolled loops.

759

760 --------------------------------------------------------------------- */

761

762}// namespace llvm

763

764ConvertUTF_RESTORE_WARNINGS

UNI_SUR_LOW_START

#define UNI_SUR_LOW_START

Definition:ConvertUTF.cpp:107

UNI_SUR_HIGH_START

#define UNI_SUR_HIGH_START

Definition:ConvertUTF.cpp:105

ConvertUTF_DISABLE_WARNINGS

#define ConvertUTF_DISABLE_WARNINGS

Definition:ConvertUTF.cpp:90

UNI_SUR_LOW_END

#define UNI_SUR_LOW_END

Definition:ConvertUTF.cpp:108

UNI_SUR_HIGH_END

#define UNI_SUR_HIGH_END

Definition:ConvertUTF.cpp:106

ConvertUTF_RESTORE_WARNINGS

#define ConvertUTF_RESTORE_WARNINGS

Definition:ConvertUTF.cpp:93

ConvertUTF.h

UNI_REPLACEMENT_CHAR

#define UNI_REPLACEMENT_CHAR

Definition:ConvertUTF.h:134

UNI_MAX_UTF16

#define UNI_MAX_UTF16

Definition:ConvertUTF.h:136

UNI_MAX_LEGAL_UTF32

#define UNI_MAX_LEGAL_UTF32

Definition:ConvertUTF.h:138

UNI_MAX_BMP

#define UNI_MAX_BMP

Definition:ConvertUTF.h:135

assert

assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

llvm

This is an optimization pass for GlobalISel generic memory operations.

Definition:AddressRanges.h:18

llvm::ConvertUTF8toUTF32Impl

static ConversionResult ConvertUTF8toUTF32Impl(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags, Boolean InputIsPartial)

Definition:ConvertUTF.cpp:640

llvm::offsetsFromUTF8

static const UTF32 offsetsFromUTF8[6]

Definition:ConvertUTF.cpp:135

llvm::UTF16

unsigned short UTF16

Definition:ConvertUTF.h:129

llvm::ConvertUTF8toUTF32

ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)

Convert a partial UTF8 sequence to UTF32.

Definition:ConvertUTF.cpp:736

llvm::halfShift

static const int halfShift

Definition:ConvertUTF.cpp:100

llvm::getNumBytesForUTF8

unsigned getNumBytesForUTF8(UTF8 firstByte)

Definition:ConvertUTF.cpp:545

llvm::halfBase

static const UTF32 halfBase

Definition:ConvertUTF.cpp:102

llvm::isLegalUTF8

static Boolean isLegalUTF8(const UTF8 *source, int length)

Definition:ConvertUTF.cpp:397

llvm::trailingBytesForUTF8

static const char trailingBytesForUTF8[256]

Definition:ConvertUTF.cpp:119

llvm::ConvertUTF8toUTF32Partial

ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)

Convert a partial UTF8 sequence to UTF32.

Definition:ConvertUTF.cpp:727

llvm::ConversionFlags

ConversionFlags

Definition:ConvertUTF.h:155

llvm::strictConversion

@ strictConversion

Definition:ConvertUTF.h:156

llvm::ConversionResult

ConversionResult

Definition:ConvertUTF.h:148

llvm::targetExhausted