1//===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties 4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5// See https://llvm.org/LICENSE.txt for license information. 6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 8//===----------------------------------------------------------------------===// 10// This file implements functions to map the name or alias of a unicode 11// character to its codepoint. 13//===----------------------------------------------------------------------===// 47// Reserve enough space for most unicode code points. 48// The chosen value represent the 99th percentile of name size as of 53 std::reverse_copy(
N->Name.begin(),
N->Name.end(), std::back_inserter(S));
56 std::reverse(S.begin(), S.end());
80bool LongName = NameInfo & 0x40;
82 std::size_t
Size = NameInfo & ~0xC0;
94N.Value = ((
H << 16) | (M << 8) | L) >> 3;
96bool HasChildren = L & 0x02;
97N.HasSibling = L & 0x01;
106N.HasSibling =
H & 0x80;
107bool HasChildren =
H & 0x40;
110N.ChildrenOffset = (
H << 16);
121 std::size_t &Consummed,
char &PreviousCharInName,
122bool IsPrefix =
false) {
126if (!
Name.starts_with(Needle))
128 Consummed = Needle.
size();
134auto NamePos =
Name.begin();
135auto NeedlePos = Needle.
begin();
137char PreviousCharInNameOrigin = PreviousCharInName;
138char PreviousCharInNeedle = *Needle.
begin();
139auto IgnoreSpaces = [](
auto It,
autoEnd,
char &PreviousChar,
140bool IsPrefix =
false) {
142constauto Next = std::next(It);
143// Ignore spaces, underscore, medial hyphens 144// The generator ensures a needle never ends (or starts) by a medial 145// hyphen https://unicode.org/reports/tr44/#UAX44-LM2. 147 *It ==
' ' || *It ==
'_' ||
148 (*It ==
'-' && isAlnum(PreviousChar) &&
149 ((Next !=
End && isAlnum(*Next)) || (Next ==
End && IsPrefix)));
159 NamePos = IgnoreSpaces(NamePos,
Name.end(), PreviousCharInName);
161 IgnoreSpaces(NeedlePos, Needle.
end(), PreviousCharInNeedle, IsPrefix);
162if (NeedlePos == Needle.
end())
164if (NamePos ==
Name.end())
166if (toUpper(*NeedlePos) != toUpper(*NamePos))
171 Consummed = std::distance(
Name.begin(), NamePos);
172if (NeedlePos != Needle.
end()) {
173 PreviousCharInName = PreviousCharInNameOrigin;
175return NeedlePos == Needle.
end();
178static std::tuple<Node, bool, uint32_t>
181constNode *Parent =
nullptr) {
183 std::size_t Consummed = 0;
187return std::make_tuple(
N,
false, 0);
189if (
Name.size() - Consummed == 0 &&
N.Value != 0xFFFFFFFF)
190return std::make_tuple(
N,
true,
N.Value);
192if (
N.hasChildren()) {
198 std::tie(
C, Matches,
Value) =
200 PreviousCharInName, Buffer, &
N);
202 std::reverse_copy(
C.Name.begin(),
C.Name.end(),
203 std::back_inserter(Buffer));
204return std::make_tuple(
N,
true,
Value);
206 ChildOffset +=
C.Size;
211return std::make_tuple(
N,
false, 0);
214static std::tuple<Node, bool, uint32_t>
253// 3.12 Conjoining Jamo Behavior Common constants 254constexprconstchar32_tSBase = 0xAC00;
260char &PreviousInName,
int &Pos,
int Column) {
261assert(Column == 0 || Column == 1 || Column == 2);
264int Prev = PreviousInName;
265for (std::size_t
I = 0;
I < CountPerColumn[Column];
I++) {
267if (
int(Syllable.
size()) <= Len)
269 std::size_t Consummed = 0;
270char PreviousInNameCopy = PreviousInName;
277 Prev = PreviousInNameCopy;
281 PreviousInName = Prev;
285static std::optional<char32_t>
288// Hangul Syllable Decomposition 289 std::size_t Consummed = 0;
296int L = -1, V = -1,
T = -1;
300if (L != -1 && V != -1 &&
T != -1 &&
Name.empty()) {
302 Buffer.
append(
"HANGUL SYLLABLE ");
313// Otherwise, it's an illegal syllable name. 323// Unicode 15.1 Table 4-8. Name Derivation Rule Prefix Strings 325 {
"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
326 {
"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
327 {
"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
328 {
"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
329 {
"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
330 {
"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
331 {
"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
332 {
"CJK UNIFIED IDEOGRAPH-", 0x2EBF0, 0x2EE5D},
333 {
"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
334 {
"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
335 {
"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
336 {
"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
337 {
"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
338 {
"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
339 {
"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
340 {
"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
341 {
"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
344static std::optional<char32_t>
348 std::size_t Consummed = 0;
350bool DoesStartWith =
startsWith(
Name, Item.Prefix, Strict, Consummed,
351 NameStart,
/*IsPrefix=*/true);
355unsignedlonglong V = 0;
356// Be consistent about mandating upper casing. 363 Buffer.
append(Item.Prefix);
364 Buffer.
append(utohexstr(V,
true));
388 std::reverse(Buffer.
begin(), Buffer.
end());
389// UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial 390// hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E. 391if (!Strict &&
Value == 0x116c &&
Name.contains_insensitive(
"O-E")) {
392 Buffer =
"HANGUL JUNGSEONG O-E";
407std::optional<LooseMatchingResult>
416// Find the unicode character whose editing distance to Pattern 417// is shortest, using the Wagner–Fischer algorithm. 420// We maintain a fixed size vector of matches, 422// The worst match (with the biggest distance) are discarded when new elements 424 std::size_t LargestEditDistance = 0;
426 Matches.
reserve(MaxMatchesCount + 1);
429char32_tValue) ->
bool {
430if (Distance > LargestEditDistance) {
431if (Matches.
size() == MaxMatchesCount)
433 LargestEditDistance = Distance;
435// To avoid allocations, the creation of the name is delayed 436// as much as possible. 448return a.
Name < GetName();
451if (It == Matches.
end() && Matches.
size() == MaxMatchesCount)
455 Matches.
insert(It, std::move(M));
456if (Matches.
size() > MaxMatchesCount)
461// We ignore case, space, hyphens, etc, 462// in both the search pattern and the prospective names. 465 Out.reserve(
Name.size());
468 Out.push_back(toUpper(
C));
474// Allocate a matrix big enough for longest names. 475const std::size_t Columns =
482 std::vector<char> Distances(
485auto Get = [&Distances, Columns](
size_t Column, std::size_t Row) ->
char & {
488return Distances[Row * Columns + Column];
491for (std::size_t
I = 0;
I < Columns;
I++)
494// Visit the childrens, 495// Filling (and overriding) the matrix for the name fragment of each node 496// iteratively. CompleteName is used to collect the actual name of potential 497// match, respecting case and spacing. 498auto VisitNode = [&](
constNode &
N, std::size_t Row,
499auto &VisitNode) ->
void {
501for (; J <
N.Name.size(); J++) {
502if (!isAlnum(
N.Name[J]))
507for (std::size_t
I = 1;
I < Columns;
I++) {
508constint Delete = Get(
I - 1, Row) + 1;
509constint Insert = Get(
I, Row - 1) + 1;
512 Get(
I - 1, Row - 1) + (NormalizedName[
I - 1] !=
N.Name[J] ? 1 : 0);
514 Get(
I, Row) = std::min(Insert, std::min(Delete, Replace));
520unsignedCost = Get(Columns - 1, Row - 1);
521if (
N.Value != 0xFFFFFFFF) {
525if (
N.hasChildren()) {
526auto ChildOffset =
N.ChildrenOffset;
529 ChildOffset +=
C.Size;
532 VisitNode(
C, Row, VisitNode);
540 VisitNode(Root, 1, VisitNode);
ReachingDefAnalysis InstSet InstSet & Ignore
#define LLVM_ATTRIBUTE_UNUSED
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
@ Normalize
Normalize - Normalize according to the given loops.
This file contains some functions that are useful when dealing with strings.
void append(StringRef RHS)
Append from a StringRef.
void reserve(size_type N)
iterator insert(iterator I, T &&Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
LLVM Value Representation.
@ C
The default llvm calling convention, compatible with C.
static Node readNode(uint32_t Offset, const Node *Parent=nullptr)
constexpr const uint32_t TCount
const std::size_t UnicodeNameToCodepointLargestNameSize
static bool startsWith(StringRef Name, StringRef Needle, bool Strict, std::size_t &Consummed, char &PreviousCharInName, bool IsPrefix=false)
std::optional< char32_t > nameToCodepointStrict(StringRef Name)
Maps the name or the alias of a Unicode character to its associated codepoints.
SmallVector< MatchForCodepointName > nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount)
const std::size_t UnicodeNameToCodepointIndexSize
constexpr const char *const HangulSyllables[][3]
std::optional< LooseMatchingResult > nameToCodepointLooseMatching(StringRef Name)
constexpr const uint32_t LCount
static std::optional< char32_t > nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer)
constexpr const uint32_t VCount
const uint8_t * UnicodeNameToCodepointIndex
static const GeneratedNamesData GeneratedNamesDataTable[]
static std::size_t findSyllable(StringRef Name, bool Strict, char &PreviousInName, int &Pos, int Column)
static std::tuple< Node, bool, uint32_t > compareNode(uint32_t Offset, StringRef Name, bool Strict, char PreviousCharInName, BufferType &Buffer, const Node *Parent=nullptr)
static std::optional< char32_t > nameToCodepoint(StringRef Name, bool Strict, BufferType &Buffer)
static std::optional< char32_t > nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer)
constexpr const char32_t SBase
const char * UnicodeNameToCodepointDict
This is an optimization pass for GlobalISel generic memory operations.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
detail::ValueMatchesPoly< M > HasValue(M Matcher)
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
bool getAsUnsignedInteger(StringRef Str, unsigned Radix, unsigned long long &Result)
Helper functions for StringRef::getAsInteger.
constexpr bool isValid() const
std::string fullName() const
constexpr bool hasChildren() const