Movatterモバイル変換


[0]ホーム

URL:


ICU 78.2  78.2
ucharstrie.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2012, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucharstrie.h
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010nov14
14 * created by: Markus W. Scherer
15 */
16 
17 #ifndef __UCHARSTRIE_H__
18 #define __UCHARSTRIE_H__
19 
26 #include "unicode/utypes.h"
27 
28 #if U_SHOW_CPLUSPLUS_API
29 
30 #include "unicode/unistr.h"
31 #include "unicode/uobject.h"
32 #include "unicode/ustringtrie.h"
33 
34 U_NAMESPACE_BEGIN
35 
36 classAppendable;
37 classUCharsTrieBuilder;
38 classUVector32;
39 
53 classU_COMMON_APIUCharsTrie :publicUMemory {
54 public:
69 UCharsTrie(ConstChar16Ptr trieUChars)
70  : ownedArray_(nullptr), uchars_(trieUChars),
71  pos_(uchars_), remainingMatchLength_(-1) {}
72 
77 ~UCharsTrie();
78 
85 UCharsTrie(constUCharsTrie &other)
86  : ownedArray_(nullptr), uchars_(other.uchars_),
87  pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
88 
94 UCharsTrie &reset() {
95  pos_=uchars_;
96  remainingMatchLength_=-1;
97 return *this;
98  }
99 
108  uint64_tgetState64() const{
109 return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
110 static_cast<uint64_t>(pos_ - uchars_);
111  }
112 
127 UCharsTrie &resetToState64(uint64_t state) {
128  remainingMatchLength_ =static_cast<int32_t>(state >> kState64RemainingShift) - 2;
129  pos_ = uchars_ + (state & kState64PosMask);
130 return *this;
131  }
132 
138 classState :publicUMemory {
139 public:
144 State() { uchars=nullptr; }
145 private:
146 friendclassUCharsTrie;
147 
148 const char16_t *uchars;
149 const char16_t *pos;
150  int32_t remainingMatchLength;
151  };
152 
160 constUCharsTrie &saveState(State &state) const{
161  state.uchars=uchars_;
162  state.pos=pos_;
163  state.remainingMatchLength=remainingMatchLength_;
164 return *this;
165  }
166 
177 UCharsTrie &resetToState(constState &state) {
178 if(uchars_==state.uchars && uchars_!=nullptr) {
179  pos_=state.pos;
180  remainingMatchLength_=state.remainingMatchLength;
181  }
182 return *this;
183  }
184 
191 UStringTrieResultcurrent()const;
192 
200 inlineUStringTrieResultfirst(int32_t uchar) {
201  remainingMatchLength_=-1;
202 return nextImpl(uchars_, uchar);
203  }
204 
213 UStringTrieResultfirstForCodePoint(UChar32 cp);
214 
221 UStringTrieResultnext(int32_t uchar);
222 
230 UStringTrieResultnextForCodePoint(UChar32 cp);
231 
247 UStringTrieResultnext(ConstChar16Ptr s, int32_t length);
248 
258 inline int32_tgetValue() const{
259 const char16_t *pos=pos_;
260  int32_t leadUnit=*pos++;
261 // U_ASSERT(leadUnit>=kMinValueLead);
262 return leadUnit&kValueIsFinal ?
263  readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
264  }
265 
275 inlineUBoolhasUniqueValue(int32_t &uniqueValue) const{
276 const char16_t *pos=pos_;
277 // Skip the rest of a pending linear-match node.
278 return pos!=nullptr && findUniqueValue(pos+remainingMatchLength_+1,false, uniqueValue);
279  }
280 
288  int32_tgetNextUChars(Appendable &out)const;
289 
294 classU_COMMON_APIIterator :publicUMemory {
295 public:
307 Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength,UErrorCode &errorCode);
308 
320 Iterator(constUCharsTrie &trie, int32_t maxStringLength,UErrorCode &errorCode);
321 
326 ~Iterator();
327 
333 Iterator &reset();
334 
339 UBoolhasNext()const;
340 
355 UBoolnext(UErrorCode &errorCode);
356 
361 constUnicodeString &getString() const{return str_; }
366  int32_tgetValue() const{return value_; }
367 
368 private:
369 UBool truncateAndStop() {
370  pos_=nullptr;
371  value_=-1;// no real value for str
372 returntrue;
373  }
374 
375 const char16_t *branchNext(const char16_t *pos, int32_t length,UErrorCode &errorCode);
376 
377 const char16_t *uchars_;
378 const char16_t *pos_;
379 const char16_t *initialPos_;
380  int32_t remainingMatchLength_;
381  int32_t initialRemainingMatchLength_;
382 UBool skipValue_;// Skip intermediate value which was already delivered.
383 
384 UnicodeString str_;
385  int32_t maxLength_;
386  int32_t value_;
387 
388 // The stack stores pairs of integers for backtracking to another
389 // outbound edge of a branch node.
390 // The first integer is an offset from uchars_.
391 // The second integer has the str_.length() from before the node in bits 15..0,
392 // and the remaining branch length in bits 31..16.
393 // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
394 // but the code looks more confusing that way.)
395  UVector32 *stack_;
396  };
397 
398 private:
399 friendclassUCharsTrieBuilder;
400 
407  UCharsTrie(char16_t *adoptUChars,const char16_t *trieUChars)
408  : ownedArray_(adoptUChars), uchars_(trieUChars),
409  pos_(uchars_), remainingMatchLength_(-1) {}
410 
411 // No assignment operator.
412  UCharsTrie &operator=(const UCharsTrie &other) =delete;
413 
414 inlinevoid stop() {
415  pos_=nullptr;
416  }
417 
418 // Reads a compact 32-bit integer.
419 // pos is already after the leadUnit, and the lead unit has bit 15 reset.
420 staticinline int32_t readValue(const char16_t *pos, int32_t leadUnit) {
421  int32_t value;
422 if(leadUnit<kMinTwoUnitValueLead) {
423  value=leadUnit;
424  }elseif(leadUnit<kThreeUnitValueLead) {
425  value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
426  }else {
427  value=(pos[0]<<16)|pos[1];
428  }
429 return value;
430  }
431 staticinlineconst char16_t *skipValue(const char16_t *pos, int32_t leadUnit) {
432 if(leadUnit>=kMinTwoUnitValueLead) {
433 if(leadUnit<kThreeUnitValueLead) {
434  ++pos;
435  }else {
436  pos+=2;
437  }
438  }
439 return pos;
440  }
441 staticinlineconst char16_t *skipValue(const char16_t *pos) {
442  int32_t leadUnit=*pos++;
443 return skipValue(pos, leadUnit&0x7fff);
444  }
445 
446 staticinline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) {
447 // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
448  int32_t value;
449 if(leadUnit<kMinTwoUnitNodeValueLead) {
450  value=(leadUnit>>6)-1;
451  }elseif(leadUnit<kThreeUnitNodeValueLead) {
452  value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
453  }else {
454  value=(pos[0]<<16)|pos[1];
455  }
456 return value;
457  }
458 staticinlineconst char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) {
459 // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
460 if(leadUnit>=kMinTwoUnitNodeValueLead) {
461 if(leadUnit<kThreeUnitNodeValueLead) {
462  ++pos;
463  }else {
464  pos+=2;
465  }
466  }
467 return pos;
468  }
469 
470 staticinlineconst char16_t *jumpByDelta(const char16_t *pos) {
471  int32_t delta=*pos++;
472 if(delta>=kMinTwoUnitDeltaLead) {
473 if(delta==kThreeUnitDeltaLead) {
474  delta=(pos[0]<<16)|pos[1];
475  pos+=2;
476  }else {
477  delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
478  }
479  }
480 return pos+delta;
481  }
482 
483 staticconst char16_t *skipDelta(const char16_t *pos) {
484  int32_t delta=*pos++;
485 if(delta>=kMinTwoUnitDeltaLead) {
486 if(delta==kThreeUnitDeltaLead) {
487  pos+=2;
488  }else {
489  ++pos;
490  }
491  }
492 return pos;
493  }
494 
495 staticinlineUStringTrieResult valueResult(int32_t node) {
496 returnstatic_cast<UStringTrieResult>(USTRINGTRIE_INTERMEDIATE_VALUE - (node >> 15));
497  }
498 
499 // Handles a branch node for both next(uchar) and next(string).
500 UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar);
501 
502 // Requires remainingLength_<0.
503 UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar);
504 
505 // Helper functions for hasUniqueValue().
506 // Recursively finds a unique value (or whether there is not a unique one)
507 // from a branch.
508 staticconst char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length,
509 UBool haveUniqueValue, int32_t &uniqueValue);
510 // Recursively finds a unique value (or whether there is not a unique one)
511 // starting from a position on a node lead unit.
512 staticUBool findUniqueValue(const char16_t *pos,UBool haveUniqueValue, int32_t &uniqueValue);
513 
514 // Helper functions for getNextUChars().
515 // getNextUChars() when pos is on a branch node.
516 staticvoid getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out);
517 
518 // UCharsTrie data structure
519 //
520 // The trie consists of a series of char16_t-serialized nodes for incremental
521 // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer)
522 // The root node is at the beginning of the trie data.
523 //
524 // Types of nodes are distinguished by their node lead unit ranges.
525 // After each node, except a final-value node, another node follows to
526 // encode match values or continue matching further units.
527 //
528 // Node types:
529 // - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
530 // The value is for the string/char16_t sequence so far.
531 // - Match node, optionally with an intermediate value in a different compact format.
532 // The value, if present, is for the string/char16_t sequence so far.
533 //
534 // Aside from the value, which uses the node lead unit's high bits:
535 //
536 // - Linear-match node: Matches a number of units.
537 // - Branch node: Branches to other nodes according to the current input unit.
538 // The node unit is the length of the branch (number of units to select from)
539 // minus 1. It is followed by a sub-node:
540 // - If the length is at most kMaxBranchLinearSubNodeLength, then
541 // there are length-1 (key, value) pairs and then one more comparison unit.
542 // If one of the key units matches, then the value is either a final value for
543 // the string so far, or a "jump" delta to the next node.
544 // If the last unit matches, then matching continues with the next node.
545 // (Values have the same encoding as final-value nodes.)
546 // - If the length is greater than kMaxBranchLinearSubNodeLength, then
547 // there is one unit and one "jump" delta.
548 // If the input unit is less than the sub-node unit, then "jump" by delta to
549 // the next sub-node which will have a length of length/2.
550 // (The delta has its own compact encoding.)
551 // Otherwise, skip the "jump" delta to the next sub-node
552 // which will have a length of length-length/2.
553 
554 // Match-node lead unit values, after masking off intermediate-value bits:
555 
556 // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
557 // the length is one more than the next unit.
558 
559 // For a branch sub-node with at most this many entries, we drop down
560 // to a linear search.
561 staticconst int32_t kMaxBranchLinearSubNodeLength=5;
562 
563 // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
564 staticconst int32_t kMinLinearMatch=0x30;
565 staticconst int32_t kMaxLinearMatchLength=0x10;
566 
567 // Match-node lead unit bits 14..6 for the optional intermediate value.
568 // If these bits are 0, then there is no intermediate value.
569 // Otherwise, see the *NodeValue* constants below.
570 staticconst int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;// 0x0040
571 staticconst int32_t kNodeTypeMask=kMinValueLead-1;// 0x003f
572 
573 // A final-value node has bit 15 set.
574 staticconst int32_t kValueIsFinal=0x8000;
575 
576 // Compact value: After testing and masking off bit 15, use the following thresholds.
577 staticconst int32_t kMaxOneUnitValue=0x3fff;
578 
579 staticconst int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1;// 0x4000
580 staticconst int32_t kThreeUnitValueLead=0x7fff;
581 
582 staticconst int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1;// 0x3ffeffff
583 
584 // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
585 staticconst int32_t kMaxOneUnitNodeValue=0xff;
586 staticconst int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6);// 0x4040
587 staticconst int32_t kThreeUnitNodeValueLead=0x7fc0;
588 
589 staticconst int32_t kMaxTwoUnitNodeValue=
590  ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1;// 0xfdffff
591 
592 // Compact delta integers.
593 staticconst int32_t kMaxOneUnitDelta=0xfbff;
594 staticconst int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1;// 0xfc00
595 staticconst int32_t kThreeUnitDeltaLead=0xffff;
596 
597 staticconst int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1;// 0x03feffff
598 
599 // For getState64():
600 // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
601 // so we need at least 5 bits for that.
602 // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
603 static constexpr int32_t kState64RemainingShift = 59;
604 static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
605 
606  char16_t *ownedArray_;
607 
608 // Fixed value referencing the UCharsTrie words.
609 const char16_t *uchars_;
610 
611 // Iterator variables.
612 
613 // Pointer to next trie unit to read. nullptr if no more matches.
614 const char16_t *pos_;
615 // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
616  int32_t remainingMatchLength_;
617 };
618 
619 U_NAMESPACE_END
620 
621 #endif/* U_SHOW_CPLUSPLUS_API */
622 
623 #endif// __UCHARSTRIE_H__
icu::Appendable
Base class for objects to which Unicode characters and strings can be appended.
Definition:appendable.h:54
icu::ConstChar16Ptr
const char16_t * wrapper with implicit conversion from distinct but bit-compatible pointer types.
Definition:char16ptr.h:156
icu::UCharsTrie::Iterator
Iterator for all of the (string, value) pairs in a UCharsTrie.
Definition:ucharstrie.h:294
icu::UCharsTrie::Iterator::Iterator
Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode)
Iterates from the root of a char16_t-serialized UCharsTrie.
icu::UCharsTrie::Iterator::reset
Iterator & reset()
Resets this iterator to its initial state.
icu::UCharsTrie::Iterator::hasNext
UBool hasNext() const
icu::UCharsTrie::Iterator::~Iterator
~Iterator()
Destructor.
icu::UCharsTrie::Iterator::getValue
int32_t getValue() const
Definition:ucharstrie.h:366
icu::UCharsTrie::Iterator::next
UBool next(UErrorCode &errorCode)
Finds the next (string, value) pair if there is one.
icu::UCharsTrie::Iterator::getString
const UnicodeString & getString() const
Definition:ucharstrie.h:361
icu::UCharsTrie::Iterator::Iterator
Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode)
Iterates from the current state of the specified UCharsTrie.
icu::UCharsTrie::State
UCharsTrie state object, for saving a trie's current state and resetting the trie back to this state ...
Definition:ucharstrie.h:138
icu::UCharsTrie::State::State
State()
Constructs an empty State.
Definition:ucharstrie.h:144
icu::UCharsTrie
Light-weight, non-const reader class for a UCharsTrie.
Definition:ucharstrie.h:53
icu::UCharsTrie::UCharsTrie
UCharsTrie(ConstChar16Ptr trieUChars)
Constructs a UCharsTrie reader instance.
Definition:ucharstrie.h:69
icu::UCharsTrie::UCharsTrie
UCharsTrie(const UCharsTrie &other)
Copy constructor, copies the other trie reader object and its state, but not the char16_t array which...
Definition:ucharstrie.h:85
icu::UCharsTrie::hasUniqueValue
UBool hasUniqueValue(int32_t &uniqueValue) const
Determines whether all strings reachable from the current state map to the same value.
Definition:ucharstrie.h:275
icu::UCharsTrie::getState64
uint64_t getState64() const
Returns the state of this trie as a 64-bit integer.
Definition:ucharstrie.h:108
icu::UCharsTrie::reset
UCharsTrie & reset()
Resets this trie to its initial state.
Definition:ucharstrie.h:94
icu::UCharsTrie::resetToState64
UCharsTrie & resetToState64(uint64_t state)
Resets this trie to the saved state.
Definition:ucharstrie.h:127
icu::UCharsTrie::next
UStringTrieResult next(int32_t uchar)
Traverses the trie from the current state for this input char16_t.
icu::UCharsTrie::nextForCodePoint
UStringTrieResult nextForCodePoint(UChar32 cp)
Traverses the trie from the current state for the one or two UTF-16 code units for this input code po...
icu::UCharsTrie::getValue
int32_t getValue() const
Returns a matching string's value if called immediately after current()/first()/next() returned USTRI...
Definition:ucharstrie.h:258
icu::UCharsTrie::firstForCodePoint
UStringTrieResult firstForCodePoint(UChar32 cp)
Traverses the trie from the initial state for the one or two UTF-16 code units for this input code po...
icu::UCharsTrie::current
UStringTrieResult current() const
Determines whether the string so far matches, whether it has a value, and whether another input char1...
icu::UCharsTrie::resetToState
UCharsTrie & resetToState(const State &state)
Resets this trie to the saved state.
Definition:ucharstrie.h:177
icu::UCharsTrie::first
UStringTrieResult first(int32_t uchar)
Traverses the trie from the initial state for this input char16_t.
Definition:ucharstrie.h:200
icu::UCharsTrie::~UCharsTrie
~UCharsTrie()
Destructor.
icu::UCharsTrie::next
UStringTrieResult next(ConstChar16Ptr s, int32_t length)
Traverses the trie from the current state for this string.
icu::UCharsTrie::saveState
const UCharsTrie & saveState(State &state) const
Saves the state of this trie.
Definition:ucharstrie.h:160
icu::UCharsTrie::getNextUChars
int32_t getNextUChars(Appendable &out) const
Finds each char16_t which continues the string from the current state.
icu::UMemory
UMemory is the common ICU base class.
Definition:uobject.h:115
icu::UnicodeString
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition:unistr.h:303
UChar32
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition:umachine.h:449
UINT64_C
#define UINT64_C(c)
Provides a platform independent way to specify an unsigned 64-bit integer constant.
Definition:umachine.h:241
UBool
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition:umachine.h:269
unistr.h
C++ API: Unicode String.
uobject.h
C++ API: Common ICU base class UObject.
ustringtrie.h
C API: Helper definitions for dictionary trie APIs.
UStringTrieResult
UStringTrieResult
Return values for BytesTrie::next(), UCharsTrie::next() and similar methods.
Definition:ustringtrie.h:35
USTRINGTRIE_INTERMEDIATE_VALUE
@ USTRINGTRIE_INTERMEDIATE_VALUE
The input unit(s) continued a matching string and there is a value for the string so far.
Definition:ustringtrie.h:66
utypes.h
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition:utypes.h:509
U_COMMON_API
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition:utypes.h:315

Generated by doxygen 1.9.1
[8]ページ先頭

©2009-2026 Movatter.jp