Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit2f09413

Browse files
gnpricebenjaminp
authored andcommitted
closesbpo-37966: Fully implement the UAX#15 quick-check algorithm. (GH-15558)
The purpose of the `unicodedata.is_normalized` function is to answerthe question `str == unicodedata.normalized(form, str)` moreefficiently than writing just that, by using the "quick check"optimization described in the Unicode standard in UAX#15.However, it turns out the code doesn't implement the full algorithmfrom the standard, and as a result we often miss the optimization andend up having to compute the whole normalized string after all.Implement the standard's algorithm. This greatly speeds up`unicodedata.is_normalized` in many cases where our partial variantof quick-check had been returning MAYBE and the standard algorithmreturns NO.At a quick test on my desktop, the existing code takes about 4.4 ms/MB(so 4.4 ns per byte) when the partial quick-check returns MAYBE and ithas to do the slow normalize-and-compare: $ build.base/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \ -- 'unicodedata.is_normalized("NFD", s)' 50 loops, best of 5: 4.39 msec per loopWith this patch, it gets the answer instantly (58 ns) on the same 1 MBstring: $ build.dev/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \ -- 'unicodedata.is_normalized("NFD", s)'5000000 loops, best of 5: 58.2 nsec per loopThis restores a small optimization that the original version of thiscode had for the `unicodedata.normalize` use case.With this, that case is actually faster than in master!$ build.base/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \ -- 'unicodedata.normalize("NFD", s)'500 loops, best of 5: 561 usec per loop$ build.dev/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \ -- 'unicodedata.normalize("NFD", s)'500 loops, best of 5: 512 usec per loop
1 parent580bdb0 commit2f09413

File tree

4 files changed

+59
-26
lines changed

4 files changed

+59
-26
lines changed

‎Doc/whatsnew/3.8.rst‎

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1090,8 +1090,9 @@ unicodedata
10901090
<http://blog.unicode.org/2019/05/unicode-12-1-en.html>`_ release.
10911091

10921092
* New function:func:`~unicodedata.is_normalized` can be used to verify a string
1093-
is in a specific normal form. (Contributed by Max Belanger and David Euresti in
1094-
:issue:`32285`).
1093+
is in a specific normal form, often much faster than by actually normalizing
1094+
the string. (Contributed by Max Belanger, David Euresti, and Greg Price in
1095+
:issue:`32285` and:issue:`37966`).
10951096

10961097

10971098
unittest

‎Lib/test/test_unicodedata.py‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,8 @@ def test_issue29456(self):
208208
self.assertEqual(self.db.normalize('NFC',u11a7_str_a),u11a7_str_b)
209209
self.assertEqual(self.db.normalize('NFC',u11c3_str_a),u11c3_str_b)
210210

211+
# For tests of unicodedata.is_normalized / self.db.is_normalized ,
212+
# see test_normalization.py .
211213

212214
deftest_east_asian_width(self):
213215
eaw=self.db.east_asian_width
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
The implementation of:func:`~unicodedata.is_normalized` has been greatly
2+
sped up on strings that aren't normalized, by implementing the full
3+
normalization-quick-check algorithm from the Unicode standard.

‎Modules/unicodedata.c‎

Lines changed: 51 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include"ucnhash.h"
2020
#include"structmember.h"
2121

22+
#include<stdbool.h>
23+
2224
_Py_IDENTIFIER(NFC);
2325
_Py_IDENTIFIER(NFD);
2426
_Py_IDENTIFIER(NFKC);
@@ -775,25 +777,40 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
775777
returnresult;
776778
}
777779

778-
typedefenum {YES,NO,MAYBE}NormalMode;
779-
780-
/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
781-
staticNormalMode
782-
is_normalized(PyObject*self,PyObject*input,intnfc,intk)
780+
// This needs to match the logic in makeunicodedata.py
781+
// which constructs the quickcheck data.
782+
typedefenum {YES=0,MAYBE=1,NO=2}QuickcheckResult;
783+
784+
/* Run the Unicode normalization "quickcheck" algorithm.
785+
*
786+
* Return YES or NO if quickcheck determines the input is certainly
787+
* normalized or certainly not, and MAYBE if quickcheck is unable to
788+
* tell.
789+
*
790+
* If `yes_only` is true, then return MAYBE as soon as we determine
791+
* the answer is not YES.
792+
*
793+
* For background and details on the algorithm, see UAX #15:
794+
* https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
795+
*/
796+
staticQuickcheckResult
797+
is_normalized_quickcheck(PyObject*self,PyObject*input,
798+
intnfc,intk,boolyes_only)
783799
{
784-
Py_ssize_ti,len;
785-
intkind;
786-
void*data;
787-
unsignedcharprev_combining=0,quickcheck_mask;
788-
789800
/* An older version of the database is requested, quickchecks must be
790801
disabled. */
791802
if (self&&UCD_Check(self))
792803
returnNO;
793804

794-
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
795-
as described in http://unicode.org/reports/tr15/#Annex8. */
796-
quickcheck_mask=3 << ((nfc ?4 :0)+ (k ?2 :0));
805+
Py_ssize_ti,len;
806+
intkind;
807+
void*data;
808+
unsignedcharprev_combining=0;
809+
810+
/* The two quickcheck bits at this shift have type QuickcheckResult. */
811+
intquickcheck_shift= (nfc ?4 :0)+ (k ?2 :0);
812+
813+
QuickcheckResultresult=YES;/* certainly normalized, unless we find something */
797814

798815
i=0;
799816
kind=PyUnicode_KIND(input);
@@ -802,16 +819,26 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
802819
while (i<len) {
803820
Py_UCS4ch=PyUnicode_READ(kind,data,i++);
804821
const_PyUnicode_DatabaseRecord*record=_getrecord_ex(ch);
805-
unsignedcharcombining=record->combining;
806-
unsignedcharquickcheck=record->normalization_quick_check;
807822

808-
if (quickcheck&quickcheck_mask)
809-
returnMAYBE;/* this string might need normalization */
823+
unsignedcharcombining=record->combining;
810824
if (combining&&prev_combining>combining)
811825
returnNO;/* non-canonical sort order, not normalized */
812826
prev_combining=combining;
827+
828+
unsignedcharquickcheck_whole=record->normalization_quick_check;
829+
if (yes_only) {
830+
if (quickcheck_whole& (3 <<quickcheck_shift))
831+
returnMAYBE;
832+
}else {
833+
switch ((quickcheck_whole >>quickcheck_shift)&3) {
834+
caseNO:
835+
returnNO;
836+
caseMAYBE:
837+
result=MAYBE;/* this string might need normalization */
838+
}
839+
}
813840
}
814-
returnYES;/* certainly normalized */
841+
returnresult;
815842
}
816843

817844
/*[clinic input]
@@ -844,7 +871,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
844871
PyObject*result;
845872
intnfc=0;
846873
intk=0;
847-
NormalModem;
874+
QuickcheckResultm;
848875

849876
PyObject*cmp;
850877
intmatch=0;
@@ -867,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
867894
returnNULL;
868895
}
869896

870-
m=is_normalized(self,input,nfc,k);
897+
m=is_normalized_quickcheck(self,input,nfc,k, false);
871898

872899
if (m==MAYBE) {
873900
cmp= (nfc ?nfc_nfkc :nfd_nfkd)(self,input,k);
@@ -913,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
913940
}
914941

915942
if (_PyUnicode_EqualToASCIIId(form,&PyId_NFC)) {
916-
if (is_normalized(self,input,1,0)==YES) {
943+
if (is_normalized_quickcheck(self,input,1,0, true)==YES) {
917944
Py_INCREF(input);
918945
returninput;
919946
}
920947
returnnfc_nfkc(self,input,0);
921948
}
922949
if (_PyUnicode_EqualToASCIIId(form,&PyId_NFKC)) {
923-
if (is_normalized(self,input,1,1)==YES) {
950+
if (is_normalized_quickcheck(self,input,1,1, true)==YES) {
924951
Py_INCREF(input);
925952
returninput;
926953
}
927954
returnnfc_nfkc(self,input,1);
928955
}
929956
if (_PyUnicode_EqualToASCIIId(form,&PyId_NFD)) {
930-
if (is_normalized(self,input,0,0)==YES) {
957+
if (is_normalized_quickcheck(self,input,0,0, true)==YES) {
931958
Py_INCREF(input);
932959
returninput;
933960
}
934961
returnnfd_nfkd(self,input,0);
935962
}
936963
if (_PyUnicode_EqualToASCIIId(form,&PyId_NFKD)) {
937-
if (is_normalized(self,input,0,1)==YES) {
964+
if (is_normalized_quickcheck(self,input,0,1, true)==YES) {
938965
Py_INCREF(input);
939966
returninput;
940967
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2026 Movatter.jp