Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc3cec78

Browse files
author
Victor Stinner
committed
Add asciilib: similar to ucs1, ucs2 and ucs4 library, but specialized to ASCII
ucs1, ucs2 and ucs4 libraries have to scan created substring to find themaximum character, whereas it is not need to ASCII strings. Because ASCIIstrings are common, it is useful to optimize ASCII.
1 parent14f8f02 commitc3cec78

File tree

4 files changed

+153
-49
lines changed

4 files changed

+153
-49
lines changed

‎Include/unicodeobject.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1851,6 +1851,7 @@ PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buff
18511851
see Objects/stringlib/localeutil.h*/
18521852
#ifndef Py_LIMITED_API
18531853
PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1854+
PyObject *unicode,
18541855
int kind,
18551856
void *buffer,
18561857
Py_ssize_t n_buffer,

‎Objects/stringlib/asciilib.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/* this is sort of a hack. there's at least one place (formatting
2+
floats) where some stringlib code takes a different path if it's
3+
compiled as unicode. */
4+
#defineSTRINGLIB_IS_UNICODE 1
5+
6+
#defineFASTSEARCH asciilib_fastsearch
7+
#defineSTRINGLIB(F) asciilib_##F
8+
#defineSTRINGLIB_OBJECT PyUnicodeObject
9+
#defineSTRINGLIB_CHAR Py_UCS1
10+
#defineSTRINGLIB_TYPE_NAME "unicode"
11+
#defineSTRINGLIB_PARSE_CODE "U"
12+
#defineSTRINGLIB_EMPTY unicode_empty
13+
#defineSTRINGLIB_ISSPACE Py_UNICODE_ISSPACE
14+
#defineSTRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
15+
#defineSTRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
16+
#defineSTRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL
17+
#defineSTRINGLIB_TOUPPER Py_UNICODE_TOUPPER
18+
#defineSTRINGLIB_TOLOWER Py_UNICODE_TOLOWER
19+
#defineSTRINGLIB_FILL Py_UNICODE_FILL
20+
#defineSTRINGLIB_STR PyUnicode_1BYTE_DATA
21+
#defineSTRINGLIB_LEN PyUnicode_GET_LENGTH
22+
#defineSTRINGLIB_NEW unicode_fromascii
23+
#defineSTRINGLIB_RESIZE not_supported
24+
#defineSTRINGLIB_CHECK PyUnicode_Check
25+
#defineSTRINGLIB_CHECK_EXACT PyUnicode_CheckExact
26+
#defineSTRINGLIB_GROUPING _PyUnicode_InsertThousandsGrouping
27+
#defineSTRINGLIB_GROUPING_LOCALE _PyUnicode_InsertThousandsGroupingLocale
28+
29+
#defineSTRINGLIB_TOSTR PyObject_Str
30+
#defineSTRINGLIB_TOASCII PyObject_ASCII
31+
32+
#define_Py_InsertThousandsGrouping _PyUnicode_ascii_InsertThousandsGrouping
33+
#define_Py_InsertThousandsGroupingLocale _PyUnicode_ascii_InsertThousandsGroupingLocale
34+

‎Objects/unicodeobject.c

Lines changed: 116 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -8331,6 +8331,15 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
83318331

83328332
/* --- Helpers ------------------------------------------------------------ */
83338333

8334+
#include"stringlib/asciilib.h"
8335+
#include"stringlib/fastsearch.h"
8336+
#include"stringlib/partition.h"
8337+
#include"stringlib/split.h"
8338+
#include"stringlib/count.h"
8339+
#include"stringlib/find.h"
8340+
#include"stringlib/localeutil.h"
8341+
#include"stringlib/undef.h"
8342+
83348343
#include"stringlib/ucs1lib.h"
83358344
#include"stringlib/fastsearch.h"
83368345
#include"stringlib/partition.h"
@@ -8359,7 +8368,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
83598368
#include"stringlib/undef.h"
83608369

83618370
staticPy_ssize_t
8362-
any_find_slice(Py_ssize_tPy_LOCAL_CALLBACK(ucs1)(constPy_UCS1*,Py_ssize_t,
8371+
any_find_slice(Py_ssize_tPy_LOCAL_CALLBACK(ascii)(constPy_UCS1*,Py_ssize_t,
8372+
constPy_UCS1*,Py_ssize_t,
8373+
Py_ssize_t,Py_ssize_t),
8374+
Py_ssize_tPy_LOCAL_CALLBACK(ucs1)(constPy_UCS1*,Py_ssize_t,
83638375
constPy_UCS1*,Py_ssize_t,
83648376
Py_ssize_t,Py_ssize_t),
83658377
Py_ssize_tPy_LOCAL_CALLBACK(ucs2)(constPy_UCS2*,Py_ssize_t,
@@ -8396,7 +8408,10 @@ any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
83968408

83978409
switch(kind) {
83988410
casePyUnicode_1BYTE_KIND:
8399-
result=ucs1(buf1,len1,buf2,len2,start,end);
8411+
if (PyUnicode_IS_ASCII(s1)&&PyUnicode_IS_ASCII(s2))
8412+
result=ascii(buf1,len1,buf2,len2,start,end);
8413+
else
8414+
result=ucs1(buf1,len1,buf2,len2,start,end);
84008415
break;
84018416
casePyUnicode_2BYTE_KIND:
84028417
result=ucs2(buf1,len1,buf2,len2,start,end);
@@ -8417,7 +8432,7 @@ any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
84178432
}
84188433

84198434
Py_ssize_t
8420-
_PyUnicode_InsertThousandsGrouping(intkind,void*data,
8435+
_PyUnicode_InsertThousandsGrouping(PyObject*unicode,intkind,void*data,
84218436
Py_ssize_tn_buffer,
84228437
void*digits,Py_ssize_tn_digits,
84238438
Py_ssize_tmin_width,
@@ -8426,9 +8441,14 @@ _PyUnicode_InsertThousandsGrouping(int kind, void *data,
84268441
{
84278442
switch(kind) {
84288443
casePyUnicode_1BYTE_KIND:
8429-
return_PyUnicode_ucs1_InsertThousandsGrouping(
8430-
(Py_UCS1*)data,n_buffer, (Py_UCS1*)digits,n_digits,
8431-
min_width,grouping,thousands_sep);
8444+
if (unicode!=NULL&&PyUnicode_IS_ASCII(unicode))
8445+
return_PyUnicode_ascii_InsertThousandsGrouping(
8446+
(Py_UCS1*)data,n_buffer, (Py_UCS1*)digits,n_digits,
8447+
min_width,grouping,thousands_sep);
8448+
else
8449+
return_PyUnicode_ucs1_InsertThousandsGrouping(
8450+
(Py_UCS1*)data,n_buffer, (Py_UCS1*)digits,n_digits,
8451+
min_width,grouping,thousands_sep);
84328452
casePyUnicode_2BYTE_KIND:
84338453
return_PyUnicode_ucs2_InsertThousandsGrouping(
84348454
(Py_UCS2*)data,n_buffer, (Py_UCS2*)digits,n_digits,
@@ -8505,10 +8525,16 @@ PyUnicode_Count(PyObject *str,
85058525
ADJUST_INDICES(start,end,len1);
85068526
switch(kind) {
85078527
casePyUnicode_1BYTE_KIND:
8508-
result=ucs1lib_count(
8509-
((Py_UCS1*)buf1)+start,end-start,
8510-
buf2,len2,PY_SSIZE_T_MAX
8511-
);
8528+
if (PyUnicode_IS_ASCII(str_obj)&&PyUnicode_IS_ASCII(sub_obj))
8529+
result=asciilib_count(
8530+
((Py_UCS1*)buf1)+start,end-start,
8531+
buf2,len2,PY_SSIZE_T_MAX
8532+
);
8533+
else
8534+
result=ucs1lib_count(
8535+
((Py_UCS1*)buf1)+start,end-start,
8536+
buf2,len2,PY_SSIZE_T_MAX
8537+
);
85128538
break;
85138539
casePyUnicode_2BYTE_KIND:
85148540
result=ucs2lib_count(
@@ -8565,12 +8591,14 @@ PyUnicode_Find(PyObject *str,
85658591

85668592
if (direction>0)
85678593
result=any_find_slice(
8568-
ucs1lib_find_slice,ucs2lib_find_slice,ucs4lib_find_slice,
8594+
asciilib_find_slice,ucs1lib_find_slice,
8595+
ucs2lib_find_slice,ucs4lib_find_slice,
85698596
str,sub,start,end
85708597
);
85718598
else
85728599
result=any_find_slice(
8573-
ucs1lib_rfind_slice,ucs2lib_rfind_slice,ucs4lib_rfind_slice,
8600+
asciilib_find_slice,ucs1lib_rfind_slice,
8601+
ucs2lib_rfind_slice,ucs4lib_rfind_slice,
85748602
str,sub,start,end
85758603
);
85768604

@@ -9200,9 +9228,14 @@ PyUnicode_Splitlines(PyObject *string, int keepends)
92009228

92019229
switch(PyUnicode_KIND(string)) {
92029230
casePyUnicode_1BYTE_KIND:
9203-
list=ucs1lib_splitlines(
9204-
(PyObject*)string,PyUnicode_1BYTE_DATA(string),
9205-
PyUnicode_GET_LENGTH(string),keepends);
9231+
if (PyUnicode_IS_ASCII(string))
9232+
list=asciilib_splitlines(
9233+
(PyObject*)string,PyUnicode_1BYTE_DATA(string),
9234+
PyUnicode_GET_LENGTH(string),keepends);
9235+
else
9236+
list=ucs1lib_splitlines(
9237+
(PyObject*)string,PyUnicode_1BYTE_DATA(string),
9238+
PyUnicode_GET_LENGTH(string),keepends);
92069239
break;
92079240
casePyUnicode_2BYTE_KIND:
92089241
list=ucs2lib_splitlines(
@@ -9241,10 +9274,16 @@ split(PyObject *self,
92419274
if (substring==NULL)
92429275
switch(PyUnicode_KIND(self)) {
92439276
casePyUnicode_1BYTE_KIND:
9244-
returnucs1lib_split_whitespace(
9245-
(PyObject*)self,PyUnicode_1BYTE_DATA(self),
9246-
PyUnicode_GET_LENGTH(self),maxcount
9247-
);
9277+
if (PyUnicode_IS_ASCII(self))
9278+
returnasciilib_split_whitespace(
9279+
(PyObject*)self,PyUnicode_1BYTE_DATA(self),
9280+
PyUnicode_GET_LENGTH(self),maxcount
9281+
);
9282+
else
9283+
returnucs1lib_split_whitespace(
9284+
(PyObject*)self,PyUnicode_1BYTE_DATA(self),
9285+
PyUnicode_GET_LENGTH(self),maxcount
9286+
);
92489287
casePyUnicode_2BYTE_KIND:
92499288
returnucs2lib_split_whitespace(
92509289
(PyObject*)self,PyUnicode_2BYTE_DATA(self),
@@ -9283,8 +9322,12 @@ split(PyObject *self,
92839322

92849323
switch(kind) {
92859324
casePyUnicode_1BYTE_KIND:
9286-
out=ucs1lib_split(
9287-
(PyObject*)self,buf1,len1,buf2,len2,maxcount);
9325+
if (PyUnicode_IS_ASCII(self)&&PyUnicode_IS_ASCII(substring))
9326+
out=asciilib_split(
9327+
(PyObject*)self,buf1,len1,buf2,len2,maxcount);
9328+
else
9329+
out=ucs1lib_split(
9330+
(PyObject*)self,buf1,len1,buf2,len2,maxcount);
92889331
break;
92899332
casePyUnicode_2BYTE_KIND:
92909333
out=ucs2lib_split(
@@ -9323,10 +9366,16 @@ rsplit(PyObject *self,
93239366
if (substring==NULL)
93249367
switch(PyUnicode_KIND(self)) {
93259368
casePyUnicode_1BYTE_KIND:
9326-
returnucs1lib_rsplit_whitespace(
9327-
(PyObject*)self,PyUnicode_1BYTE_DATA(self),
9328-
PyUnicode_GET_LENGTH(self),maxcount
9329-
);
9369+
if (PyUnicode_IS_ASCII(self))
9370+
returnasciilib_rsplit_whitespace(
9371+
(PyObject*)self,PyUnicode_1BYTE_DATA(self),
9372+
PyUnicode_GET_LENGTH(self),maxcount
9373+
);
9374+
else
9375+
returnucs1lib_rsplit_whitespace(
9376+
(PyObject*)self,PyUnicode_1BYTE_DATA(self),
9377+
PyUnicode_GET_LENGTH(self),maxcount
9378+
);
93309379
casePyUnicode_2BYTE_KIND:
93319380
returnucs2lib_rsplit_whitespace(
93329381
(PyObject*)self,PyUnicode_2BYTE_DATA(self),
@@ -9365,8 +9414,12 @@ rsplit(PyObject *self,
93659414

93669415
switch(kind) {
93679416
casePyUnicode_1BYTE_KIND:
9368-
out=ucs1lib_rsplit(
9369-
(PyObject*)self,buf1,len1,buf2,len2,maxcount);
9417+
if (PyUnicode_IS_ASCII(self)&&PyUnicode_IS_ASCII(substring))
9418+
out=asciilib_rsplit(
9419+
(PyObject*)self,buf1,len1,buf2,len2,maxcount);
9420+
else
9421+
out=ucs1lib_rsplit(
9422+
(PyObject*)self,buf1,len1,buf2,len2,maxcount);
93709423
break;
93719424
casePyUnicode_2BYTE_KIND:
93729425
out=ucs2lib_rsplit(
@@ -9387,12 +9440,15 @@ rsplit(PyObject *self,
93879440
}
93889441

93899442
staticPy_ssize_t
9390-
anylib_find(intkind,void*buf1,Py_ssize_tlen1,
9391-
void*buf2,Py_ssize_tlen2,Py_ssize_toffset)
9443+
anylib_find(intkind,PyObject*str1,void*buf1,Py_ssize_tlen1,
9444+
PyObject*str2,void*buf2,Py_ssize_tlen2,Py_ssize_toffset)
93929445
{
93939446
switch(kind) {
93949447
casePyUnicode_1BYTE_KIND:
9395-
returnucs1lib_find(buf1,len1,buf2,len2,offset);
9448+
if (PyUnicode_IS_ASCII(str1)&&PyUnicode_IS_ASCII(str2))
9449+
returnasciilib_find(buf1,len1,buf2,len2,offset);
9450+
else
9451+
returnucs1lib_find(buf1,len1,buf2,len2,offset);
93969452
casePyUnicode_2BYTE_KIND:
93979453
returnucs2lib_find(buf1,len1,buf2,len2,offset);
93989454
casePyUnicode_4BYTE_KIND:
@@ -9403,12 +9459,15 @@ anylib_find(int kind, void *buf1, Py_ssize_t len1,
94039459
}
94049460

94059461
staticPy_ssize_t
9406-
anylib_count(intkind,void*sbuf,Py_ssize_tslen,
9407-
void*buf1,Py_ssize_tlen1,Py_ssize_tmaxcount)
9462+
anylib_count(intkind,PyObject*sstr,void*sbuf,Py_ssize_tslen,
9463+
PyObject*str1,void*buf1,Py_ssize_tlen1,Py_ssize_tmaxcount)
94089464
{
94099465
switch(kind) {
94109466
casePyUnicode_1BYTE_KIND:
9411-
returnucs1lib_count(sbuf,slen,buf1,len1,maxcount);
9467+
if (PyUnicode_IS_ASCII(sstr)&&PyUnicode_IS_ASCII(str1))
9468+
returnasciilib_count(sbuf,slen,buf1,len1,maxcount);
9469+
else
9470+
returnucs1lib_count(sbuf,slen,buf1,len1,maxcount);
94129471
casePyUnicode_2BYTE_KIND:
94139472
returnucs2lib_count(sbuf,slen,buf1,len1,maxcount);
94149473
casePyUnicode_4BYTE_KIND:
@@ -9497,7 +9556,7 @@ replace(PyObject *self, PyObject *str1,
94979556
if (!buf1) gotoerror;
94989557
release1=1;
94999558
}
9500-
i=anylib_find(rkind,sbuf,slen,buf1,len1,0);
9559+
i=anylib_find(rkind,self,sbuf,slen,str1,buf1,len1,0);
95019560
if (i<0)
95029561
gotonothing;
95039562
if (rkind>kind2) {
@@ -9530,9 +9589,9 @@ replace(PyObject *self, PyObject *str1,
95309589
i+=len1;
95319590

95329591
while (--maxcount>0) {
9533-
i=anylib_find(rkind,sbuf+PyUnicode_KIND_SIZE(rkind,i),
9534-
slen-i,
9535-
buf1,len1,i);
9592+
i=anylib_find(rkind,self,
9593+
sbuf+PyUnicode_KIND_SIZE(rkind,i),slen-i,
9594+
str1,buf1,len1,i);
95369595
if (i==-1)
95379596
break;
95389597
memcpy(res+PyUnicode_KIND_SIZE(rkind,i),
@@ -9557,7 +9616,7 @@ replace(PyObject *self, PyObject *str1,
95579616
if (!buf1) gotoerror;
95589617
release1=1;
95599618
}
9560-
n=anylib_count(rkind,sbuf,slen,buf1,len1,maxcount);
9619+
n=anylib_count(rkind,self,sbuf,slen,str1,buf1,len1,maxcount);
95619620
if (n==0)
95629621
gotonothing;
95639622
if (kind2<rkind) {
@@ -9596,9 +9655,9 @@ replace(PyObject *self, PyObject *str1,
95969655
if (len1>0) {
95979656
while (n-->0) {
95989657
/* look for next match */
9599-
j=anylib_find(rkind,
9600-
sbuf+PyUnicode_KIND_SIZE(rkind,i),
9601-
slen-i,buf1,len1,i);
9658+
j=anylib_find(rkind,self,
9659+
sbuf+PyUnicode_KIND_SIZE(rkind,i),slen-i,
9660+
str1,buf1,len1,i);
96029661
if (j==-1)
96039662
break;
96049663
elseif (j>i) {
@@ -10443,7 +10502,8 @@ unicode_find(PyObject *self, PyObject *args)
1044310502
returnNULL;
1044410503

1044510504
result=any_find_slice(
10446-
ucs1lib_find_slice,ucs2lib_find_slice,ucs4lib_find_slice,
10505+
asciilib_find_slice,ucs1lib_find_slice,
10506+
ucs2lib_find_slice,ucs4lib_find_slice,
1044710507
self, (PyObject*)substring,start,end
1044810508
);
1044910509

@@ -10536,7 +10596,8 @@ unicode_index(PyObject *self, PyObject *args)
1053610596
returnNULL;
1053710597

1053810598
result=any_find_slice(
10539-
ucs1lib_find_slice,ucs2lib_find_slice,ucs4lib_find_slice,
10599+
asciilib_find_slice,ucs1lib_find_slice,
10600+
ucs2lib_find_slice,ucs4lib_find_slice,
1054010601
self, (PyObject*)substring,start,end
1054110602
);
1054210603

@@ -11548,7 +11609,8 @@ unicode_rfind(PyObject *self, PyObject *args)
1154811609
returnNULL;
1154911610

1155011611
result=any_find_slice(
11551-
ucs1lib_rfind_slice,ucs2lib_rfind_slice,ucs4lib_rfind_slice,
11612+
asciilib_rfind_slice,ucs1lib_rfind_slice,
11613+
ucs2lib_rfind_slice,ucs4lib_rfind_slice,
1155211614
self, (PyObject*)substring,start,end
1155311615
);
1155411616

@@ -11583,7 +11645,8 @@ unicode_rindex(PyObject *self, PyObject *args)
1158311645
returnNULL;
1158411646

1158511647
result=any_find_slice(
11586-
ucs1lib_rfind_slice,ucs2lib_rfind_slice,ucs4lib_rfind_slice,
11648+
asciilib_rfind_slice,ucs1lib_rfind_slice,
11649+
ucs2lib_rfind_slice,ucs4lib_rfind_slice,
1158711650
self, (PyObject*)substring,start,end
1158811651
);
1158911652

@@ -11712,7 +11775,10 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
1171211775

1171311776
switch(PyUnicode_KIND(str_obj)) {
1171411777
casePyUnicode_1BYTE_KIND:
11715-
out=ucs1lib_partition(str_obj,buf1,len1,sep_obj,buf2,len2);
11778+
if (PyUnicode_IS_ASCII(str_obj)&&PyUnicode_IS_ASCII(sep_obj))
11779+
out=asciilib_partition(str_obj,buf1,len1,sep_obj,buf2,len2);
11780+
else
11781+
out=ucs1lib_partition(str_obj,buf1,len1,sep_obj,buf2,len2);
1171611782
break;
1171711783
casePyUnicode_2BYTE_KIND:
1171811784
out=ucs2lib_partition(str_obj,buf1,len1,sep_obj,buf2,len2);
@@ -11781,7 +11847,10 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
1178111847

1178211848
switch(PyUnicode_KIND(str_in)) {
1178311849
casePyUnicode_1BYTE_KIND:
11784-
out=ucs1lib_rpartition(str_obj,buf1,len1,sep_obj,buf2,len2);
11850+
if (PyUnicode_IS_ASCII(str_obj)&&PyUnicode_IS_ASCII(sep_obj))
11851+
out=asciilib_rpartition(str_obj,buf1,len1,sep_obj,buf2,len2);
11852+
else
11853+
out=ucs1lib_rpartition(str_obj,buf1,len1,sep_obj,buf2,len2);
1178511854
break;
1178611855
casePyUnicode_2BYTE_KIND:
1178711856
out=ucs2lib_rpartition(str_obj,buf1,len1,sep_obj,buf2,len2);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp