Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

gh-94808: CoverPyUnicode_Count in CAPI#96929

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
encukou merged 1 commit intopython:mainfromsobolevn:cover-py-unicode-count
Oct 6, 2022

Conversation

sobolevn
Copy link
Member

@sobolevnsobolevn commentedSep 19, 2022
edited by bedevere-bot
Loading

It is heavily inspired by

deftest_count(self):
self.checkequal(3,'aaa','count','a')
self.checkequal(0,'aaa','count','b')
self.checkequal(3,'aaa','count','a')
self.checkequal(0,'aaa','count','b')
self.checkequal(3,'aaa','count','a')
self.checkequal(0,'aaa','count','b')
self.checkequal(0,'aaa','count','b')
self.checkequal(2,'aaa','count','a',1)
self.checkequal(0,'aaa','count','a',10)
self.checkequal(1,'aaa','count','a',-1)
self.checkequal(3,'aaa','count','a',-10)
self.checkequal(1,'aaa','count','a',0,1)
self.checkequal(3,'aaa','count','a',0,10)
self.checkequal(2,'aaa','count','a',0,-1)
self.checkequal(0,'aaa','count','a',0,-10)
self.checkequal(3,'aaa','count','',1)
self.checkequal(1,'aaa','count','',3)
self.checkequal(0,'aaa','count','',10)
self.checkequal(2,'aaa','count','',-1)
self.checkequal(4,'aaa','count','',-10)
self.checkequal(1,'','count','')
self.checkequal(0,'','count','',1,1)
self.checkequal(0,'','count','',sys.maxsize,0)
self.checkequal(0,'','count','xx')
self.checkequal(0,'','count','xx',1,1)
self.checkequal(0,'','count','xx',sys.maxsize,0)
self.checkraises(TypeError,'hello','count')
ifself.contains_bytes:
self.checkequal(0,'hello','count',42)
else:
self.checkraises(TypeError,'hello','count',42)
# For a variety of combinations,
# verify that str.count() matches an equivalent function
# replacing all occurrences and then differencing the string lengths
charset= ['','a','b']
digits=7
base=len(charset)
teststrings=set()
foriinrange(base**digits):
entry= []
forjinrange(digits):
i,m=divmod(i,base)
entry.append(charset[m])
teststrings.add(''.join(entry))
teststrings= [self.fixtype(ts)fortsinteststrings]
foriinteststrings:
n=len(i)
forjinteststrings:
r1=i.count(j)
ifj:
r2,rem=divmod(n-len(i.replace(j,self.fixtype(''))),
len(j))
else:
r2,rem=len(i)+1,0
ifremorr1!=r2:
self.assertEqual(rem,0,'%s != 0 for %s'% (rem,i))
self.assertEqual(r1,r2,'%s != %s for %s'% (r1,r2,i))

Question: what is the historical context on whyPyUnicode_Count is not reused inunicode_count? They look pretty similar:

Py_ssize_t
PyUnicode_Count(PyObject*str,
PyObject*substr,
Py_ssize_tstart,
Py_ssize_tend)
{
Py_ssize_tresult;
intkind1,kind2;
constvoid*buf1=NULL,*buf2=NULL;
Py_ssize_tlen1,len2;
if (ensure_unicode(str)<0||ensure_unicode(substr)<0)
return-1;
kind1=PyUnicode_KIND(str);
kind2=PyUnicode_KIND(substr);
if (kind1<kind2)
return0;
len1=PyUnicode_GET_LENGTH(str);
len2=PyUnicode_GET_LENGTH(substr);
ADJUST_INDICES(start,end,len1);
if (end-start<len2)
return0;
buf1=PyUnicode_DATA(str);
buf2=PyUnicode_DATA(substr);
if (kind2!=kind1) {
buf2=unicode_askind(kind2,buf2,len2,kind1);
if (!buf2)
gotoonError;
}
switch (kind1) {
casePyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(str)&&PyUnicode_IS_ASCII(substr))
result=asciilib_count(
((constPy_UCS1*)buf1)+start,end-start,
buf2,len2,PY_SSIZE_T_MAX
);
else
result=ucs1lib_count(
((constPy_UCS1*)buf1)+start,end-start,
buf2,len2,PY_SSIZE_T_MAX
);
break;
casePyUnicode_2BYTE_KIND:
result=ucs2lib_count(
((constPy_UCS2*)buf1)+start,end-start,
buf2,len2,PY_SSIZE_T_MAX
);
break;
casePyUnicode_4BYTE_KIND:
result=ucs4lib_count(
((constPy_UCS4*)buf1)+start,end-start,
buf2,len2,PY_SSIZE_T_MAX
);
break;
default:
Py_UNREACHABLE();
}
assert((kind2!=kind1)== (buf2!=PyUnicode_DATA(substr)));
if (kind2!=kind1)
PyMem_Free((void*)buf2);
returnresult;
onError:
assert((kind2!=kind1)== (buf2!=PyUnicode_DATA(substr)));
if (kind2!=kind1)
PyMem_Free((void*)buf2);
return-1;
}

And

staticPyObject*
unicode_count(PyObject*self,PyObject*args)
{
PyObject*substring=NULL;/* initialize to fix a compiler warning */
Py_ssize_tstart=0;
Py_ssize_tend=PY_SSIZE_T_MAX;
PyObject*result;
intkind1,kind2;
constvoid*buf1,*buf2;
Py_ssize_tlen1,len2,iresult;
if (!parse_args_finds_unicode("count",args,&substring,&start,&end))
returnNULL;
kind1=PyUnicode_KIND(self);
kind2=PyUnicode_KIND(substring);
if (kind1<kind2)
returnPyLong_FromLong(0);
len1=PyUnicode_GET_LENGTH(self);
len2=PyUnicode_GET_LENGTH(substring);
ADJUST_INDICES(start,end,len1);
if (end-start<len2)
returnPyLong_FromLong(0);
buf1=PyUnicode_DATA(self);
buf2=PyUnicode_DATA(substring);
if (kind2!=kind1) {
buf2=unicode_askind(kind2,buf2,len2,kind1);
if (!buf2)
returnNULL;
}
switch (kind1) {
casePyUnicode_1BYTE_KIND:
iresult=ucs1lib_count(
((constPy_UCS1*)buf1)+start,end-start,
buf2,len2,PY_SSIZE_T_MAX
);
break;
casePyUnicode_2BYTE_KIND:
iresult=ucs2lib_count(
((constPy_UCS2*)buf1)+start,end-start,
buf2,len2,PY_SSIZE_T_MAX
);
break;
casePyUnicode_4BYTE_KIND:
iresult=ucs4lib_count(
((constPy_UCS4*)buf1)+start,end-start,
buf2,len2,PY_SSIZE_T_MAX
);
break;
default:
Py_UNREACHABLE();
}
result=PyLong_FromSsize_t(iresult);
assert((kind2==kind1)== (buf2==PyUnicode_DATA(substring)));
if (kind2!=kind1)
PyMem_Free((void*)buf2);
returnresult;
}

@sobolevnsobolevn added testsTests in the Lib/test dir skip news labelsSep 19, 2022
@mdboom
Copy link
Contributor

Question: what is the historical context on why PyUnicode_Count is not reused in unicode_count?

It looks like these both date to the same commit d57fd91 from 2000-03-10. They were pretty different then, but are almost the same now. I see some benefit in makingunicode_count callPyUnicode_Count to make sure they remain consistent, but I could also see someone seeing this as "churn for churn's sake".

Note there is alsoanylib_count which is a subset ofunicode_count andPyUnicode_Count.

There are a few other instances of this kind of thing I've come across looking at coverage -- it would be good to get a core developer's take on whether merging internal and external functions where they are clearly wrappable like this would be welcome.

@encukou
Copy link
Member

Apparentlyunicode_count missedan optimization in 2011, otherwise they're equivalent (except arg parsing & converting the return value). Merging them could add the optimization tounicode_count.
If you want to work on that, note that there's alsoanylib_count that duplicates the mainswitch.

@sobolevn
Copy link
MemberAuthor

Thanks! Yes, I would like to do that! I will open a new issue for it.

carljm added a commit to carljm/cpython that referenced this pull requestOct 6, 2022
* main:pythonGH-88050: fix race in closing subprocess pipe in asyncio  (python#97951)pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962)pythongh-95986: Fix the example using match keyword (python#95989)pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944)pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929)pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610)pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703)pythonGH-88968: Add notes about socket ownership transfers (python#97936)pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528)
carljm added a commit to carljm/cpython that referenced this pull requestOct 8, 2022
* main: (53 commits)pythongh-94808: Coverage: Test that maximum indentation level is handled (python#95926)pythonGH-88050: fix race in closing subprocess pipe in asyncio  (python#97951)pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962)pythongh-95986: Fix the example using match keyword (python#95989)pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944)pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929)pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610)pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703)pythonGH-88968: Add notes about socket ownership transfers (python#97936)pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528)pythongh-65961: Raise `DeprecationWarning` when `__package__` differs from `__spec__.parent` (python#97879)  docs(typing): add "see PEP 675" to LiteralString (python#97926)pythongh-97850: Remove all known instances of module_repr() (python#97876)  I changed my surname early this year (python#96671)pythongh-93738: Documentation C syntax (:c:type:<C type> -> :c:expr:<C type>) (python#97768)pythongh-91539: improve performance of get_proxies_environment  (python#91566)  build(deps): bump actions/stale from 5 to 6 (python#97701)pythonGH-95172 Make the same version `versionadded` oneline (python#95172)pythongh-88050: Fix asyncio subprocess to kill process cleanly when process is blocked (python#32073)pythongh-93738: Documentation C syntax (Function glob patterns -> literal markup) (python#97774)  ...
mpage pushed a commit to mpage/cpython that referenced this pull requestOct 11, 2022
Sign up for freeto join this conversation on GitHub. Already have an account?Sign in to comment
Reviewers

@encukouencukouencukou approved these changes

Assignees
No one assigned
Labels
skip newstestsTests in the Lib/test dir
Projects
None yet
Milestone
No milestone
Development

Successfully merging this pull request may close these issues.

4 participants
@sobolevn@mdboom@encukou@bedevere-bot

[8]ページ先頭

©2009-2025 Movatter.jp