Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit8909d14

Browse files
gh-91760: More strict rules for numerical group references and group names in RE
Only sequence of ASCII digits not starting with 0 (except group 0) isnow accepted as a numerical reference.The group name in bytes patterns and replacement strings can now onlycontain ASCII letters and digits and underscore.
1 parent944fffe commit8909d14

File tree

5 files changed

+84
-33
lines changed

5 files changed

+84
-33
lines changed

‎Doc/library/re.rst

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,8 @@ The special characters are:
395395
``(?P<name>...)``
396396
Similar to regular parentheses, but the substring matched by the group is
397397
accessible via the symbolic group name *name*. Group names must be valid
398-
Python identifiers, and each group name must be defined only once within a
398+
Python identifiers, and in bytes patterns they must contain only characters
399+
in the ASCII range. Each group name must be defined only once within a
399400
regular expression. A symbolic group is also a numbered group, just as if
400401
the group were not named.
401402

@@ -417,6 +418,10 @@ The special characters are:
417418
|| * ``\1``|
418419
+---------------------------------------+----------------------------------+
419420

421+
..versionchanged::3.11
422+
In bytes patterns group names must contain only characters in
423+
the ASCII range.
424+
420425
..index::single: (?P=; in regular expressions
421426

422427
``(?P=name)``
@@ -486,6 +491,9 @@ The special characters are:
486491
will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
487492
not with ``'<user@host.com'`` nor ``'user@host.com>'``.
488493

494+
..versionchanged::3.11
495+
Group *id* can only contain ASCII digits and cannot start with ``0``.
496+
489497

490498
The special sequences consist of ``'\'`` and a character from the list below.
491499
If the ordinary character is not an ASCII digit or an ASCII letter, then the
@@ -995,6 +1003,12 @@ form.
9951003
Empty matches for the pattern are replaced when adjacent to a previous
9961004
non-empty match.
9971005

1006+
..versionchanged::3.11
1007+
Group *id* can only contain ASCII digits and cannot start with ``0``
1008+
(except group 0).
1009+
In bytes replacement strings group names must contain only characters
1010+
in the ASCII range.
1011+
9981012

9991013
..function::subn(pattern, repl, string, count=0, flags=0)
10001014

‎Doc/whatsnew/3.11.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,6 +1060,14 @@ Changes in the Python API
10601060
before.
10611061
(Contributed by Ma Lin in:issue:`35859`.)
10621062

1063+
* More strict rules are now applied for numerical group references and
1064+
group names in regular expressions.
1065+
Only sequence of ASCII digits not starting with ``0`` (except group 0) is
1066+
now accepted as a numerical reference.
1067+
The group name in bytes patterns and replacement strings can now only
1068+
contain ASCII letters and digits and underscore.
1069+
(Contributed by Serhiy Storchaka in:issue:`91760`.)
1070+
10631071
* The *population* parameter of:func:`random.sample` must be a sequence.
10641072
Automatic conversion of sets to lists is no longer supported. If the sample size
10651073
is larger than the population size, a:exc:`ValueError` is raised.

‎Lib/re/_parser.py

Lines changed: 27 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,14 @@ def seek(self, index):
295295
deferror(self,msg,offset=0):
296296
returnerror(msg,self.string,self.tell()-offset)
297297

298+
defcheckgroupname(self,name,offset):
299+
ifnot (self.istextorname.isascii()):
300+
msg="bad character in group name %a"%name
301+
raiseself.error(msg,len(name)+offset)
302+
ifnotname.isidentifier():
303+
msg="bad character in group name %r"%name
304+
raiseself.error(msg,len(name)+offset)
305+
298306
def_class_escape(source,escape):
299307
# handle escape code inside character class
300308
code=ESCAPES.get(escape)
@@ -707,15 +715,11 @@ def _parse(source, state, verbose, nested, first=False):
707715
ifsourcematch("<"):
708716
# named group: skip forward to end of name
709717
name=source.getuntil(">","group name")
710-
ifnotname.isidentifier():
711-
msg="bad character in group name %r"%name
712-
raisesource.error(msg,len(name)+1)
718+
source.checkgroupname(name,1)
713719
elifsourcematch("="):
714720
# named backreference
715721
name=source.getuntil(")","group name")
716-
ifnotname.isidentifier():
717-
msg="bad character in group name %r"%name
718-
raisesource.error(msg,len(name)+1)
722+
source.checkgroupname(name,1)
719723
gid=state.groupdict.get(name)
720724
ifgidisNone:
721725
msg="unknown group name %r"%name
@@ -776,25 +780,21 @@ def _parse(source, state, verbose, nested, first=False):
776780
elifchar=="(":
777781
# conditional backreference group
778782
condname=source.getuntil(")","group name")
779-
ifcondname.isidentifier():
780-
condgroup=state.groupdict.get(condname)
781-
ifcondgroupisNone:
782-
msg="unknown group name %r"%condname
783-
raisesource.error(msg,len(condname)+1)
784-
else:
785-
try:
786-
condgroup=int(condname)
787-
ifcondgroup<0:
788-
raiseValueError
789-
exceptValueError:
790-
msg="bad character in group name %r"%condname
791-
raisesource.error(msg,len(condname)+1)fromNone
783+
if (condname.isdecimal()andcondname.isascii()and
784+
(condname[0]!="0"orcondname=="0")):
785+
condgroup=int(condname)
792786
ifnotcondgroup:
793787
raisesource.error("bad group number",
794788
len(condname)+1)
795789
ifcondgroup>=MAXGROUPS:
796790
msg="invalid group reference %d"%condgroup
797791
raisesource.error(msg,len(condname)+1)
792+
else:
793+
source.checkgroupname(condname,1)
794+
condgroup=state.groupdict.get(condname)
795+
ifcondgroupisNone:
796+
msg="unknown group name %r"%condname
797+
raisesource.error(msg,len(condname)+1)
798798
state.checklookbehindgroup(condgroup,source)
799799
item_yes=_parse(source,state,verbose,nested+1)
800800
ifsource.match("|"):
@@ -1006,26 +1006,21 @@ def addgroup(index, pos):
10061006
# group
10071007
c=this[1]
10081008
ifc=="g":
1009-
name=""
10101009
ifnots.match("<"):
10111010
raises.error("missing <")
10121011
name=s.getuntil(">","group name")
1013-
ifname.isidentifier():
1012+
if (name.isdecimal()andname.isascii()and
1013+
(name[0]!="0"orname=="0")):
1014+
index=int(name)
1015+
ifindex>=MAXGROUPS:
1016+
raises.error("invalid group reference %d"%index,
1017+
len(name)+1)
1018+
else:
1019+
s.checkgroupname(name,1)
10141020
try:
10151021
index=groupindex[name]
10161022
exceptKeyError:
10171023
raiseIndexError("unknown group name %r"%name)fromNone
1018-
else:
1019-
try:
1020-
index=int(name)
1021-
ifindex<0:
1022-
raiseValueError
1023-
exceptValueError:
1024-
raises.error("bad character in group name %r"%name,
1025-
len(name)+1)fromNone
1026-
ifindex>=MAXGROUPS:
1027-
raises.error("invalid group reference %d"%index,
1028-
len(name)+1)
10291024
addgroup(index,len(name)+1)
10301025
elifc=="0":
10311026
ifs.nextinOCTDIGITS:

‎Lib/test/test_re.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ def test_basic_re_sub(self):
135135
self.assertEqual(re.sub('(?P<a>x)',r'\g<a>\g<1>','xx'),'xxxx')
136136
self.assertEqual(re.sub('(?P<unk>x)',r'\g<unk>\g<unk>','xx'),'xxxx')
137137
self.assertEqual(re.sub('(?P<unk>x)',r'\g<1>\g<1>','xx'),'xxxx')
138+
self.assertEqual(re.sub('()x',r'\g<0>\g<0>','xx'),'xxxx')
138139

139140
self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b','a'),'\t\n\v\r\f\a\b')
140141
self.assertEqual(re.sub('a','\t\n\v\r\f\a\b','a'),'\t\n\v\r\f\a\b')
@@ -274,6 +275,12 @@ def test_symbolic_groups_errors(self):
274275
self.checkPatternError('(?P<©>x)',"bad character in group name '©'",4)
275276
self.checkPatternError('(?P=©)',"bad character in group name '©'",4)
276277
self.checkPatternError('(?(©)y)',"bad character in group name '©'",3)
278+
self.checkPatternError(b'(?P<\xc2\xb5>x)',
279+
r"bad character in group name '\xc2\xb5'",4)
280+
self.checkPatternError(b'(?P=\xc2\xb5)',
281+
r"bad character in group name '\xc2\xb5'",4)
282+
self.checkPatternError(b'(?(\xc2\xb5)y)',
283+
r"bad character in group name '\xc2\xb5'",3)
277284

278285
deftest_symbolic_refs(self):
279286
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)',r'\g<b>','xx'),'')
@@ -306,12 +313,24 @@ def test_symbolic_refs_errors(self):
306313
re.sub('(?P<a>x)',r'\g<ab>','xx')
307314
self.checkTemplateError('(?P<a>x)',r'\g<-1>','xx',
308315
"bad character in group name '-1'",3)
316+
self.checkTemplateError('(?P<a>x)',r'\g<+1>','xx',
317+
"bad character in group name '+1'",3)
318+
self.checkTemplateError('(?P<a>x)',r'\g<01>','xx',
319+
"bad character in group name '01'",3)
320+
self.checkTemplateError('()'*10,r'\g<1_0>','xx',
321+
"bad character in group name '1_0'",3)
322+
self.checkTemplateError('(?P<a>x)',r'\g< 1 >','xx',
323+
"bad character in group name ' 1 '",3)
309324
self.checkTemplateError('(?P<a>x)',r'\g<©>','xx',
310325
"bad character in group name '©'",3)
326+
self.checkTemplateError(b'(?P<a>x)',b'\\g<\xc2\xb5>',b'xx',
327+
r"bad character in group name '\xc2\xb5'",3)
311328
self.checkTemplateError('(?P<a>x)',r'\g<㊀>','xx',
312329
"bad character in group name '㊀'",3)
313330
self.checkTemplateError('(?P<a>x)',r'\g<¹>','xx',
314331
"bad character in group name '¹'",3)
332+
self.checkTemplateError('(?P<a>x)',r'\g<१>','xx',
333+
"bad character in group name '१'",3)
315334

316335
deftest_re_subn(self):
317336
self.assertEqual(re.subn("(?i)b+","x","bbbb BBBB"), ('x x',2))
@@ -577,10 +596,20 @@ def test_re_groupref_exists_errors(self):
577596
self.checkPatternError(r'(?P<a>)(?(0)a|b)','bad group number',10)
578597
self.checkPatternError(r'()(?(-1)a|b)',
579598
"bad character in group name '-1'",5)
599+
self.checkPatternError(r'()(?(+1)a|b)',
600+
"bad character in group name '+1'",5)
601+
self.checkPatternError(r'()(?(01)a|b)',
602+
"bad character in group name '01'",5)
603+
self.checkPatternError(r'()'*10+r'(?(1_0)a|b)',
604+
"bad character in group name '1_0'",23)
605+
self.checkPatternError(r'()(?( 1 )a|b)',
606+
"bad character in group name ' 1 '",5)
580607
self.checkPatternError(r'()(?(㊀)a|b)',
581608
"bad character in group name '㊀'",5)
582609
self.checkPatternError(r'()(?(¹)a|b)',
583610
"bad character in group name '¹'",5)
611+
self.checkPatternError(r'()(?(१)a|b)',
612+
"bad character in group name '१'",5)
584613
self.checkPatternError(r'()(?(1',
585614
"missing ), unterminated name",5)
586615
self.checkPatternError(r'()(?(1)a',
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Apply more strict rules for numerical group references and group names in
2+
regular expressions. Only sequence of ASCII digits not starting with 0
3+
(except group 0) is now accepted as a numerical reference. The group name in
4+
bytes patterns and replacement strings can now only contain ASCII letters
5+
and digits and underscore.

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp