Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commita84a56d

Browse files
gh-91760: More strict rules for numerical group references and group names in RE (GH-91792)
Only sequence of ASCII digits is now accepted as a numerical reference.The group name in bytes patterns and replacement strings can now onlycontain ASCII letters and digits and underscore.
1 parent7b024e3 commita84a56d

File tree

5 files changed

+62
-91
lines changed

5 files changed

+62
-91
lines changed

‎Doc/library/re.rst

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,8 @@ The special characters are:
395395
``(?P<name>...)``
396396
Similar to regular parentheses, but the substring matched by the group is
397397
accessible via the symbolic group name *name*. Group names must be valid
398-
Python identifiers, and each group name must be defined only once within a
398+
Python identifiers, and in bytes patterns they must contain only characters
399+
in the ASCII range. Each group name must be defined only once within a
399400
regular expression. A symbolic group is also a numbered group, just as if
400401
the group were not named.
401402

@@ -417,8 +418,9 @@ The special characters are:
417418
|| * ``\1``|
418419
+---------------------------------------+----------------------------------+
419420

420-
..deprecated::3.11
421-
Group names containing non-ASCII characters in bytes patterns.
421+
..versionchanged::3.12
422+
In bytes patterns group names must contain only characters in
423+
the ASCII range.
422424

423425
..index::single: (?P=; in regular expressions
424426

@@ -489,8 +491,8 @@ The special characters are:
489491
will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
490492
not with ``'<user@host.com'`` nor ``'user@host.com>'``.
491493

492-
..deprecated::3.11
493-
Group *id*containing anything except ASCII digits.
494+
..versionchanged::3.12
495+
Group *id*can only contain ASCII digits.
494496

495497

496498
The special sequences consist of ``'\'`` and a character from the list below.
@@ -1001,9 +1003,10 @@ form.
10011003
Empty matches for the pattern are replaced when adjacent to a previous
10021004
non-empty match.
10031005

1004-
..deprecated::3.11
1005-
Group *id* containing anything except ASCII digits.
1006-
Group names containing non-ASCII characters in bytes replacement strings.
1006+
..versionchanged::3.12
1007+
Group *id* can only contain ASCII digits.
1008+
In bytes replacement strings group names must contain only characters
1009+
in the ASCII range.
10071010

10081011

10091012
..function::subn(pattern, repl, string, count=0, flags=0)

‎Doc/whatsnew/3.12.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,13 @@ Porting to Python 3.12
114114

115115
This section lists previously described changes and other bugfixes
116116
that may require changes to your code.
117+
118+
Changes in the Python API
119+
-------------------------
120+
121+
* More strict rules are now applied for numerical group references and
122+
group names in regular expressions.
123+
Only sequence of ASCII digits is now accepted as a numerical reference.
124+
The group name in bytes patterns and replacement strings can now only
125+
contain ASCII letters and digits and underscore.
126+
(Contributed by Serhiy Storchaka in:gh:`91760`.)

‎Lib/re/_parser.py

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -291,17 +291,13 @@ def error(self, msg, offset=0):
291291
msg=msg.encode('ascii','backslashreplace').decode('ascii')
292292
returnerror(msg,self.string,self.tell()-offset)
293293

294-
defcheckgroupname(self,name,offset,nested):
294+
defcheckgroupname(self,name,offset):
295+
ifnot (self.istextorname.isascii()):
296+
msg="bad character in group name %a"%name
297+
raiseself.error(msg,len(name)+offset)
295298
ifnotname.isidentifier():
296299
msg="bad character in group name %r"%name
297300
raiseself.error(msg,len(name)+offset)
298-
ifnot (self.istextorname.isascii()):
299-
importwarnings
300-
warnings.warn(
301-
"bad character in group name %a at position %d"%
302-
(name,self.tell()-len(name)-offset),
303-
DeprecationWarning,stacklevel=nested+7
304-
)
305301

306302
def_class_escape(source,escape):
307303
# handle escape code inside character class
@@ -717,11 +713,11 @@ def _parse(source, state, verbose, nested, first=False):
717713
ifsourcematch("<"):
718714
# named group: skip forward to end of name
719715
name=source.getuntil(">","group name")
720-
source.checkgroupname(name,1,nested)
716+
source.checkgroupname(name,1)
721717
elifsourcematch("="):
722718
# named backreference
723719
name=source.getuntil(")","group name")
724-
source.checkgroupname(name,1,nested)
720+
source.checkgroupname(name,1)
725721
gid=state.groupdict.get(name)
726722
ifgidisNone:
727723
msg="unknown group name %r"%name
@@ -782,20 +778,14 @@ def _parse(source, state, verbose, nested, first=False):
782778
elifchar=="(":
783779
# conditional backreference group
784780
condname=source.getuntil(")","group name")
785-
ifcondname.isidentifier():
786-
source.checkgroupname(condname,1,nested)
781+
ifnot (condname.isdecimal()andcondname.isascii()):
782+
source.checkgroupname(condname,1)
787783
condgroup=state.groupdict.get(condname)
788784
ifcondgroupisNone:
789785
msg="unknown group name %r"%condname
790786
raisesource.error(msg,len(condname)+1)
791787
else:
792-
try:
793-
condgroup=int(condname)
794-
ifcondgroup<0:
795-
raiseValueError
796-
exceptValueError:
797-
msg="bad character in group name %r"%condname
798-
raisesource.error(msg,len(condname)+1)fromNone
788+
condgroup=int(condname)
799789
ifnotcondgroup:
800790
raisesource.error("bad group number",
801791
len(condname)+1)
@@ -1022,20 +1012,14 @@ def addgroup(index, pos):
10221012
ifnots.match("<"):
10231013
raises.error("missing <")
10241014
name=s.getuntil(">","group name")
1025-
ifname.isidentifier():
1026-
s.checkgroupname(name,1,-1)
1015+
ifnot (name.isdecimal()andname.isascii()):
1016+
s.checkgroupname(name,1)
10271017
try:
10281018
index=groupindex[name]
10291019
exceptKeyError:
10301020
raiseIndexError("unknown group name %r"%name)fromNone
10311021
else:
1032-
try:
1033-
index=int(name)
1034-
ifindex<0:
1035-
raiseValueError
1036-
exceptValueError:
1037-
raises.error("bad character in group name %r"%name,
1038-
len(name)+1)fromNone
1022+
index=int(name)
10391023
ifindex>=MAXGROUPS:
10401024
raises.error("invalid group reference %d"%index,
10411025
len(name)+1)

‎Lib/test/test_re.py

Lines changed: 24 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -275,21 +275,12 @@ def test_symbolic_groups_errors(self):
275275
self.checkPatternError('(?P<©>x)',"bad character in group name '©'",4)
276276
self.checkPatternError('(?P=©)',"bad character in group name '©'",4)
277277
self.checkPatternError('(?(©)y)',"bad character in group name '©'",3)
278-
withself.assertWarnsRegex(DeprecationWarning,
279-
r"bad character in group name '\\xc2\\xb5' "
280-
r"at position 4")asw:
281-
re.compile(b'(?P<\xc2\xb5>x)')
282-
self.assertEqual(w.filename,__file__)
283-
withself.assertWarnsRegex(DeprecationWarning,
284-
r"bad character in group name '\\xc2\\xb5' "
285-
r"at position 4"):
286-
self.checkPatternError(b'(?P=\xc2\xb5)',
287-
r"unknown group name '\xc2\xb5'",4)
288-
withself.assertWarnsRegex(DeprecationWarning,
289-
r"bad character in group name '\\xc2\\xb5' "
290-
r"at position 3"):
291-
self.checkPatternError(b'(?(\xc2\xb5)y)',
292-
r"unknown group name '\xc2\xb5'",3)
278+
self.checkPatternError(b'(?P<\xc2\xb5>x)',
279+
r"bad character in group name '\xc2\xb5'",4)
280+
self.checkPatternError(b'(?P=\xc2\xb5)',
281+
r"bad character in group name '\xc2\xb5'",4)
282+
self.checkPatternError(b'(?(\xc2\xb5)y)',
283+
r"bad character in group name '\xc2\xb5'",3)
293284

294285
deftest_symbolic_refs(self):
295286
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)',r'\g<b>','xx'),'')
@@ -322,35 +313,22 @@ def test_symbolic_refs_errors(self):
322313
re.sub('(?P<a>x)',r'\g<ab>','xx')
323314
self.checkTemplateError('(?P<a>x)',r'\g<-1>','xx',
324315
"bad character in group name '-1'",3)
325-
withself.assertWarnsRegex(DeprecationWarning,
326-
r"bad character in group name '\+1' "
327-
r"at position 3")asw:
328-
re.sub('(?P<a>x)',r'\g<+1>','xx')
329-
self.assertEqual(w.filename,__file__)
330-
withself.assertWarnsRegex(DeprecationWarning,
331-
r"bad character in group name '1_0' "
332-
r"at position 3"):
333-
re.sub('()'*10,r'\g<1_0>','xx')
334-
withself.assertWarnsRegex(DeprecationWarning,
335-
r"bad character in group name ' 1 ' "
336-
r"at position 3"):
337-
re.sub('(?P<a>x)',r'\g< 1 >','xx')
316+
self.checkTemplateError('(?P<a>x)',r'\g<+1>','xx',
317+
"bad character in group name '+1'",3)
318+
self.checkTemplateError('()'*10,r'\g<1_0>','xx',
319+
"bad character in group name '1_0'",3)
320+
self.checkTemplateError('(?P<a>x)',r'\g< 1 >','xx',
321+
"bad character in group name ' 1 '",3)
338322
self.checkTemplateError('(?P<a>x)',r'\g<©>','xx',
339323
"bad character in group name '©'",3)
340-
withself.assertWarnsRegex(DeprecationWarning,
341-
r"bad character in group name '\\xc2\\xb5' "
342-
r"at position 3")asw:
343-
withself.assertRaisesRegex(IndexError,"unknown group name '\xc2\xb5'"):
344-
re.sub(b'(?P<a>x)',b'\\g<\xc2\xb5>',b'xx')
345-
self.assertEqual(w.filename,__file__)
324+
self.checkTemplateError(b'(?P<a>x)',b'\\g<\xc2\xb5>',b'xx',
325+
r"bad character in group name '\xc2\xb5'",3)
346326
self.checkTemplateError('(?P<a>x)',r'\g<㊀>','xx',
347327
"bad character in group name '㊀'",3)
348328
self.checkTemplateError('(?P<a>x)',r'\g<¹>','xx',
349329
"bad character in group name '¹'",3)
350-
withself.assertWarnsRegex(DeprecationWarning,
351-
r"bad character in group name '१' "
352-
r"at position 3"):
353-
re.sub('(?P<a>x)',r'\g<१>','xx')
330+
self.checkTemplateError('(?P<a>x)',r'\g<१>','xx',
331+
"bad character in group name '१'",3)
354332

355333
deftest_re_subn(self):
356334
self.assertEqual(re.subn("(?i)b+","x","bbbb BBBB"), ('x x',2))
@@ -616,27 +594,18 @@ def test_re_groupref_exists_errors(self):
616594
self.checkPatternError(r'(?P<a>)(?(0)a|b)','bad group number',10)
617595
self.checkPatternError(r'()(?(-1)a|b)',
618596
"bad character in group name '-1'",5)
619-
withself.assertWarnsRegex(DeprecationWarning,
620-
r"bad character in group name '\+1' "
621-
r"at position 5")asw:
622-
re.compile(r'()(?(+1)a|b)')
623-
self.assertEqual(w.filename,__file__)
624-
withself.assertWarnsRegex(DeprecationWarning,
625-
r"bad character in group name '1_0' "
626-
r"at position 23"):
627-
re.compile(r'()'*10+r'(?(1_0)a|b)')
628-
withself.assertWarnsRegex(DeprecationWarning,
629-
r"bad character in group name ' 1 ' "
630-
r"at position 5"):
631-
re.compile(r'()(?( 1 )a|b)')
597+
self.checkPatternError(r'()(?(+1)a|b)',
598+
"bad character in group name '+1'",5)
599+
self.checkPatternError(r'()'*10+r'(?(1_0)a|b)',
600+
"bad character in group name '1_0'",23)
601+
self.checkPatternError(r'()(?( 1 )a|b)',
602+
"bad character in group name ' 1 '",5)
632603
self.checkPatternError(r'()(?(㊀)a|b)',
633604
"bad character in group name '㊀'",5)
634605
self.checkPatternError(r'()(?(¹)a|b)',
635606
"bad character in group name '¹'",5)
636-
withself.assertWarnsRegex(DeprecationWarning,
637-
r"bad character in group name '१' "
638-
r"at position 5"):
639-
re.compile(r'()(?(१)a|b)')
607+
self.checkPatternError(r'()(?(१)a|b)',
608+
"bad character in group name '१'",5)
640609
self.checkPatternError(r'()(?(1',
641610
"missing ), unterminated name",5)
642611
self.checkPatternError(r'()(?(1)a',
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Apply more strict rules for numerical group references and group names in
2+
regular expressions. Only sequence of ASCII digits is now accepted as
3+
a numerical reference. The group name in
4+
bytes patterns and replacement strings can now only contain ASCII letters
5+
and digits and underscore.

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp