Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit7d1f50c

Browse files
sethmlarsonEclips4gpshead
authored
[3.8]gh-121285: Remove backtracking when parsing tarfile headers (GH-121286) (#123642)
* Remove backtracking when parsing tarfile headers* Rewrite PAX header parsing to be stricter* Optimize parsing of GNU extended sparse headers v0.0(cherry picked from commit34ddb64)Co-authored-by: Seth Michael Larson <seth@python.org>Co-authored-by: Kirill Podoprigora <kirill.bast9@mail.ru>Co-authored-by: Gregory P. Smith <greg@krypto.org>
1 parent7bc367e commit7d1f50c

File tree

3 files changed

+111
-38
lines changed

3 files changed

+111
-38
lines changed

‎Lib/tarfile.py‎

Lines changed: 67 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,9 @@ def data_filter(member, dest_path):
840840
# Sentinel for replace() defaults, meaning "don't change the attribute"
841841
_KEEP=object()
842842

843+
# Header length is digits followed by a space.
844+
_header_length_prefix_re=re.compile(br"([0-9]{1,20}) ")
845+
843846
classTarInfo(object):
844847
"""Informational class which holds the details about an
845848
archive member given by a tar header block.
@@ -1390,59 +1393,76 @@ def _proc_pax(self, tarfile):
13901393
else:
13911394
pax_headers=tarfile.pax_headers.copy()
13921395

1393-
# Check if the pax header contains a hdrcharset field. This tells us
1394-
# the encoding of the path, linkpath, uname and gname fields. Normally,
1395-
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
1396-
# implementations are allowed to store them as raw binary strings if
1397-
# the translation to UTF-8 fails.
1398-
match=re.search(br"\d+ hdrcharset=([^\n]+)\n",buf)
1399-
ifmatchisnotNone:
1400-
pax_headers["hdrcharset"]=match.group(1).decode("utf-8")
1401-
1402-
# For the time being, we don't care about anything other than "BINARY".
1403-
# The only other value that is currently allowed by the standard is
1404-
# "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1405-
hdrcharset=pax_headers.get("hdrcharset")
1406-
ifhdrcharset=="BINARY":
1407-
encoding=tarfile.encoding
1408-
else:
1409-
encoding="utf-8"
1410-
14111396
# Parse pax header information. A record looks like that:
14121397
# "%d %s=%s\n" % (length, keyword, value). length is the size
14131398
# of the complete record including the length field itself and
1414-
# the newline. keyword and value are both UTF-8 encoded strings.
1415-
regex=re.compile(br"(\d+) ([^=]+)=")
1399+
# the newline.
14161400
pos=0
1417-
whileTrue:
1418-
match=regex.match(buf,pos)
1419-
ifnotmatch:
1420-
break
1401+
encoding=None
1402+
raw_headers= []
1403+
whilelen(buf)>posandbuf[pos]!=0x00:
1404+
ifnot (match:=_header_length_prefix_re.match(buf,pos)):
1405+
raiseInvalidHeaderError("invalid header")
1406+
try:
1407+
length=int(match.group(1))
1408+
exceptValueError:
1409+
raiseInvalidHeaderError("invalid header")
1410+
# Headers must be at least 5 bytes, shortest being '5 x=\n'.
1411+
# Value is allowed to be empty.
1412+
iflength<5:
1413+
raiseInvalidHeaderError("invalid header")
1414+
ifpos+length>len(buf):
1415+
raiseInvalidHeaderError("invalid header")
14211416

1422-
length,keyword=match.groups()
1423-
length=int(length)
1424-
iflength==0:
1417+
header_value_end_offset=match.start(1)+length-1# Last byte of the header
1418+
keyword_and_value=buf[match.end(1)+1:header_value_end_offset]
1419+
raw_keyword,equals,raw_value=keyword_and_value.partition(b"=")
1420+
1421+
# Check the framing of the header. The last character must be '\n' (0x0A)
1422+
ifnotraw_keywordorequals!=b"="orbuf[header_value_end_offset]!=0x0A:
14251423
raiseInvalidHeaderError("invalid header")
1426-
value=buf[match.end(2)+1:match.start(1)+length-1]
1424+
raw_headers.append((length,raw_keyword,raw_value))
1425+
1426+
# Check if the pax header contains a hdrcharset field. This tells us
1427+
# the encoding of the path, linkpath, uname and gname fields. Normally,
1428+
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
1429+
# implementations are allowed to store them as raw binary strings if
1430+
# the translation to UTF-8 fails. For the time being, we don't care about
1431+
# anything other than "BINARY". The only other value that is currently
1432+
# allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1433+
# Note that we only follow the initial 'hdrcharset' setting to preserve
1434+
# the initial behavior of the 'tarfile' module.
1435+
ifraw_keyword==b"hdrcharset"andencodingisNone:
1436+
ifraw_value==b"BINARY":
1437+
encoding=tarfile.encoding
1438+
else:# This branch ensures only the first 'hdrcharset' header is used.
1439+
encoding="utf-8"
1440+
1441+
pos+=length
14271442

1443+
# If no explicit hdrcharset is set, we use UTF-8 as a default.
1444+
ifencodingisNone:
1445+
encoding="utf-8"
1446+
1447+
# After parsing the raw headers we can decode them to text.
1448+
forlength,raw_keyword,raw_valueinraw_headers:
14281449
# Normally, we could just use "utf-8" as the encoding and "strict"
14291450
# as the error handler, but we better not take the risk. For
14301451
# example, GNU tar <= 1.23 is known to store filenames it cannot
14311452
# translate to UTF-8 as raw strings (unfortunately without a
14321453
# hdrcharset=BINARY header).
14331454
# We first try the strict standard encoding, and if that fails we
14341455
# fall back on the user's encoding and error handler.
1435-
keyword=self._decode_pax_field(keyword,"utf-8","utf-8",
1456+
keyword=self._decode_pax_field(raw_keyword,"utf-8","utf-8",
14361457
tarfile.errors)
14371458
ifkeywordinPAX_NAME_FIELDS:
1438-
value=self._decode_pax_field(value,encoding,tarfile.encoding,
1459+
value=self._decode_pax_field(raw_value,encoding,tarfile.encoding,
14391460
tarfile.errors)
14401461
else:
1441-
value=self._decode_pax_field(value,"utf-8","utf-8",
1462+
value=self._decode_pax_field(raw_value,"utf-8","utf-8",
14421463
tarfile.errors)
14431464

14441465
pax_headers[keyword]=value
1445-
pos+=length
14461466

14471467
# Fetch the next header.
14481468
try:
@@ -1457,7 +1477,7 @@ def _proc_pax(self, tarfile):
14571477

14581478
elif"GNU.sparse.size"inpax_headers:
14591479
# GNU extended sparse format version 0.0.
1460-
self._proc_gnusparse_00(next,pax_headers,buf)
1480+
self._proc_gnusparse_00(next,raw_headers)
14611481

14621482
elifpax_headers.get("GNU.sparse.major")=="1"andpax_headers.get("GNU.sparse.minor")=="0":
14631483
# GNU extended sparse format version 1.0.
@@ -1479,15 +1499,24 @@ def _proc_pax(self, tarfile):
14791499

14801500
returnnext
14811501

1482-
def_proc_gnusparse_00(self,next,pax_headers,buf):
1502+
def_proc_gnusparse_00(self,next,raw_headers):
14831503
"""Process a GNU tar extended sparse header, version 0.0.
14841504
"""
14851505
offsets= []
1486-
formatchinre.finditer(br"\d+ GNU.sparse.offset=(\d+)\n",buf):
1487-
offsets.append(int(match.group(1)))
14881506
numbytes= []
1489-
formatchinre.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n",buf):
1490-
numbytes.append(int(match.group(1)))
1507+
for_,keyword,valueinraw_headers:
1508+
ifkeyword==b"GNU.sparse.offset":
1509+
try:
1510+
offsets.append(int(value.decode()))
1511+
exceptValueError:
1512+
raiseInvalidHeaderError("invalid header")
1513+
1514+
elifkeyword==b"GNU.sparse.numbytes":
1515+
try:
1516+
numbytes.append(int(value.decode()))
1517+
exceptValueError:
1518+
raiseInvalidHeaderError("invalid header")
1519+
14911520
next.sparse=list(zip(offsets,numbytes))
14921521

14931522
def_proc_gnusparse_01(self,next,pax_headers):

‎Lib/test/test_tarfile.py‎

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,6 +1047,48 @@ def test_pax_number_fields(self):
10471047
finally:
10481048
tar.close()
10491049

1050+
deftest_pax_header_bad_formats(self):
1051+
# The fields from the pax header have priority over the
1052+
# TarInfo.
1053+
pax_header_replacements= (
1054+
b" foo=bar\n",
1055+
b"0\n",
1056+
b"1\n",
1057+
b"2\n",
1058+
b"3 =\n",
1059+
b"4 =a\n",
1060+
b"1000000 foo=bar\n",
1061+
b"0 foo=bar\n",
1062+
b"-12 foo=bar\n",
1063+
b"000000000000000000000000036 foo=bar\n",
1064+
)
1065+
pax_headers= {"foo":"bar"}
1066+
1067+
forreplacementinpax_header_replacements:
1068+
withself.subTest(header=replacement):
1069+
tar=tarfile.open(tmpname,"w",format=tarfile.PAX_FORMAT,
1070+
encoding="iso8859-1")
1071+
try:
1072+
t=tarfile.TarInfo()
1073+
t.name="pax"# non-ASCII
1074+
t.uid=1
1075+
t.pax_headers=pax_headers
1076+
tar.addfile(t)
1077+
finally:
1078+
tar.close()
1079+
1080+
withopen(tmpname,"rb")asf:
1081+
data=f.read()
1082+
self.assertIn(b"11 foo=bar\n",data)
1083+
data=data.replace(b"11 foo=bar\n",replacement)
1084+
1085+
withopen(tmpname,"wb")asf:
1086+
f.truncate()
1087+
f.write(data)
1088+
1089+
withself.assertRaisesRegex(tarfile.ReadError,r"file could not be opened successfully"):
1090+
tarfile.open(tmpname,encoding="iso8859-1")
1091+
10501092

10511093
classWriteTestBase(TarTest):
10521094
# Put all write tests in here that are supposed to be tested
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Remove backtracking from tarfile header parsing for ``hdrcharset``, PAX, and
2+
GNU sparse headers.

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp