NotificationsYou must be signed in to change notification settings
Fork33.7k
Star70.4k

Commit7d1f50c

authored

[3.8]gh-121285: Remove backtracking when parsing tarfile headers (GH-121286) (#123642)

* Remove backtracking when parsing tarfile headers* Rewrite PAX header parsing to be stricter* Optimize parsing of GNU extended sparse headers v0.0(cherry picked from commit34ddb64)Co-authored-by: Seth Michael Larson <seth@python.org>Co-authored-by: Kirill Podoprigora <kirill.bast9@mail.ru>Co-authored-by: Gregory P. Smith <greg@krypto.org>

1 parent7bc367e commit7d1f50cCopy full SHA for 7d1f50c

File tree

3 files changed

+111

-38

lines changed

Lib
- tarfile.py
- test
  - test_tarfile.py
Misc/NEWS.d/next/Security
- 2024-07-02-13-39-20.gh-issue-121285.hrl-yI.rst

3 files changed

+111

-38

lines changed

`‎Lib/tarfile.py‎`

Lines changed: 67 additions & 38 deletions

Original file line number	Diff line number	Diff line change
`@@ -840,6 +840,9 @@ def data_filter(member, dest_path):`
`840`	`840`	`# Sentinel for replace() defaults, meaning "don't change the attribute"`
`841`	`841`	`_KEEP=object()`
`842`	`842`
	`843`	`+# Header length is digits followed by a space.`
	`844`	`+_header_length_prefix_re=re.compile(br"([0-9]{1,20}) ")`
	`845`	`+`
`843`	`846`	`classTarInfo(object):`
`844`	`847`	`"""Informational class which holds the details about an`
`845`	`848`	`archive member given by a tar header block.`
`@@ -1390,59 +1393,76 @@ def _proc_pax(self, tarfile):`
`1390`	`1393`	`else:`
`1391`	`1394`	`pax_headers=tarfile.pax_headers.copy()`
`1392`	`1395`
`1393`		`-# Check if the pax header contains a hdrcharset field. This tells us`
`1394`		`-# the encoding of the path, linkpath, uname and gname fields. Normally,`
`1395`		`-# these fields are UTF-8 encoded but since POSIX.1-2008 tar`
`1396`		`-# implementations are allowed to store them as raw binary strings if`
`1397`		`-# the translation to UTF-8 fails.`
`1398`		`-match=re.search(br"\d+ hdrcharset=([^\n]+)\n",buf)`
`1399`		`-ifmatchisnotNone:`
`1400`		`-pax_headers["hdrcharset"]=match.group(1).decode("utf-8")`
`1401`		`-`
`1402`		`-# For the time being, we don't care about anything other than "BINARY".`
`1403`		`-# The only other value that is currently allowed by the standard is`
`1404`		`-# "ISO-IR 10646 2000 UTF-8" in other words UTF-8.`
`1405`		`-hdrcharset=pax_headers.get("hdrcharset")`
`1406`		`-ifhdrcharset=="BINARY":`
`1407`		`-encoding=tarfile.encoding`
`1408`		`-else:`
`1409`		`-encoding="utf-8"`
`1410`		`-`
`1411`	`1396`	`# Parse pax header information. A record looks like that:`
`1412`	`1397`	`# "%d %s=%s\n" % (length, keyword, value). length is the size`
`1413`	`1398`	`# of the complete record including the length field itself and`
`1414`		`-# the newline. keyword and value are both UTF-8 encoded strings.`
`1415`		`-regex=re.compile(br"(\d+) ([^=]+)=")`
	`1399`	`+# the newline.`
`1416`	`1400`	`pos=0`
`1417`		`-whileTrue:`
`1418`		`-match=regex.match(buf,pos)`
`1419`		`-ifnotmatch:`
`1420`		`-break`
	`1401`	`+encoding=None`
	`1402`	`+raw_headers= []`
	`1403`	`+whilelen(buf)>posandbuf[pos]!=0x00:`
	`1404`	`+ifnot (match:=_header_length_prefix_re.match(buf,pos)):`
	`1405`	`+raiseInvalidHeaderError("invalid header")`
	`1406`	`+try:`
	`1407`	`+length=int(match.group(1))`
	`1408`	`+exceptValueError:`
	`1409`	`+raiseInvalidHeaderError("invalid header")`
	`1410`	`+# Headers must be at least 5 bytes, shortest being '5 x=\n'.`
	`1411`	`+# Value is allowed to be empty.`
	`1412`	`+iflength<5:`
	`1413`	`+raiseInvalidHeaderError("invalid header")`
	`1414`	`+ifpos+length>len(buf):`
	`1415`	`+raiseInvalidHeaderError("invalid header")`
`1421`	`1416`
`1422`		`-length,keyword=match.groups()`
`1423`		`-length=int(length)`
`1424`		`-iflength==0:`
	`1417`	`+header_value_end_offset=match.start(1)+length-1# Last byte of the header`
	`1418`	`+keyword_and_value=buf[match.end(1)+1:header_value_end_offset]`
	`1419`	`+raw_keyword,equals,raw_value=keyword_and_value.partition(b"=")`
	`1420`	`+`
	`1421`	`+# Check the framing of the header. The last character must be '\n' (0x0A)`
	`1422`	`+ifnotraw_keywordorequals!=b"="orbuf[header_value_end_offset]!=0x0A:`
`1425`	`1423`	`raiseInvalidHeaderError("invalid header")`
`1426`		`-value=buf[match.end(2)+1:match.start(1)+length-1]`
	`1424`	`+raw_headers.append((length,raw_keyword,raw_value))`
	`1425`	`+`
	`1426`	`+# Check if the pax header contains a hdrcharset field. This tells us`
	`1427`	`+# the encoding of the path, linkpath, uname and gname fields. Normally,`
	`1428`	`+# these fields are UTF-8 encoded but since POSIX.1-2008 tar`
	`1429`	`+# implementations are allowed to store them as raw binary strings if`
	`1430`	`+# the translation to UTF-8 fails. For the time being, we don't care about`
	`1431`	`+# anything other than "BINARY". The only other value that is currently`
	`1432`	`+# allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8.`
	`1433`	`+# Note that we only follow the initial 'hdrcharset' setting to preserve`
	`1434`	`+# the initial behavior of the 'tarfile' module.`
	`1435`	`+ifraw_keyword==b"hdrcharset"andencodingisNone:`
	`1436`	`+ifraw_value==b"BINARY":`
	`1437`	`+encoding=tarfile.encoding`
	`1438`	`+else:# This branch ensures only the first 'hdrcharset' header is used.`
	`1439`	`+encoding="utf-8"`
	`1440`	`+`
	`1441`	`+pos+=length`
`1427`	`1442`
	`1443`	`+# If no explicit hdrcharset is set, we use UTF-8 as a default.`
	`1444`	`+ifencodingisNone:`
	`1445`	`+encoding="utf-8"`
	`1446`	`+`
	`1447`	`+# After parsing the raw headers we can decode them to text.`
	`1448`	`+forlength,raw_keyword,raw_valueinraw_headers:`
`1428`	`1449`	`# Normally, we could just use "utf-8" as the encoding and "strict"`
`1429`	`1450`	`# as the error handler, but we better not take the risk. For`
`1430`	`1451`	`# example, GNU tar <= 1.23 is known to store filenames it cannot`
`1431`	`1452`	`# translate to UTF-8 as raw strings (unfortunately without a`
`1432`	`1453`	`# hdrcharset=BINARY header).`
`1433`	`1454`	`# We first try the strict standard encoding, and if that fails we`
`1434`	`1455`	`# fall back on the user's encoding and error handler.`
`1435`		`-keyword=self._decode_pax_field(keyword,"utf-8","utf-8",`
	`1456`	`+keyword=self._decode_pax_field(raw_keyword,"utf-8","utf-8",`
`1436`	`1457`	`tarfile.errors)`
`1437`	`1458`	`ifkeywordinPAX_NAME_FIELDS:`
`1438`		`-value=self._decode_pax_field(value,encoding,tarfile.encoding,`
	`1459`	`+value=self._decode_pax_field(raw_value,encoding,tarfile.encoding,`
`1439`	`1460`	`tarfile.errors)`
`1440`	`1461`	`else:`
`1441`		`-value=self._decode_pax_field(value,"utf-8","utf-8",`
	`1462`	`+value=self._decode_pax_field(raw_value,"utf-8","utf-8",`
`1442`	`1463`	`tarfile.errors)`
`1443`	`1464`
`1444`	`1465`	`pax_headers[keyword]=value`
`1445`		`-pos+=length`
`1446`	`1466`
`1447`	`1467`	`# Fetch the next header.`
`1448`	`1468`	`try:`
`@@ -1457,7 +1477,7 @@ def _proc_pax(self, tarfile):`
`1457`	`1477`
`1458`	`1478`	`elif"GNU.sparse.size"inpax_headers:`
`1459`	`1479`	`# GNU extended sparse format version 0.0.`
`1460`		`-self._proc_gnusparse_00(next,pax_headers,buf)`
	`1480`	`+self._proc_gnusparse_00(next,raw_headers)`
`1461`	`1481`
`1462`	`1482`	`elifpax_headers.get("GNU.sparse.major")=="1"andpax_headers.get("GNU.sparse.minor")=="0":`
`1463`	`1483`	`# GNU extended sparse format version 1.0.`
`@@ -1479,15 +1499,24 @@ def _proc_pax(self, tarfile):`
`1479`	`1499`
`1480`	`1500`	`returnnext`
`1481`	`1501`
`1482`		`-def_proc_gnusparse_00(self,next,pax_headers,buf):`
	`1502`	`+def_proc_gnusparse_00(self,next,raw_headers):`
`1483`	`1503`	`"""Process a GNU tar extended sparse header, version 0.0.`
`1484`	`1504`	`"""`
`1485`	`1505`	`offsets= []`
`1486`		`-formatchinre.finditer(br"\d+ GNU.sparse.offset=(\d+)\n",buf):`
`1487`		`-offsets.append(int(match.group(1)))`
`1488`	`1506`	`numbytes= []`
`1489`		`-formatchinre.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n",buf):`
`1490`		`-numbytes.append(int(match.group(1)))`
	`1507`	`+for_,keyword,valueinraw_headers:`
	`1508`	`+ifkeyword==b"GNU.sparse.offset":`
	`1509`	`+try:`
	`1510`	`+offsets.append(int(value.decode()))`
	`1511`	`+exceptValueError:`
	`1512`	`+raiseInvalidHeaderError("invalid header")`
	`1513`	`+`
	`1514`	`+elifkeyword==b"GNU.sparse.numbytes":`
	`1515`	`+try:`
	`1516`	`+numbytes.append(int(value.decode()))`
	`1517`	`+exceptValueError:`
	`1518`	`+raiseInvalidHeaderError("invalid header")`
	`1519`	`+`
`1491`	`1520`	`next.sparse=list(zip(offsets,numbytes))`
`1492`	`1521`
`1493`	`1522`	`def_proc_gnusparse_01(self,next,pax_headers):`

`‎Lib/test/test_tarfile.py‎`

Lines changed: 42 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1047,6 +1047,48 @@ def test_pax_number_fields(self):`
`1047`	`1047`	`finally:`
`1048`	`1048`	`tar.close()`
`1049`	`1049`
	`1050`	`+deftest_pax_header_bad_formats(self):`
	`1051`	`+# The fields from the pax header have priority over the`
	`1052`	`+# TarInfo.`
	`1053`	`+pax_header_replacements= (`
	`1054`	`+b" foo=bar\n",`
	`1055`	`+b"0\n",`
	`1056`	`+b"1\n",`
	`1057`	`+b"2\n",`
	`1058`	`+b"3 =\n",`
	`1059`	`+b"4 =a\n",`
	`1060`	`+b"1000000 foo=bar\n",`
	`1061`	`+b"0 foo=bar\n",`
	`1062`	`+b"-12 foo=bar\n",`
	`1063`	`+b"000000000000000000000000036 foo=bar\n",`
	`1064`	`+ )`
	`1065`	`+pax_headers= {"foo":"bar"}`
	`1066`	`+`
	`1067`	`+forreplacementinpax_header_replacements:`
	`1068`	`+withself.subTest(header=replacement):`
	`1069`	`+tar=tarfile.open(tmpname,"w",format=tarfile.PAX_FORMAT,`
	`1070`	`+encoding="iso8859-1")`
	`1071`	`+try:`
	`1072`	`+t=tarfile.TarInfo()`
	`1073`	`+t.name="pax"# non-ASCII`
	`1074`	`+t.uid=1`
	`1075`	`+t.pax_headers=pax_headers`
	`1076`	`+tar.addfile(t)`
	`1077`	`+finally:`
	`1078`	`+tar.close()`
	`1079`	`+`
	`1080`	`+withopen(tmpname,"rb")asf:`
	`1081`	`+data=f.read()`
	`1082`	`+self.assertIn(b"11 foo=bar\n",data)`
	`1083`	`+data=data.replace(b"11 foo=bar\n",replacement)`
	`1084`	`+`
	`1085`	`+withopen(tmpname,"wb")asf:`
	`1086`	`+f.truncate()`
	`1087`	`+f.write(data)`
	`1088`	`+`
	`1089`	`+withself.assertRaisesRegex(tarfile.ReadError,r"file could not be opened successfully"):`
	`1090`	`+tarfile.open(tmpname,encoding="iso8859-1")`
	`1091`	`+`
`1050`	`1092`
`1051`	`1093`	`classWriteTestBase(TarTest):`
`1052`	`1094`	`# Put all write tests in here that are supposed to be tested`

`‎Misc/NEWS.d/next/Security/2024-07-02-13-39-20.gh-issue-121285.hrl-yI.rst‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+Remove backtracking from tarfile header parsing for ``hdrcharset``, PAX, and
	`2`	`+GNU sparse headers.`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit7d1f50c

File tree

3 files changed

3 files changed

`‎Lib/tarfile.py‎`

`‎Lib/test/test_tarfile.py‎`

`‎Misc/NEWS.d/next/Security/2024-07-02-13-39-20.gh-issue-121285.hrl-yI.rst‎`

0 commit comments