NotificationsYou must be signed in to change notification settings
Fork32.2k
Star67.7k

Commiteae7dad

authored

gh-95534: Improve gzip reading speed by 10% (#97664)

Change summary:+ There is now a `gzip.READ_BUFFER_SIZE` constant that is 128KB. Other programs that read in 128KB chunks: pigz and cat. So this seems best practice among good programs. Also it is faster than 8 kb chunks.+ a zlib._ZlibDecompressor was added. This is the _bz2.BZ2Decompressor ported to zlib. Since the zlib.Decompress object is better for in-memory decompression, the _ZlibDecompressor is hidden. It only makes sense in file decompression, and that is already implemented now in the gzip library. No need to bother the users with this.+ The ZlibDecompressor uses the older Cpython arrange_output_buffer functions, as those are faster and more appropriate for the use case. + GzipFile.read has been optimized. There is no longer a `unconsumed_tail` member to write back to padded file. This is instead handled by the ZlibDecompressor itself, which has an internal buffer. `_add_read_data` has been inlined, as it was just two calls.EDIT: While I am adding improvements anyway, I figured I could add another one-liner optimization now to the python -m gzip application. That read chunks in io.DEFAULT_BUFFER_SIZE previously, but has been updated now to use READ_BUFFER_SIZE chunks.

1 parentbb38b39 commiteae7dadCopy full SHA for eae7dad

File tree

5 files changed

+850

-80

lines changed

Lib
- gzip.py
- test
  - test_zlib.py
Misc/NEWS.d/next/Library
- 2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst
Modules
- clinic
  - zlibmodule.c.h
- zlibmodule.c

5 files changed

+850

-80

lines changed

`‎Lib/gzip.py`

Lines changed: 12 additions & 12 deletions

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,8 @@`
`21`	`21`	`_COMPRESS_LEVEL_TRADEOFF=6`
`22`	`22`	`_COMPRESS_LEVEL_BEST=9`
`23`	`23`
	`24`	`+READ_BUFFER_SIZE=128*1024`
	`25`	`+`
`24`	`26`
`25`	`27`	`defopen(filename,mode="rb",compresslevel=_COMPRESS_LEVEL_BEST,`
`26`	`28`	`encoding=None,errors=None,newline=None):`
`@@ -446,7 +448,7 @@ def _read_gzip_header(fp):`
`446`	`448`
`447`	`449`	`class_GzipReader(_compression.DecompressReader):`
`448`	`450`	`def__init__(self,fp):`
`449`		`-super().__init__(_PaddedFile(fp),zlib.decompressobj,`
	`451`	`+super().__init__(_PaddedFile(fp),zlib._ZlibDecompressor,`
`450`	`452`	`wbits=-zlib.MAX_WBITS)`
`451`	`453`	`# Set flag indicating start of a new member`
`452`	`454`	`self._new_member=True`
`@@ -494,12 +496,13 @@ def read(self, size=-1):`
`494`	`496`	`self._new_member=False`
`495`	`497`
`496`	`498`	`# Read a chunk of data from the file`
`497`		`-buf=self._fp.read(io.DEFAULT_BUFFER_SIZE)`
	`499`	`+ifself._decompressor.needs_input:`
	`500`	`+buf=self._fp.read(READ_BUFFER_SIZE)`
	`501`	`+uncompress=self._decompressor.decompress(buf,size)`
	`502`	`+else:`
	`503`	`+uncompress=self._decompressor.decompress(b"",size)`
`498`	`504`
`499`		`-uncompress=self._decompressor.decompress(buf,size)`
`500`		`-ifself._decompressor.unconsumed_tail!=b"":`
`501`		`-self._fp.prepend(self._decompressor.unconsumed_tail)`
`502`		`-elifself._decompressor.unused_data!=b"":`
	`505`	`+ifself._decompressor.unused_data!=b"":`
`503`	`506`	`# Prepend the already read bytes to the fileobj so they can`
`504`	`507`	`# be seen by _read_eof() and _read_gzip_header()`
`505`	`508`	`self._fp.prepend(self._decompressor.unused_data)`
`@@ -510,14 +513,11 @@ def read(self, size=-1):`
`510`	`513`	`raiseEOFError("Compressed file ended before the "`
`511`	`514`	`"end-of-stream marker was reached")`
`512`	`515`
`513`		`-self._add_read_data(uncompress )`
	`516`	`+self._crc=zlib.crc32(uncompress,self._crc)`
	`517`	`+self._stream_size+=len(uncompress)`
`514`	`518`	`self._pos+=len(uncompress)`
`515`	`519`	`returnuncompress`
`516`	`520`
`517`		`-def_add_read_data(self,data):`
`518`		`-self._crc=zlib.crc32(data,self._crc)`
`519`		`-self._stream_size=self._stream_size+len(data)`
`520`		`-`
`521`	`521`	`def_read_eof(self):`
`522`	`522`	`# We've read to the end of the file`
`523`	`523`	`# We check that the computed CRC and size of the`
`@@ -647,7 +647,7 @@ def main():`
`647`	`647`	`f=builtins.open(arg,"rb")`
`648`	`648`	`g=open(arg+".gz","wb")`
`649`	`649`	`whileTrue:`
`650`		`-chunk=f.read(io.DEFAULT_BUFFER_SIZE)`
	`650`	`+chunk=f.read(READ_BUFFER_SIZE)`
`651`	`651`	`ifnotchunk:`
`652`	`652`	`break`
`653`	`653`	`g.write(chunk)`

`‎Lib/test/test_zlib.py`

Lines changed: 167 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -944,6 +944,173 @@ def choose_lines(source, number, seed=None, generator=random):`
`944`	`944`	`"""`
`945`	`945`
`946`	`946`
	`947`	`+classZlibDecompressorTest():`
	`948`	`+# Test adopted from test_bz2.py`
	`949`	`+TEXT=HAMLET_SCENE`
	`950`	`+DATA=zlib.compress(HAMLET_SCENE)`
	`951`	`+BAD_DATA=b"Not a valid deflate block"`
	`952`	`+deftest_Constructor(self):`
	`953`	`+self.assertRaises(TypeError,zlib._ZlibDecompressor,42)`
	`954`	`+`
	`955`	`+deftestDecompress(self):`
	`956`	`+zlibd=zlib._ZlibDecompressor()`
	`957`	`+self.assertRaises(TypeError,zlibd.decompress)`
	`958`	`+text=zlibd.decompress(self.DATA)`
	`959`	`+self.assertEqual(text,self.TEXT)`
	`960`	`+`
	`961`	`+deftestDecompressChunks10(self):`
	`962`	`+zlibd=zlib._ZlibDecompressor()`
	`963`	`+text=b''`
	`964`	`+n=0`
	`965`	`+whileTrue:`
	`966`	`+str=self.DATA[n10:(n+1)10]`
	`967`	`+ifnotstr:`
	`968`	`+break`
	`969`	`+text+=zlibd.decompress(str)`
	`970`	`+n+=1`
	`971`	`+self.assertEqual(text,self.TEXT)`
	`972`	`+`
	`973`	`+deftestDecompressUnusedData(self):`
	`974`	`+zlibd=zlib._ZlibDecompressor()`
	`975`	`+unused_data=b"this is unused data"`
	`976`	`+text=zlibd.decompress(self.DATA+unused_data)`
	`977`	`+self.assertEqual(text,self.TEXT)`
	`978`	`+self.assertEqual(zlibd.unused_data,unused_data)`
	`979`	`+`
	`980`	`+deftestEOFError(self):`
	`981`	`+zlibd=zlib._ZlibDecompressor()`
	`982`	`+text=zlibd.decompress(self.DATA)`
	`983`	`+self.assertRaises(EOFError,zlibd.decompress,b"anything")`
	`984`	`+self.assertRaises(EOFError,zlibd.decompress,b"")`
	`985`	`+`
	`986`	`+@support.skip_if_pgo_task`
	`987`	`+@bigmemtest(size=_4G+100,memuse=3.3)`
	`988`	`+deftestDecompress4G(self,size):`
	`989`	`+# "Test zlib._ZlibDecompressor.decompress() with >4GiB input"`
	`990`	`+blocksize=1010241024`
	`991`	`+block=random.randbytes(blocksize)`
	`992`	`+try:`
	`993`	`+data=block* (size//blocksize+1)`
	`994`	`+compressed=zlib.compress(data)`
	`995`	`+zlibd=zlib._ZlibDecompressor()`
	`996`	`+decompressed=zlibd.decompress(compressed)`
	`997`	`+self.assertTrue(decompressed==data)`
	`998`	`+finally:`
	`999`	`+data=None`
	`1000`	`+compressed=None`
	`1001`	`+decompressed=None`
	`1002`	`+`
	`1003`	`+deftestPickle(self):`
	`1004`	`+forprotoinrange(pickle.HIGHEST_PROTOCOL+1):`
	`1005`	`+withself.assertRaises(TypeError):`
	`1006`	`+pickle.dumps(zlib._ZlibDecompressor(),proto)`
	`1007`	`+`
	`1008`	`+deftestDecompressorChunksMaxsize(self):`
	`1009`	`+zlibd=zlib._ZlibDecompressor()`
	`1010`	`+max_length=100`
	`1011`	`+out= []`
	`1012`	`+`
	`1013`	`+# Feed some input`
	`1014`	`+len_=len(self.BIG_DATA)-64`
	`1015`	`+out.append(zlibd.decompress(self.BIG_DATA[:len_],`
	`1016`	`+max_length=max_length))`
	`1017`	`+self.assertFalse(zlibd.needs_input)`
	`1018`	`+self.assertEqual(len(out[-1]),max_length)`
	`1019`	`+`
	`1020`	`+# Retrieve more data without providing more input`
	`1021`	`+out.append(zlibd.decompress(b'',max_length=max_length))`
	`1022`	`+self.assertFalse(zlibd.needs_input)`
	`1023`	`+self.assertEqual(len(out[-1]),max_length)`
	`1024`	`+`
	`1025`	`+# Retrieve more data while providing more input`
	`1026`	`+out.append(zlibd.decompress(self.BIG_DATA[len_:],`
	`1027`	`+max_length=max_length))`
	`1028`	`+self.assertLessEqual(len(out[-1]),max_length)`
	`1029`	`+`
	`1030`	`+# Retrieve remaining uncompressed data`
	`1031`	`+whilenotzlibd.eof:`
	`1032`	`+out.append(zlibd.decompress(b'',max_length=max_length))`
	`1033`	`+self.assertLessEqual(len(out[-1]),max_length)`
	`1034`	`+`
	`1035`	`+out=b"".join(out)`
	`1036`	`+self.assertEqual(out,self.BIG_TEXT)`
	`1037`	`+self.assertEqual(zlibd.unused_data,b"")`
	`1038`	`+`
	`1039`	`+deftest_decompressor_inputbuf_1(self):`
	`1040`	`+# Test reusing input buffer after moving existing`
	`1041`	`+# contents to beginning`
	`1042`	`+zlibd=zlib._ZlibDecompressor()`
	`1043`	`+out= []`
	`1044`	`+`
	`1045`	`+# Create input buffer and fill it`
	`1046`	`+self.assertEqual(zlibd.decompress(self.DATA[:100],`
	`1047`	`+max_length=0),b'')`
	`1048`	`+`
	`1049`	`+# Retrieve some results, freeing capacity at beginning`
	`1050`	`+# of input buffer`
	`1051`	`+out.append(zlibd.decompress(b'',2))`
	`1052`	`+`
	`1053`	`+# Add more data that fits into input buffer after`
	`1054`	`+# moving existing data to beginning`
	`1055`	`+out.append(zlibd.decompress(self.DATA[100:105],15))`
	`1056`	`+`
	`1057`	`+# Decompress rest of data`
	`1058`	`+out.append(zlibd.decompress(self.DATA[105:]))`
	`1059`	`+self.assertEqual(b''.join(out),self.TEXT)`
	`1060`	`+`
	`1061`	`+deftest_decompressor_inputbuf_2(self):`
	`1062`	`+# Test reusing input buffer by appending data at the`
	`1063`	`+# end right away`
	`1064`	`+zlibd=zlib._ZlibDecompressor()`
	`1065`	`+out= []`
	`1066`	`+`
	`1067`	`+# Create input buffer and empty it`
	`1068`	`+self.assertEqual(zlibd.decompress(self.DATA[:200],`
	`1069`	`+max_length=0),b'')`
	`1070`	`+out.append(zlibd.decompress(b''))`
	`1071`	`+`
	`1072`	`+# Fill buffer with new data`
	`1073`	`+out.append(zlibd.decompress(self.DATA[200:280],2))`
	`1074`	`+`
	`1075`	`+# Append some more data, not enough to require resize`
	`1076`	`+out.append(zlibd.decompress(self.DATA[280:300],2))`
	`1077`	`+`
	`1078`	`+# Decompress rest of data`
	`1079`	`+out.append(zlibd.decompress(self.DATA[300:]))`
	`1080`	`+self.assertEqual(b''.join(out),self.TEXT)`
	`1081`	`+`
	`1082`	`+deftest_decompressor_inputbuf_3(self):`
	`1083`	`+# Test reusing input buffer after extending it`
	`1084`	`+`
	`1085`	`+zlibd=zlib._ZlibDecompressor()`
	`1086`	`+out= []`
	`1087`	`+`
	`1088`	`+# Create almost full input buffer`
	`1089`	`+out.append(zlibd.decompress(self.DATA[:200],5))`
	`1090`	`+`
	`1091`	`+# Add even more data to it, requiring resize`
	`1092`	`+out.append(zlibd.decompress(self.DATA[200:300],5))`
	`1093`	`+`
	`1094`	`+# Decompress rest of data`
	`1095`	`+out.append(zlibd.decompress(self.DATA[300:]))`
	`1096`	`+self.assertEqual(b''.join(out),self.TEXT)`
	`1097`	`+`
	`1098`	`+deftest_failure(self):`
	`1099`	`+zlibd=zlib._ZlibDecompressor()`
	`1100`	`+self.assertRaises(Exception,zlibd.decompress,self.BAD_DATA*30)`
	`1101`	`+# Previously, a second call could crash due to internal inconsistency`
	`1102`	`+self.assertRaises(Exception,zlibd.decompress,self.BAD_DATA*30)`
	`1103`	`+`
	`1104`	`+@support.refcount_test`
	`1105`	`+deftest_refleaks_in___init__(self):`
	`1106`	`+gettotalrefcount=support.get_attribute(sys,'gettotalrefcount')`
	`1107`	`+zlibd=zlib._ZlibDecompressor()`
	`1108`	`+refs_before=gettotalrefcount()`
	`1109`	`+foriinrange(100):`
	`1110`	`+zlibd.__init__()`
	`1111`	`+self.assertAlmostEqual(gettotalrefcount()-refs_before,0,delta=10)`
	`1112`	`+`
	`1113`	`+`
`947`	`1114`	`classCustomInt:`
`948`	`1115`	`def__index__(self):`
`949`	`1116`	`return100`

`‎Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+:meth:`gzip.GzipFile.read` reads 10% faster.

`‎Modules/clinic/zlibmodule.c.h`

Lines changed: 99 additions & 1 deletion

Some generated files are not rendered by default. Learn more aboutcustomizing how changed files appear on GitHub.

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commiteae7dad

File tree

5 files changed

5 files changed

`‎Lib/gzip.py`

`‎Lib/test/test_zlib.py`

`‎Misc/NEWS.d/next/Library/2022-09-30-09-22-37.gh-issue-95534.ndEfPj.rst`

`‎Modules/clinic/zlibmodule.c.h`

0 commit comments