Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commiteae7dad

Browse files
authored
gh-95534: Improve gzip reading speed by 10% (#97664)
Change summary:+ There is now a `gzip.READ_BUFFER_SIZE` constant that is 128KB. Other programs that read in 128KB chunks: pigz and cat. So this seems best practice among good programs. Also it is faster than 8 kb chunks.+ a zlib._ZlibDecompressor was added. This is the _bz2.BZ2Decompressor ported to zlib. Since the zlib.Decompress object is better for in-memory decompression, the _ZlibDecompressor is hidden. It only makes sense in file decompression, and that is already implemented now in the gzip library. No need to bother the users with this.+ The ZlibDecompressor uses the older Cpython arrange_output_buffer functions, as those are faster and more appropriate for the use case. + GzipFile.read has been optimized. There is no longer a `unconsumed_tail` member to write back to padded file. This is instead handled by the ZlibDecompressor itself, which has an internal buffer. `_add_read_data` has been inlined, as it was just two calls.EDIT: While I am adding improvements anyway, I figured I could add another one-liner optimization now to the python -m gzip application. That read chunks in io.DEFAULT_BUFFER_SIZE previously, but has been updated now to use READ_BUFFER_SIZE chunks.
1 parentbb38b39 commiteae7dad

File tree

5 files changed

+850
-80
lines changed

5 files changed

+850
-80
lines changed

‎Lib/gzip.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
_COMPRESS_LEVEL_TRADEOFF=6
2222
_COMPRESS_LEVEL_BEST=9
2323

24+
READ_BUFFER_SIZE=128*1024
25+
2426

2527
defopen(filename,mode="rb",compresslevel=_COMPRESS_LEVEL_BEST,
2628
encoding=None,errors=None,newline=None):
@@ -446,7 +448,7 @@ def _read_gzip_header(fp):
446448

447449
class_GzipReader(_compression.DecompressReader):
448450
def__init__(self,fp):
449-
super().__init__(_PaddedFile(fp),zlib.decompressobj,
451+
super().__init__(_PaddedFile(fp),zlib._ZlibDecompressor,
450452
wbits=-zlib.MAX_WBITS)
451453
# Set flag indicating start of a new member
452454
self._new_member=True
@@ -494,12 +496,13 @@ def read(self, size=-1):
494496
self._new_member=False
495497

496498
# Read a chunk of data from the file
497-
buf=self._fp.read(io.DEFAULT_BUFFER_SIZE)
499+
ifself._decompressor.needs_input:
500+
buf=self._fp.read(READ_BUFFER_SIZE)
501+
uncompress=self._decompressor.decompress(buf,size)
502+
else:
503+
uncompress=self._decompressor.decompress(b"",size)
498504

499-
uncompress=self._decompressor.decompress(buf,size)
500-
ifself._decompressor.unconsumed_tail!=b"":
501-
self._fp.prepend(self._decompressor.unconsumed_tail)
502-
elifself._decompressor.unused_data!=b"":
505+
ifself._decompressor.unused_data!=b"":
503506
# Prepend the already read bytes to the fileobj so they can
504507
# be seen by _read_eof() and _read_gzip_header()
505508
self._fp.prepend(self._decompressor.unused_data)
@@ -510,14 +513,11 @@ def read(self, size=-1):
510513
raiseEOFError("Compressed file ended before the "
511514
"end-of-stream marker was reached")
512515

513-
self._add_read_data(uncompress )
516+
self._crc=zlib.crc32(uncompress,self._crc)
517+
self._stream_size+=len(uncompress)
514518
self._pos+=len(uncompress)
515519
returnuncompress
516520

517-
def_add_read_data(self,data):
518-
self._crc=zlib.crc32(data,self._crc)
519-
self._stream_size=self._stream_size+len(data)
520-
521521
def_read_eof(self):
522522
# We've read to the end of the file
523523
# We check that the computed CRC and size of the
@@ -647,7 +647,7 @@ def main():
647647
f=builtins.open(arg,"rb")
648648
g=open(arg+".gz","wb")
649649
whileTrue:
650-
chunk=f.read(io.DEFAULT_BUFFER_SIZE)
650+
chunk=f.read(READ_BUFFER_SIZE)
651651
ifnotchunk:
652652
break
653653
g.write(chunk)

‎Lib/test/test_zlib.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -944,6 +944,173 @@ def choose_lines(source, number, seed=None, generator=random):
944944
"""
945945

946946

947+
classZlibDecompressorTest():
948+
# Test adopted from test_bz2.py
949+
TEXT=HAMLET_SCENE
950+
DATA=zlib.compress(HAMLET_SCENE)
951+
BAD_DATA=b"Not a valid deflate block"
952+
deftest_Constructor(self):
953+
self.assertRaises(TypeError,zlib._ZlibDecompressor,42)
954+
955+
deftestDecompress(self):
956+
zlibd=zlib._ZlibDecompressor()
957+
self.assertRaises(TypeError,zlibd.decompress)
958+
text=zlibd.decompress(self.DATA)
959+
self.assertEqual(text,self.TEXT)
960+
961+
deftestDecompressChunks10(self):
962+
zlibd=zlib._ZlibDecompressor()
963+
text=b''
964+
n=0
965+
whileTrue:
966+
str=self.DATA[n*10:(n+1)*10]
967+
ifnotstr:
968+
break
969+
text+=zlibd.decompress(str)
970+
n+=1
971+
self.assertEqual(text,self.TEXT)
972+
973+
deftestDecompressUnusedData(self):
974+
zlibd=zlib._ZlibDecompressor()
975+
unused_data=b"this is unused data"
976+
text=zlibd.decompress(self.DATA+unused_data)
977+
self.assertEqual(text,self.TEXT)
978+
self.assertEqual(zlibd.unused_data,unused_data)
979+
980+
deftestEOFError(self):
981+
zlibd=zlib._ZlibDecompressor()
982+
text=zlibd.decompress(self.DATA)
983+
self.assertRaises(EOFError,zlibd.decompress,b"anything")
984+
self.assertRaises(EOFError,zlibd.decompress,b"")
985+
986+
@support.skip_if_pgo_task
987+
@bigmemtest(size=_4G+100,memuse=3.3)
988+
deftestDecompress4G(self,size):
989+
# "Test zlib._ZlibDecompressor.decompress() with >4GiB input"
990+
blocksize=10*1024*1024
991+
block=random.randbytes(blocksize)
992+
try:
993+
data=block* (size//blocksize+1)
994+
compressed=zlib.compress(data)
995+
zlibd=zlib._ZlibDecompressor()
996+
decompressed=zlibd.decompress(compressed)
997+
self.assertTrue(decompressed==data)
998+
finally:
999+
data=None
1000+
compressed=None
1001+
decompressed=None
1002+
1003+
deftestPickle(self):
1004+
forprotoinrange(pickle.HIGHEST_PROTOCOL+1):
1005+
withself.assertRaises(TypeError):
1006+
pickle.dumps(zlib._ZlibDecompressor(),proto)
1007+
1008+
deftestDecompressorChunksMaxsize(self):
1009+
zlibd=zlib._ZlibDecompressor()
1010+
max_length=100
1011+
out= []
1012+
1013+
# Feed some input
1014+
len_=len(self.BIG_DATA)-64
1015+
out.append(zlibd.decompress(self.BIG_DATA[:len_],
1016+
max_length=max_length))
1017+
self.assertFalse(zlibd.needs_input)
1018+
self.assertEqual(len(out[-1]),max_length)
1019+
1020+
# Retrieve more data without providing more input
1021+
out.append(zlibd.decompress(b'',max_length=max_length))
1022+
self.assertFalse(zlibd.needs_input)
1023+
self.assertEqual(len(out[-1]),max_length)
1024+
1025+
# Retrieve more data while providing more input
1026+
out.append(zlibd.decompress(self.BIG_DATA[len_:],
1027+
max_length=max_length))
1028+
self.assertLessEqual(len(out[-1]),max_length)
1029+
1030+
# Retrieve remaining uncompressed data
1031+
whilenotzlibd.eof:
1032+
out.append(zlibd.decompress(b'',max_length=max_length))
1033+
self.assertLessEqual(len(out[-1]),max_length)
1034+
1035+
out=b"".join(out)
1036+
self.assertEqual(out,self.BIG_TEXT)
1037+
self.assertEqual(zlibd.unused_data,b"")
1038+
1039+
deftest_decompressor_inputbuf_1(self):
1040+
# Test reusing input buffer after moving existing
1041+
# contents to beginning
1042+
zlibd=zlib._ZlibDecompressor()
1043+
out= []
1044+
1045+
# Create input buffer and fill it
1046+
self.assertEqual(zlibd.decompress(self.DATA[:100],
1047+
max_length=0),b'')
1048+
1049+
# Retrieve some results, freeing capacity at beginning
1050+
# of input buffer
1051+
out.append(zlibd.decompress(b'',2))
1052+
1053+
# Add more data that fits into input buffer after
1054+
# moving existing data to beginning
1055+
out.append(zlibd.decompress(self.DATA[100:105],15))
1056+
1057+
# Decompress rest of data
1058+
out.append(zlibd.decompress(self.DATA[105:]))
1059+
self.assertEqual(b''.join(out),self.TEXT)
1060+
1061+
deftest_decompressor_inputbuf_2(self):
1062+
# Test reusing input buffer by appending data at the
1063+
# end right away
1064+
zlibd=zlib._ZlibDecompressor()
1065+
out= []
1066+
1067+
# Create input buffer and empty it
1068+
self.assertEqual(zlibd.decompress(self.DATA[:200],
1069+
max_length=0),b'')
1070+
out.append(zlibd.decompress(b''))
1071+
1072+
# Fill buffer with new data
1073+
out.append(zlibd.decompress(self.DATA[200:280],2))
1074+
1075+
# Append some more data, not enough to require resize
1076+
out.append(zlibd.decompress(self.DATA[280:300],2))
1077+
1078+
# Decompress rest of data
1079+
out.append(zlibd.decompress(self.DATA[300:]))
1080+
self.assertEqual(b''.join(out),self.TEXT)
1081+
1082+
deftest_decompressor_inputbuf_3(self):
1083+
# Test reusing input buffer after extending it
1084+
1085+
zlibd=zlib._ZlibDecompressor()
1086+
out= []
1087+
1088+
# Create almost full input buffer
1089+
out.append(zlibd.decompress(self.DATA[:200],5))
1090+
1091+
# Add even more data to it, requiring resize
1092+
out.append(zlibd.decompress(self.DATA[200:300],5))
1093+
1094+
# Decompress rest of data
1095+
out.append(zlibd.decompress(self.DATA[300:]))
1096+
self.assertEqual(b''.join(out),self.TEXT)
1097+
1098+
deftest_failure(self):
1099+
zlibd=zlib._ZlibDecompressor()
1100+
self.assertRaises(Exception,zlibd.decompress,self.BAD_DATA*30)
1101+
# Previously, a second call could crash due to internal inconsistency
1102+
self.assertRaises(Exception,zlibd.decompress,self.BAD_DATA*30)
1103+
1104+
@support.refcount_test
1105+
deftest_refleaks_in___init__(self):
1106+
gettotalrefcount=support.get_attribute(sys,'gettotalrefcount')
1107+
zlibd=zlib._ZlibDecompressor()
1108+
refs_before=gettotalrefcount()
1109+
foriinrange(100):
1110+
zlibd.__init__()
1111+
self.assertAlmostEqual(gettotalrefcount()-refs_before,0,delta=10)
1112+
1113+
9471114
classCustomInt:
9481115
def__index__(self):
9491116
return100
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
:meth:`gzip.GzipFile.read` reads 10% faster.

‎Modules/clinic/zlibmodule.c.h

Lines changed: 99 additions & 1 deletion
Some generated files are not rendered by default. Learn more aboutcustomizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp