| """Functions that read and write gzipped files. |
| |
| The user of the file doesn't have to worry about the compression, |
| but random access is not allowed.""" |
| |
| # based on Andrew Kuchling's minigzip.py distributed with the zlib module |
| |
| import string, struct, sys, time |
| import zlib |
| import __builtin__ |
| |
| FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT=1,2,4,8,16 |
| |
| READ, WRITE=1,2 |
| |
| def write32(output, value): |
| output.write(struct.pack("<l", value)) |
| |
| def write32u(output, value): |
| output.write(struct.pack("<L", value)) |
| |
| def read32(input): |
| return struct.unpack("<l", input.read(4))[0] |
| |
| def open(filename, mode="rb", compresslevel=9): |
| returnGzipFile(filename, mode, compresslevel) |
| |
| classGzipFile: |
| |
| myfileobj=None |
| |
| def __init__(self, filename=None, mode=None, |
| compresslevel=9, fileobj=None): |
| if fileobjisNone: |
| fileobj= self.myfileobj= __builtin__.open(filename, modeor'rb') |
| if filenameisNone: |
| if hasattr(fileobj,'name'): filename= fileobj.name |
| else: filename='' |
| if modeisNone: |
| if hasattr(fileobj,'mode'): mode= fileobj.mode |
| else: mode='rb' |
| |
| if mode[0:1]=='r': |
| self.mode= READ |
| # Set flag indicating start of a new member |
| self._new_member=1 |
| self.extrabuf="" |
| self.extrasize=0 |
| self.filename= filename |
| |
| elif mode[0:1]=='w'or mode[0:1]=='a': |
| self.mode= WRITE |
| self._init_write(filename) |
| self.compress= zlib.compressobj(compresslevel, |
| zlib.DEFLATED, |
| -zlib.MAX_WBITS, |
| zlib.DEF_MEM_LEVEL, |
| 0) |
| else: |
| raiseValueError,"Mode "+ mode+" not supported" |
| |
| self.fileobj= fileobj |
| |
| if self.mode== WRITE: |
| self._write_gzip_header() |
| |
| def __repr__(self): |
| s= repr(self.fileobj) |
| return'<gzip '+ s[1:-1]+' '+ hex(id(self))+'>' |
| |
| def _init_write(self, filename): |
| if filename[-3:]!='.gz': |
| filename= filename+'.gz' |
| self.filename= filename |
| self.crc= zlib.crc32("") |
| self.size=0 |
| self.writebuf=[] |
| self.bufsize=0 |
| |
| def _write_gzip_header(self): |
| self.fileobj.write('\037\213')# magic header |
| self.fileobj.write('\010')# compression method |
| fname= self.filename[:-3] |
| flags=0 |
| if fname: |
| flags= FNAME |
| self.fileobj.write(chr(flags)) |
| write32u(self.fileobj, long(time.time())) |
| self.fileobj.write('\002') |
| self.fileobj.write('\377') |
| if fname: |
| self.fileobj.write(fname+'\000') |
| |
| def _init_read(self): |
| self.crc= zlib.crc32("") |
| self.size=0 |
| |
| def _read_gzip_header(self): |
| magic= self.fileobj.read(2) |
| if magic!='\037\213': |
| raiseIOError,'Not a gzipped file' |
| method= ord( self.fileobj.read(1)) |
| if method!=8: |
| raiseIOError,'Unknown compression method' |
| flag= ord( self.fileobj.read(1)) |
| # modtime = self.fileobj.read(4) |
| # extraflag = self.fileobj.read(1) |
| # os = self.fileobj.read(1) |
| self.fileobj.read(6) |
| |
| if flag& FEXTRA: |
| # Read & discard the extra field, if present |
| xlen=ord(self.fileobj.read(1)) |
| xlen=xlen+256*ord(self.fileobj.read(1)) |
| self.fileobj.read(xlen) |
| if flag& FNAME: |
| # Read and discard a null-terminated string containing the filename |
| while(1): |
| s=self.fileobj.read(1) |
| ifnot sor s=='\000':break |
| if flag& FCOMMENT: |
| # Read and discard a null-terminated string containing a comment |
| while(1): |
| s=self.fileobj.read(1) |
| ifnot sor s=='\000':break |
| if flag& FHCRC: |
| self.fileobj.read(2)# Read & discard the 16-bit header CRC |
| |
| |
| def write(self,data): |
| if self.fileobjisNone: |
| raiseValueError,"write() on closed GzipFile object" |
| if len(data)>0: |
| self.size= self.size+ len(data) |
| self.crc= zlib.crc32(data, self.crc) |
| self.fileobj.write( self.compress.compress(data)) |
| |
| def writelines(self,lines): |
| self.write(string.join(lines)) |
| |
| def read(self, size=-1): |
| if self.extrasize<=0and self.fileobjisNone: |
| return'' |
| |
| readsize=1024 |
| if size<0:# get the whole thing |
| try: |
| while1: |
| self._read(readsize) |
| readsize= readsize*2 |
| exceptEOFError: |
| size= self.extrasize |
| else:# just get some more of it |
| try: |
| while size> self.extrasize: |
| self._read(readsize) |
| readsize= readsize*2 |
| exceptEOFError: |
| if size> self.extrasize: |
| size= self.extrasize |
| |
| chunk= self.extrabuf[:size] |
| self.extrabuf= self.extrabuf[size:] |
| self.extrasize= self.extrasize- size |
| |
| return chunk |
| |
| def _unread(self, buf): |
| self.extrabuf= buf+ self.extrabuf |
| self.extrasize= len(buf)+ self.extrasize |
| |
| def _read(self, size=1024): |
| if self.fileobjisNone:raiseEOFError,"Reached EOF" |
| |
| if self._new_member: |
| # If the _new_member flag is set, we have to |
| # jump to the next member, if there is one. |
| # |
| # First, check if we're at the end of the file; |
| # if so, it's time to stop; no more members to read. |
| pos= self.fileobj.tell()# Save current position |
| self.fileobj.seek(0,2)# Seek to end of file |
| if pos== self.fileobj.tell(): |
| self.fileobj=None |
| raiseEOFError,"Reached EOF" |
| else: |
| self.fileobj.seek( pos)# Return to original position |
| |
| self._init_read() |
| self._read_gzip_header() |
| self.decompress= zlib.decompressobj(-zlib.MAX_WBITS) |
| self._new_member=0 |
| |
| # Read a chunk of data from the file |
| buf= self.fileobj.read(size) |
| |
| # If the EOF has been reached, flush the decompression object |
| # and mark this object as finished. |
| |
| if buf=="": |
| uncompress= self.decompress.flush() |
| self._read_eof() |
| self.fileobj=None |
| self._add_read_data( uncompress) |
| raiseEOFError,'Reached EOF' |
| |
| uncompress= self.decompress.decompress(buf) |
| self._add_read_data( uncompress) |
| |
| if self.decompress.unused_data!="": |
| # Ending case: we've come to the end of a member in the file, |
| # so seek back to the start of the unused data, finish up |
| # this member, and read a new gzip header. |
| # (The number of bytes to seek back is the length of the unused |
| # data, minus 8 because _read_eof() will rewind a further 8 bytes) |
| self.fileobj.seek(-len(self.decompress.unused_data)+8,1) |
| |
| # Check the CRC and file size, and set the flag so we read |
| # a new member on the next call |
| self._read_eof() |
| self._new_member=1 |
| |
| def _add_read_data(self, data): |
| self.crc= zlib.crc32(data, self.crc) |
| self.extrabuf= self.extrabuf+ data |
| self.extrasize= self.extrasize+ len(data) |
| self.size= self.size+ len(data) |
| |
| def _read_eof(self): |
| # We've read to the end of the file, so we have to rewind in order |
| # to reread the 8 bytes containing the CRC and the file size. |
| # We check the that the computed CRC and size of the |
| # uncompressed data matches the stored values. |
| self.fileobj.seek(-8,1) |
| crc32= read32(self.fileobj) |
| isize= read32(self.fileobj) |
| if crc32%0x100000000L!= self.crc%0x100000000L: |
| raiseValueError,"CRC check failed" |
| elif isize!= self.size: |
| raiseValueError,"Incorrect length of data produced" |
| |
| def close(self): |
| if self.mode== WRITE: |
| self.fileobj.write(self.compress.flush()) |
| write32(self.fileobj, self.crc) |
| write32(self.fileobj, self.size) |
| self.fileobj=None |
| elif self.mode== READ: |
| self.fileobj=None |
| if self.myfileobj: |
| self.myfileobj.close() |
| self.myfileobj=None |
| |
| def __del__(self): |
| try: |
| if(self.myfileobjisNoneand |
| self.fileobjisNone): |
| return |
| exceptAttributeError: |
| return |
| self.close() |
| |
| def flush(self): |
| self.fileobj.flush() |
| |
| def seek(self): |
| raiseIOError,'Random access not allowed in gzip files' |
| |
| def tell(self): |
| raiseIOError,'I won\'t tell() you for gzip files' |
| |
| def isatty(self): |
| return0 |
| |
| def readline(self, size=-1): |
| if size<0: size= sys.maxint |
| bufs=[] |
| orig_size= size |
| readsize= min(100, size)# Read from the file in small chunks |
| while1: |
| if size==0: |
| return string.join(bufs,'')# Return resulting line |
| |
| c= self.read(readsize) |
| i= string.find(c,'\n') |
| if sizeisnotNone: |
| # We set i=size to break out of the loop under two |
| # conditions: 1) there's no newline, and the chunk is |
| # larger than size, or 2) there is a newline, but the |
| # resulting line would be longer than 'size'. |
| if i==-1and len(c)> size: i=size-1 |
| elif size<= i: i= size-1 |
| |
| if i>=0or c=='': |
| bufs.append(c[:i+1])# Add portion of last chunk |
| self._unread(c[i+1:])# Push back rest of chunk |
| return string.join(bufs,'')# Return resulting line |
| |
| # Append chunk to list, decrease 'size', |
| bufs.append(c) |
| size= size- len(c) |
| readsize= min(size, readsize*2) |
| |
| def readlines(self, sizehint=0): |
| # Negative numbers result in reading all the lines |
| if sizehint<=0: sizehint= sys.maxint |
| L=[] |
| while sizehint>0: |
| line= self.readline() |
| if line=="":break |
| L.append( line) |
| sizehint= sizehint- len(line) |
| |
| return L |
| |
| def writelines(self, L): |
| for linein L: |
| self.write(line) |
| |
| |
| def _test(): |
| # Act like gzip; with -d, act like gunzip. |
| # The input file is not deleted, however, nor are any other gzip |
| # options or features supported. |
| import sys |
| args= sys.argv[1:] |
| decompress= argsand args[0]=="-d" |
| if decompress: |
| args= args[1:] |
| ifnot args: |
| args=["-"] |
| for argin args: |
| if decompress: |
| if arg=="-": |
| f=GzipFile(filename="", mode="rb", fileobj=sys.stdin) |
| g= sys.stdout |
| else: |
| if arg[-3:]!=".gz": |
| print"filename doesn't end in .gz:",`arg` |
| continue |
| f= open(arg,"rb") |
| g= __builtin__.open(arg[:-3],"wb") |
| else: |
| if arg=="-": |
| f= sys.stdin |
| g=GzipFile(filename="", mode="wb", fileobj=sys.stdout) |
| else: |
| f= __builtin__.open(arg,"rb") |
| g= open(arg+".gz","wb") |
| while1: |
| chunk= f.read(1024) |
| ifnot chunk: |
| break |
| g.write(chunk) |
| if gisnot sys.stdout: |
| g.close() |
| if fisnot sys.stdin: |
| f.close() |
| |
| if __name__=='__main__': |
| _test() |