- Notifications
You must be signed in to change notification settings - Fork20
Open
Description
Test run on Python 3.8, Windows 7:
- I took 4 arbitrary page numbers (pages 4,6,8,9).
- For each of the benchmark listed pdf files I extracted those pages from it (if available).
- Then I created a new pdf using the extracted pages, and repeated them between 1 and 5 times (to check how well pdfrw / pypdf optimize size of created pdfs containing repetitive information). So output pdfs will have up to 4x5 = 20 pages
- I measure time employed and output sizes
I recall my initial code also deleted original bookmarks/annotations from pdfs, but I removed that part for simplicity and commentedwhere I had read about that.
Code:
#!/usr/bin/python# -*- coding: utf-8 -*-deffsize(filepath):importosfinfo=os.stat(filepath)fsize=finfo.st_sizeKB="%.2f"% (fsize/1024)return([fsize,KB])#@profiledefcreatepdf_from_sourcepdf_pages_pdfrw(sourcepdf=None,pageslist=None,destpdf=None,debug=False):""" <https://github.com/pmaupin/pdfrw/blob/master/examples/subset.py>"""frompdfrwimportPdfWriter,PdfReader#import pdfrw_bookmarks # code from https://github.com/pmaupin/pdfrw/issues/52#issuecomment-271190546pages=PdfReader(sourcepdf).pagestotalpages=len(pages)outdata=PdfWriter(destpdf)forpinpageslist:ifp<totalpages:ifdebug:print("pdfrw ",p)#pdfrw_pageannots(pages[p-1])outdata.addpage(pages[p-1])outdata.write()#@profiledefcreatepdf_from_sourcepdf_pages_pypdf(sourcepdf=None,pageslist=None,destpdf=None,debug=False,compress=False):""" Generate destpdf with list of certain pages taken from sourcepdf.- <https://pypdf2.readthedocs.io/en/stable/user/merging-pdfs.html>- SO [Extract specific pages of PDF and save it with Python](https://stackoverflow.com/a/51885963/710788)"""frompypdfimportPdfWriter,PdfReaderfsource=open(sourcepdf,"rb")merger=PdfWriter()totalpages=len(PdfReader(fsource).pages)forpinpageslist:ifp<totalpages:ifdebug:print("pypdf ",p)# add page p (0-based index):merger.append(fileobj=fsource,pages=(p-1,p))ifcompress:# Compress the dataforpageinmerger.pages:page.compress_content_streams()# This is CPU intensive!# Write to an output PDF documentoutput=open(destpdf,"wb")merger.write(output)# Close File Descriptorsmerger.close()output.close()#from memory_profiler import profile#@profiledefpypdf_vs_pdfrw():""" [performance comparative](https://github.com/pmaupin/pdfrw/issues/232#issuecomment-1436153435) between two packages:- pdfrw- pypdf"""print(datetime.now()-startTime," before comparing")pdfurls= ["https://arxiv.org/pdf/2201.00151.pdf","https://arxiv.org/pdf/1707.09725.pdf","https://arxiv.org/pdf/2201.00021.pdf","https://arxiv.org/pdf/2201.00037.pdf","https://arxiv.org/pdf/2201.00069.pdf","https://arxiv.org/pdf/2201.00178.pdf","https://arxiv.org/pdf/2201.00201.pdf","https://arxiv.org/pdf/1602.06541.pdf","https://arxiv.org/pdf/2201.00200.pdf","https://arxiv.org/pdf/2201.00022.pdf","https://arxiv.org/pdf/2201.00029.pdf","https://arxiv.org/pdf/1601.03642.pdf",]importrequests,ospdfrw_Tsize=0pdfrw_Ttime=0pypdf_Tsize=0pypdf_Ttime=0forpdfurlinpdfurls:sourcepdf=pdfurl.split("/")[-1]ifnotos.path.exists(sourcepdf):response=requests.get(pdfurl,headers=None,params=None)ifresponse.status_code==200:withopen(sourcepdf,'wb')asf:f.write(response.content)else:print(response.status_code)print("COULDN'T DOWNLOAD '{}' FILE:\n".format(pdfurl))ifnotos.path.exists(sourcepdf):print("\n","-_"*40,"\n\nSKIPPING '{}' FILE:\n".format(sourcepdf))else:print("\n","-_"*40,"\n\nTESTING WITH '{}' FILE:\n".format(sourcepdf))foriinrange(1,6):pageslist=[4,6,8,9]*i#*5 eats all my memory when using pypdf with large pdf filesprint("-"*50,"\npageslist:",pageslist)start=datetime.now()destpdf=sourcepdf+"_pdfrw-test_{}.pdf".format(".".join([str(p)forpinpageslist]))createpdf_from_sourcepdf_pages_pdfrw(sourcepdf=sourcepdf,pageslist=pageslist,destpdf=destpdf);pdfrw_t=round((datetime.now()-start).total_seconds(),3)pdfrw_s=fsize(destpdf)pdfrw_Ttime+=pdfrw_tpdfrw_Tsize+=pdfrw_s[0]print("pdfrw: {} KB output size, took {} seconds".format(pdfrw_s[1],pdfrw_t))start=datetime.now()destpdf=sourcepdf+"_pypdf-test_{}.pdf".format(".".join([str(p)forpinpageslist]))createpdf_from_sourcepdf_pages_pypdf(sourcepdf=sourcepdf,pageslist=pageslist,destpdf=destpdf);pypdf_t=round((datetime.now()-start).total_seconds(),3)pypdf_s=fsize(destpdf)pypdf_Ttime+=pypdf_tpypdf_Tsize+=pypdf_s[0]print("pypdf: {} KB output size, took {} seconds".format(pypdf_s[1],pypdf_t))print("pypdf_time / pdfrw_time = {} ratio".format(round(pypdf_t/pdfrw_t,2)))print("pypdf_size / pdfrw_size = {} ratio".format(round(pypdf_s[0]/pdfrw_s[0],2)))importpdfrw,pypdfprint("-_"*40)print("\n pdfrw.__version__ {}\nAccumulated output file size: {:.2f} MB\nTotal time: {:.2f} seconds".format(pdfrw.__version__,pdfrw_Tsize/1024/1024,pdfrw_Ttime))print("\n pypdf.__version__ {}\nAccumulated output file size: {:.2f} MB\nTotal time: {:.2f} seconds".format(pypdf.__version__,pypdf_Tsize/1024/1024,pypdf_Ttime))if__name__=="__main__":importsysfromdatetimeimportdatetimestartTime=datetime.now()print("START: ",startTime)pypdf_vs_pdfrw()endTime=datetime.now()print("\nEND: ",endTime)print("\nTOTAL TIME: ",endTime-startTime)
OUTPUT:
START: 2023-07-01 22:06:17.7182880:00:00 before comparing -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_TESTING WITH '2201.00151.pdf' FILE:--------------------------------------------------pageslist: [4, 6, 8, 9]pdfrw: 591.29 KB output size, took 0.109 secondspypdf: 660.78 KB output size, took 0.499 secondspypdf_time / pdfrw_time = 4.58 ratiopypdf_size / pdfrw_size = 1.12 ratio--------------------------------------------------(... LINES DELETED TO AVOID TOO LONG OUTPUT ...)--------------------------------------------------pageslist: [4, 6, 8, 9, 4, 6, 8, 9, 4, 6, 8, 9, 4, 6, 8, 9, 4, 6, 8, 9]pdfrw: 130.20 KB output size, took 0.047 secondspypdf: 836.60 KB output size, took 1.031 secondspypdf_time / pdfrw_time = 21.94 ratiopypdf_size / pdfrw_size = 6.43 ratio-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_ pdfrw.__version__ 0.5.0Accumulated output file size: 50.73 MBTotal time: 4.47 seconds pypdf.__version__ 3.2.0Accumulated output file size: 193.77 MBTotal time: 108.14 secondsEND: 2023-07-01 22:08:11.767827TOTAL TIME: 0:01:54.049539