|
| 1 | +# Import Libraries |
| 2 | +fromtypingimportTuple |
| 3 | +fromioimportBytesIO |
| 4 | +importos |
| 5 | +importargparse |
| 6 | +importre |
| 7 | +importfitz |
| 8 | + |
| 9 | + |
| 10 | +defextract_info(input_file:str): |
| 11 | +""" |
| 12 | + Extracts file info |
| 13 | + """ |
| 14 | +# Open the PDF |
| 15 | +pdfDoc=fitz.open(input_file) |
| 16 | +output= { |
| 17 | +"File":input_file,"Encrypted": ("True"ifpdfDoc.isEncryptedelse"False") |
| 18 | + } |
| 19 | +# If PDF is encrypted the file metadata cannot be extracted |
| 20 | +ifnotpdfDoc.isEncrypted: |
| 21 | +forkey,valueinpdfDoc.metadata.items(): |
| 22 | +output[key]=value |
| 23 | + |
| 24 | +# To Display File Info |
| 25 | +print("## File Information ##################################################") |
| 26 | +print("\n".join("{}:{}".format(i,j)fori,jinoutput.items())) |
| 27 | +print("######################################################################") |
| 28 | + |
| 29 | +returnTrue,output |
| 30 | + |
| 31 | + |
| 32 | +defsearch_for_text(lines,search_str): |
| 33 | +""" |
| 34 | + Search for the search string within the document lines |
| 35 | + """ |
| 36 | +forlineinlines: |
| 37 | +# Find all matches within one line |
| 38 | +results=re.findall(search_str,line,re.IGNORECASE) |
| 39 | +# In case multiple matches within one line |
| 40 | +forresultinresults: |
| 41 | +yieldresult |
| 42 | + |
| 43 | + |
| 44 | +defredact_matching_data(page,matched_values): |
| 45 | +""" |
| 46 | + Redacts matching values |
| 47 | + """ |
| 48 | +matches_found=0 |
| 49 | +# Loop throughout matching values |
| 50 | +forvalinmatched_values: |
| 51 | +matches_found+=1 |
| 52 | +matching_val_area=page.searchFor(val) |
| 53 | +# Redact matching values |
| 54 | + [page.addRedactAnnot(area,text=" ",fill=(0,0,0)) |
| 55 | +forareainmatching_val_area] |
| 56 | +# Apply the redaction |
| 57 | +page.apply_redactions() |
| 58 | +returnmatches_found |
| 59 | + |
| 60 | + |
| 61 | +defframe_matching_data(page,matched_values): |
| 62 | +""" |
| 63 | + frames matching values |
| 64 | + """ |
| 65 | +matches_found=0 |
| 66 | +# Loop throughout matching values |
| 67 | +forvalinmatched_values: |
| 68 | +matches_found+=1 |
| 69 | +matching_val_area=page.searchFor(val) |
| 70 | +forareainmatching_val_area: |
| 71 | +ifisinstance(area,fitz.fitz.Rect): |
| 72 | +# Draw a rectangle around matched values |
| 73 | +annot=page.addRectAnnot(area) |
| 74 | +# , fill = fitz.utils.getColor('black') |
| 75 | +annot.setColors(stroke=fitz.utils.getColor('red')) |
| 76 | +# If you want to remove matched data |
| 77 | +#page.addFreetextAnnot(area, ' ') |
| 78 | +annot.update() |
| 79 | +returnmatches_found |
| 80 | + |
| 81 | + |
| 82 | +defhighlight_matching_data(page,matched_values,type): |
| 83 | +""" |
| 84 | + Highlight matching values |
| 85 | + """ |
| 86 | +matches_found=0 |
| 87 | +# Loop throughout matching values |
| 88 | +forvalinmatched_values: |
| 89 | +matches_found+=1 |
| 90 | +matching_val_area=page.searchFor(val) |
| 91 | +# print("matching_val_area",matching_val_area) |
| 92 | +highlight=None |
| 93 | +iftype=='Highlight': |
| 94 | +highlight=page.addHighlightAnnot(matching_val_area) |
| 95 | +eliftype=='Squiggly': |
| 96 | +highlight=page.addSquigglyAnnot(matching_val_area) |
| 97 | +eliftype=='Underline': |
| 98 | +highlight=page.addUnderlineAnnot(matching_val_area) |
| 99 | +eliftype=='Strikeout': |
| 100 | +highlight=page.addStrikeoutAnnot(matching_val_area) |
| 101 | +else: |
| 102 | +highlight=page.addHighlightAnnot(matching_val_area) |
| 103 | +# To change the highlight colar |
| 104 | +# highlight.setColors({"stroke":(0,0,1),"fill":(0.75,0.8,0.95) }) |
| 105 | +# highlight.setColors(stroke = fitz.utils.getColor('white'), fill = fitz.utils.getColor('red')) |
| 106 | +# highlight.setColors(colors= fitz.utils.getColor('red')) |
| 107 | +highlight.update() |
| 108 | +returnmatches_found |
| 109 | + |
| 110 | + |
| 111 | +defprocess_data(input_file:str,output_file:str,search_str:str,pages:Tuple=None,action:str='Highlight'): |
| 112 | +""" |
| 113 | + Process the pages of the PDF File |
| 114 | + """ |
| 115 | +# Open the PDF |
| 116 | +pdfDoc=fitz.open(input_file) |
| 117 | +# Save the generated PDF to memory buffer |
| 118 | +output_buffer=BytesIO() |
| 119 | +total_matches=0 |
| 120 | +# Iterate through pages |
| 121 | +forpginrange(pdfDoc.pageCount): |
| 122 | +# If required for specific pages |
| 123 | +ifpages: |
| 124 | +ifstr(pg)notinpages: |
| 125 | +continue |
| 126 | +# Select the page |
| 127 | +page=pdfDoc[pg] |
| 128 | +# Get Matching Data |
| 129 | +# Split page by lines |
| 130 | +page_lines=page.getText("text").split('\n') |
| 131 | +matched_values=search_for_text(page_lines,search_str) |
| 132 | +ifmatched_values: |
| 133 | +ifaction=='Redact': |
| 134 | +matches_found=redact_matching_data(page,matched_values) |
| 135 | +elifaction=='Frame': |
| 136 | +matches_found=frame_matching_data(page,matched_values) |
| 137 | +elifactionin ('Highlight','Squiggly','Underline','Strikeout'): |
| 138 | +matches_found=highlight_matching_data( |
| 139 | +page,matched_values,action) |
| 140 | +else: |
| 141 | +matches_found=highlight_matching_data( |
| 142 | +page,matched_values,'Highlight') |
| 143 | +total_matches+=matches_found |
| 144 | +print(f"{total_matches} Match(es) Found of Search String{search_str} In Input File:{input_file}") |
| 145 | +# Save to output |
| 146 | +pdfDoc.save(output_buffer) |
| 147 | +pdfDoc.close() |
| 148 | +# Save the output buffer to the output file |
| 149 | +withopen(output_file,mode='wb')asf: |
| 150 | +f.write(output_buffer.getbuffer()) |
| 151 | + |
| 152 | + |
| 153 | +defremove_highlght(input_file:str,output_file:str,pages:Tuple=None): |
| 154 | +# Open the PDF |
| 155 | +pdfDoc=fitz.open(input_file) |
| 156 | +# Save the generated PDF to memory buffer |
| 157 | +output_buffer=BytesIO() |
| 158 | +# Initialize a counter for annotations |
| 159 | +annot_found=0 |
| 160 | +# Iterate through pages |
| 161 | +forpginrange(pdfDoc.pageCount): |
| 162 | +# If required for specific pages |
| 163 | +ifpages: |
| 164 | +ifstr(pg)notinpages: |
| 165 | +continue |
| 166 | +# Select the page |
| 167 | +page=pdfDoc[pg] |
| 168 | +annot=page.firstAnnot |
| 169 | +whileannot: |
| 170 | +annot_found+=1 |
| 171 | +page.deleteAnnot(annot) |
| 172 | +annot=annot.next |
| 173 | +ifannot_found>=0: |
| 174 | +print(f"Annotation(s) Found In The Input File:{input_file}") |
| 175 | +# Save to output |
| 176 | +pdfDoc.save(output_buffer) |
| 177 | +pdfDoc.close() |
| 178 | +# Save the output buffer to the output file |
| 179 | +withopen(output_file,mode='wb')asf: |
| 180 | +f.write(output_buffer.getbuffer()) |
| 181 | + |
| 182 | + |
| 183 | + |
| 184 | +defprocess_file(**kwargs): |
| 185 | +""" |
| 186 | + To process one single file |
| 187 | + Redact, Frame, Highlight... one PDF File |
| 188 | + Remove Highlights from a single PDF File |
| 189 | + """ |
| 190 | +input_file=kwargs.get('input_file') |
| 191 | +output_file=kwargs.get('output_file') |
| 192 | +ifoutput_fileisNone: |
| 193 | +output_file=input_file |
| 194 | +search_str=kwargs.get('search_str') |
| 195 | +pages=kwargs.get('pages') |
| 196 | +# Redact, Frame, Highlight, Squiggly, Underline, Strikeout, Remove |
| 197 | +action=kwargs.get('action') |
| 198 | +ifaction=="Remove": |
| 199 | +# Remove the Highlights except Redactions |
| 200 | +remove_highlght(input_file=input_file, |
| 201 | +output_file=output_file,pages=pages) |
| 202 | +else: |
| 203 | +process_data(input_file=input_file,output_file=output_file, |
| 204 | +search_str=search_str,pages=pages,action=action) |
| 205 | + |
| 206 | + |
| 207 | +defprocess_folder(**kwargs): |
| 208 | +""" |
| 209 | + Redact, Frame, Highlight... all PDF Files within a specified path |
| 210 | + Remove Highlights from all PDF Files within a specified path |
| 211 | + """ |
| 212 | +input_folder=kwargs.get('input_folder') |
| 213 | +search_str=kwargs.get('search_str') |
| 214 | +# Run in recursive mode |
| 215 | +recursive=kwargs.get('recursive') |
| 216 | +#Redact, Frame, Highlight, Squiggly, Underline, Strikeout, Remove |
| 217 | +action=kwargs.get('action') |
| 218 | +pages=kwargs.get('pages') |
| 219 | +# Loop though the files within the input folder. |
| 220 | +forfoldername,dirs,filenamesinos.walk(input_folder): |
| 221 | +forfilenameinfilenames: |
| 222 | +# Check if pdf file |
| 223 | +ifnotfilename.endswith('.pdf'): |
| 224 | +continue |
| 225 | +# PDF File found |
| 226 | +inp_pdf_file=os.path.join(foldername,filename) |
| 227 | +print("Processing file =",inp_pdf_file) |
| 228 | +process_file(input_file=inp_pdf_file,output_file=None, |
| 229 | +search_str=search_str,action=action,pages=pages) |
| 230 | +ifnotrecursive: |
| 231 | +break |
| 232 | + |
| 233 | + |
| 234 | +defis_valid_path(path): |
| 235 | +""" |
| 236 | + Validates the path inputted and checks whether it is a file path or a folder path |
| 237 | + """ |
| 238 | +ifnotpath: |
| 239 | +raiseValueError(f"Invalid Path") |
| 240 | +ifos.path.isfile(path): |
| 241 | +returnpath |
| 242 | +elifos.path.isdir(path): |
| 243 | +returnpath |
| 244 | +else: |
| 245 | +raiseValueError(f"Invalid Path{path}") |
| 246 | + |
| 247 | + |
| 248 | +defparse_args(): |
| 249 | +""" |
| 250 | + Get user command line parameters |
| 251 | + """ |
| 252 | +parser=argparse.ArgumentParser(description="Available Options") |
| 253 | +parser.add_argument('-i','--input_path',dest='input_path',type=is_valid_path, |
| 254 | +required=True,help="Enter the path of the file or the folder to process") |
| 255 | +parser.add_argument('-a','--action',dest='action',choices=['Redact','Frame','Highlight','Squiggly','Underline','Strikeout','Remove'],type=str, |
| 256 | +default='Highlight',help="Choose whether to Redact or to Frame or to Highlight or to Squiggly or to Underline or to Strikeout or to Remove") |
| 257 | +parser.add_argument('-p','--pages',dest='pages',type=tuple, |
| 258 | +help="Enter the pages to consider e.g.: [2,4]") |
| 259 | +action=parser.parse_known_args()[0].action |
| 260 | +ifaction!='Remove': |
| 261 | +parser.add_argument('-s','--search_str',dest='search_str'# lambda x: os.path.has_valid_dir_syntax(x) |
| 262 | + ,type=str,required=True,help="Enter a valid search string") |
| 263 | +path=parser.parse_known_args()[0].input_path |
| 264 | +ifos.path.isfile(path): |
| 265 | +parser.add_argument('-o','--output_file',dest='output_file',type=str# lambda x: os.path.has_valid_dir_syntax(x) |
| 266 | + ,help="Enter a valid output file") |
| 267 | +ifos.path.isdir(path): |
| 268 | +parser.add_argument('-r','--recursive',dest='recursive',default=False,type=lambdax: ( |
| 269 | +str(x).lower()in ['true','1','yes']),help="Process Recursively or Non-Recursively") |
| 270 | +args=vars(parser.parse_args()) |
| 271 | +# To Display The Command Line Arguments |
| 272 | +print("## Command Arguments #################################################") |
| 273 | +print("\n".join("{}:{}".format(i,j)fori,jinargs.items())) |
| 274 | +print("######################################################################") |
| 275 | +returnargs |
| 276 | + |
| 277 | + |
| 278 | +if__name__=='__main__': |
| 279 | +# Parsing command line arguments entered by user |
| 280 | +args=parse_args() |
| 281 | +# If File Path |
| 282 | +ifos.path.isfile(args['input_path']): |
| 283 | +# Extracting File Info |
| 284 | +extract_info(input_file=args['input_path']) |
| 285 | +# Process a file |
| 286 | +process_file( |
| 287 | +input_file=args['input_path'],output_file=args['output_file'], |
| 288 | +search_str=args['search_str']if'search_str'in (args.keys())elseNone, |
| 289 | +pages=args['pages'],action=args['action'] |
| 290 | + ) |
| 291 | +# If Folder Path |
| 292 | +elifos.path.isdir(args['input_path']): |
| 293 | +# Process a folder |
| 294 | +process_folder( |
| 295 | +input_folder=args['input_path'], |
| 296 | +search_str=args['search_str']if'search_str'in (args.keys())elseNone, |
| 297 | +action=args['action'],pages=args['pages'],recursive=args['recursive'] |
| 298 | + ) |