@@ -437,7 +437,7 @@ def _qimage_to_pil(self, qimage: QImage) -> PIL.Image.Image:
437
437
return img .convert ("RGB" )
438
438
439
439
def _ocr_image (self ,qimage :QImage ,langs :Optional [str ]= None ,timeout :int = 30 )-> bytes :
440
- if shutil .which ("tesseract" )is None :
440
+ if shutil .which (self . editor . args . tesseract_command )is None :
441
441
# tesseract is not installed
442
442
return None
443
443
langs = langs or self .editor .ocr_langs
@@ -448,14 +448,22 @@ def _ocr_image(self, qimage: QImage, langs: Optional[str] = None, timeout: int =
448
448
# TODO? use https://github.com/sirfz/tesserocr
449
449
tiff_bytes = pil_to_tiff_bytes (pil_img )
450
450
args = [
451
- "tesseract" ,
451
+ self . editor . args . tesseract_command ,
452
452
"-" ,# input: stdin
453
453
"-" ,# output: stdout
454
454
"-l" ,langs ,
455
- # "-c", "tessedit_create_hocr=1", # config
456
- "quiet" ,# config: hide "Estimating resolution as N" messages
457
- "hocr" ,# extension
455
+ "-c" ,"tessedit_create_hocr=1" ,
456
+ # TODO get dpi value from hocr file
457
+ # <div class='ocr_page' id='page_1' title='...; scan_res 300 300'>
458
+ # "--dpi", "300",
459
+ "--loglevel" ,"WARN" ,# ALL, TRACE, DEBUG, INFO, WARN, ERROR, FATAL, OFF
458
460
]
461
+ if self .editor .args .tessdata_dir :
462
+ args += [
463
+ "--oem" ,"1" ,
464
+ "--psm" ,"6" ,
465
+ "--tessdata-dir" ,self .editor .args .tessdata_dir ,
466
+ ]
459
467
hocr_bytes = subprocess .check_output (args ,input = tiff_bytes ,timeout = timeout )
460
468
return hocr_bytes
461
469
@@ -1100,6 +1108,15 @@ def main():
1100
1108
default = None ,
1101
1109
help = "Overlay color (color name or #RRGGBB)" ,
1102
1110
)
1111
+ parser .add_argument (
1112
+ "--tesseract-command" ,
1113
+ default = "tesseract" ,
1114
+ )
1115
+ parser .add_argument (
1116
+ "--tessdata-dir" ,
1117
+ default = None ,
1118
+ help = "usually a git clone of https://github.com/tesseract-ocr/tessdata_best" ,
1119
+ )
1103
1120
args = parser .parse_args ()
1104
1121
1105
1122
# handle Ctrl+C from terminal