11import json
22import io
3+ import logging
34import random
45from functools import partial
56
910
1011from .loader import log_and_continue
1112
13+ _logger = logging .getLogger (__name__ )
1214
1315def filter_no_annotation_or_no_image (sample ):
14- # FIXME check sample for valid doc/image+ annotation
16+ # FIXME check sample for valid doc/imageand annotation
1517return True
1618
1719
@@ -37,16 +39,24 @@ def __init__(
3739def __call__ (self ,sample ):
3840anno = json .loads (sample ['json' ])
3941
40- page_anno = self .anno_preprocess (anno ,generator = self .generator )
41- if isinstance (anno , (tuple ,list )):
42+ try :
43+ page_anno = self .anno_preprocess (anno ,generator = self .generator )
44+ except Exception as exn :
45+ _logger .error (f'Issue processing annotation for{ sample ["__url__" ]} ,{ sample ["__key__" ]} .' )
46+ #_logger.error(json.dumps(anno, indent=4))
47+ raise (exn )
48+
49+ info = None
50+ if isinstance (page_anno ,tuple ):
4251page_anno ,info = page_anno
43- num_pages = info .get ('num_pages' ,1 )# original # pages
4452page_indices = info .get ('page_indices' , [0 ])# the samples page indices
53+ num_decode_pages = len (page_indices )
54+ num_anno_pages = info .get ('num_pages' ,1 )
4555page_image_info = info .get ('image_info' ,None )
4656if page_image_info is not None :
4757assert len (page_image_info )== len (page_indices )
4858else :
49- num_pages = 1
59+ num_decode_pages = num_anno_pages = 1
5060page_indices = [0 ]
5161page_image_info = None
5262
@@ -56,32 +66,35 @@ def __call__(self, sample):
5666if ext in sample :
5767with io .BytesIO (sample [ext ])as b :
5868image = Image .open (b )
59- multi_page_image = getattr (image ,'n_frames' ,1 )> 1
69+ num_image_pages = getattr (image ,'n_frames' ,1 )
70+ if num_image_pages != num_anno_pages :
71+ _logger .warning (
72+ f'Mismatch between num image and num annotation pages{ num_image_pages } !={ num_anno_pages } '
73+ f' for sample{ sample ["__url__" ]} ,{ sample ["__key__" ]} .' )
6074for i ,page_index in enumerate (page_indices ):
61- if multi_page_image :
62- page = image .seek (page_index )
75+ if num_image_pages > 1 :
76+ image .seek (page_index )
6377else :
64- assert num_pages == 1
78+ assert num_anno_pages == 1
6579image .load ()
66- page = image
6780
6881if self .image_fmt :
69- page = page .convert (self .image_fmt )
82+ image = image .convert (self .image_fmt )
7083
7184if page_image_info is not None :
7285# FIXME, if train objective involves masking or otherwise processing image
7386# with knowledge of annotations / text content, anno info should contain
7487# mask locations, etc. For such a task, we need to pass it to image preprocess
75- page = self .image_preprocess (page ,page_info = page_image_info [i ])
88+ image = self .image_preprocess (image ,page_info = page_image_info [i ])
7689else :
77- page = self .image_preprocess (page )
90+ image = self .image_preprocess (image )
7891# FIXME note, should move to torchvision v2 annotations at some point, they should
7992# have a generator arg (eventually) which will make proper state restore possible
80- page_images .append (page )
93+ page_images .append (image )
8194
8295assert len (page_images ),'No page images present'
8396
84- if self .squeeze_pages and num_pages == 1 :
97+ if self .squeeze_pages and num_decode_pages == 1 :
8598# FIXME always list?
8699page_images = page_images [0 ]
87100page_anno = {k :v [0 ]for k ,v in page_anno .items ()}