NotificationsYou must be signed in to change notification settings
Fork10
Star160

Commit531e6f4

committed

Support reading updated ocr annotation format, extra error handling & checks

1 parentc482062 commit531e6f4Copy full SHA for 531e6f4

File tree

1 file changed

+28

-15

lines changed

src/chug/webdataset
- doc_anno_pipe.py

1 file changed

+28

-15

lines changed

`‎src/chug/webdataset/doc_anno_pipe.py‎`

Lines changed: 28 additions & 15 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`	`1`	`importjson`
`2`	`2`	`importio`
	`3`	`+importlogging`
`3`	`4`	`importrandom`
`4`	`5`	`fromfunctoolsimportpartial`
`5`	`6`
`@@ -9,9 +10,10 @@`
`9`	`10`
`10`	`11`	`from .loaderimportlog_and_continue`
`11`	`12`
	`13`	`+_logger=logging.getLogger(__name__)`
`12`	`14`
`13`	`15`	`deffilter_no_annotation_or_no_image(sample):`
`14`		`-# FIXME check sample for valid doc/image+ annotation`
	`16`	`+# FIXME check sample for valid doc/imageand annotation`
`15`	`17`	`returnTrue`
`16`	`18`
`17`	`19`
`@@ -37,16 +39,24 @@ def __init__(`
`37`	`39`	`def__call__(self,sample):`
`38`	`40`	`anno=json.loads(sample['json'])`
`39`	`41`
`40`		`-page_anno=self.anno_preprocess(anno,generator=self.generator)`
`41`		`-ifisinstance(anno, (tuple,list)):`
	`42`	`+try:`
	`43`	`+page_anno=self.anno_preprocess(anno,generator=self.generator)`
	`44`	`+exceptExceptionasexn:`
	`45`	`+_logger.error(f'Issue processing annotation for{sample["__url__"]},{sample["__key__"]}.')`
	`46`	`+#_logger.error(json.dumps(anno, indent=4))`
	`47`	`+raise(exn)`
	`48`	`+`
	`49`	`+info=None`
	`50`	`+ifisinstance(page_anno,tuple):`
`42`	`51`	`page_anno,info=page_anno`
`43`		`-num_pages=info.get('num_pages',1)# original # pages`
`44`	`52`	`page_indices=info.get('page_indices', [0])# the samples page indices`
	`53`	`+num_decode_pages=len(page_indices)`
	`54`	`+num_anno_pages=info.get('num_pages',1)`
`45`	`55`	`page_image_info=info.get('image_info',None)`
`46`	`56`	`ifpage_image_infoisnotNone:`
`47`	`57`	`assertlen(page_image_info)==len(page_indices)`
`48`	`58`	`else:`
`49`		`-num_pages=1`
	`59`	`+num_decode_pages=num_anno_pages=1`
`50`	`60`	`page_indices= [0]`
`51`	`61`	`page_image_info=None`
`52`	`62`
`@@ -56,32 +66,35 @@ def __call__(self, sample):`
`56`	`66`	`ifextinsample:`
`57`	`67`	`withio.BytesIO(sample[ext])asb:`
`58`	`68`	`image=Image.open(b)`
`59`		`-multi_page_image=getattr(image,'n_frames',1)>1`
	`69`	`+num_image_pages=getattr(image,'n_frames',1)`
	`70`	`+ifnum_image_pages!=num_anno_pages:`
	`71`	`+_logger.warning(`
	`72`	`+f'Mismatch between num image and num annotation pages{num_image_pages} !={num_anno_pages}'`
	`73`	`+f' for sample{sample["__url__"]},{sample["__key__"]}.')`
`60`	`74`	`fori,page_indexinenumerate(page_indices):`
`61`		`-ifmulti_page_image:`
`62`		`-page=image.seek(page_index)`
	`75`	`+ifnum_image_pages>1:`
	`76`	`+image.seek(page_index)`
`63`	`77`	`else:`
`64`		`-assertnum_pages==1`
	`78`	`+assertnum_anno_pages==1`
`65`	`79`	`image.load()`
`66`		`-page=image`
`67`	`80`
`68`	`81`	`ifself.image_fmt:`
`69`		`-page=page.convert(self.image_fmt)`
	`82`	`+image=image.convert(self.image_fmt)`
`70`	`83`
`71`	`84`	`ifpage_image_infoisnotNone:`
`72`	`85`	`# FIXME, if train objective involves masking or otherwise processing image`
`73`	`86`	`# with knowledge of annotations / text content, anno info should contain`
`74`	`87`	`# mask locations, etc. For such a task, we need to pass it to image preprocess`
`75`		`-page=self.image_preprocess(page,page_info=page_image_info[i])`
	`88`	`+image=self.image_preprocess(image,page_info=page_image_info[i])`
`76`	`89`	`else:`
`77`		`-page=self.image_preprocess(page)`
	`90`	`+image=self.image_preprocess(image)`
`78`	`91`	`# FIXME note, should move to torchvision v2 annotations at some point, they should`
`79`	`92`	`# have a generator arg (eventually) which will make proper state restore possible`
`80`		`-page_images.append(page)`
	`93`	`+page_images.append(image)`
`81`	`94`
`82`	`95`	`assertlen(page_images),'No page images present'`
`83`	`96`
`84`		`-ifself.squeeze_pagesandnum_pages==1:`
	`97`	`+ifself.squeeze_pagesandnum_decode_pages==1:`
`85`	`98`	`# FIXME always list?`
`86`	`99`	`page_images=page_images[0]`
`87`	`100`	`page_anno= {k:v[0]fork,vinpage_anno.items()}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit531e6f4

File tree

1 file changed

1 file changed

`‎src/chug/webdataset/doc_anno_pipe.py‎`

0 commit comments