|
1 | 1 | #Chugging Data |
2 | 2 |
|
| 3 | +A library to help w/ efficient training with multi-modal data. Initially focused on image & document + text tasks. |
| 4 | + |
| 5 | +`chug` currently leverages`webdataset` and Huggingface`datasets`.`webdataset` tar files and dataset pipelines are preferred for scalable pretraining. For ease of use, Huggingface`datasets` are also supported and work great for exploration, validation, and fine-tune use cases. |
| 6 | + |
| 7 | +##TODOs |
| 8 | + |
| 9 | +###Nearish |
| 10 | +* Cleanup and refinement, codebase will change. |
| 11 | +* Documentation & unit-tests. |
| 12 | +* Support reading of info .json/.yaml files for automatic shard info resolution for webdatasets (like timm). |
| 13 | +* Support unified preprocessor functions for combined image + text tokenization (img+text token interleaving, etc.). |
| 14 | + |
| 15 | +###Longish |
| 16 | +* Increase range of task pipelines for other tasks, modelling needs. |
| 17 | +* Support additional modalities & targets (video, audio, detection/dense pixel targets, image/video/audio targets). |
| 18 | +* Explore alternatives to .tar shards (array_record, arrow, etc). |
| 19 | + |
| 20 | +##Usage / Examples |
| 21 | + |
| 22 | +###Document Reading, Training w/ IDL |
| 23 | +```python |
| 24 | +import chug |
| 25 | +img_cfg= chug.ImageInputCfg(size=(1024,768),transform_type='doc_better') |
| 26 | +img_fn= chug.create_image_preprocessor(input_cfg=img_cfg,is_training=True) |
| 27 | +txt_fn= chug.create_text_preprocessor( |
| 28 | +'naver-clova-ix/donut-base', |
| 29 | +prompt_end_token='<s_idl>', |
| 30 | +task_start_token='<s_idl>',#NOTE needs to be added to tokenizer |
| 31 | +) |
| 32 | + |
| 33 | +task_cfg= chug.DataTaskDocReadCfg( |
| 34 | +image_process_fn=img_fn, |
| 35 | +text_process_fn=txt_fn, |
| 36 | +page_sampling='random', |
| 37 | +error_handler='dump_and_reraise', |
| 38 | +) |
| 39 | +task_pipe= chug.create_task_pipeline(task_cfg) |
| 40 | +data_cfg= chug.DataCfg( |
| 41 | +source='pipe:curl -s -f -L https://huggingface.co/datasets/pixparse/IDL-wds/resolve/main/idl-train-0{0000..1000}.tar',#FIXME range |
| 42 | +split='train', |
| 43 | +batch_size=8, |
| 44 | +num_samples=1000000,#FIXME get actual value |
| 45 | +num_workers=0, |
| 46 | +format='wds', |
| 47 | +) |
| 48 | +lb= chug.create_loader( |
| 49 | + data_cfg, |
| 50 | + task_cfg, |
| 51 | +is_training=True, |
| 52 | +) |
| 53 | +ii=iter(lb.loader) |
| 54 | +sample=next(ii) |
| 55 | +``` |
| 56 | + |
| 57 | +###Document Reading, Exploring IDL |
| 58 | +```python |
| 59 | +import chug |
| 60 | +task_cfg= chug.DataTaskDocReadCfg(page_sampling='all') |
| 61 | +task_pipe= chug.create_task_pipeline(task_cfg) |
| 62 | + |
| 63 | +data_cfg= chug.DataCfg( |
| 64 | +source='pixparse/IDL-wds', |
| 65 | +split='train', |
| 66 | +batch_size=None, |
| 67 | +format='hfids', |
| 68 | +num_workers=0, |
| 69 | +) |
| 70 | +lb= chug.create_loader( |
| 71 | + data_cfg, |
| 72 | + task_cfg, |
| 73 | +) |
| 74 | +ii=iter(lb.loader) |
| 75 | +sample=next(ii) |
| 76 | +``` |
| 77 | + |
| 78 | +###Document Reading, Training with PDFA |
| 79 | + |
| 80 | +```python |
| 81 | +import chug |
| 82 | +img_cfg= chug.ImageInputCfg(size=(1024,768),transform_type='doc_nougat') |
| 83 | +img_fn= chug.create_image_preprocessor(input_cfg=img_cfg,is_training=True) |
| 84 | +txt_fn= chug.create_text_preprocessor( |
| 85 | +'naver-clova-ix/donut-base', |
| 86 | +prompt_end_token='<s_pdfa>', |
| 87 | +task_start_token='<s_pdfa>',#NOTE needs to be added to tokenizer |
| 88 | +) |
| 89 | + |
| 90 | +task_cfg= chug.DataTaskDocReadCfg( |
| 91 | +image_process_fn=img_fn, |
| 92 | +text_process_fn=txt_fn, |
| 93 | +page_sampling='random', |
| 94 | +) |
| 95 | +task_pipe= chug.create_task_pipeline(task_cfg) |
| 96 | +data_cfg= chug.DataCfg( |
| 97 | +source='pipe:curl -s -f -L https://huggingface.co/datasets/pixparse/pdfa-english-train/resolve/main/pdfa-eng-train-{000000..005000}.tar', |
| 98 | +split='train', |
| 99 | +batch_size=8, |
| 100 | +num_samples=1000000,#FIXME approx |
| 101 | +format='wds', |
| 102 | +) |
| 103 | +lb= chug.create_loader( |
| 104 | + data_cfg, |
| 105 | + task_cfg, |
| 106 | +is_training=True, |
| 107 | +) |
| 108 | +ii=iter(lb.loader) |
| 109 | +sample=next(ii) |
| 110 | +``` |
| 111 | + |
| 112 | +###Document Reading, Exploring PDFA |
| 113 | + |
| 114 | +```python |
| 115 | +import chug |
| 116 | + |
| 117 | +task_cfg= chug.DataTaskDocReadCfg( |
| 118 | +page_sampling='all', |
| 119 | +) |
| 120 | +task_pipe= chug.create_task_pipeline(task_cfg) |
| 121 | +data_cfg= chug.DataCfg( |
| 122 | +source='pixparse/pdfa-eng-wds', |
| 123 | +split='train', |
| 124 | +batch_size=None, |
| 125 | +format='hfids', |
| 126 | +num_workers=0, |
| 127 | +) |
| 128 | +lb= chug.create_loader( |
| 129 | + data_cfg, |
| 130 | + task_cfg, |
| 131 | +) |
| 132 | +ii=iter(lb.loader) |
| 133 | +sample=next(ii) |
| 134 | +``` |
| 135 | + |
| 136 | + |
| 137 | +###Image + Text |
| 138 | + |
| 139 | +###Training |
| 140 | + |
| 141 | +```python |
| 142 | +import chug |
| 143 | +import transformers |
| 144 | +from functoolsimport partial |
| 145 | +img_cfg= chug.ImageInputCfg(size=(512,512),transform_type='image_timm') |
| 146 | +img_fn= chug.create_image_preprocessor(input_cfg=img_cfg,is_training=True) |
| 147 | +tokenizer= transformers.AutoTokenizer.from_pretrained('laion/CLIP-ViT-H-14-laion2B-s32B-b79K') |
| 148 | +txt_fn= partial(chug.tokenize,max_length=1000,tokenizer=tokenizer) |
| 149 | +task_cfg= chug.DataTaskImageTextCfg( |
| 150 | +image_process_fn=img_fn, |
| 151 | +text_process_fn=txt_fn, |
| 152 | +) |
| 153 | +task_pipe= chug.create_task_pipeline(task_cfg) |
| 154 | +data_cfg= chug.DataCfg( |
| 155 | +source='pipe:curl -s -f -L https://huggingface.co/datasets/pixparse/cc12m-wds/resolve/main/cc12m-train-{0000..2175}.tar', |
| 156 | +split='train', |
| 157 | +batch_size=8, |
| 158 | +num_samples=10000000, |
| 159 | +format='wds', |
| 160 | +) |
| 161 | +lb= chug.create_loader( |
| 162 | + data_cfg, |
| 163 | + task_cfg, |
| 164 | +is_training=True, |
| 165 | +) |
| 166 | +ii=iter(lb.loader) |
| 167 | +sample=next(ii) |
| 168 | +``` |
| 169 | + |
| 170 | +###Document VQA |
| 171 | + |
| 172 | +####Training, Fine-tuning |
| 173 | +```python |
| 174 | +import chug |
| 175 | +from chug.task_pipelineimport create_task_pipeline |
| 176 | +img_cfg= chug.ImageInputCfg(size=(1024,768),transform_type='doc_basic') |
| 177 | +img_fn= chug.create_image_preprocessor(img_cfg,is_training=True) |
| 178 | +txt_fn= chug.create_text_preprocessor( |
| 179 | +'naver-clova-ix/donut-base-finetuned-docvqa', |
| 180 | +prompt_end_token='<s_answer>', |
| 181 | +task_start_token='<s_docvqa>', |
| 182 | +) |
| 183 | + |
| 184 | +task_cfg= chug.DataTaskDocVqaCfg( |
| 185 | +image_process_fn=img_fn, |
| 186 | +text_process_fn=txt_fn, |
| 187 | +) |
| 188 | +task_pipe= create_task_pipeline(task_cfg) |
| 189 | + |
| 190 | +data_cfg= chug.DataCfg( |
| 191 | +source='pipe:curl -s -f -L https://huggingface.co/datasets/pixparse/docvqa-wds/resolve/main/docvqa-train-{000..383}.tar', |
| 192 | +split='train', |
| 193 | +batch_size=8, |
| 194 | +format='wds', |
| 195 | +num_samples=39463, |
| 196 | +) |
| 197 | +lb= chug.create_loader( |
| 198 | + data_cfg, |
| 199 | + task_cfg, |
| 200 | +is_training=True, |
| 201 | +) |
| 202 | +ii=iter(lb.loader) |
| 203 | +sample=next(ii) |
| 204 | +``` |
| 205 | + |
| 206 | +####Exploration |
| 207 | + |
| 208 | +```python |
| 209 | +import chug |
| 210 | +from chug.task_pipelineimport create_task_pipeline |
| 211 | +task_cfg= chug.DataTaskDocVqaCfg( |
| 212 | +question_prefix='Question:', |
| 213 | +question_suffix='', |
| 214 | +answer_prefix='Answer:', |
| 215 | +answer_suffix='' |
| 216 | +) |
| 217 | +task_pipe= create_task_pipeline(task_cfg) |
| 218 | +data_cfg= chug.DataCfg( |
| 219 | +source='pixparse/docvqa-single-page-questions', |
| 220 | +split='validation', |
| 221 | +batch_size=None, |
| 222 | +format='hfids', |
| 223 | +num_workers=0, |
| 224 | +) |
| 225 | +lb= chug.create_loader( |
| 226 | + data_cfg, |
| 227 | + task_cfg |
| 228 | +) |
| 229 | +ii=iter(lb.loader) |
| 230 | +``` |