Commit4a1e138

authored

[None][feat] Update multimodal utilityget_num_tokens_per_image for better generalization (#7544)

Signed-off-by: Chang Liu (Enterprise Products) <9713593+chang-l@users.noreply.github.com>

1 parentdd9627d commit4a1e138Copy full SHA for 4a1e138

File tree

3 files changed

+17

-27

lines changed

tensorrt_llm/inputs
- multimodal.py
- registry.py
tests/unittest/_torch/multimodal
- test_find_num_image_tokens.py

3 files changed

+17

-27

lines changed

`‎tensorrt_llm/inputs/multimodal.py‎`

Lines changed: 2 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -505,19 +505,14 @@ def find_mm_token_lengths(mm_data: Dict[str, Any],`
`505`	`505`	`ifisinstance(item,torch.Tensor):`
`506`	`506`	`item=ToPILImage()(item)`
`507`	`507`	`num_tokens=input_processor.get_num_tokens_per_image(`
`508`		`-image_width=item.width,`
`509`		`-image_height=item.height,`
`510`		`- )`
	`508`	`+image=item, )`
`511`	`509`	`modality_token_lengths.append(num_tokens)`
`512`	`510`	`elifmodality=="video":`
`513`	`511`	`assertisinstance(item,list),"Video must be a list of frames"`
`514`	`512`	`ifisinstance(item[0],torch.Tensor):`
`515`	`513`	`item= [ToPILImage()(frame)forframeinitem]`
`516`	`514`	`num_tokens=input_processor.get_num_tokens_per_video(`
`517`		`-video_width=item[0].width,`
`518`		`-video_height=item[0].height,`
`519`		`-num_frames=len(item),`
`520`		`- )`
	`515`	`+video=item, )`
`521`	`516`	`modality_token_lengths.append(num_tokens)`
`522`	`517`	`else:`
`523`	`518`	`# TODO: add audio support if needed`

`‎tensorrt_llm/inputs/registry.py‎`

Lines changed: 10 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@`
`3`	`3`	`fromtypingimport (Any,Callable,Dict,List,Optional,Protocol,Tuple,Type,`
`4`	`4`	`TypeVar)`
`5`	`5`
	`6`	`+fromPILimportImage`
`6`	`7`	`fromtorchimportTensor,nn`
`7`	`8`
`8`	`9`	`from .._utilsimportnvtx_range_debug`
`@@ -114,8 +115,7 @@ def get_num_multimodal_tokens(self):`
`114`	`115`	`defget_num_tokens_per_image(`
`115`	`116`	`self,`
`116`	`117`	`*,`
`117`		`-image_width:int,`
`118`		`-image_height:int,`
	`118`	`+image:Image.Image,`
`119`	`119`	`**kwargs,`
`120`	`120`	`):`
`121`	`121`	`"""`
`@@ -126,16 +126,16 @@ def get_num_tokens_per_image(`
`126`	`126`
`127`	`127`	`Subclasses can override this method to provide custom logic to calculate the number of tokens.`
`128`	`128`	`"""`
	`129`	`+image_height=image.height`
	`130`	`+image_width=image.width`
`129`	`131`	`image_size= (image_height,image_width)`
`130`	`132`	`returnself.get_num_multimodal_tokens([image_size],`
`131`	`133`	`**kwargs)["num_image_tokens"][0]`
`132`	`134`
`133`	`135`	`defget_num_tokens_per_video(`
`134`	`136`	`self,`
`135`	`137`	`*,`
`136`		`-video_width:int,`
`137`		`-video_height:int,`
`138`		`-num_frames:int,`
	`138`	`+video:List[Image.Image],`
`139`	`139`	`**kwargs,`
`140`	`140`	`):`
`141`	`141`	`"""`
`@@ -146,15 +146,18 @@ def get_num_tokens_per_video(`
`146`	`146`
`147`	`147`	`Subclasses can override this method to provide custom logic to calculate the number of tokens.`
`148`	`148`	`"""`
	`149`	`+video_width=video[0].width`
	`150`	`+video_height=video[0].height`
	`151`	`+num_frames=len(video)`
`149`	`152`	`video_size= (num_frames,video_height,video_width)`
`150`	`153`	`try:`
`151`	`154`	`num_video_tokens=self.get_num_multimodal_tokens(`
`152`	`155`	`video_sizes=[video_size],**kwargs)["num_video_tokens"][0]`
`153`	`156`	`returnnum_video_tokens`
`154`	`157`	`exceptException:`
`155`	`158`	`# Fallback: treat video as sequence of frames`
`156`		`-num_tokens_per_frame=self.get_num_tokens_per_image(`
`157`		`-image_width=video_width,image_height=video_height,**kwargs)`
	`159`	`+num_tokens_per_frame=self.get_num_tokens_per_image(image=video[0],`
	`160`	`+**kwargs)`
`158`	`161`	`temporal_patch_size=self.temporal_patch_sizeifhasattr(`
`159`	`162`	`self,'temporal_patch_size')else1`
`160`	`163`	`returnnum_tokens_per_frame*num_frames//temporal_patch_size`

`‎tests/unittest/_torch/multimodal/test_find_num_image_tokens.py‎`

Lines changed: 5 additions & 13 deletions

Original file line number	Diff line number	Diff line change
`@@ -136,13 +136,10 @@ def test_get_num_tokens_per_image(model_key, multimodal_model_configs):`
`136`	`136`	`# Get predicted number of tokens using get_num_tokens_per_image`
`137`	`137`	`ifmodel_type=='llava_next':`
`138`	`138`	`predicted_num_tokens=input_processor.get_num_tokens_per_image(`
`139`		`-image_width=image_width,image_height=image_height)`
	`139`	`+image=test_image)`
`140`	`140`	`elifmodel_type=='qwen2_5_vl':`
`141`	`141`	`predicted_num_tokens=input_processor.get_num_tokens_per_image(`
`142`		`-image_width=image_width,`
`143`		`-image_height=image_height,`
`144`		`-num_frames=1,`
`145`		`-do_resize=True)`
	`142`	`+image=test_image)`
`146`	`143`	`else:`
`147`	`144`	`raiseValueError(f"Unsupported model type:{model_type}")`
`148`	`145`
`@@ -235,7 +232,6 @@ def test_get_num_tokens_per_video(model_key, multimodal_model_configs):`
`235`	`232`	`test_video=load_video(test_video_url,num_frames=8,format="pil")`
`236`	`233`	`# load_video returns a list of frames, we only have one video`
`237`	`234`	`video_width,video_height=test_video[0].size`
`238`		`-num_frames=len(test_video)`
`239`	`235`
`240`	`236`	`# Get actual embedding tensor for this image`
`241`	`237`	`actual_embedding=SharedTensorContainer.from_dict(`
`@@ -245,17 +241,13 @@ def test_get_num_tokens_per_video(model_key, multimodal_model_configs):`
`245`	`241`	`# The first dimension should be the number of image tokens`
`246`	`242`	`actual_num_tokens=actual_embedding.shape[0]`
`247`	`243`
`248`		`-# Get predicted number of tokens usingget_num_tokens_per_image`
	`244`	`+# Get predicted number of tokens usingget_num_tokens_per_video`
`249`	`245`	`ifmodel_type=='llava_next':`
`250`	`246`	`predicted_num_tokens=input_processor.get_num_tokens_per_video(`
`251`		`-video_width=video_width,`
`252`		`-video_height=video_height,`
`253`		`-num_frames=num_frames)`
	`247`	`+video=test_video)`
`254`	`248`	`elifmodel_type=='qwen2_5_vl':`
`255`	`249`	`predicted_num_tokens=input_processor.get_num_tokens_per_video(`
`256`		`-video_width=video_width,`
`257`		`-video_height=video_height,`
`258`		`-num_frames=num_frames)`
	`250`	`+video=test_video)`
`259`	`251`
`260`	`252`	`# The key assertion: predicted should match actual`
`261`	`253`	`assertpredicted_num_tokens==actual_num_tokens, \`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit4a1e138

File tree

3 files changed

3 files changed

`‎tensorrt_llm/inputs/multimodal.py‎`

`‎tensorrt_llm/inputs/registry.py‎`

`‎tests/unittest/_torch/multimodal/test_find_num_image_tokens.py‎`

0 commit comments