Commit78ac556

committed

[None][fix] Fix the aux_stream in Llama4MinLatencyFusedMoE

Signed-off-by: Jinyang Yuan <154768711+jinyangyuan-nvidia@users.noreply.github.com>

1 parentfac5220 commit78ac556Copy full SHA for 78ac556

File tree

2 files changed

-5

lines changed

tensorrt_llm/_torch/models
- modeling_llama.py
- modeling_llama_min_latency.py

2 files changed

-5

lines changed

`‎tensorrt_llm/_torch/models/modeling_llama.py‎`

Lines changed: 2 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@`
`41`	`41`	`from ..modules.multi_stream_utilsimportmaybe_execute_in_parallel`
`42`	`42`	`from ..modules.rms_normimportRMSNorm`
`43`	`43`	`from ..speculativeimportSpecMetadata`
`44`		`-from ..utilsimportFp4QuantizedTensor`
	`44`	`+from ..utilsimportAuxStreamType,Fp4QuantizedTensor`
`45`	`45`	`from .modeling_multimodal_utilsimportfuse_input_embeds`
`46`	`46`	`from .modeling_speculativeimportSpecDecOneEngineForCausalLM`
`47`	`47`	`from .modeling_utilsimport (DecoderModel,DecoderModelForCausalLM,`
`@@ -293,6 +293,7 @@ def __init__(`
`293`	`293`	`weight_loading_mode=MoEWeightLoadingMode.FUSED_GATE_UP_PROJ,`
`294`	`294`	`model_config=model_config,`
`295`	`295`	`apply_router_weight_on_input=True,`
	`296`	`+aux_stream_dict={AuxStreamType.MoeChunkingOverlap:aux_stream},`
`296`	`297`	`layer_idx=layer_idx)`
`297`	`298`
`298`	`299`	`self.router=Linear(`

`‎tensorrt_llm/_torch/models/modeling_llama_min_latency.py‎`

Lines changed: 6 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`WeightsLoadingConfig)`
`24`	`24`	`from ..modules.multi_stream_utilsimportmaybe_execute_in_parallel`
`25`	`25`	`from ..speculativeimportSpecMetadata`
`26`		`-from ..utilsimportFp4QuantizedTensor`
	`26`	`+from ..utilsimportAuxStreamType,Fp4QuantizedTensor`
`27`	`27`	`from .modeling_llamaimportLlama4Attention,Llama4DecoderLayer,Llama4MoE`
`28`	`28`
`29`	`29`	`# Perf heuristics thresholds.`
`@@ -438,7 +438,8 @@ def __init__(`
`438`	`438`	`dtype:Optional[torch.dtype]=None,`
`439`	`439`	`reduce_results:bool=False,`
`440`	`440`	`model_config:ModelConfig=ModelConfig(),`
`441`		`-aux_stream:torch.cuda.Stream=torch.cuda.Stream(),`
	`441`	`+aux_stream_dict:Optional[Dict[AuxStreamType,`
	`442`	`+torch.cuda.Stream]]=None,`
`442`	`443`	`weight_loading_mode:MoEWeightLoadingMode=MoEWeightLoadingMode.`
`443`	`444`	`VANILLA,`
`444`	`445`	`apply_router_weight_on_input:bool=False,`
`@@ -452,7 +453,7 @@ def __init__(`
`452`	`453`	`dtype=dtype,`
`453`	`454`	`reduce_results=reduce_results,`
`454`	`455`	`model_config=model_config,`
`455`		`-aux_stream=aux_stream,`
	`456`	`+aux_stream_dict=aux_stream_dict,`
`456`	`457`	`weight_loading_mode=weight_loading_mode,`
`457`	`458`	`apply_router_weight_on_input=apply_router_weight_on_input,`
`458`	`459`	`)`
`@@ -554,6 +555,7 @@ def __init__(`
`554`	`555`	`weight_loading_mode=MoEWeightLoadingMode.FUSED_GATE_UP_PROJ,`
`555`	`556`	`model_config=model_config,`
`556`	`557`	`apply_router_weight_on_input=True,`
	`558`	`+aux_stream_dict={AuxStreamType.MoeChunkingOverlap:aux_stream},`
`557`	`559`	`)`
`558`	`560`
`559`	`561`	`self.router=Llama4MinLatencyLinear(`
`@@ -801,7 +803,7 @@ def forward(`
`801`	`803`	`orself.fusion_config.POST_MLP_FUSION`
`802`	`804`	`ifneeds_post_allreduceandself.next_layer_layernormisnotNone:`
`803`	`805`	`ifuse_fp8_allreduceandself.next_attnisnotNone \`
`804`		`-andhasattr(elf.next_attn.qkv_proj,'input_scale'):`
	`806`	`+andhasattr(self.next_attn.qkv_proj,'input_scale'):`
`805`	`807`	`hidden_states,residual=self.all_reduce(`
`806`	`808`	`hidden_states,`
`807`	`809`	`all_reduce_params=AllReduceParams(`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit78ac556

File tree

2 files changed

2 files changed

`‎tensorrt_llm/_torch/models/modeling_llama.py‎`

`‎tensorrt_llm/_torch/models/modeling_llama_min_latency.py‎`

0 commit comments