Commite9f26fe

authored

[None][chore] Cherry-pick from (#7598) Make low_precision_combine as a llm arg (#7898)

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

1 parent28b9a81 commite9f26feCopy full SHA for e9f26fe

File tree

6 files changed

+21

-4

lines changed

examples/llm-api
- quickstart_advanced.py
tensorrt_llm
- _torch
  - model_config.py
  - modules/fused_moe
    - fused_moe_wide_ep.py
  - pyexecutor
    - config.py
    - model_loader.py
- llmapi
  - llm_args.py

6 files changed

+21

-4

lines changed

`‎examples/llm-api/quickstart_advanced.py‎`

Lines changed: 6 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -73,6 +73,11 @@ def add_llm_args(parser):`
`73`	`73`	`parser.add_argument('--moe_ep_size',type=int,default=-1)`
`74`	`74`	`parser.add_argument('--moe_tp_size',type=int,default=-1)`
`75`	`75`	`parser.add_argument('--moe_cluster_size',type=int,default=-1)`
	`76`	`+parser.add_argument(`
	`77`	`+'--use_low_precision_moe_combine',`
	`78`	`+default=False,`
	`79`	`+action='store_true',`
	`80`	`+help='Use low precision combine in MoE (only for NVFP4 quantization)')`
`76`	`81`
`77`	`82`	`# KV cache`
`78`	`83`	`parser.add_argument('--kv_cache_dtype',type=str,default='auto')`
`@@ -236,7 +241,7 @@ def setup_llm(args, **kwargs):`
`236`	`241`	`enable_piecewise_cuda_graph= \`
`237`	`242`	`args.use_piecewise_cuda_graph)`
`238`	`243`	`ifargs.use_torch_compileelseNone,`
`239`		`-moe_config=MoeConfig(backend=args.moe_backend),`
	`244`	`+moe_config=MoeConfig(backend=args.moe_backend,use_low_precision_moe_combine=args.use_low_precision_moe_combine),`
`240`	`245`	`sampler_type=args.sampler_type,`
`241`	`246`	`max_seq_len=args.max_seq_len,`
`242`	`247`	`max_batch_size=args.max_batch_size,`

`‎tensorrt_llm/_torch/model_config.py‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -133,6 +133,8 @@ class ModelConfig(Generic[TConfig]):`
`133`	`133`	`moe_backend:str='CUTLASS'# options can be CUTLASS, TRTLLM`
`134`	`134`	`# IF true, disables FC2+finalize fusion in CUTLASS MoE backend`
`135`	`135`	`moe_disable_finalize_fusion:bool=False`
	`136`	`+# If true, use low precision combine in MoE operations (only for NVFP4 quantization)`
	`137`	`+use_low_precision_moe_combine:bool=False`
`136`	`138`
`137`	`139`	`allreduce_strategy:AllReduceStrategy=AllReduceStrategy.AUTO`
`138`	`140`

`‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py‎`

Lines changed: 1 addition & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -193,8 +193,7 @@ def __init__(`
`193`	`193`	`ifself.enable_alltoall:`
`194`	`194`	`self.use_postquant_alltoall= (os.environ.get(`
`195`	`195`	`"TRTLLM_MOE_POST_QUANT_ALLTOALLV","1")=="1")`
`196`		`-self.use_low_precision_combine= (os.environ.get(`
`197`		`-"TRTLLM_MOE_USE_LOW_PRECISION_COMBINE","0")=="1")`
	`196`	`+self.use_low_precision_combine=model_config.use_low_precision_moe_combine`
`198`	`197`
`199`	`198`	`ifself.alltoall_method_type==AlltoallMethodType.MNNVL:`
`200`	`199`	`MnnvlMemory.initialize()`

`‎tensorrt_llm/_torch/pyexecutor/config.py‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,7 @@ class PyTorchConfig:`
`62`	`62`	`moe_backend:str='CUTLASS'`
`63`	`63`
`64`	`64`	`moe_disable_finalize_fusion:bool=False`
	`65`	`+use_low_precision_moe_combine:bool=False`
`65`	`66`
`66`	`67`	`sampler_type:SamplerType=SamplerType.auto`
`67`	`68`	`"""`

`‎tensorrt_llm/_torch/pyexecutor/model_loader.py‎`

Lines changed: 3 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -303,7 +303,9 @@ def _load_and_validate_config(`
`303`	`303`	`attn_backend=self.pytorch_backend_config.attn_backend,`
`304`	`304`	`moe_backend=self.pytorch_backend_config.moe_backend,`
`305`	`305`	`moe_disable_finalize_fusion=self.pytorch_backend_config.`
`306`		`-moe_disable_finalize_fusion)`
	`306`	`+moe_disable_finalize_fusion,`
	`307`	`+use_low_precision_moe_combine=self.pytorch_backend_config.`
	`308`	`+use_low_precision_moe_combine)`
`307`	`309`
`308`	`310`	`validate_and_set_kv_cache_quant(`
`309`	`311`	`config,self.pytorch_backend_config.kv_cache_dtype)`

`‎tensorrt_llm/llmapi/llm_args.py‎`

Lines changed: 8 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -192,6 +192,12 @@ class MoeConfig(StrictBaseModel):`
`192`	`192`	`"Disable FC2+finalize kernel fusion in CUTLASS MoE backend. Setting this to True recovers deterministic numerical behavior with top-k > 2."`
`193`	`193`	`)`
`194`	`194`
	`195`	`+use_low_precision_moe_combine:bool=Field(`
	`196`	`+default=False,`
	`197`	`+description=`
	`198`	`+"Use low precision combine in MoE operations (only for NVFP4 quantization). When enabled, uses lower precision for combining expert outputs to improve performance."`
	`199`	`+ )`
	`200`	`+`
`195`	`201`	`@classmethod`
`196`	`202`	`deffrom_dict(cls,data:dict):`
`197`	`203`	`returncls(**data)`
`@@ -2614,6 +2620,8 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":`
`2614`	`2620`	`moe_load_balancer=self.moe_config.load_balancer,`
`2615`	`2621`	`attn_backend=self.attn_backend,`
`2616`	`2622`	`moe_backend=self.moe_config.backend,`
	`2623`	`+use_low_precision_moe_combine=self.moe_config.`
	`2624`	`+use_low_precision_moe_combine,`
`2617`	`2625`	`sampler_type=self.sampler_type,`
`2618`	`2626`	`kv_cache_dtype=self.kv_cache_config.dtype,`
`2619`	`2627`	`mamba_ssm_cache_dtype=self.kv_cache_config.mamba_ssm_cache_dtype,`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commite9f26fe

File tree

6 files changed

6 files changed

`‎examples/llm-api/quickstart_advanced.py‎`

`‎tensorrt_llm/_torch/model_config.py‎`

`‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py‎`

`‎tensorrt_llm/_torch/pyexecutor/config.py‎`

`‎tensorrt_llm/_torch/pyexecutor/model_loader.py‎`

`‎tensorrt_llm/llmapi/llm_args.py‎`

0 commit comments