NotificationsYou must be signed in to change notification settings
Fork0
Star0

Commit514f60a

committed

[None][chore] Make low_precision_combine as a llm arg (NVIDIA#7598)

Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>

1 parent8484aa9 commit514f60aCopy full SHA for 514f60a

File tree

6 files changed

+21

-4

lines changed

examples/llm-api
- quickstart_advanced.py
tensorrt_llm
- _torch
  - model_config.py
  - modules/fused_moe
    - fused_moe_wide_ep.py
  - pyexecutor
    - config.py
    - model_engine.py
- llmapi
  - llm_args.py

6 files changed

+21

-4

lines changed

`‎examples/llm-api/quickstart_advanced.py‎`

Lines changed: 6 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -73,6 +73,11 @@ def add_llm_args(parser):`
`73`	`73`	`parser.add_argument('--moe_ep_size',type=int,default=-1)`
`74`	`74`	`parser.add_argument('--moe_tp_size',type=int,default=-1)`
`75`	`75`	`parser.add_argument('--moe_cluster_size',type=int,default=-1)`
	`76`	`+parser.add_argument(`
	`77`	`+'--use_low_precision_moe_combine',`
	`78`	`+default=False,`
	`79`	`+action='store_true',`
	`80`	`+help='Use low precision combine in MoE (only for NVFP4 quantization)')`
`76`	`81`
`77`	`82`	`# KV cache`
`78`	`83`	`parser.add_argument('--kv_cache_dtype',type=str,default='auto')`
`@@ -228,7 +233,7 @@ def setup_llm(args, **kwargs):`
`228`	`233`	`enable_piecewise_cuda_graph= \`
`229`	`234`	`args.use_piecewise_cuda_graph)`
`230`	`235`	`ifargs.use_torch_compileelseNone,`
`231`		`-moe_config=MoeConfig(backend=args.moe_backend),`
	`236`	`+moe_config=MoeConfig(backend=args.moe_backend,use_low_precision_moe_combine=args.use_low_precision_moe_combine),`
`232`	`237`	`sampler_type=args.sampler_type,`
`233`	`238`	`max_seq_len=args.max_seq_len,`
`234`	`239`	`max_batch_size=args.max_batch_size,`

`‎tensorrt_llm/_torch/model_config.py‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -133,6 +133,8 @@ class ModelConfig(Generic[TConfig]):`
`133`	`133`	`moe_backend:str='CUTLASS'# options can be CUTLASS, TRTLLM`
`134`	`134`	`# IF true, disables FC2+finalize fusion in CUTLASS MoE backend`
`135`	`135`	`moe_disable_finalize_fusion:bool=False`
	`136`	`+# If true, use low precision combine in MoE operations (only for NVFP4 quantization)`
	`137`	`+use_low_precision_moe_combine:bool=False`
`136`	`138`
`137`	`139`	`allreduce_strategy:AllReduceStrategy=AllReduceStrategy.AUTO`
`138`	`140`

`‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py‎`

Lines changed: 2 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -189,9 +189,8 @@ def __init__(`
`189`	`189`	`self.use_postquant_alltoall= (os.environ.get(`
`190`	`190`	`"TRTLLM_MOE_POST_QUANT_ALLTOALLV","1")`
`191`	`191`	`=="1")andqm.has_nvfp4()`
`192`		`-self.use_low_precision_combine= (os.environ.get(`
`193`		`-"TRTLLM_MOE_USE_LOW_PRECISION_COMBINE","0")`
`194`		`-=="1")andqm.has_nvfp4()`
	`192`	`+self.use_low_precision_combine=model_config.use_low_precision_moe_combineandqm.has_nvfp4(`
	`193`	`+ )`
`195`	`194`
`196`	`195`	`ifself.alltoall_method_type==AlltoallMethodType.MNNVL:`
`197`	`196`	`MnnvlMemory.initialize()`

`‎tensorrt_llm/_torch/pyexecutor/config.py‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,7 @@ class PyTorchConfig:`
`55`	`55`	`moe_backend:str='CUTLASS'`
`56`	`56`
`57`	`57`	`moe_disable_finalize_fusion:bool=False`
	`58`	`+use_low_precision_moe_combine:bool=False`
`58`	`59`
`59`	`60`	`enable_mixed_sampler:bool=False`
`60`	`61`	`"""`

`‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -311,6 +311,8 @@ def __init__(`
`311`	`311`	`moe_backend=pytorch_backend_config.moe_backend,`
`312`	`312`	`moe_disable_finalize_fusion=pytorch_backend_config.`
`313`	`313`	`moe_disable_finalize_fusion,`
	`314`	`+use_low_precision_moe_combine=pytorch_backend_config.`
	`315`	`+use_low_precision_moe_combine,`
`314`	`316`	`load_format=pytorch_backend_config.load_format,`
`315`	`317`	`max_num_tokens=max_num_tokens,`
`316`	`318`	`moe_max_num_tokens=pytorch_backend_config.moe_max_num_tokens,`

`‎tensorrt_llm/llmapi/llm_args.py‎`

Lines changed: 8 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -191,6 +191,12 @@ class MoeConfig(StrictBaseModel):`
`191`	`191`	`"Disable FC2+finalize kernel fusion in CUTLASS MoE backend. Setting this to True recovers deterministic numerical behavior with top-k > 2."`
`192`	`192`	`)`
`193`	`193`
	`194`	`+use_low_precision_moe_combine:bool=Field(`
	`195`	`+default=False,`
	`196`	`+description=`
	`197`	`+"Use low precision combine in MoE operations (only for NVFP4 quantization). When enabled, uses lower precision for combining expert outputs to improve performance."`
	`198`	`+ )`
	`199`	`+`
`194`	`200`	`@classmethod`
`195`	`201`	`deffrom_dict(cls,data:dict):`
`196`	`202`	`returncls(**data)`
`@@ -2502,6 +2508,8 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":`
`2502`	`2508`	`moe_load_balancer=self.moe_config.load_balancer,`
`2503`	`2509`	`attn_backend=self.attn_backend,`
`2504`	`2510`	`moe_backend=self.moe_config.backend,`
	`2511`	`+use_low_precision_moe_combine=self.moe_config.`
	`2512`	`+use_low_precision_moe_combine,`
`2505`	`2513`	`enable_mixed_sampler=self.enable_mixed_sampler,`
`2506`	`2514`	`sampler_type=self.sampler_type,`
`2507`	`2515`	`kv_cache_dtype=self.kv_cache_config.dtype,`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit514f60a

File tree

6 files changed

6 files changed

`‎examples/llm-api/quickstart_advanced.py‎`

`‎tensorrt_llm/_torch/model_config.py‎`

`‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py‎`

`‎tensorrt_llm/_torch/pyexecutor/config.py‎`

`‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎`

`‎tensorrt_llm/llmapi/llm_args.py‎`

0 commit comments