Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit514f60a

Browse files
committed
[None][chore] Make low_precision_combine as a llm arg (NVIDIA#7598)
Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>
1 parent8484aa9 commit514f60a

File tree

6 files changed

+21
-4
lines changed

6 files changed

+21
-4
lines changed

‎examples/llm-api/quickstart_advanced.py‎

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ def add_llm_args(parser):
7373
parser.add_argument('--moe_ep_size',type=int,default=-1)
7474
parser.add_argument('--moe_tp_size',type=int,default=-1)
7575
parser.add_argument('--moe_cluster_size',type=int,default=-1)
76+
parser.add_argument(
77+
'--use_low_precision_moe_combine',
78+
default=False,
79+
action='store_true',
80+
help='Use low precision combine in MoE (only for NVFP4 quantization)')
7681

7782
# KV cache
7883
parser.add_argument('--kv_cache_dtype',type=str,default='auto')
@@ -228,7 +233,7 @@ def setup_llm(args, **kwargs):
228233
enable_piecewise_cuda_graph= \
229234
args.use_piecewise_cuda_graph)
230235
ifargs.use_torch_compileelseNone,
231-
moe_config=MoeConfig(backend=args.moe_backend),
236+
moe_config=MoeConfig(backend=args.moe_backend,use_low_precision_moe_combine=args.use_low_precision_moe_combine),
232237
sampler_type=args.sampler_type,
233238
max_seq_len=args.max_seq_len,
234239
max_batch_size=args.max_batch_size,

‎tensorrt_llm/_torch/model_config.py‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@ class ModelConfig(Generic[TConfig]):
133133
moe_backend:str='CUTLASS'# options can be CUTLASS, TRTLLM
134134
# IF true, disables FC2+finalize fusion in CUTLASS MoE backend
135135
moe_disable_finalize_fusion:bool=False
136+
# If true, use low precision combine in MoE operations (only for NVFP4 quantization)
137+
use_low_precision_moe_combine:bool=False
136138

137139
allreduce_strategy:AllReduceStrategy=AllReduceStrategy.AUTO
138140

‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py‎

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,9 +189,8 @@ def __init__(
189189
self.use_postquant_alltoall= (os.environ.get(
190190
"TRTLLM_MOE_POST_QUANT_ALLTOALLV","1")
191191
=="1")andqm.has_nvfp4()
192-
self.use_low_precision_combine= (os.environ.get(
193-
"TRTLLM_MOE_USE_LOW_PRECISION_COMBINE","0")
194-
=="1")andqm.has_nvfp4()
192+
self.use_low_precision_combine=model_config.use_low_precision_moe_combineandqm.has_nvfp4(
193+
)
195194

196195
ifself.alltoall_method_type==AlltoallMethodType.MNNVL:
197196
MnnvlMemory.initialize()

‎tensorrt_llm/_torch/pyexecutor/config.py‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class PyTorchConfig:
5555
moe_backend:str='CUTLASS'
5656

5757
moe_disable_finalize_fusion:bool=False
58+
use_low_precision_moe_combine:bool=False
5859

5960
enable_mixed_sampler:bool=False
6061
"""

‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,8 @@ def __init__(
311311
moe_backend=pytorch_backend_config.moe_backend,
312312
moe_disable_finalize_fusion=pytorch_backend_config.
313313
moe_disable_finalize_fusion,
314+
use_low_precision_moe_combine=pytorch_backend_config.
315+
use_low_precision_moe_combine,
314316
load_format=pytorch_backend_config.load_format,
315317
max_num_tokens=max_num_tokens,
316318
moe_max_num_tokens=pytorch_backend_config.moe_max_num_tokens,

‎tensorrt_llm/llmapi/llm_args.py‎

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,12 @@ class MoeConfig(StrictBaseModel):
191191
"Disable FC2+finalize kernel fusion in CUTLASS MoE backend. Setting this to True recovers deterministic numerical behavior with top-k > 2."
192192
)
193193

194+
use_low_precision_moe_combine:bool=Field(
195+
default=False,
196+
description=
197+
"Use low precision combine in MoE operations (only for NVFP4 quantization). When enabled, uses lower precision for combining expert outputs to improve performance."
198+
)
199+
194200
@classmethod
195201
deffrom_dict(cls,data:dict):
196202
returncls(**data)
@@ -2502,6 +2508,8 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
25022508
moe_load_balancer=self.moe_config.load_balancer,
25032509
attn_backend=self.attn_backend,
25042510
moe_backend=self.moe_config.backend,
2511+
use_low_precision_moe_combine=self.moe_config.
2512+
use_low_precision_moe_combine,
25052513
enable_mixed_sampler=self.enable_mixed_sampler,
25062514
sampler_type=self.sampler_type,
25072515
kv_cache_dtype=self.kv_cache_config.dtype,

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp