Commitfc92e82

yifeizhang-c

authored and

dominicshanshan

committed

[TRTLLM-6589][feat] Support CUDA graph for DeepEP (NVIDIA#7514)

Signed-off-by: Yifei Zhang <219273404+yifeizhang-c@users.noreply.github.com>

1 parentbd2fd9c commitfc92e82Copy full SHA for fc92e82

File tree

3 files changed

+37

-12

lines changed

cpp/tensorrt_llm/deep_ep
- CMakeLists.txt
tensorrt_llm/_torch/modules/fused_moe
- deep_ep_utils.py
- fused_moe_wide_ep.py

3 files changed

+37

-12

lines changed

`‎cpp/tensorrt_llm/deep_ep/CMakeLists.txt‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-set(DEEP_EP_COMMITbe2582ffe69b5e7d61c3bc9bf7a5316bc48261f9)`
	`1`	`+set(DEEP_EP_COMMIT5be51b228a7c82dbdb213ea58e77bffd12b38af8)`
`2`	`2`	`set(NVSHMEM_URL_HASH`
`3`	`3`	`SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a)`
`4`	`4`

`‎tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py‎`

Lines changed: 6 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ def reserve(self, hidden_size: int, hidden_dtype: torch.dtype):`
`59`	`59`
`60`	`60`	`defdispatch(self,x:Union[torch.Tensor,Tuple[torch.Tensor,torch.Tensor]],`
`61`	`61`	`topk_idx:torch.Tensor,topk_weights:torch.Tensor,`
`62`		`-num_experts:int,global_expert_id_offset:int)-> \`
	`62`	`+num_experts:int,global_expert_id_offset:int,all_rank_max_num_tokens:int,ep_size:int,use_cuda_graph:bool)-> \`
`63`	`63`	`Tuple[Union[torch.Tensor,Tuple[torch.Tensor,torch.Tensor]],torch.Tensor,torch.Tensor,List,Tuple]:`
`64`	`64`	# NOTES: an optional `previous_event` means a CUDA event captured that you want to make it as a dependency
`65`	`65`	`# of the dispatch kernel, it may be useful with communication-computation overlap. For more information, please`
`@@ -71,13 +71,16 @@ def dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],`
`71`	`71`	`assertevent.eventisNone`
`72`	`72`
`73`	`73`	`# Do MoE dispatch`
`74`		`-# NOTES: the CPU will wait for GPU's signal to arrive, so this is not compatible with CUDA graph`
`75`	`74`	# For more advanced usages, please refer to the docs of the `dispatch` function
	`75`	`+ifuse_cuda_graph:`
	`76`	`+num_worst_tokens=all_rank_max_num_tokens*ep_size`
	`77`	`+else:`
	`78`	`+num_worst_tokens=0`
`76`	`79`	`recv_x,recv_topk_idx,recv_topk_weights,num_recv_tokens_per_expert_list,handle,event= \`
`77`	`80`	`self.buffer.dispatch(x,topk_idx=topk_idx,topk_weights=topk_weights,`
`78`	`81`	`num_tokens_per_rank=num_tokens_per_rank,num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,`
`79`	`82`	`is_token_in_rank=is_token_in_rank,num_tokens_per_expert=num_tokens_per_expert,`
`80`		`-global_expert_id_offset=global_expert_id_offset)`
	`83`	`+global_expert_id_offset=global_expert_id_offset,num_worst_tokens=num_worst_tokens)`
`81`	`84`	`assertevent.eventisNone`
`82`	`85`
`83`	`86`	# For event management, please refer to the docs of the `EventOverlap` class

`‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py‎`

Lines changed: 30 additions & 8 deletions

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ class AlltoallMethodType(IntEnum):`
`32`	`32`	`NotEnabled=0`
`33`	`33`	`# MNNVL`
`34`	`34`	`MNNVL=1`
`35`		`-# DeepEP intranode or internode:noCUDA Graphssupport, IBGDA is required by internode`
	`35`	`+# DeepEP intranode or internode: CUDA Graphsare supported, IBGDA is required by internode`
`36`	`36`	`DeepEP=2`
`37`	`37`	`# DeepEP low latency: CUDA Graphs are supported, IBGDA is required`
`38`	`38`	`DeepEPLowLatency=3`
`@@ -101,6 +101,8 @@ def __init__(`
`101`	`101`	`self.repeat_idx=0`
`102`	`102`	`self.repeat_count=1`
`103`	`103`
	`104`	`+self.use_cuda_graph=model_config.use_cuda_graph`
	`105`	`+`
`104`	`106`	`moe_load_balancer_config=model_config.moe_load_balancer`
`105`	`107`	`init_expert_size_per_partition=moe_load_balancer_config.num_local_slotsifmoe_load_balancer_configelseself.num_experts//self.ep_size`
`106`	`108`	`self.initial_global_assignments= [`
`@@ -212,6 +214,9 @@ def __init__(`
`212`	`214`	`str(`
`213`	`215`	`min(model_config.max_num_tokens,`
`214`	`216`	`self.moe_max_num_tokens))))`
	`217`	`+# Set nvshmem queue pair depth larger than the number of on-flight WRs (ref: https://github.com/deepseek-ai/DeepEP/issues/427)`
	`218`	`+os.environ['NVSHMEM_QP_DEPTH']=str(`
	`219`	`+2* (self.deep_ep_max_num_tokens+1))`
`215`	`220`	`self.deep_ep_buffer=buffer_pool.get_low_latency_buffer(`
`216`	`221`	`model_config.mapping)`
`217`	`222`	`self.deep_ep_buffer.reserve(self.deep_ep_max_num_tokens,`
`@@ -255,6 +260,25 @@ def _check_configs(self):`
`255`	`260`	`defselect_alltoall_method_type(mapping:Mapping,top_k:int,`
`256`	`261`	`dtype:torch.dtype,`
`257`	`262`	`use_cuda_graph:bool)->AlltoallMethodType:`
	`263`	`+`
	`264`	`+# Check if DeepEP is feasible for the given number of ranks`
	`265`	`+# DeepEP supports two modes:`
	`266`	`+# 1. Intranode: Single node with 2, 4, or 8 ranks`
	`267`	`+# 2. Internode: 2, 4, 8, or 16 nodes with 8 ranks per node`
	`268`	`+defis_deepep_feasible(num_ranks:int)->bool:`
	`269`	`+NUM_INTRANODE_SUPPORTED_RANKS= {2,4,8}`
	`270`	`+REQUIRED_LOCAL_MPI_SIZE=8`
	`271`	`+NUM_INTERNODE_SUPPORTED_RDMA_RANKS= {2,4,8,16}`
	`272`	`+mpi_size=local_mpi_size()`
	`273`	`+# Intranode cases`
	`274`	`+ifnum_ranks==mpi_sizeandnum_ranksinNUM_INTRANODE_SUPPORTED_RANKS:`
	`275`	`+returnTrue`
	`276`	`+# Internode cases`
	`277`	`+ifmpi_size!=REQUIRED_LOCAL_MPI_SIZE:`
	`278`	`+returnFalse`
	`279`	`+num_rdma_nodes=num_ranks//mpi_size`
	`280`	`+returnnum_rdma_nodesinNUM_INTERNODE_SUPPORTED_RDMA_RANKS`
	`281`	`+`
`258`	`282`	`all2all_method_type=os.environ.get("TRTLLM_FORCE_ALLTOALL_METHOD")`
`259`	`283`	`ifall2all_method_typeisnotNone:`
`260`	`284`	`returnAlltoallMethodType[all2all_method_type]`
`@@ -276,12 +300,10 @@ def select_alltoall_method_type(mapping: Mapping, top_k: int,`
`276`	`300`
`277`	`301`	`ifos.environ.get("TRTLLM_CAN_USE_DEEP_EP","0")=="1":`
`278`	`302`	`ifdeep_ep_installedanddtype==torch.bfloat16:`
`279`		`-ifuse_cuda_graph:`
`280`		`-# Here we can only choose DeepEPLowLatency since only this method supports CUDA Graphs.`
`281`		`-returnAlltoallMethodType.DeepEPLowLatency`
`282`		`-else:`
`283`		`-# Here we can choose DeepEP or DeepEPLowLatency if both are available. Now DeepEP is faster.`
	`303`	`+# Choose DeepEP if feasible`
	`304`	`+ifis_deepep_feasible(mapping.moe_ep_size):`
`284`	`305`	`returnAlltoallMethodType.DeepEP`
	`306`	`+returnAlltoallMethodType.DeepEPLowLatency`
`285`	`307`
`286`	`308`	`returnAlltoallMethodType.NotEnabled`
`287`	`309`
`@@ -548,7 +570,7 @@ def forward_chunk(`
`548`	`570`	`ifnotuse_postquant_alltoall:`
`549`	`571`	`x,recv_topk_idx,token_final_scales,num_recv_tokens_per_expert_list,deep_ep_handle= \`
`550`	`572`	`self.deep_ep_buffer.dispatch(x,token_selected_slots,token_final_scales,self.num_slots,`
`551`		`-self.expert_size_per_partition*self.mapping.moe_ep_rank)`
	`573`	`+self.expert_size_per_partition*self.mapping.moe_ep_rank,all_rank_max_num_tokens,self.ep_size,self.use_cuda_graph)`
`552`	`574`	`padded,x,_,token_selected_slots,token_final_scales=self.pad_empty_recv_tensors(`
`553`	`575`	`x,None,recv_topk_idx,token_final_scales)`
`554`	`576`	`elifself.alltoall_method_type==AlltoallMethodType.DeepEPLowLatency:`
`@@ -636,7 +658,7 @@ def forward_chunk(`
`636`	`658`	`x_sf=x_sf.view(torch.float32)`
`637`	`659`	`(x,x_sf),recv_topk_idx,token_final_scales,num_recv_tokens_per_expert_list,deep_ep_handle= \`
`638`	`660`	`self.deep_ep_buffer.dispatch((x,x_sf),token_selected_slots,token_final_scales,self.num_slots,`
`639`		`-self.expert_size_per_partition*self.mapping.moe_ep_rank)`
	`661`	`+self.expert_size_per_partition*self.mapping.moe_ep_rank,all_rank_max_num_tokens,self.ep_size,self.use_cuda_graph)`
`640`	`662`	`padded,x,x_sf,token_selected_slots,token_final_scales=self.pad_empty_recv_tensors(`
`641`	`663`	`x,x_sf,recv_topk_idx,token_final_scales)`
`642`	`664`	`ifx_sfisnotNone:`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitfc92e82

File tree

3 files changed

3 files changed

`‎cpp/tensorrt_llm/deep_ep/CMakeLists.txt‎`

`‎tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py‎`

`‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py‎`

0 commit comments