@@ -32,7 +32,7 @@ class AlltoallMethodType(IntEnum):
3232NotEnabled = 0
3333# MNNVL
3434MNNVL = 1
35- # DeepEP intranode or internode:no CUDA Graphssupport , IBGDA is required by internode
35+ # DeepEP intranode or internode: CUDA Graphsare supported , IBGDA is required by internode
3636DeepEP = 2
3737# DeepEP low latency: CUDA Graphs are supported, IBGDA is required
3838DeepEPLowLatency = 3
@@ -101,6 +101,8 @@ def __init__(
101101self .repeat_idx = 0
102102self .repeat_count = 1
103103
104+ self .use_cuda_graph = model_config .use_cuda_graph
105+
104106moe_load_balancer_config = model_config .moe_load_balancer
105107init_expert_size_per_partition = moe_load_balancer_config .num_local_slots if moe_load_balancer_config else self .num_experts // self .ep_size
106108self .initial_global_assignments = [
@@ -212,6 +214,9 @@ def __init__(
212214str (
213215min (model_config .max_num_tokens ,
214216self .moe_max_num_tokens ))))
217+ # Set nvshmem queue pair depth larger than the number of on-flight WRs (ref: https://github.com/deepseek-ai/DeepEP/issues/427)
218+ os .environ ['NVSHMEM_QP_DEPTH' ]= str (
219+ 2 * (self .deep_ep_max_num_tokens + 1 ))
215220self .deep_ep_buffer = buffer_pool .get_low_latency_buffer (
216221model_config .mapping )
217222self .deep_ep_buffer .reserve (self .deep_ep_max_num_tokens ,
@@ -255,6 +260,25 @@ def _check_configs(self):
255260def select_alltoall_method_type (mapping :Mapping ,top_k :int ,
256261dtype :torch .dtype ,
257262use_cuda_graph :bool )-> AlltoallMethodType :
263+
264+ # Check if DeepEP is feasible for the given number of ranks
265+ # DeepEP supports two modes:
266+ # 1. Intranode: Single node with 2, 4, or 8 ranks
267+ # 2. Internode: 2, 4, 8, or 16 nodes with 8 ranks per node
268+ def is_deepep_feasible (num_ranks :int )-> bool :
269+ NUM_INTRANODE_SUPPORTED_RANKS = {2 ,4 ,8 }
270+ REQUIRED_LOCAL_MPI_SIZE = 8
271+ NUM_INTERNODE_SUPPORTED_RDMA_RANKS = {2 ,4 ,8 ,16 }
272+ mpi_size = local_mpi_size ()
273+ # Intranode cases
274+ if num_ranks == mpi_size and num_ranks in NUM_INTRANODE_SUPPORTED_RANKS :
275+ return True
276+ # Internode cases
277+ if mpi_size != REQUIRED_LOCAL_MPI_SIZE :
278+ return False
279+ num_rdma_nodes = num_ranks // mpi_size
280+ return num_rdma_nodes in NUM_INTERNODE_SUPPORTED_RDMA_RANKS
281+
258282all2all_method_type = os .environ .get ("TRTLLM_FORCE_ALLTOALL_METHOD" )
259283if all2all_method_type is not None :
260284return AlltoallMethodType [all2all_method_type ]
@@ -276,12 +300,10 @@ def select_alltoall_method_type(mapping: Mapping, top_k: int,
276300
277301if os .environ .get ("TRTLLM_CAN_USE_DEEP_EP" ,"0" )== "1" :
278302if deep_ep_installed and dtype == torch .bfloat16 :
279- if use_cuda_graph :
280- # Here we can only choose DeepEPLowLatency since only this method supports CUDA Graphs.
281- return AlltoallMethodType .DeepEPLowLatency
282- else :
283- # Here we can choose DeepEP or DeepEPLowLatency if both are available. Now DeepEP is faster.
303+ # Choose DeepEP if feasible
304+ if is_deepep_feasible (mapping .moe_ep_size ):
284305return AlltoallMethodType .DeepEP
306+ return AlltoallMethodType .DeepEPLowLatency
285307
286308return AlltoallMethodType .NotEnabled
287309
@@ -548,7 +570,7 @@ def forward_chunk(
548570if not use_postquant_alltoall :
549571x ,recv_topk_idx ,token_final_scales ,num_recv_tokens_per_expert_list ,deep_ep_handle = \
550572self .deep_ep_buffer .dispatch (x ,token_selected_slots ,token_final_scales ,self .num_slots ,
551- self .expert_size_per_partition * self .mapping .moe_ep_rank )
573+ self .expert_size_per_partition * self .mapping .moe_ep_rank , all_rank_max_num_tokens , self . ep_size , self . use_cuda_graph )
552574padded ,x ,_ ,token_selected_slots ,token_final_scales = self .pad_empty_recv_tensors (
553575x ,None ,recv_topk_idx ,token_final_scales )
554576elif self .alltoall_method_type == AlltoallMethodType .DeepEPLowLatency :
@@ -636,7 +658,7 @@ def forward_chunk(
636658x_sf = x_sf .view (torch .float32 )
637659 (x ,x_sf ),recv_topk_idx ,token_final_scales ,num_recv_tokens_per_expert_list ,deep_ep_handle = \
638660self .deep_ep_buffer .dispatch ((x ,x_sf ),token_selected_slots ,token_final_scales ,self .num_slots ,
639- self .expert_size_per_partition * self .mapping .moe_ep_rank )
661+ self .expert_size_per_partition * self .mapping .moe_ep_rank , all_rank_max_num_tokens , self . ep_size , self . use_cuda_graph )
640662padded ,x ,x_sf ,token_selected_slots ,token_final_scales = self .pad_empty_recv_tensors (
641663x ,x_sf ,recv_topk_idx ,token_final_scales )
642664if x_sf is not None :