NVIDIA/TensorRT-LLMPublic

NotificationsYou must be signed in to change notification settings
Fork1.9k
Star12.3k

Commitfb6f39f

authored

Merge branch 'main' into docao/support_topk_logprobs_torch_backend

2 parentscac915f +b278d06 commitfb6f39fCopy full SHA for fb6f39f

File tree

156 files changed

+4259

-718

lines changed

3rdparty
cpp
- CMakeLists.txt
- cmake/modules
  - cuda_configuration.cmake
- include/tensorrt_llm
  - common
    - cudaUtils.h
  - deep_gemm
    - tma_utils.cuh
- kernels/fmha_v2
  - Makefile
- tensorrt_llm
  - common
    - attentionOp.cpp
  - cutlass_extensions/include/cutlass_extensions
    - gemm_configs.h
  - deep_ep
    - CMakeLists.txt
  - executor
    - tensor.cpp
  - kernels
    - beamSearchKernels.cu
    - beamSearchKernels.h
    - contextFusedMultiHeadAttention
      - fmhaRunner.cpp
    - cutlass_kernels
      - CMakeLists.txt
      - cutlass_heuristic.cpp
      - fp4_gemm
        fp4_gemm_bf16.cu
        fp4_gemm_fp16.cu
        fp4_gemm_fp32.cu
        fp4_gemm_template.h
        nvfp4_nvfp4_gemm_template_sm100.h
      - fp8_blockscale_gemm
        fp8_blockscale_tma_utils.cuh
      - fpA_intB_gemm
        fpA_intB_gemm_template.h
        fpA_intB_gemm_template_sm90.h
        launchers
        fpA_intB_launcher_sm90.h
        fpA_intB_launcher_sm90.inl
      - include
        moe_gemm_kernels.h
      - moe_gemm
        launchers
        fused_moe_gemm_launcher_sm80.h
        fused_moe_gemm_launcher_sm80.inl
        moe_gemm_tma_ws_launcher.h
        moe_gemm_tma_ws_launcher.inl
        moe_gemm_tma_ws_mixed_input_launcher.h
        moe_gemm_tma_ws_mixed_input_launcher.inl
        moe_gemm_template_dispatch.h
        moe_gemm_template_dispatch_tma_ws.h
        moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
      - python
        generate_kernels.py
    - decoderMaskedMultiheadAttention
      - decoderMaskedMultiheadAttentionTemplate.h
    - fmhaDispatcher.cpp
    - internal_cutlass_kernels/include
      - moe_gemm_kernels.h
    - sageAttentionKernels.cu
    - speculativeDecoding
      - eagleDecodingKernels.cu
    - xqaDispatcher.cpp
  - runtime
    - iBuffer.cpp
    - moeLoadBalancer
      - hostAccessibleDeviceAllocator.cpp
    - utils
      - debugUtils.cu
  - thop
- tests/unit_tests/kernels
  - CMakeLists.txt
docker
docs/source
jenkins
requirements.txt
scripts
- build_wheel.py
tensorrt_llm
- _torch
  - auto_deploy
    - custom_ops
      - flashinfer_rope.py
    - models/patches
      - llama4.py
      - mistral3.py
  - custom_ops
    - torch_custom_ops.py
  - cute_dsl_kernels
    - __init__.py
    - blackwell
  - models
    - modeling_deepseekv3.py
  - modules
    - attention.py
    - fused_moe
    - linear.py
  - pyexecutor
    - py_executor_creator.py
- _utils.py
- bench/dataclasses
  - configuration.py
- llmapi
  - trtllm-llmapi-launch
tests
- integration
  - defs
    - accuracy
      - accuracy_core.py
      - test_llm_api_pytorch.py
    - disaggregated/test_configs
      - disagg_config_diff_max_tokens.yaml
    - test_unittests.py
  - test_lists
    - test-db
      - l0_dgx_b300.yml
      - l0_gb300_multi_gpus.yml
    - waives.txt
- unittest
  - _torch
    - auto_deploy/unit/singlegpu
      - test_ad_build_small_single.py
    - misc
      - test_virtual_memory.py
    - modeling
      - test_modeling_mllama.py
    - modules
      - test_fused_moe.py
    - multi_gpu
      - test_linear.py
    - thop/parallel
  - trt/attention
    - test_gpt_attention.py
  - utils
    - util.py
triton_backend/inflight_batcher_llm
- CMakeLists.txt

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

156 files changed

+4259

-718

lines changed

`‎3rdparty/DeepGEMM‎`

SubmoduleDeepGEMM updated36 files

`‎3rdparty/cutlass‎`

Submodulecutlass updated606 files

`‎3rdparty/json‎`

Submodulejson updated856 files

`‎cpp/CMakeLists.txt‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -248,6 +248,7 @@ endif()`
`248`	`248`	`include_directories(`
`249`	`249`	`SYSTEM`
`250`	`250`	`${CUDAToolkit_INCLUDE_DIRS}`
	`251`	`+${CUDAToolkit_INCLUDE_DIRS}/cccl`
`251`	`252`	`${CUDNN_ROOT_DIR}/include`
`252`	`253`	`$<TARGET_PROPERTY:TensorRT::NvInfer,INTERFACE_INCLUDE_DIRECTORIES>`
`253`	`254`	`${3RDPARTY_DIR}/cutlass/include`
`@@ -510,7 +511,6 @@ print(os.path.dirname(torch.__file__),end='');"`
`510`	`511`	`endif()`
`511`	`512`	`endif()`
`512`	`513`	`endif()`
`513`		`-`
`514`	`514`	`else()`
`515`	`515`	`if(NOTWIN32)`
`516`	`516`	`if(NOT USE_CXX11_ABI)`

`‎cpp/cmake/modules/cuda_configuration.cmake‎`

Lines changed: 14 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -138,6 +138,9 @@ function(setup_cuda_architectures)`
`138`	`138`	`message(FATAL_ERROR"Unrecognized CUDA architecture:${CUDA_ARCH}")`
`139`	`139`	`endif()`
`140`	`140`	`endforeach()`
	`141`	`+ if("103"IN_LIST CMAKE_CUDA_ARCHITECTURES_CLEAN)`
	`142`	`+ list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN"100")`
	`143`	`+ endif()`
`141`	`144`	`list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_CLEAN)`
`142`	`145`	`set(CMAKE_CUDA_ARCHITECTURES_RAW${CMAKE_CUDA_ARCHITECTURES_CLEAN})`
`143`	`146`	`endif()`
`@@ -150,6 +153,9 @@ function(setup_cuda_architectures)`
`150`	`153`	`if(CMAKE_CUDA_COMPILER_VERSIONVERSION_GREATER_EQUAL"12.7")`
`151`	`154`	`list(APPEND CMAKE_CUDA_ARCHITECTURES_RAW 100 120)`
`152`	`155`	`endif()`
	`156`	`+ if(CMAKE_CUDA_COMPILER_VERSIONVERSION_GREATER_EQUAL"12.9")`
	`157`	`+ list(APPEND CMAKE_CUDA_ARCHITECTURES_RAW 103)`
	`158`	`+ endif()`
`153`	`159`	`endif()`
`154`	`160`
`155`	`161`	`# CMAKE_CUDA_ARCHITECTURES_ORIG contains all architectures enabled, without`
`@@ -160,7 +166,14 @@ function(setup_cuda_architectures)`
`160`	`166`	`${CMAKE_CUDA_ARCHITECTURES_ORIG}`
`161`	`167`	`PARENT_SCOPE)`
`162`	`168`
`163`		`- set(ARCHITECTURES_WITH_KERNELS 80 86 89 90 100 120)`
	`169`	`+ set(ARCHITECTURES_WITH_KERNELS`
	`170`	`+ 80`
	`171`	`+ 86`
	`172`	`+ 89`
	`173`	`+ 90`
	`174`	`+ 100`
	`175`	`+ 103`
	`176`	`+ 120)`
`164`	`177`	`foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)`
`165`	`178`	`if(NOT${CUDA_ARCH}IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)`
`166`	`179`	`add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}")`

`‎cpp/include/tensorrt_llm/common/cudaUtils.h‎`

Lines changed: 6 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -311,6 +311,12 @@ inline int getSMVersion()`
`311`	`311`	`return sm;`
`312`	`312`	`}`
`313`	`313`
	`314`	`+inlineboolisSM100Family()`
	`315`	`+{`
	`316`	`+intconst sm =getSMVersion();`
	`317`	`+return sm ==100 \|\| sm ==103;// To be continued...`
	`318`	`+}`
	`319`	`+`
`314`	`320`	`inlineintgetDevice()`
`315`	`321`	`{`
`316`	`322`	`int deviceID{0};`

`‎cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh‎`

Lines changed: 3 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ constexpr CUtensorMapDataType get_CUtensorMapDataType()`
`95`	`95`	`}`
`96`	`96`	`}`
`97`	`97`
`98`		`-PFN_cuTensorMapEncodeTiledget_cuTensorMapEncodeTiled()`
	`98`	`+PFN_cuTensorMapEncodeTiled_v12000get_cuTensorMapEncodeTiled()`
`99`	`99`	`{`
`100`	`100`	// Get pointer to `cuTensorMapEncodeTiled`
`101`	`101`	`cudaDriverEntryPointQueryResult driver_status;`
`@@ -110,12 +110,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled()`
`110`	`110`
`111`	`111`	`if (driver_status != cudaDriverEntryPointSuccess)`
`112`	`112`	`throwstd::runtime_error("driver_status != cudaDriverEntryPointSuccess");`
`113`		`-returnreinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr);`
	`113`	`+returnreinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr);`
`114`	`114`	`}`
`115`	`115`
`116`	`116`	`template<typename T>`
`117`	`117`	`CUtensorMapmake_2d_tma_copy_desc(T* global_address,uint64_t gmem_dim[2],uint64_t stride_in_bytes,`
`118`		`-uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type,PFN_cuTensorMapEncodeTiled encode_func =nullptr)`
	`118`	`+uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type,PFN_cuTensorMapEncodeTiled_v12000 encode_func =nullptr)`
`119`	`119`	`{`
`120`	`120`	`CUtensorMap tensor_map{};`
`121`	`121`	`constexpruint32_t rank =2;`

`‎cpp/kernels/fmha_v2/Makefile‎`

Lines changed: 2 additions & 41 deletions

Original file line number	Diff line number	Diff line change
`@@ -90,9 +90,6 @@ NVCC_FLAGS += $(PREPROCESSOR_FLAGS)`
`90`	`90`	`# The include directories.`
`91`	`91`	`INCLUDE_DIRS += -I./src -I./generated -I$(CUDA)/include`
`92`	`92`
`93`		`-GENCODE_SM70 = -gencode=arch=compute_70,code=\"sm_70\"`
`94`		`-GENCODE_SM72 = -gencode=arch=compute_72,code=\"sm_72\"`
`95`		`-GENCODE_SM75 = -gencode=arch=compute_75,code=\"sm_75\"`
`96`	`93`	`GENCODE_SM80 = -gencode=arch=compute_80,code=\"sm_80\"`
`97`	`94`	`GENCODE_SM86 = -gencode=arch=compute_86,code=\"sm_86\"`
`98`	`95`	`GENCODE_SM87 = -gencode=arch=compute_87,code=\"sm_87\"`
`@@ -125,9 +122,8 @@ endif`
`125`	`122`	`CUBIN_CPP =$(patsubst%.cu.cubin,%.cubin.cpp,$(CUBINS))`
`126`	`123`	`CUBIN_OBJ =$(patsubst%.cubin.cpp,%.cubin.o,$(CUBIN_CPP))`
`127`	`124`
`128`		`-GENCODES =$(GENCODE_SM70)`
`129`		`-GENCODES +=$(GENCODE_SM72)`
`130`		`-GENCODES +=$(GENCODE_SM75)`
	`125`	`+GENCODES =`
	`126`	`+`
`131`	`127`	`GENCODES +=$(GENCODE_SM80)`
`132`	`128`	`GENCODES +=$(GENCODE_SM86)`
`133`	`129`	`GENCODES +=$(GENCODE_SM89)`
`@@ -152,20 +148,12 @@ UNIT_TEST_OBJ = $(patsubst %.cu, obj/%.o, $(UNIT_TEST_CPP))`
`152`	`148`	`UNIT_TEST_EXE =$(patsubst%.cu, bin/%.exe,$(UNIT_TEST_CPP))`
`153`	`149`
`154`	`150`	`# arch-dependent boilerplates`
`155`		`-UNIT_TEST_CPP_SM70 =`
`156`		`-ifdefENABLE_SM70`
`157`		`-UNIT_TEST_CPP_SM70 =$(wildcard$(UNIT_TEST_CPP_DIR)/arch/*_sm70.cu)`
`158`		`-UNIT_TEST_OBJ_SM70 =$(patsubst%_sm70.cu, obj/%_sm70.o,$(UNIT_TEST_CPP_SM70))`
`159`		`-UNIT_TEST_EXE_SM70 =$(patsubst%_sm70.cu, bin/%_sm70.exe,$(UNIT_TEST_CPP_SM70))`
`160`		`-endif`
`161`		`-`
`162`	`151`	`UNIT_TEST_CPP_SM80 =$(wildcard$(UNIT_TEST_CPP_DIR)/arch/*_sm80.cu)`
`163`	`152`	`UNIT_TEST_OBJ_SM80 =$(patsubst%_sm80.cu, obj/%_sm80.o,$(UNIT_TEST_CPP_SM80))`
`164`	`153`	`UNIT_TEST_EXE_SM80 =$(patsubst%_sm80.cu, bin/%_sm80.exe,$(UNIT_TEST_CPP_SM80))`
`165`	`154`
`166`	`155`	`# aggregate exes as prerequisite of build target "test"`
`167`	`156`	`UNIT_TEST_EXE_ARCH =`
`168`		`-UNIT_TEST_EXE_ARCH +=$(UNIT_TEST_EXE_SM70)`
`169`	`157`	`UNIT_TEST_EXE_ARCH +=$(UNIT_TEST_EXE_SM80)`
`170`	`158`
`171`	`159`	`# #################################################################################################`
`@@ -248,12 +236,6 @@ bin/libfmha_cubin.a: $(CUBIN_OBJ)`
`248`	`236`
`249`	`237`	`###################################################################################################`
`250`	`238`
`251`		`-obj/%_sm70.cu.o: ./generated/%_sm70.cu ./src/.h ./src/fmha/.h`
`252`		`-$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM70)$(INCLUDE_DIRS) -c -o$@$<`
`253`		`-obj/%_sm72.cu.o: ./generated/%_sm72.cu ./src/.h ./src/fmha/.h`
`254`		`-$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM72)$(INCLUDE_DIRS) -c -o$@$<`
`255`		`-obj/%_sm75.cu.o: ./generated/%_sm75.cu ./src/.h ./src/fmha/.h`
`256`		`-$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM75)$(INCLUDE_DIRS) -c -o$@$<`
`257`	`239`	`obj/%_sm80.cu.o: ./generated/%_sm80.cu ./src/.h ./src/fmha/.h`
`258`	`240`	`$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM80)$(INCLUDE_DIRS) -c -o$@$<`
`259`	`241`	`obj/%_sm86.cu.o: ./generated/%_sm86.cu ./src/.h ./src/fmha/.h`
`@@ -269,12 +251,6 @@ obj/%_sm100.cu.o: ./generated/%_sm100.cu ./src/.h ./src/fmha/.h ./src/fmha/hop`
`269`	`251`	`obj/%_sm120.cu.o: ./generated/%_sm120.cu ./src/.h ./src/fmha/.h`
`270`	`252`	`$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM120)$(INCLUDE_DIRS) -c -o$@$<`
`271`	`253`
`272`		`-obj/%_sm70.no_i2f_f2i.cu.o: ./generated/%_sm70.no_i2f_f2i.cu ./src/.h ./src/fmha/.h`
`273`		`-$(NVCC)$(NVCC_FLAGS)$(GENCODE_SM70)$(INCLUDE_DIRS) -c -o$@$<`
`274`		`-obj/%_sm72.no_i2f_f2i.cu.o: ./generated/%_sm72.no_i2f_f2i.cu ./src/.h ./src/fmha/.h`
`275`		`-$(NVCC)$(NVCC_FLAGS)$(GENCODE_SM72)$(INCLUDE_DIRS) -c -o$@$<`
`276`		`-obj/%_sm75.no_i2f_f2i.cu.o: ./generated/%_sm75.no_i2f_f2i.cu ./src/.h ./src/fmha/.h`
`277`		`-$(NVCC)$(NVCC_FLAGS)$(GENCODE_SM75)$(INCLUDE_DIRS) -c -o$@$<`
`278`	`254`	`obj/%_sm80.no_i2f_f2i.cu.o: ./generated/%_sm80.no_i2f_f2i.cu ./src/.h ./src/fmha/.h`
`279`	`255`	`$(NVCC)$(NVCC_FLAGS)$(GENCODE_SM80)$(INCLUDE_DIRS) -c -o$@$<`
`280`	`256`	`obj/%_sm86.no_i2f_f2i.cu.o: ./generated/%_sm86.no_i2f_f2i.cu ./src/.h ./src/fmha/.h`
`@@ -314,20 +290,11 @@ $(UNIT_TEST_OBJ): $(UNIT_TEST_OBJ_DIR)/%.o : ${UNIT_TEST_CPP_DIR}/%.cu ./src/*.h`
`314`	`290`	`$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODES) -c -o$@$< -I./src$(GTEST_INC)`
`315`	`291`
`316`	`292`	`# arch-dependent objs`
`317`		`-$(UNIT_TEST_OBJ_SM70):%.o :$(UNIT_TEST_CPP_SM70) ./src/.h ./src/fmha/.h`
`318`		`-$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM70) -c -o$@$< -I./src$(GTEST_INC)`
`319`		`-`
`320`	`293`	`$(UNIT_TEST_OBJ_SM80):%.o :$(UNIT_TEST_CPP_SM80) ./src/.h ./src/fmha/.h`
`321`	`294`	`$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM80) -c -o$@$< -I./src$(GTEST_INC)`
`322`	`295`
`323`	`296`	`###################################################################################################`
`324`	`297`
`325`		`-cubin/%_sm70.cu.cubin: ./generated/%_sm70.cu ./src/.h ./src/fmha/.h`
`326`		`-$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM70)$(INCLUDE_DIRS) -cubin -o$@$<`
`327`		`-cubin/%_sm72.cu.cubin: ./generated/%_sm72.cu ./src/.h ./src/fmha/.h`
`328`		`-$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM72)$(INCLUDE_DIRS) -cubin -o$@$<`
`329`		`-cubin/%_sm75.cu.cubin: ./generated/%_sm75.cu ./src/.h ./src/fmha/.h`
`330`		`-$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM75)$(INCLUDE_DIRS) -cubin -o$@$<`
`331`	`298`	`cubin/%_sm80.cu.cubin: ./generated/%_sm80.cu ./src/.h ./src/fmha/.h`
`332`	`299`	`$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM80)$(INCLUDE_DIRS) -cubin -o$@$<`
`333`	`300`	`cubin/%_sm86.cu.cubin: ./generated/%_sm86.cu ./src/.h ./src/fmha/.h`
`@@ -343,12 +310,6 @@ cubin/%_sm100.cu.cubin: ./generated/%_sm100.cu ./src/.h ./src/fmha/.h`
`343`	`310`	`cubin/%_sm120.cu.cubin: ./generated/%_sm120.cu ./src/.h ./src/fmha/.h`
`344`	`311`	`$(NVCC)$(NVCC_FLAGS)$(I2F_F2I_FLAGS)$(GENCODE_SM120)$(INCLUDE_DIRS) -cubin -o$@$<`
`345`	`312`
`346`		`-cubin/%_sm70.no_i2f_f2i.cu.cubin: ./generated/%_sm70.no_i2f_f2i.cu ./src/.h ./src/fmha/.h`
`347`		`-$(NVCC)$(NVCC_FLAGS)$(GENCODE_SM70)$(INCLUDE_DIRS) -cubin -o$@$<`
`348`		`-cubin/%_sm72.no_i2f_f2i.cu.cubin: ./generated/%_sm72.no_i2f_f2i.cu ./src/.h ./src/fmha/.h`
`349`		`-$(NVCC)$(NVCC_FLAGS)$(GENCODE_SM72)$(INCLUDE_DIRS) -cubin -o$@$<`
`350`		`-cubin/%_sm75.no_i2f_f2i.cu.cubin: ./generated/%_sm75.no_i2f_f2i.cu ./src/.h ./src/fmha/.h`
`351`		`-$(NVCC)$(NVCC_FLAGS)$(GENCODE_SM75)$(INCLUDE_DIRS) -cubin -o$@$<`
`352`	`313`	`cubin/%_sm80.no_i2f_f2i.cu.cubin: ./generated/%_sm80.no_i2f_f2i.cu ./src/.h ./src/fmha/.h`
`353`	`314`	`$(NVCC)$(NVCC_FLAGS)$(GENCODE_SM80)$(INCLUDE_DIRS) -cubin -o$@$<`
`354`	`315`	`cubin/%_sm86.no_i2f_f2i.cu.cubin: ./generated/%_sm86.no_i2f_f2i.cu ./src/.h ./src/fmha/.h`

`‎cpp/tensorrt_llm/common/attentionOp.cpp‎`

Lines changed: 5 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -2530,22 +2530,22 @@ int AttentionOp::initialize() noexcept`
`2530`	`2530`	`if (mFP8ContextFMHA)`
`2531`	`2531`	`{`
`2532`	`2532`	`TLLM_CHECK_WITH_INFO(mEnableContextFMHA,"FP8 FMHA cannot be enabled because Context FMHA is not supported.");`
`2533`		`-TLLM_CHECK_WITH_INFO(mSM ==89 \|\|mSM ==90 \|\|mSM ==100 \|\|mSM ==120 \|\|mSM ==121,`
`2534`		`-"FP8 FMHA can only be enabled on sm_89, sm_90,sm_100, sm_120 or sm_121.");`
	`2533`	`+TLLM_CHECK_WITH_INFO(mSM ==89 \|\|mSM ==90 \|\|mSM ==100 \|\|mSM ==103 \|\|mSM ==120 \|\|mSM ==121,`
	`2534`	`+"FP8 FMHA can only be enabled on sm_89, sm_90,sm_100f, sm_120 or sm_121.");`
`2535`	`2535`	`}`
`2536`	`2536`
`2537`	`2537`	`// Pre-Check of FP8 Generation MLA.`
`2538`	`2538`	`if (mFP8GenerationMLA)`
`2539`	`2539`	`{`
`2540`	`2540`	`TLLM_CHECK_WITH_INFO(mIsMLAEnabled,"FP8 Generation MLA cannot be enabled because MLA is not supported.");`
`2541`		`-TLLM_CHECK_WITH_INFO(mSM ==89 \|\|mSM ==90 \|\|mSM ==100 \|\|mSM ==120 \|\|mSM ==121,`
	`2541`	`+TLLM_CHECK_WITH_INFO(mSM ==89 \|\|mSM ==90 \|\|mSM ==100 \|\|mSM ==103 \|\|mSM ==120 \|\|mSM ==121,`
`2542`	`2542`	`"FP8 Generation MLA is supported on Ada, Hopper or Blackwell architecture.");`
`2543`	`2543`	`}`
`2544`	`2544`
`2545`	`2545`	`// Check requirements for FP4 output.`
`2546`	`2546`	`TLLM_CHECK_WITH_INFO(!mFuseFp4Quant \|\|mEnableContextFMHA,"Context FMHA must enable if fuse_fp4_quant is enabled");`
`2547`		`-TLLM_CHECK_WITH_INFO(!mFuseFp4Quant \|\|mSM ==100 \|\|mSM ==120 \|\|mSM ==121,`
`2548`		`-"fuse_fp4_quant only supportsSM100 or SM120 or SM121 devices.");`
	`2547`	`+TLLM_CHECK_WITH_INFO(!mFuseFp4Quant \|\|(mSM ==100 \|\|mSM ==103) \|\|mSM ==120 \|\|mSM ==121,`
	`2548`	`+"fuse_fp4_quant only supportsSM100f or SM120 or SM121 devices.");`
`2549`	`2549`
`2550`	`2550`	`// Check requirements for FP4 KV cache.`
`2551`	`2551`	`TLLM_CHECK_WITH_INFO(!mKVCacheQuantMode.hasFp4KvCache() \|\|mFP8ContextFMHA,`

`‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h‎`

Lines changed: 8 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,8 @@`
`23`	`23`	`#include<type_traits>`
`24`	`24`
`25`	`25`	`#include"cute/tensor.hpp"`
	`26`	`+#include"tensorrt_llm/common/assert.h"`
	`27`	`+#include"tensorrt_llm/common/tllmException.h"`
`26`	`28`
`27`	`29`	`namespacetensorrt_llm`
`28`	`30`	`{`
`@@ -155,6 +157,9 @@ enum class CutlassTileConfigSM100 : int`
`155`	`157`	`CtaShape128x256x256B = shape_tuple_to_enum(128,256,256),`
`156`	`158`	`};`
`157`	`159`
	`160`	`+// An alias to make the SHAPE_CASE macro work`
	`161`	`+using CutlassTileConfigSM103 = CutlassTileConfigSM100;`
	`162`	`+`
`158`	`163`	`enumclassCutlassTileConfigSM120 :int`
`159`	`164`	`{`
`160`	`165`	`// Signals that we should run heuristics do choose a config`
`@@ -411,16 +416,17 @@ struct CutlassGemmConfig`
`411`	`416`	`CutlassGemmConfig(CutlassTileConfigSM100 tile_config_sm100, MainloopScheduleType mainloop_schedule,`
`412`	`417`	`EpilogueScheduleType epilogue_schedule, ClusterShape cluster_shape,`
`413`	`418`	`ClusterShape dynamic_cluster_shape = ClusterShape::Undefined,`
`414`		`- ClusterShape fallback_cluster_shape = ClusterShape::Undefined)`
	`419`	`+ ClusterShape fallback_cluster_shape = ClusterShape::Undefined,int sm_version =100)`
`415`	`420`	`: tile_config_sm100(tile_config_sm100)`
`416`	`421`	`, mainloop_schedule(mainloop_schedule)`
`417`	`422`	`, epilogue_schedule(epilogue_schedule)`
`418`	`423`	`, cluster_shape(cluster_shape)`
`419`	`424`	`, dynamic_cluster_shape(dynamic_cluster_shape)`
`420`	`425`	`, fallback_cluster_shape(fallback_cluster_shape)`
`421`		`- , sm_version(100)`
	`426`	`+ , sm_version(sm_version)`
`422`	`427`	`, is_tma_warp_specialized(true)`
`423`	`428`	`{`
	`429`	`+TLLM_CHECK_WITH_INFO(sm_version >=100 && sm_version <120,"Expected SM 10x version");`
`424`	`430`	`}`
`425`	`431`
`426`	`432`	`CutlassGemmConfig(CutlassTileConfigSM120 tile_config_sm120, MainloopScheduleType mainloop_schedule,`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitfb6f39f

File tree

156 files changed

Some content is hidden

156 files changed

`‎3rdparty/DeepGEMM‎`

`‎3rdparty/cutlass‎`

`‎3rdparty/json‎`

`‎cpp/CMakeLists.txt‎`

`‎cpp/cmake/modules/cuda_configuration.cmake‎`

`‎cpp/include/tensorrt_llm/common/cudaUtils.h‎`

`‎cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh‎`

`‎cpp/kernels/fmha_v2/Makefile‎`

`‎cpp/tensorrt_llm/common/attentionOp.cpp‎`

`‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h‎`

0 commit comments