Models#

classtensorrt_llm.models.BertModel(*args,**kwargs)[source]#

Bases:BertBase

forward(
input_ids=None,
input_lengths=None,
position_ids=None,
token_type_ids=None,
hidden_states=None,
max_input_length=None,
)[source]#
classtensorrt_llm.models.BertForQuestionAnswering(*args,**kwargs)[source]#

Bases:BertBase

forward(
input_ids=None,
input_lengths=None,
token_type_ids=None,
position_ids=None,
hidden_states=None,
max_input_length=None,
)[source]#
classtensorrt_llm.models.BertForSequenceClassification(*args,**kwargs)[source]#

Bases:BertBase

forward(
input_ids,
input_lengths,
token_type_ids=None,
position_ids=None,
hidden_states=None,
max_input_length=None,
)[source]#
tensorrt_llm.models.RobertaModel#

alias ofBertModel

tensorrt_llm.models.RobertaForQuestionAnswering#

alias ofBertForQuestionAnswering

tensorrt_llm.models.RobertaForSequenceClassification#

alias ofBertForSequenceClassification

classtensorrt_llm.models.BloomModel(
config:PretrainedConfig,
)[source]#

Bases:Module

forward(
input_ids:Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
attention_params=None,
)[source]#
classtensorrt_llm.models.BloomForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

classtensorrt_llm.models.CLIPVisionTransformer(
image_size,
num_channels,
patch_size,
hidden_size,
num_attention_heads,
max_position_embeddings,
norm_epsilon,
intermediate_size,
hidden_act,
num_hidden_layers,
require_ln_f,
mapping:Mapping,
dtype,
)[source]#

Bases:Module

forward(pixel_values)[source]#
classtensorrt_llm.models.DiT(*args,**kwargs)[source]#

Bases:PretrainedModel

check_config(
config:PretrainedConfig,
)[source]#
unpatchify(x:Tensor)[source]#
forward(latent,timestep,label)[source]#

Forward pass of DiT.latent: (N, C, H, W)timestep: (N,)label: (N,)

forward_without_cfg(x,t,y)[source]#

Forward pass without classifier-free guidance.

forward_with_cfg(x,t,y)[source]#

Forward pass with classifier-free guidance.

prepare_inputs(max_batch_size,**kwargs)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine theranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

classtensorrt_llm.models.SD3Transformer2DModel(*args,**kwargs)[source]#

Bases:PretrainedModel

config_class#

alias ofSD3Transformer2DModelConfig

forward(
hidden_states:Tensor,
encoder_hidden_states:Tensor|None=None,
pooled_projections:Tensor|None=None,
timestep:Tensor|None=None,
block_controlnet_hidden_states:List[Tensor]=None,
joint_attention_kwargs:Dict[str,Any]|None=None,
)[source]#
prepare_inputs(max_batch_size,**kwargs)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine theranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

classmethodfrom_pretrained(
pretrained_model_name_or_path:str,
dtype='float16',
mapping=<tensorrt_llm.mapping.Mappingobject>,
**kwargs,
)[source]#
load(weights,from_pruned=False)[source]#
enable_forward_chunking(
chunk_size:int|None=None,
dim:int=0,
)[source]#
disable_forward_chunking()[source]#
propertyattn_processors#
set_attn_processor(processor)[source]#
fuse_qkv_projections()[source]#
unfuse_qkv_projections()[source]#
classtensorrt_llm.models.DeepseekForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofDeepSeekV1Config

classmethodfrom_hugging_face(
model_dir,
dtype:str='auto',
mapping:Mapping|None=None,
override_fields={},
**kwargs,
)[source]#

Create LLM object and load weights from hugging face.

Parameters:
  • hf_model_dir – the hugging face model directory

  • dtype – str, the default weights data type when loading from the hugging face model

  • mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

classtensorrt_llm.models.FalconConfig(
*,
bias:bool=False,
parallel_attention:bool=False,
num_ln_in_parallel_attn:int|None=None,
new_decoder_architecture:bool=False,
rotary_base:float=10000.0,
**kwargs,
)[source]#

Bases:PretrainedConfig

to_dict()[source]#
classmethodfrom_hugging_face(
hf_config_or_dir:str|transformers.PretrainedConfig,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#
classtensorrt_llm.models.DeepseekV2ForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofDeepSeekV2Config

classmethodfrom_hugging_face(
model_dir,
dtype:str='auto',
hf_model:PreTrainedModel|None=None,
use_preloading:bool=False,
use_safetensors_loading:bool=False,
mapping:Mapping|None=None,
override_fields={},
**kwargs,
)[source]#

Create LLM object and load weights from hugging face.

Parameters:
  • hf_model_dir – the hugging face model directory

  • dtype – str, the default weights data type when loading from the hugging face model

  • mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

classtensorrt_llm.models.FalconForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofFalconConfig

check_config(config)[source]#
classmethodfrom_hugging_face(
hf_model_or_dir:str|transformers.PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create a FalconForCausalLM object from give parameters

classtensorrt_llm.models.FalconModel(config:FalconConfig)[source]#

Bases:Module

forward(
input_ids:Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
)[source]#
classtensorrt_llm.models.GPTConfig(
*,
gpt_variant:str='gpt2',
bias:bool=True,
q_scaling:float=1.0,
embedding_scale:float|None=None,
apply_query_key_layer_scaling:bool=False,
rotary_pct:float=1.0,
rotary_base:float=10000.0,
rotary_scaling:dict|None=None,
inner_layernorm:bool=False,
norm_before_bmm1:bool=False,
moe:MoeConfig|dict|None=None,
**kwargs,
)[source]#

Bases:PretrainedConfig

to_dict()[source]#
classmethodfrom_hugging_face(
hf_config_or_dir:str|transformers.PretrainedConfig,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#
classmethodfrom_nemo(
nemo_ckpt_dir:str,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#
classtensorrt_llm.models.GPTModel(config:GPTConfig)[source]#

Bases:Module

forward(
input_ids,
position_ids,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
lora_params=None,
spec_decoding_params=None,
)[source]#
classtensorrt_llm.models.GPTForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofGPTConfig

classmethodfrom_hugging_face(
hf_model_or_dir:str|transformers.PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create a LLaMAForCausalLM object from give parameters

classmethodquantize(
hf_model_dir:str,
output_dir:str,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
*,
device:str='cuda',
calib_dataset:str='cnn_dailymail',
calib_batches:int=512,
calib_batch_size:int=1,
calib_max_seq_length:int=512,
random_seed:int=1234,
tokenizer_max_seq_length:int=2048,
**kwargs,
)[source]#
classmethodfrom_nemo(
nemo_ckpt_dir:str,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#
use_lora(
lora_config:LoraConfig,
)[source]#

Load lora weights from the give config to the module.

Parameters:

lora_config – the lora config

classtensorrt_llm.models.OPTForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

check_config(config)[source]#
classtensorrt_llm.models.OPTModel(config:PretrainedConfig)[source]#

Bases:Module

forward(
input_ids:Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
**kwargs,
)[source]#
classtensorrt_llm.models.LLaMAConfig(
*,
mlp_bias:bool=False,
attn_bias:bool=False,
rotary_base:float=10000.0,
rotary_scaling:dict|None=None,
residual_mlp:bool=False,
disable_weight_only_quant_plugin:bool=False,
moe:MoeConfig|dict|None=None,
remove_duplicated_kv_heads:bool=False,
embedding_multiplier:float=1.0,
attention_multiplier:float=1.0,
residual_multiplier:float=1.0,
output_multiplier_scale:float=1.0,
**kwargs,
)[source]#

Bases:PretrainedConfig

to_dict()[source]#
classmethodfrom_hugging_face(
hf_config_or_dir:str|transformers.PretrainedConfig,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#
classmethodfrom_meta_ckpt(
meta_ckpt_dir:str,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#
classtensorrt_llm.models.LLaMAForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofLLaMAConfig

classmethodfrom_hugging_face(
hf_model_or_dir:str|PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create a LLaMAForCausalLM object from give parameters

default_plugin_config(**kwargs)[source]#

Return the default plugin config for this model.

This is used when the plugin_config value is not given in to_trt() call.If users need to set different plugin configs, they can start from the return object and change it.

classmethodfrom_meta_ckpt(
meta_ckpt_dir:str,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#
classmethodquantize(
hf_model_dir:str,
output_dir:str,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
*,
device:str='cuda',
calib_dataset:str='cnn_dailymail',
calib_batches:int=512,
calib_batch_size:int=1,
calib_max_seq_length:int=512,
random_seed:int=1234,
tokenizer_max_seq_length:int=2048,
**kwargs,
)[source]#
use_lora(
lora_config:LoraConfig,
)[source]#

Load lora weights from the give config to the module.

Parameters:

lora_config – the lora config

classtensorrt_llm.models.LLaMAModel(config:LLaMAConfig)[source]#

Bases:Module

forward(
input_ids,
position_ids=None,
use_cache=False,
attention_mask=None,
spec_decoding_params=None,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
hidden_states_for_embed=None,
prompt_embedding_table:Tensor|None=None,
prompt_tasks:Tensor|None=None,
prompt_vocab_size:Tensor|None=None,
lora_params=None,
)[source]#
classtensorrt_llm.models.LlavaNextVisionWrapper(*args,**kwargs)[source]#

Bases:PretrainedModel

forward(pixel_values,position_ids=None)[source]#
classmethodfrom_hugging_face(
hf_model_dir:str,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create a LlavaNextVisionWrapper object from give parameters

save_checkpoint(output_dir,save_config=True)[source]#
prepare_inputs(max_batch_size,**kwargs)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine theranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

classtensorrt_llm.models.LlavaNextVisionConfig(
*,
image_size:int,
patch_size:int,
text_hidden_size:int,
projector_hidden_act:str='gelu',
num_channels:int=3,
vision_model_type:str='clip_vision_model',
**kwargs,
)[source]#

Bases:PretrainedConfig

classmethodfrom_hugging_face(
hf_config_or_dir:str|transformers.PretrainedConfig,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#
classtensorrt_llm.models.MedusaConfig(
*,
num_medusa_heads:int=4,
num_medusa_layers:int=1,
max_draft_len:int=63,
**kwargs,
)[source]#

Bases:PretrainedConfig

to_dict()[source]#
classmethodfrom_hugging_face(
hf_config_or_dir:str|transformers.PretrainedConfig,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#
classtensorrt_llm.models.MedusaForCausalLm(*args,**kwargs)[source]#

Bases:PretrainedModel

config_class#

alias ofMedusaConfig

classmethodfrom_hugging_face(
hf_model_or_dir:str|transformers.PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create LLM object and load weights from hugging face.

Parameters:
  • hf_model_dir – the hugging face model directory

  • dtype – str, the default weights data type when loading from the hugging face model

  • mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

classtensorrt_llm.models.ReDrafterForLLaMALM(*args,**kwargs)[source]#

Bases:ReDrafterMixin,LLaMAForCausalLM

ReDrafter implementation for LLaMA models.

Combines:- Base LLaMA model functionality from LLaMAForCausalLM- Drafting/speculative decoding logic from ReDrafterMixin

classtensorrt_llm.models.ReDrafterForQWenLM(*args,**kwargs)[source]#

Bases:ReDrafterMixin,QWenForCausalLM

ReDrafter implementation for QWen models.

Combines:- Base QWen model functionality from QWenForCausalLM- Drafting/speculative decoding logic from ReDrafterMixin

classtensorrt_llm.models.GPTJConfig(*,rotary_dim:int=64,**kwargs)[source]#

Bases:PretrainedConfig

This is the configuration class to store the configuration of GPTJ model.

to_dict()[source]#
classmethodfrom_hugging_face(
hf_config_or_dir:str|transformers.PretrainedConfig,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#
classtensorrt_llm.models.GPTJModel(config:GPTJConfig)[source]#

Bases:Module

forward(
input_ids:Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
)[source]#
classtensorrt_llm.models.GPTJForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofGPTJConfig

classmethodfrom_hugging_face(
hf_model_or_dir:str|transformers.PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config=None,
**kwargs,
)[source]#

Create LLM object and load weights from hugging face.

Parameters:
  • hf_model_dir – the hugging face model directory

  • dtype – str, the default weights data type when loading from the hugging face model

  • mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

classtensorrt_llm.models.GPTNeoXModel(
config:PretrainedConfig,
)[source]#

Bases:Module

forward(
input_ids:Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
)[source]#
classtensorrt_llm.models.GPTNeoXForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

classtensorrt_llm.models.PhiModel(config:PretrainedConfig)[source]#

Bases:Module

forward(
input_ids:Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
lora_params=None,
)[source]#
classtensorrt_llm.models.Phi3Model(
config:PretrainedConfig,
)[source]#

Bases:Module

forward(
input_ids:Tensor,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
lora_params=None,
)[source]#
classtensorrt_llm.models.PhiForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofPhiConfig

check_config(config)[source]#
classmethodfrom_hugging_face(
hf_model_or_dir:str|transformers.PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create LLM object and load weights from hugging face.

Parameters:
  • hf_model_dir – the hugging face model directory

  • dtype – str, the default weights data type when loading from the hugging face model

  • mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

use_lora(
lora_config:LoraConfig,
)[source]#

Load lora weights from the give config to the module.

Parameters:

lora_config – the lora config

classtensorrt_llm.models.Phi3ForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofPhi3Config

classmethodfrom_hugging_face(
hf_model_or_dir:str|transformers.PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create LLM object and load weights from hugging face.

Parameters:
  • hf_model_dir – the hugging face model directory

  • dtype – str, the default weights data type when loading from the hugging face model

  • mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

use_lora(
lora_config:LoraConfig,
)[source]#

Load lora weights from the give config to the module.

Parameters:

lora_config – the lora config

classtensorrt_llm.models.ChatGLMConfig(
*,
chatglm_version:str='chatglm3',
add_bias_linear:bool=False,
add_qkv_bias:bool=True,
apply_query_key_layer_scaling:bool=False,
apply_residual_connection_post_layernorm:bool=False,
rmsnorm:bool=True,
rotary_pct:float=0.5,
rotary_base:float=10000.0,
rotary_scaling:dict|None=None,
**kwargs,
)[source]#

Bases:PretrainedConfig

to_dict()[source]#
classmethodfrom_hugging_face(
hf_config_or_dir:str|transformers.PretrainedConfig,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#
classtensorrt_llm.models.ChatGLMForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofChatGLMConfig

classmethodfrom_hugging_face(
hf_model_or_dir:str|transformers.PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create a LLaMAForCausalLM object from give parameters

classmethodquantize(
hf_model_dir:str,
output_dir:str,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
*,
device:str='cuda',
calib_dataset:str='cnn_dailymail',
calib_batches:int=512,
calib_batch_size:int=1,
calib_max_seq_length:int=512,
random_seed:int=1234,
tokenizer_max_seq_length:int=2048,
**kwargs,
)[source]#
prepare_inputs(*args,**kwargs)[source]#

SeePretrainedModel.prepare_inputs for the detailed parameter list.

classtensorrt_llm.models.ChatGLMModel(
config:ChatGLMConfig,
)[source]#

Bases:Module

forward(
input_ids:Tensor=None,
position_ids:Tensor=None,
use_cache:bool=False,
attention_mask:Tensor=None,
kv_cache_params:KeyValueCacheParams=None,
attention_params:AttentionParams=None,
)[source]#
classtensorrt_llm.models.BaichuanForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofBaichuanConfig

classmethodfrom_hugging_face(
hf_model_or_dir:str|transformers.PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create a BaichuanForCausalLM object from give parameters

classmethodquantize(
hf_model_dir:str,
output_dir:str,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
*,
device:str='cuda',
calib_dataset:str='cnn_dailymail',
calib_batches:int=512,
calib_batch_size:int=1,
calib_max_seq_length:int=512,
random_seed:int=1234,
tokenizer_max_seq_length:int=2048,
**kwargs,
)[source]#
classtensorrt_llm.models.EncoderModel(*args,**kwargs)[source]#

Bases:PretrainedModel

check_config(
config:PretrainedConfig,
)[source]#
forward(
input_ids:Tensor,
input_lengths=None,
position_ids=None,
token_type_ids=None,
hidden_states=None,
max_input_length=None,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
attention_mask=None,
lora_params:LoraParams=None,
language_adapter_routings:Tensor|None=None,
)[source]#
prepare_inputs(
max_batch_size,
max_input_len,
prompt_embedding_table_size:int=0,
lora_target_modules:List[str]=None,
*args,
**kwargs,
)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine theranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

use_lora(
lora_config:LoraConfig,
)[source]#

Load lora weights from the give config to the module.

Parameters:

lora_config – the lora config

use_prompt_tuning()[source]#

Enable p tuning when build the TRT engine, call this before to_trt.

precompute_relative_attention_bias(build_config)[source]#
classtensorrt_llm.models.DecoderModel(*args,**kwargs)[source]#

Bases:PretrainedModel

check_config(
config:PretrainedConfig,
)[source]#
forward(
decoder_input_ids:Tensor,
encoder_output:Tensor,
position_ids=None,
token_type_ids=None,
use_cache=False,
attention_mask_params=None,
last_token_ids=None,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
lora_params:LoraParams=None,
cross_kv_cache_gen:Tensor|None=None,
cross_kv_reuse:Tensor|None=None,
language_adapter_routings:Tensor|None=None,
)[source]#
prepare_inputs(
max_batch_size,
max_beam_width,
max_decoder_input_len,
max_seq_len,
max_encoder_input_len,
gather_context_logits:bool=False,
lora_target_modules:List[str]=None,
use_cache=True,
*args,
**kwargs,
)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine theranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

use_lora(
lora_config:LoraConfig,
)[source]#

Load lora weights from the give config to the module.

Parameters:

lora_config – the lora config

precompute_relative_attention_bias(build_config)[source]#
classtensorrt_llm.models.PretrainedConfig(
*,
architecture:str,
dtype:str,
hidden_size:int,
num_hidden_layers:int,
num_attention_heads:int,
vocab_size:int|None=None,
hidden_act:str='gelu',
logits_dtype:str='float32',
norm_epsilon:float=1e-05,
position_embedding_type:PositionEmbeddingType|str=PositionEmbeddingType.learned_absolute,
max_position_embeddings:int|None=None,
rotary_embedding_dim:int|None=None,
num_key_value_heads:int|None=None,
intermediate_size:int|None=None,
mapping:Mapping|dict|None=None,
quantization:QuantConfig|dict|None=None,
use_parallel_embedding:bool=False,
embedding_sharding_dim:int=0,
head_size:int|None=None,
qk_layernorm:bool=False,
runtime_defaults:RuntimeDefaultsIn=None,
**kwargs,
)[source]#

Bases:object

staticcreate_runtime_defaults(
defaults:RuntimeDefaultsIn=None,
)RuntimeDefaults|None[source]#
propertykv_dtype#
set_if_not_exist(key,value)[source]#
classmethodfrom_dict(config:dict)[source]#
to_dict()[source]#
classmethodfrom_json_file(config_file:str)[source]#
classmethodfrom_checkpoint(ckpt_dir:str)[source]#
to_json_file(config_file:str)[source]#
to_layer_quant_config(config_file:str)[source]#
propertyquant_mode#
propertyquant_algo#
set_rank(rank:int)[source]#
get_config_group(group_cls:Type[CG])CG[source]#
has_config_group(group_cls:Type[CG])bool[source]#
for_each_rank()Generator[Self,None,None][source]#
classtensorrt_llm.models.PretrainedModel(*args,**kwargs)[source]#

Bases:Module,GenerationMixin,TopModelMixin

release()[source]#
check_config(config)[source]#
classmethodfrom_config(
config:PretrainedConfig,
)[source]#
classmethodfrom_checkpoint(
ckpt_dir:str,
rank:int|None=None,
config:PretrainedConfig|None=None,
*,
preprocess_weights_hook:Callable[[Dict[str,Tensor]],Dict[str,Tensor]]|None=None,
)[source]#
load(weights,from_pruned=False)[source]#
save_checkpoint(output_dir,save_config=True)[source]#
prepare_inputs(
max_batch_size,
max_input_len,
max_seq_len,
max_num_tokens,
use_cache,
max_beam_width:int=1,
opt_num_tokens:int=None,
prompt_embedding_table_size:int=0,
position_encoding_2d:bool=False,
max_draft_len:int=0,
speculative_decoding_draft_tokens_external:bool=False,
spec_decoding_is_generation_length_variable:bool=False,
gather_context_logits:bool=False,
lora_target_modules:List[str]=None,
opt_batch_size:int=0,
num_hidden_layers:int=None,
mrope_rotary_cos_sin_size:int=None,
)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine theranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

classmethodquantize(
hf_model_dir:str,
output_dir:str,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
*,
device:str='cuda',
calib_dataset:str='cnn_dailymail',
calib_batches:int=512,
calib_batch_size:int=1,
calib_max_seq_length:int=512,
random_seed:int=1234,
tokenizer_max_seq_length:int=2048,
**kwargs,
)[source]#
classtensorrt_llm.models.WhisperEncoder(*args,**kwargs)[source]#

Bases:PretrainedModel

forward(
input_features:Tensor,
input_lengths=None,
position_ids=None,
)[source]#
prepare_inputs(max_batch_size=16)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine theranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

precompute_relative_attention_bias(build_config)[source]#
classtensorrt_llm.models.MambaForCausalLM(*args,**kwargs)[source]#

Bases:PretrainedModel

config_class#

alias ofMambaConfig

forward(
input_ids,
conv_states,
ssm_states,
host_request_types,
last_token_ids,
last_token_ids_for_logits,
host_context_lengths,
slot_mapping:Tensor|None=None,
)[source]#
prepare_inputs(
max_batch_size,
max_input_len,
max_seq_len,
max_num_tokens,
use_cache,
max_beam_width:int=1,
opt_num_tokens:int=None,
opt_batch_size:int=0,
prompt_embedding_table_size:int=0,
max_draft_len:int=0,
gather_context_logits:bool=False,
lora_target_modules:List[str]=None,
speculative_decoding_draft_tokens_external:bool=False,
)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine theranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

classmethodfrom_hugging_face(
hf_model_or_dir:str|transformers.PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create LLM object and load weights from hugging face.

Parameters:
  • hf_model_dir – the hugging face model directory

  • dtype – str, the default weights data type when loading from the hugging face model

  • mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

classtensorrt_llm.models.MPTForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

check_config(config)[source]#
classtensorrt_llm.models.MPTModel(config:PretrainedConfig)[source]#

Bases:Module

forward(
input_ids,
position_ids,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
)[source]#
classtensorrt_llm.models.GemmaConfig(
*,
architecture:str,
rotary_base:float=10000.0,
rotary_scaling:dict|None=None,
attn_bias:bool=False,
mlp_bias:bool=False,
position_embedding_type:PositionEmbeddingType=PositionEmbeddingType.rope_gpt_neox,
query_pre_attn_scalar:int|None=None,
final_logit_softcapping:float|None=None,
attn_logit_softcapping:float|None=None,
mapping:Mapping|dict|None=None,
_sliding_window_pattern:int=None,
rope_local_base_freq:int=None,
sliding_window:int=None,
**kwargs,
)[source]#

Bases:PretrainedConfig

GEMMA_ADDED_FIELDS={'attn_bias','inter_layernorms','mlp_bias','rotary_base','rotary_scaling'}#
GEMMA2_ADDED_FIELDS={'attn_logit_softcapping','final_logit_softcapping','query_pre_attn_scalar'}#
GEMMA3_ADDED_FIELDS={'_sliding_window_pattern','final_logit_softcapping','query_pre_attn_scalar','rope_local_base_freq','sliding_window'}#
VERBATIM={'_sliding_window_pattern','attn_logit_softcapping','final_logit_softcapping','hidden_act','hidden_size','intermediate_size','max_position_embeddings','num_attention_heads','num_hidden_layers','query_pre_attn_scalar','rope_local_base_freq','sliding_window','use_parallel_embedding','vocab_size'}#
propertyis_gemma_2:bool#
gemma2_config()[source]#
propertyis_gemma_3:bool#
gemma3_config()[source]#
to_dict()[source]#

Serialize the fields added in GemmaConfig

staticget_hf_config(config_dir:str|PathLike)[source]#
classmethodfrom_hugging_face(
hf_config_or_dir:HfConfigOrDir,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)GemmaConfig[source]#
classtensorrt_llm.models.GemmaForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofGemmaConfig

classmethodfrom_hugging_face(
hf_model_dir:HfConfigOrDir,
dtype='float16',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
load_model_on_cpu:bool=True,
**kwargs,
)[source]#

Create LLM object and load weights from hugging face.

Parameters:
  • hf_model_dir – the hugging face model directory

  • dtype – str, the default weights data type when loading from the hugging face model

  • mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

NATIVE_QUANT_FLOW={QuantAlgo.W4A16,QuantAlgo.W8A16,QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN,QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN,QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN,QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN}#
classmethodassert_valid_quant_algo(
quant_algo:QuantAlgo|None,
)[source]#
classmethodquantize(
hf_model_dir:str,
output_dir:str,
dtype:str='float16',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
*,
gemma_config_kwargs:Dict[str,Any]=None,
**quantize_kwargs:Dict[str,Any],
)[source]#
use_lora(
lora_config:LoraConfig,
)None[source]#

Load lora weights from the give config to the module.

Parameters:

lora_config – the lora config

classtensorrt_llm.models.DbrxConfig(
*,
bias:bool=False,
clip_qkv:float|None=None,
rotary_base:float=500000.0,
rotary_scaling:dict|None=None,
moe:MoeConfig|dict|None=None,
**kwargs,
)[source]#

Bases:PretrainedConfig

to_dict()[source]#
classtensorrt_llm.models.DbrxForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofDbrxConfig

classtensorrt_llm.models.RecurrentGemmaForCausalLM(*args,**kwargs)[source]#

Bases:PretrainedModel

forward(
input_ids,
position_ids=None,
use_cache=False,
attention_mask=None,
kv_cache_params=None,
attention_params=None,
conv_states=None,
rnn_states=None,
host_request_types=None,
last_token_ids=None,
last_token_ids_for_logits=None,
host_context_lengths=None,
slot_mapping=None,
)[source]#
prepare_recurrent_inputs(
max_batch_size,
num_profiles,
mapping,
)[source]#
prepare_inputs(
max_batch_size,
max_input_len,
max_seq_len,
max_num_tokens,
use_cache,
max_beam_width:int=1,
opt_num_tokens:int=None,
opt_batch_size:int=0,
prompt_embedding_table_size:int=0,
max_draft_len:int=0,
gather_context_logits:bool=False,
lora_target_modules:List[str]=None,
speculative_decoding_draft_tokens_external:bool=False,
)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine theranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

classtensorrt_llm.models.CogVLMConfig(
*,
mlp_bias:bool=False,
attn_bias:bool=False,
rotary_base:float=10000.0,
rotary_scaling:dict|None=None,
**kwargs,
)[source]#

Bases:PretrainedConfig

to_dict()[source]#
classtensorrt_llm.models.CogVLMForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM,TopModelMixin

config_class#

alias ofCogVLMConfig

classmethodfrom_hugging_face(
hf_model_dir,
dtype='float16',
mapping:Mapping|None=None,
quant_mode:QuantMode|None=None,
**kwargs,
)[source]#

Create LLM object and load weights from hugging face.

Parameters:
  • hf_model_dir – the hugging face model directory

  • dtype – str, the default weights data type when loading from the hugging face model

  • mapping – Mapping, specify the multi-gpu parallel strategy, when it’s None, single GPU is used

default_plugin_config(**kwargs)[source]#

Return the default plugin config for this model.

This is used when the plugin_config value is not given in to_trt() call.If users need to set different plugin configs, they can start from the return object and change it.

classmethodquantize(
hf_model_dir,
output_dir,
quant_config:QuantConfig,
*,
dtype='float16',
mapping:Mapping|None=None,
calib_batches=512,
calib_batch_size=1,
random_seed=1234,
tokenizer_max_seq_length=2048,
**kwargs,
)[source]#
classtensorrt_llm.models.EagleForCausalLM(*args,**kwargs)[source]#

Bases:LLaMAForCausalLM

config_class#

alias ofEagleConfig

forward(*args,**kwargs)[source]#
prepare_inputs(*args,**kwargs)[source]#
Inputs needed:

device_request_types: [bs]draft_tokens: [bs, max_draft_len]draft_lens: [bs]spec_decoding_generation_lengths: [bs]spec_decoding_position_offsets: [bs, max_gen_tokens]spec_decoding_packed_mask: [bs, max_draft_len, packed_length] **eagle_temperature: [bs]rand_data_validation: [bs, max_draft_len]

** The mask is tricky since the boolean mask will need to be
packed in runtime. So, the last dim will be:

packed_length = ceil((max_draft_len+1)/32)

classmethodfrom_hugging_face(
hf_model_or_dir:str|transformers.PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create a LLaMAForCausalLM object from give parameters

classtensorrt_llm.models.SpeculativeDecodingMode(
value,
names=<notgiven>,
*values,
module=None,
qualname=None,
type=None,
start=1,
boundary=None,
)[source]#

Bases:IntFlag

NONE=1#
DRAFT_TOKENS_EXTERNAL=2#
MEDUSA=4#
LOOKAHEAD_DECODING=8#
EXPLICIT_DRAFT_TOKENS=16#
EAGLE=32#
NGRAM=64#
USER_PROVIDED=128#
AUTO=256#
staticfrom_arguments(args:Namespace)[source]#
classtensorrt_llm.models.CohereForCausalLM(*args,**kwargs)[source]#

Bases:DecoderModelForCausalLM

config_class#

alias ofCohereConfig

classmethodfrom_hugging_face(
hf_model_or_dir:str,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create a CohereForCausalLM object from give parameters

classtensorrt_llm.models.MLLaMAForCausalLM(*args,**kwargs)[source]#

Bases:PretrainedModel

config_class#

alias ofMLLaMAConfig

forward(
decoder_input_ids:Tensor,
encoder_output:Tensor,
use_cache=False,
attention_mask_params=None,
last_token_ids=None,
kv_cache_params=None,
attention_params=None,
hidden_states=None,
lora_params:LoraParams=None,
cross_kv_cache_gen:Tensor|None=None,
cross_kv_reuse:Tensor|None=None,
prompt_embedding_table:Tensor|None=None,
prompt_tasks:Tensor|None=None,
prompt_vocab_size:Tensor|None=None,
skip_cross_attn_blocks:Tensor|None=None,
)[source]#
prepare_inputs(
max_batch_size,
max_beam_width,
max_decoder_input_len,
max_seq_len,
max_encoder_input_len,
gather_context_logits:bool=False,
gather_generation_logits:bool=False,
lora_target_modules:List[str]=None,
prompt_embedding_table_size:int=0,
use_cache=True,
*args,
**kwargs,
)[source]#

@brief: Prepare inputs Tensors for the model, the given sizes are used to determine theranges of the dimensions of when using TRT dynamic shapes.

@return: a list contains values which can be fed into the self.forward()

use_lora(
lora_config:LoraConfig,
)[source]#

Load lora weights from the give config to the module.

Parameters:

lora_config – the lora config

classmethodfrom_hugging_face(
hf_model_or_dir:str|transformers.PreTrainedModel,
dtype:str='auto',
mapping:Mapping|None=None,
quant_config:QuantConfig|None=None,
**kwargs,
)[source]#

Create a MLLaMAForCausalLM object from give parameters

On this page