Runtime#

classtensorrt_llm.runtime.ModelConfig(
max_batch_size:int,
max_beam_width:int,
vocab_size:int,
num_layers:int,
num_heads:int,
num_kv_heads:int,
hidden_size:int,
gpt_attention_plugin:bool,
gemm_allreduce_plugin:str=None,
remove_input_padding:bool=False,
model_name:str='',
kv_cache_type:tensorrt_llm.bindings.KVCacheType=KVCacheType.CONTINUOUS,
cross_attention:bool=False,
head_size:int=None,
has_position_embedding:bool=True,
has_token_type_embedding:bool=False,
tokens_per_block:int=32,
max_prompt_embedding_table_size:int=0,
quant_mode:tensorrt_llm.quantization.mode.QuantMode=<QuantMode:0>,
gather_context_logits:bool=False,
gather_generation_logits:bool=False,
dtype:str='',
lora_plugin:bool=False,
lora_target_modules:List[str]=<factory>,
trtllm_modules_to_hf_modules:dict=None,
skip_cross_kv:bool=False,
num_medusa_heads:int=0,
max_medusa_tokens:int=0,
paged_state:bool=True,
mamba_conv1d_plugin:bool=True,
conv_kernel:int=0,
layer_types:List[str]=<factory>,
rnn_hidden_size:int=0,
rnn_head_size:int=0,
rnn_conv_dim_size:int=0,
state_size:int=0,
state_dtype:str='',
gpu_weights_percent:float=1.0,
redrafter_num_beams:int=0,
redrafter_draft_len_per_beam:int=0,
num_kv_heads_per_layer:Optional[List[int]]=None,
num_kv_heads_per_cross_attn_layer:Optional[List[int]]=None,
skip_cross_attn_blocks:bool=False,
language_adapter_config:Optional[tensorrt_llm.layers.language_adapter.LanguageAdapterConfig]=None,
)[source]#

Bases:object

max_batch_size:int#
max_beam_width:int#
vocab_size:int#
num_layers:int#
num_heads:int#
num_kv_heads:int#
hidden_size:int#
gpt_attention_plugin:bool#
gemm_allreduce_plugin:str=None#
remove_input_padding:bool=False#
model_name:str=''#
kv_cache_type:KVCacheType=0#
cross_attention:bool=False#
head_size:int=None#
has_position_embedding:bool=True#
has_token_type_embedding:bool=False#
tokens_per_block:int=32#
max_prompt_embedding_table_size:int=0#
quant_mode:QuantMode=0#
gather_context_logits:bool=False#
gather_generation_logits:bool=False#
dtype:str=''#
lora_plugin:bool=False#
lora_target_modules:List[str]#
trtllm_modules_to_hf_modules:dict=None#
skip_cross_kv:bool=False#
num_medusa_heads:int=0#
max_medusa_tokens:int=0#
paged_state:bool=True#
mamba_conv1d_plugin:bool=True#
conv_kernel:int=0#
layer_types:List[str]#
rnn_hidden_size:int=0#
rnn_head_size:int=0#
rnn_conv_dim_size:int=0#
state_size:int=0#
state_dtype:str=''#
gpu_weights_percent:float=1.0#
redrafter_num_beams:int=0#
redrafter_draft_len_per_beam:int=0#
num_kv_heads_per_layer:List[int]|None=None#
num_kv_heads_per_cross_attn_layer:List[int]|None=None#
skip_cross_attn_blocks:bool=False#
language_adapter_config:LanguageAdapterConfig|None=None#
classtensorrt_llm.runtime.GenerationSession(
model_config:ModelConfig,
engine_buffer,
mapping:Mapping,
debug_mode=False,
debug_tensors_to_save=None,
cuda_graph_mode=False,
stream:Stream=None,
)[source]#

Bases:object

batch_size:int#
num_draft_tokens:int=0#
medusa_topks:List[int]=None#
medusa_paths:List[List[int]]=None#
medusa_tree_ids:List[int]=None#
medusa_position_offsets:List[int]=None#
medusa_temperature:float=0.0#
mapping:Mapping#
runtime:_Runtime#
device:device#
debug_mode:bool#
cuda_graph_mode:bool#
buffer_allocated:bool#
debug_tensors_to_save:None#
propertycontext_mem_size:int#
propertyvocab_size#
propertynum_layers#
propertyfirst_layer#
propertylast_layer#
propertynum_heads#
propertyhidden_size#
propertyuse_gpt_attention_plugin#
propertyuse_mamba_conv1d_plugin#
propertypaged_kv_cache#
propertykv_cache_type#
propertyuse_kv_cache#
propertytokens_per_block#
propertyremove_input_padding#
get_num_heads_kv(
layer_idx:int|None=None,
)int[source]#
propertyhead_size#
propertymax_prompt_embedding_table_size#
propertyquant_mode#
propertygather_context_logits#
propertygather_generation_logits#
propertydtype#
propertyprofiler#
propertyengine_inspector#
cuda_stream_guard()[source]#

Sync external stream and set current stream to the one bound to the session. Reset on exit.

propertycross_attention#
propertyhas_position_embedding#
propertyhas_token_type_embedding#
propertyuse_lora_plugin#
propertyuse_gemm_allreduce_plugin#
propertygemm_allreduce_plugin#
propertyis_medusa_mode#
propertyis_redrafter_mode#
propertymax_draft_tokens#
propertynum_medusa_heads#
propertypaged_state#
propertyconv_kernel#
propertyrnn_hidden_size#
propertyrnn_head_size#
propertyrnn_conv_dim_size#
propertystate_size#
propertystate_dtype#
setup(
batch_size:int,
max_context_length:int,
max_new_tokens:int,
beam_width:int=1,
max_attention_window_size:int|None=None,
sink_token_length:int|None=None,
encoder_max_input_length:int|None=None,
lora_manager:LoraManager=None,
lora_uids:List[str]=None,
medusa_choices:List[List[int]]=None,
multi_block_mode:bool=True,
enable_context_fmha_fp32_acc:bool=None,
)[source]#
pp_communicate_new_tokens(
should_stop,
cache_indir,
sequence_length,
)[source]#
pp_communicate_final_output_ids(
final_output_ids,
batch_size,
beam_width,
)[source]#
finalize_decoder(
context_lengths,
batch_size,
beam_width,
scfg,
in_progress=False,
)[source]#
find_best_medusa_path(
batch_size,
input_ids:Tensor,
next_logits,
temp=0,
)[source]#
filter_medusa_logits(
batch_size,
best_path,
best_path_lengths,
medusa_logits,
)[source]#

medusa_logits is of shape [nMH, bs, nMT+1, vocab]

Returns [nMH, bs, vocab]

get_next_medusa_tokens(
batch_size,
next_medusa_logits,
)[source]#
locate_accepted_draft_tokens(
batch_size,
best_path,
best_path_len,
draft_paths,
)[source]#
update_output_ids_by_offset(
new_generated_ids,
offsets,
)[source]#
next_medusa_input_ids()[source]#
reorder_kv_cache_for_beam_search(
batch_size:int,
beam_width:int,
max_context_length:int,
step:int,
)[source]#
early_stop_criteria(batch_size,step,should_stop)[source]#
medusa_decode_and_verify(step,batch_size,logits)[source]#
process_logits_including_draft(
step,
batch_size,
logits,
next_step_buffer,
)[source]#
  1. Process logits to tokens and validate (Medusa) or process outputs (ReDrafter)

  2. Extract early stop criteria here : self.accept_length

  3. Update output ids : needs self.new_tokens and past_sequence_length

  4. Get next input_ids : self.[new_tokens, accept_lengths, medusa_output_tokens]

  5. Update KV cache : self.[sequence_length, num_draft_tokens]

  6. Update sequence_length_buffer and past_kv_length

handle_per_step(
*,
cache_indirections:list,
step:int,
batch_size:int,
max_context_length:int,
beam_width:int,
input_ids:Tensor,
hidden_states:Tensor,
scfg:SamplingConfig,
kv_cache_block_offsets:Tensor,
host_kv_cache_block_offsets:Tensor,
cross_kv_cache_block_offsets:Tensor,
host_cross_kv_cache_block_offsets:Tensor,
prompt_embedding_table:Tensor,
tasks:Tensor,
context_lengths:Tensor,
host_context_lengths,
attention_mask:Tensor,
cross_attention_mask_for_context:Tensor,
cross_attention_mask_for_gen:Tensor,
prompt_vocab_size:Tensor,
ite:int,
sequence_limit_lengths:Tensor,
sequence_lengths:Tensor,
next_step_tensors:Dict[str,RuntimeTensor],
stop_words_data,
bad_words_data,
encoder_output:Tensor,
encoder_input_lengths:Tensor,
stopping_criteria:StoppingCriteria,
logits_processor:LogitsProcessor,
output_generation_logits:bool,
**kwargs,
)[source]#
dump_debug_buffers(step:int)None[source]#
decode_regular(
*,
batch_size:int,
scfg:SamplingConfig,
sequence_lengths:Tensor,
context_lengths:Tensor,
host_context_lengths,
max_context_length:int,
beam_width:int,
cache_indirections:list,
input_ids:Tensor,
hidden_states:Tensor,
prompt_embedding_table:Tensor,
tasks:Tensor,
prompt_vocab_size:Tensor,
ite:int,
sequence_limit_lengths:Tensor,
stop_words_data,
bad_words_data,
output_sequence_lengths:bool=False,
output_generation_logits:bool=False,
return_dict:bool=False,
encoder_output:Tensor=None,
encoder_input_lengths:Tensor=None,
stopping_criteria:StoppingCriteria=None,
logits_processor:LogitsProcessor=None,
cross_attention_mask:List[Tensor]=None,
**kwargs,
)[source]#
decode_stream(
*,
batch_size:int,
scfg:SamplingConfig,
sequence_lengths:Tensor,
context_lengths:Tensor,
host_context_lengths,
max_context_length:int,
beam_width:int,
cache_indirections:list,
input_ids:Tensor,
hidden_states:Tensor,
prompt_embedding_table:Tensor,
tasks:Tensor,
prompt_vocab_size:Tensor,
ite:int,
sequence_limit_lengths:Tensor,
stop_words_data,
bad_words_data,
output_sequence_lengths:bool=False,
output_generation_logits:bool=False,
return_dict:bool=False,
encoder_output:Tensor=None,
encoder_input_lengths:Tensor=None,
stopping_criteria:StoppingCriteria=None,
logits_processor:LogitsProcessor=None,
cross_attention_mask:List[Tensor]=None,
**kwargs,
)[source]#
decode_batch(
input_ids:Sequence[Tensor],
sampling_config:SamplingConfig,
streaming:bool=False,
**kwargs,
)[source]#
decode(
input_ids:Tensor,
context_lengths:Tensor,
sampling_config:SamplingConfig,
prompt_embedding_table:Tensor=None,
tasks:Tensor=None,
prompt_vocab_size:Tensor=None,
stop_words_list=None,
bad_words_list=None,
streaming:bool=False,
output_sequence_lengths:bool=False,
output_generation_logits:bool=False,
return_dict:bool=False,
encoder_output:Tensor=None,
encoder_input_lengths:Tensor=None,
stopping_criteria:StoppingCriteria=None,
logits_processor:LogitsProcessor=None,
cross_attention_mask:List[Tensor]=None,
**kwargs,
)[source]#
classtensorrt_llm.runtime.GenerationSequence(seq_idx,batch_idx)[source]#

Bases:object

get_batch_idx()int[source]#

Returns idx of sequence in batch

get_seq_idx()int[source]#

Returns sequence idx

classtensorrt_llm.runtime.KVCacheManager(
*,
num_layers:int,
num_blocks:int,
block_size:int,
tokens_per_block:int,
max_blocks_per_seq:int,
max_attention_window_size:int,
sink_token_len:int,
beam_width:int=1,
use_one_more_block:bool=False,
)[source]#

Bases:object

step(finished:List[bool])[source]#

Iterate to the next generation step.Add new blocks where needed and clear finished sequences.

add_sequence(
sequence:GenerationSequence,
context_len:int,
always_share_across_beam:bool=False,
)[source]#

Add sequence to the manager and allocate minimum amount of blocks for context

get_block_offsets(beam_width:int)Tensor[source]#

Returns array of offsets into memory pools

classtensorrt_llm.runtime.SamplingConfig(
end_id:int,
pad_id:int,
max_new_tokens:int=20,
num_beams:int=1,
num_return_sequences:int|None=None,
max_attention_window_size:int|None=None,
sink_token_length:int|None=None,
output_sequence_lengths:bool=False,
return_dict:bool=False,
stop_words_list:list|numpy.ndarray|torch.Tensor|NoneType=None,
bad_words_list:list|numpy.ndarray|torch.Tensor|NoneType=None,
temperature:float|torch.Tensor=1.0,
top_k:int|torch.Tensor=1,
top_p:float|torch.Tensor=0.0,
top_p_decay:torch.Tensor|None=None,
top_p_min:torch.Tensor|None=None,
top_p_reset_ids:torch.Tensor|None=None,
random_seed:int|torch.Tensor=None,
length_penalty:float|torch.Tensor=1.0,
early_stopping:int|torch.Tensor=1,
repetition_penalty:float|torch.Tensor=1.0,
min_length:int|torch.Tensor=1,
presence_penalty:float|torch.Tensor=0.0,
frequency_penalty:float|torch.Tensor=0.0,
use_beam_hyps:bool=True,
min_p:float|torch.Tensor=0.0,
)[source]#

Bases:object

end_id:int#
pad_id:int#
max_new_tokens:int=20#
num_beams:int=1#
num_return_sequences:int|None=None#
max_attention_window_size:int|None=None#
sink_token_length:int|None=None#
output_sequence_lengths:bool=False#
return_dict:bool=False#
stop_words_list:list|ndarray|Tensor|None=None#
bad_words_list:list|ndarray|Tensor|None=None#
temperature:float|Tensor=1.0#
top_k:int|Tensor=1#
top_p:float|Tensor=0.0#
top_p_decay:Tensor|None=None#
top_p_min:Tensor|None=None#
top_p_reset_ids:Tensor|None=None#
random_seed:int|Tensor=None#
length_penalty:float|Tensor=1.0#
early_stopping:int|Tensor=1#
repetition_penalty:float|Tensor=1.0#
min_length:int|Tensor=1#
presence_penalty:float|Tensor=0.0#
frequency_penalty:float|Tensor=0.0#
use_beam_hyps:bool=True#
beam_search_diversity_rate:float|Tensor=0.0#
output_cum_log_probs:bool=False#
output_log_probs:bool=False#
no_repeat_ngram_size:int|Tensor=None#
min_p:float|Tensor=0.0#
update(**kwargs)[source]#
classtensorrt_llm.runtime.Session(**kwargs)[source]#

Bases:object

Session is a managed TensorRT runtime.

staticfrom_serialized_engine(
engine,
)Session[source]#

@brief: Create a session from a serialized engine@param engine: a serialized engine@return: a Session object

staticfrom_engine(engine)Session[source]#

@brief: Create a session from an existing ICudaEngine engine@param engine: an ICudaEngine@return: a Session object

propertyruntime:Runtime#
propertyengine:ICudaEngine#
propertycontext:IExecutionContext#
Get the default TensorRT execution context,

use self.engine.create_execution_context() to create a new context if needed

@return: one TensorRT execution context object

Type:

@brief

propertycontext_mem_size:int#
set_shapes(
tensor_dict:Dict[str,Tensor],
context:IExecutionContext|None=None,
)[source]#
infer_shapes(
inputs:List[TensorInfo],
context:IExecutionContext|None=None,
)List[TensorInfo][source]#
@brief: Set input shapes to given context, and infer the output shapes from the given input shapes.

This function should be called every time when the input shapes are changed before calling run().Or call the context.set_input_shape on all dynamic shaped input tensors manually.

@param inputs: list of TensorInfo object, each item represents an input tensor@param context: TensorRT execution context, if None, use the default context@return: list of TensorInfo object, each item represents an output tensor, returns None if failed

run(
inputs:Dict[str,Any],
outputs:Dict[str,Any],
stream,
context=None,
)bool[source]#

@brief: Run the TensorRT engine with the given inputs and outputs@param inputs: dict of input tensors, key is tensor name, value is tensor pointer or torch tensor@param outputs: dict of output tensors, key is tensor name, value is tensor pointer or torch tensor@param stream: cuda stream to enqueue the TensorRT engine on@param context: TensorRT execution context, if None, use the default context@return: True if enqueue succeeded, note the enqueue is an async call,

returning True does not mean the execution is finished

classtensorrt_llm.runtime.TensorInfo(name:'str',dtype:'trt.DataType',shape:'tuple')[source]#

Bases:object

name:str#
dtype:DataType#
shape:tuple#
numel()[source]#
view(*shape)[source]#
squeeze(dim=0)[source]#
classtensorrt_llm.runtime.ChatGLMGenerationSession(
model_config:ModelConfig,
engine_buffer,
mapping:Mapping,
debug_mode=False,
debug_tensors_to_save=None,
cuda_graph_mode=False,
stream:Stream=None,
)[source]#

Bases:GenerationSession

classtensorrt_llm.runtime.QWenForCausalLMGenerationSession(
model_config:ModelConfig,
engine_buffer,
mapping:Mapping,
debug_mode=False,
debug_tensors_to_save=None,
cuda_graph_mode=False,
stream:Stream=None,
global_max_input_length:int=2048,
global_max_output_length:int=4096,
)[source]#

Bases:GenerationSession

generate(
input_ids:Tensor,
input_lengths:Tensor,
sampling_config:SamplingConfig,
max_new_tokens:int,
runtime_rank:int=0,
)[source]#
tensorrt_llm.runtime.decode_words_list(
word_dict:List[List[str]],
tokenizer=None,
add_special_tokens=False,
)[source]#
format of word_dict

len(word_dict) should be same to batch_sizeword_dict[i] means the words for batch ilen(word_dict[i]) >= 1, which means it must contain at least 1 stringFor example, word_dict[2] = [” I am happy”, “ I am sad”].

classtensorrt_llm.runtime.LogitsProcessorList(iterable=(),/)[source]#

Bases:list,LogitsProcessor

classtensorrt_llm.runtime.LogitsProcessor[source]#

Bases:object

Base class for all logit processors that can be applied during generation.

classtensorrt_llm.runtime.StoppingCriteriaList(iterable=(),/)[source]#

Bases:list,StoppingCriteria

classtensorrt_llm.runtime.StoppingCriteria[source]#

Bases:object

Base class for all stopping criteria that can be applied during generation.

classtensorrt_llm.runtime.ModelRunner(
session:GenerationSession,
max_batch_size:int,
max_input_len:int,
max_seq_len:int,
max_beam_width:int,
kv_cache_type:KVCacheType,
lora_manager:LoraManager|None=None,
)[source]#

Bases:ModelRunnerMixin

An interface class that wraps GenerationSession and provides generation methods.

classmethodfrom_engine(
engine:Engine,
*,
max_output_len:int|None,
lora_dir:List[str]|None,
rank:int,
debug_mode:bool,
lora_ckpt_source:str,
medusa_choices:List[List[int]],
stream:Stream,
gpu_weights_percent:float,
enable_context_fmha_fp32_acc:bool|None,
multi_block_mode:bool|None,
)ModelRunner[source]#
classmethodfrom_dir(
engine_dir:str,
*,
max_output_len:int|None=None,
lora_dir:List[str]|None=None,
rank:int=0,
debug_mode:bool=False,
lora_ckpt_source:str='hf',
medusa_choices:List[List[int]]=None,
stream:Stream=None,
gpu_weights_percent:float=1,
enable_context_fmha_fp32_acc:bool|None=None,
multi_block_mode:bool|None=None,
fail_fast_on_attention_window_too_large:bool=False,
)ModelRunner[source]#

Create a ModelRunner instance from an engine directory.

Parameters:
  • engine_dir (str) – The directory that contains the serialized engine files and config files.

  • max_output_len (Optional[int]) – max_output_len, this arg might be available only when loading time, generate will still to check when disable_kv_cache is enabled.

  • lora_dir (Optional[List[str]]) – The directories that contain LoRA weights.

  • rank (int) – The runtime rank id.

  • debug_mode (bool) – Whether or not to turn on the debug mode.

  • medusa_choices (List[List[int]]) – Medusa choices to use when in Medusa decoding

  • stream (torch.cuda.Stream) – Stream to use.

  • multi_block_mode (bool) – Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.

  • fail_fast_on_attention_window_too_large (bool) – Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache.Note: This parameter is only applicable to C++ runtime (ModelRunnerCpp).

Returns:

An instance of ModelRunner.

Return type:

ModelRunner

propertydtype:dtype#
propertyvocab_size:int#
propertyvocab_size_padded:int#
propertyhidden_size:int#
propertynum_heads:int#
propertynum_layers:int#
propertymax_sequence_length:int#
propertyremove_input_padding:bool#
propertyuse_lora_plugin:bool#
propertymax_prompt_embedding_table_size:int#
propertymapping:Mapping#
propertygather_context_logits:bool#
propertygather_generation_logits:bool#
generate(
batch_input_ids:List[Tensor],
position_ids:List[Tensor]=None,
sampling_config:SamplingConfig|None=None,
prompt_table:str|Tensor|None=None,
prompt_tasks:str|None=None,
lora_uids:list|None=None,
streaming:bool=False,
output_generation_logits:bool=False,
stopping_criteria:StoppingCriteria|None=None,
logits_processor:LogitsProcessor|None=None,
medusa_choices:List[List[int]]|None=None,
encoder_max_input_length:int=None,
encoder_input_features:List[Tensor]=None,
encoder_output_lengths:List[Tensor]=None,
cross_attention_masks:List[Tensor]=None,
**kwargs,
)Tensor|dict[source]#

Generates sequences of token ids.The generation-controlling parameters are set in the sampling_config; it will be set to a default one if not passed.You can override any sampling_config’s attributes by passing corresponding parameters.

Parameters:
  • batch_input_ids (List[torch.Tensor]) – A list of input id tensors. Each tensor is of shape (sequence_length, ).

  • sampling_config (SamplingConfig) – The sampling configuration to be used as base parametrization for the generation call.The passed**kwargs matching the sampling_config’s attributes will override them.If the sampling_config is not provided, a default will be used.

  • prompt_table (str ortorch.Tensor) – The file path of prompt table (.npy format, exported by nemo_prompt_convert.py) or the prompt table itself.

  • prompt_tasks (str) – The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).

  • lora_uids (list) – The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.

  • streaming (bool) – Whether or not to use streaming mode for generation.

  • stopping_criteria (StoppingCriteria) – Custom stopping criteria.

  • logits_processor (LogitsProcessor) – Custom logits processors.

  • medusa_choices (List[List[int]]) – Medusa decoding choices.

  • (Dict[str (kwargs) – Ad hoc parametrization of sampling_config.The passed**kwargs matching the sampling_config’s attributes will override them.

  • Any] – Ad hoc parametrization of sampling_config.The passed**kwargs matching the sampling_config’s attributes will override them.

Returns:

If return_dict=False, the method returns generated output_ids.If return_dict=True, the method returns a dict of output_ids,sequence_lengths (if sampling_config.output_sequence_lengths=True),context_logits and generation_logits (if self.gather_context_logits=Trueand self.gather_generation_logits=True, respectively).

Return type:

torch.Tensor or dict

serialize_engine()IHostMemory[source]#

Serialize the engine.

Returns:

The serialized engine.

Return type:

bytes

classtensorrt_llm.runtime.ModelRunnerCpp(
executor:Executor,
max_batch_size:int,
max_input_len:int,
max_seq_len:int,
max_beam_width:int,
model_config:ModelConfig,
world_config:WorldConfig,
use_kv_cache:bool,
lora_manager:LoraManager|None=None,
)[source]#

Bases:ModelRunnerMixin

An interface class that wraps Executor and provides generation methods.

classmethodfrom_dir(
engine_dir:str,
*,
lora_dir:str|None=None,
rank:int=0,
max_batch_size:int|None=None,
max_input_len:int|None=None,
max_output_len:int|None=None,
max_beam_width:int|None=None,
max_attention_window_size:list[int]|None=None,
sink_token_length:int|None=None,
kv_cache_free_gpu_memory_fraction:float|None=None,
cross_kv_cache_fraction:float|None=None,
medusa_choices:list[list[int]]|None=None,
eagle_choices:list[list[int]]|None=None,
eagle_posterior_threshold:float|None=None,
eagle_use_dynamic_tree:bool=False,
eagle_dynamic_tree_max_top_k:int|None=None,
lookahead_config:list[int]|None=None,
debug_mode:bool=False,
lora_ckpt_source:str='hf',
use_gpu_direct_storage:bool=False,
gpu_weights_percent:float=1,
max_tokens_in_paged_kv_cache:int|None=None,
kv_cache_enable_block_reuse:bool=False,
enable_chunked_context:bool=False,
is_enc_dec:bool=False,
multi_block_mode:bool=True,
enable_context_fmha_fp32_acc:bool|None=None,
cuda_graph_mode:bool|None=None,
logits_processor_map:Dict[str,LogitsProcessor]|None=None,
device_ids:List[int]|None=None,
is_orchestrator_mode:bool=False,
use_runtime_defaults:bool=True,
gather_generation_logits:bool=False,
use_variable_beam_width_search:bool=False,
mm_embedding_offloading:bool=False,
fail_fast_on_attention_window_too_large:bool=False,
)ModelRunnerCpp[source]#

Create a ModelRunnerCpp instance from an engine directory.

Parameters:
  • engine_dir (str) – The directory that contains the serialized engine files and config files.

  • lora_dir (str) – The directory that contains LoRA weights.

  • rank (int) – The runtime rank id.

  • max_batch_size (int) – The runtime batch size limit. If max_batch_size is not None, it should notbe larger than the engine’s max_batch_size; otherwise, the engine’s max_batch_sizewill be used.

  • max_input_len (int) – The runtime input length limit. If max_input_len is not None, it should notbe larger than the engine’s max_input_len; otherwise, the engine’s max_input_lenwill be used.

  • max_output_len (int) – The runtime output length limit. If max_output_len is not None, it should notbe larger than the engine’s max_output_len; otherwise, the engine’s max_output_lenwill be used.

  • max_beam_width (int) – The runtime beam width limit. If max_beam_width is not None, it should notbe larger than the engine’s max_beam_width; otherwise, the engine’s max_beam_widthwill be used.

  • max_attention_window_size (List[int]) – The attention window size that controls the sliding window attention / cyclic kv cache behavior.

  • sink_token_length (int) – The sink token length, default=0.

  • kv_cache_free_gpu_memory_fraction (float) – Free GPU memory fraction that KV cache used.

  • cross_kv_cache_fraction (float) – KV Cache fraction reserved for cross attention, should only be used with enc-dec models.

  • debug_mode (bool) – Whether or not to turn on the debug mode.

  • medusa_choices (List[List[int]]) – Medusa choices to use when in Medusa decoding.

  • eagle_choices (List[List[int]]) – Eagle choices to use when in Eagle-1 decoding.

  • float (eagle_posterior_threshold) – Minimum token probability threshold for typical acceptance.Value different from None enables typical acceptance in Eagle.

  • bool (eagle_use_dynamic_tree) – Whether to use Eagle-2, which is dynamic tree.

  • int (eagle_dynamic_tree_max_top_k) – The maximum number of draft tokens to expand for each node in Eagle-2.

  • lora_ckpt_source (str) – Source of checkpoint. Should be one of [‘hf’, ‘nemo’].

  • max_tokens_in_paged_kv_cache (int) – Maximum amount of tokens configured in kv cache.

  • kv_cache_enable_block_reuse (bool) – Enables block reuse in kv cache.

  • enable_chunked_context (bool) – Enables chunked context.

  • is_enc_dec (bool) – Whether the model is encoder-decoder architecture.

  • multi_block_mode (bool) – Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.

  • enable_context_fmha_fp32_acc (bool) – Enable FMHA runner FP32 accumulation.

  • cuda_graph_mode (bool) – Whether to use cuda graph for inference.

  • logits_processor_map (Dict[str,LogitsProcessor]) – A map of logits processor functions indexed by names. A name can be provided later tothe generate() function to specify which logits processor to run.

  • device_ids (List[int]) – Device indices to run the Executor on.

  • is_orchestrator_mode (bool) – The mode to run the model-runner, Leader mode by default.

  • gather_generation_logits (bool) – Enable gathering generation logits.

  • fail_fast_on_attention_window_too_large (bool) – Whether to fail fast if the attention window(s) are too large to fit even a single sequence in the KVCache.

Returns:

An instance of ModelRunnerCpp.

Return type:

ModelRunnerCpp

propertydtype:dtype#
propertyvocab_size:int#
propertyvocab_size_padded:int#
propertyhidden_size:int#
propertynum_heads:int#
propertynum_layers:int#
propertymax_sequence_length:int#
propertyremove_input_padding:bool#
propertymax_prompt_embedding_table_size:int#
propertygather_context_logits:bool#
propertygather_generation_logits:bool#
generate(
batch_input_ids:List[Tensor],
*,
position_ids:List[Tensor]=None,
encoder_input_ids:List[Tensor]=None,
encoder_input_features:List[Tensor]=None,
encoder_output_lengths:List[int]=None,
cross_attention_masks:List[Tensor]=None,
mrope_params:MropeParams|None=None,
sampling_config:SamplingConfig|None=None,
lora_uids:list|None=None,
lookahead_config:list[int]|None=None,
streaming:bool=False,
stopping_criteria:StoppingCriteria|None=None,
logits_processor_names:list[str]|None=None,
max_new_tokens:int=1,
end_id:int|None=None,
pad_id:int|None=None,
bad_words_list:list[list[int]]|None=None,
stop_words_list:list[list[int]]|None=None,
return_dict:bool=False,
output_sequence_lengths:bool=False,
output_generation_logits:bool=False,
output_log_probs:bool=False,
output_cum_log_probs:bool=False,
prompt_table:str|Tensor|None=None,
prompt_tasks:str|None=None,
input_token_extra_ids:List[List[int]]=None,
return_all_generated_tokens:bool=False,
language_adapter_uids:List[int]|None=None,
mm_embedding_offloading:bool=False,
**kwargs,
)Tensor|dict[source]#

Generates sequences of token ids.The generation-controlling parameters are set in the sampling_config; it will be set to a default one if not passed.You can override any sampling_config’s attributes by passing corresponding parameters.

Parameters:
  • batch_input_ids (List[torch.Tensor]) – A list of input id tensors. Each tensor is of shape (sequence_length, ).

  • position_ids (List[torch.Tensor]) – A list of position id tensors. Each tensor is of shape (sequence_length, ).

  • encoder_input_ids (List[torch.Tensor]) – A list of encoder input id tensors for encoder-decoder models (optional). Each tensor is of shape (sequence_length, ).

  • encoder_input_features – (List[torch.Tensor]):A list of encoder input feature tensors for multimodal encoder-decoder models (optional). Each tensor is of shape (sequence_length, feature_dim).

  • encoder_output_lengths – (List[int]):A list of encoder output lengths (optional) if encoder output has different length from encoder input (due to convolution down-sampling, etc.)

  • sampling_config (SamplingConfig) – The sampling configuration to be used as base parametrization for the generation call.The passed**kwargs matching the sampling_config’s attributes will override them.If the sampling_config is not provided, a default will be used.

  • prompt_table (str ortorch.Tensor) – The file path of prompt table (.npy format, exported by nemo_prompt_convert.py) or the prompt table itself.

  • prompt_tasks (str) – The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).

  • input_token_extra_ids (List[List[int]]) – Input token extra ids for using p-tuning and KV Cache reuse together

  • lora_uids (list) – The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.

  • streaming (bool) – Whether or not to use streaming mode for generation.

  • stopping_criteria (StoppingCriteria) – Custom stopping criteria.

  • logits_processor_names (List[str]) – Custom logits processor names.

  • return_all_generated_tokens (bool) – Whether the full output is returned at each streaming step

  • (Dict[str (kwargs) – Ad hoc parametrization of sampling_config.The passed**kwargs matching the sampling_config’s attributes will override them.

  • Any] – Ad hoc parametrization of sampling_config.The passed**kwargs matching the sampling_config’s attributes will override them.

Returns:

If return_dict=False, the method returns generated output_ids.If return_dict=True, the method returns a dict of output_ids,sequence_lengths (if sampling_config.output_sequence_lengths=True),context_logits and generation_logits (if self.gather_context_logits=True andself.gather_generation_logits=True, respectively).

Return type:

torch.Tensor or dict

classtensorrt_llm.runtime.EncDecModelRunner(
engine_name,
engine_dir,
lora_dir=None,
lora_task_uids=None,
debug_mode=False,
skip_encoder=False,
stream:Stream=None,
enable_context_fmha_fp32_acc:bool=None,
)[source]#

Bases:object

classmethodfrom_engine(
engine_name,
engine_dir,
lora_dir=None,
lora_task_uids=None,
debug_mode=False,
skip_encoder=False,
stream=None,
enable_context_fmha_fp32_acc=None,
)[source]#
process_input(
input_ids,
remove_input_padding=False,
pad_token_id=0,
prompt_tasks=None,
language_adapter_routings=None,
)[source]#
encoder_run(
input_ids,
input_lengths,
max_input_length,
position_ids=None,
token_type_ids=None,
debug_mode=False,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
attention_mask=None,
language_adapter_routings=None,
)[source]#
generate(
encoder_input_ids,
decoder_input_ids,
max_new_tokens,
num_beams=1,
pad_token_id=None,
eos_token_id=None,
bos_token_id=None,
debug_mode=False,
return_dict=False,
prompt_embedding_table=None,
prompt_tasks=None,
prompt_vocab_size=None,
attention_mask=None,
time_encoder=False,
return_encoder_output=False,
encoder_language_adapter_routings=None,
decoder_language_adapter_routings=None,
)[source]#
classtensorrt_llm.runtime.MultimodalModelRunner(args)[source]#

Bases:object

propertycpp_e2e#
propertycpp_llm_only#
propertypython_e2e#
propertyvisual_engine_dir#
propertyaudio_engine_dir#
propertyllm_engine_dir#
init_tokenizer()[source]#
init_processor()[source]#
init_image_encoder()[source]#
init_audio_encoder()[source]#
init_llm()[source]#
video_preprocess(video_path)[source]#
preprocess(
pre_prompt,
post_prompt,
image,
other_vision_inputs,
other_audio_inputs,
)[source]#
statictokenizer_image_token(
batch_size,
pre_prompt,
post_prompt,
tokenizer,
image_token_index=-200,
)[source]#
split_prompt_by_images(tensor)[source]#
prepare_position_ids_for_cogvlm(input_ids)[source]#
generate(
pre_prompt,
post_prompt,
image,
decoder_input_ids,
max_new_tokens,
other_vision_inputs={},
other_audio_inputs={},
other_decoder_inputs={},
)[source]#
get_visual_features(image,other_vision_inputs)[source]#
get_audio_features(audio,other_audio_inputs)[source]#
setup_fake_prompts_vila(
batch_size,
visual_features,
split_input_ids,
input_lengths,
)[source]#
setup_fake_prompts(
visual_features,
pre_input_ids,
post_input_ids,
input_lengths,
)[source]#
get_rope_index(
input_ids:IntTensor,
image_grid_thw:LongTensor|None=None,
video_grid_thw:LongTensor|None=None,
attention_mask:Tensor|None=None,
)Tuple[Tensor,Tensor][source]#

Calculate the 3D rope index based on image and video’s temporal, height and width in LLM.

Explanation:

Each embedding sequence contains vision embedding and text embedding or just contains text embedding.

For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.Examples:

input_ids: [T T T T T], here T is for text.temporal position_ids: [0, 1, 2, 3, 4]height position_ids: [0, 1, 2, 3, 4]width position_ids: [0, 1, 2, 3, 4]

For vision and text embedding sequence, we calculate 3D rotary position embedding for vision partand 1D rotary position embedding for text part.Examples:

Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]text temporal position_ids: [3, 4, 5, 6, 7]text height position_ids: [3, 4, 5, 6, 7]text width position_ids: [3, 4, 5, 6, 7]Here we calculate the text start position_ids as the max vision position_ids plus 1.

Parameters:
  • input_ids (torch.IntTensor of shape(batch_size, sequence_length)) – Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provideit.

  • image_grid_thw (torch.LongTensor of shape(num_images, 3),optional) – The temporal, height and width of feature shape of each image in LLM.

  • video_grid_thw (torch.LongTensor of shape(num_videos, 3),optional) – The temporal, height and width of feature shape of each video in LLM.

  • attention_mask (torch.Tensor of shape(batch_size, sequence_length),optional) –

    Mask to avoid performing attention on padding token indices. Mask values selected in[0, 1]:

    • 1 for tokens that arenot masked,

    • 0 for tokens that aremasked.

Returns:

position_ids (torch.IntTensor of shape(3, batch_size, sequence_length))mrope_position_deltas (torch.Tensor of shape(batch_size))

setup_fake_prompts_qwen2vl(
visual_features,
input_ids,
vision_grid_thws,
attention_mask,
input_lengths,
)[source]#
ptuning_setup_fuyu(
input_ids,
image_patches_indices,
)[source]#
ptuning_setup_pixtral(input_ids)[source]#
ptuning_setup_llava_next(
visual_features,
pre_prompt,
post_prompt,
)[source]#
ptuning_setup_phi3(
visual_features,
audio_features,
input_ids,
num_img_tokens,
num_aud_tokens,
)[source]#
ptuning_setup(
prompt_table,
input_ids,
input_lengths,
)[source]#
load_test_data(image_path=None,video_path=None)[source]#
load_test_audio(audio_path)[source]#
setup_inputs(
input_text,
raw_image,
raw_audio=None,
)[source]#
run(
input_text,
input_image,
input_audio,
max_new_tokens,
)[source]#
On this page