Runtime #

classtensorrt_llm.runtime.ModelConfig( max_batch_size:int, max_beam_width:int, vocab_size:int, num_layers:int, num_heads:int, num_kv_heads:int, hidden_size:int, gpt_attention_plugin:bool, gemm_allreduce_plugin:str=None, remove_input_padding:bool=False, model_name:str='', kv_cache_type:tensorrt_llm.bindings.KVCacheType=KVCacheType.CONTINUOUS, cross_attention:bool=False, head_size:int=None, has_position_embedding:bool=True, has_token_type_embedding:bool=False, tokens_per_block:int=32, max_prompt_embedding_table_size:int=0, quant_mode:tensorrt_llm.quantization.mode.QuantMode=<QuantMode:0>, gather_context_logits:bool=False, gather_generation_logits:bool=False, dtype:str='', lora_plugin:bool=False, lora_target_modules:List[str]=<factory>, trtllm_modules_to_hf_modules:dict=None, skip_cross_kv:bool=False, num_medusa_heads:int=0, max_medusa_tokens:int=0, paged_state:bool=True, mamba_conv1d_plugin:bool=True, conv_kernel:int=0, layer_types:List[str]=<factory>, rnn_hidden_size:int=0, rnn_head_size:int=0, rnn_conv_dim_size:int=0, state_size:int=0, state_dtype:str='', gpu_weights_percent:float=1.0, redrafter_num_beams:int=0, redrafter_draft_len_per_beam:int=0, num_kv_heads_per_layer:Optional[List[int]]=None, num_kv_heads_per_cross_attn_layer:Optional[List[int]]=None, skip_cross_attn_blocks:bool=False, language_adapter_config:Optional[tensorrt_llm.layers.language_adapter.LanguageAdapterConfig]=None, )[source]#

Bases:object

max_batch_size:int#

max_beam_width:int#

vocab_size:int#

num_layers:int#

num_heads:int#

num_kv_heads:int#

hidden_size:int#

gpt_attention_plugin:bool#

gemm_allreduce_plugin:str=None#

remove_input_padding:bool=False#

model_name:str=''#

kv_cache_type:KVCacheType=0#

cross_attention:bool=False#

head_size:int=None#

has_position_embedding:bool=True#

has_token_type_embedding:bool=False#

tokens_per_block:int=32#

max_prompt_embedding_table_size:int=0#

quant_mode:QuantMode=0#

gather_context_logits:bool=False#

gather_generation_logits:bool=False#

dtype:str=''#

lora_plugin:bool=False#

lora_target_modules:List[str]#

trtllm_modules_to_hf_modules:dict=None#

skip_cross_kv:bool=False#

num_medusa_heads:int=0#

max_medusa_tokens:int=0#

paged_state:bool=True#

mamba_conv1d_plugin:bool=True#

conv_kernel:int=0#

layer_types:List[str]#

rnn_hidden_size:int=0#

rnn_head_size:int=0#

rnn_conv_dim_size:int=0#

state_size:int=0#

state_dtype:str=''#

gpu_weights_percent:float=1.0#

redrafter_num_beams:int=0#

redrafter_draft_len_per_beam:int=0#

num_kv_heads_per_layer:List[int]|None=None#

num_kv_heads_per_cross_attn_layer:List[int]|None=None#

skip_cross_attn_blocks:bool=False#

language_adapter_config:LanguageAdapterConfig|None=None#

classtensorrt_llm.runtime.GenerationSession( model_config:ModelConfig, engine_buffer, mapping:Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream:Stream=None, )[source]#

Bases:object

batch_size:int#

num_draft_tokens:int=0#

medusa_topks:List[int]=None#

medusa_paths:List[List[int]]=None#

medusa_tree_ids:List[int]=None#

medusa_position_offsets:List[int]=None#

medusa_temperature:float=0.0#

mapping:Mapping#

runtime:_Runtime#

device:device#

debug_mode:bool#

cuda_graph_mode:bool#

buffer_allocated:bool#

debug_tensors_to_save:None#

propertycontext_mem_size:int#

propertyvocab_size#

propertynum_layers#

propertyfirst_layer#

propertylast_layer#

propertynum_heads#

propertyhidden_size#

propertyuse_gpt_attention_plugin#

propertyuse_mamba_conv1d_plugin#

propertypaged_kv_cache#

propertykv_cache_type#

propertyuse_kv_cache#

propertytokens_per_block#

propertyremove_input_padding#

get_num_heads_kv( layer_idx:int|None=None, )→int[source]#

propertyhead_size#

propertymax_prompt_embedding_table_size#

propertyquant_mode#

propertygather_context_logits#

propertygather_generation_logits#

propertydtype#

propertyprofiler#

propertyengine_inspector#

cuda_stream_guard()[source]#: Sync external stream and set current stream to the one bound to the session. Reset on exit.

propertycross_attention#

propertyhas_position_embedding#

propertyhas_token_type_embedding#

propertyuse_lora_plugin#

propertyuse_gemm_allreduce_plugin#

propertygemm_allreduce_plugin#

propertyis_medusa_mode#

propertyis_redrafter_mode#

propertymax_draft_tokens#

propertynum_medusa_heads#

propertypaged_state#

propertyconv_kernel#

propertyrnn_hidden_size#

propertyrnn_head_size#

propertyrnn_conv_dim_size#

propertystate_size#

propertystate_dtype#

setup( batch_size:int, max_context_length:int, max_new_tokens:int, beam_width:int=1, max_attention_window_size:int|None=None, sink_token_length:int|None=None, encoder_max_input_length:int|None=None, lora_manager:LoraManager=None, lora_uids:List[str]=None, medusa_choices:List[List[int]]=None, multi_block_mode:bool=True, enable_context_fmha_fp32_acc:bool=None, )[source]#

pp_communicate_new_tokens( should_stop, cache_indir, sequence_length, )[source]#

pp_communicate_final_output_ids( final_output_ids, batch_size, beam_width, )[source]#

finalize_decoder( context_lengths, batch_size, beam_width, scfg, in_progress=False, )[source]#

find_best_medusa_path( batch_size, input_ids:Tensor, next_logits, temp=0, )[source]#

filter_medusa_logits( batch_size, best_path, best_path_lengths, medusa_logits, )[source]#: medusa_logits is of shape [nMH, bs, nMT+1, vocab]
Returns [nMH, bs, vocab]

get_next_medusa_tokens( batch_size, next_medusa_logits, )[source]#

locate_accepted_draft_tokens( batch_size, best_path, best_path_len, draft_paths, )[source]#

update_output_ids_by_offset( new_generated_ids, offsets, )[source]#

next_medusa_input_ids()[source]#

reorder_kv_cache_for_beam_search( batch_size:int, beam_width:int, max_context_length:int, step:int, )[source]#

early_stop_criteria(batch_size,step,should_stop)[source]#

medusa_decode_and_verify(step,batch_size,logits)[source]#

process_logits_including_draft( step, batch_size, logits, next_step_buffer, )[source]#

Process logits to tokens and validate (Medusa) or process outputs (ReDrafter)
Extract early stop criteria here : self.accept_length
Update output ids : needs self.new_tokens and past_sequence_length
Get next input_ids : self.[new_tokens, accept_lengths, medusa_output_tokens]
Update KV cache : self.[sequence_length, num_draft_tokens]
Update sequence_length_buffer and past_kv_length

handle_per_step(
*,
cache_indirections:list,
step:int,
batch_size:int,
max_context_length:int,
beam_width:int,
input_ids:Tensor,
hidden_states:Tensor,
scfg:SamplingConfig,
kv_cache_block_offsets:Tensor,
host_kv_cache_block_offsets:Tensor,
cross_kv_cache_block_offsets:Tensor,
host_cross_kv_cache_block_offsets:Tensor,
prompt_embedding_table:Tensor,
tasks:Tensor,
context_lengths:Tensor,
host_context_lengths,
attention_mask:Tensor,
cross_attention_mask_for_context:Tensor,
cross_attention_mask_for_gen:Tensor,
prompt_vocab_size:Tensor,
ite:int,
sequence_limit_lengths:Tensor,
sequence_lengths:Tensor,
next_step_tensors:Dict[str,RuntimeTensor],
stop_words_data,
bad_words_data,
encoder_output:Tensor,
encoder_input_lengths:Tensor,
stopping_criteria:StoppingCriteria,
logits_processor:LogitsProcessor,
output_generation_logits:bool,
**kwargs,
)[source]#

dump_debug_buffers(step:int)→None[source]#

decode_regular(
*,
batch_size:int,
scfg:SamplingConfig,
sequence_lengths:Tensor,
context_lengths:Tensor,
host_context_lengths,
max_context_length:int,
beam_width:int,
cache_indirections:list,
input_ids:Tensor,
hidden_states:Tensor,
prompt_embedding_table:Tensor,
tasks:Tensor,
prompt_vocab_size:Tensor,
ite:int,
sequence_limit_lengths:Tensor,
stop_words_data,
bad_words_data,
output_sequence_lengths:bool=False,
output_generation_logits:bool=False,
return_dict:bool=False,
encoder_output:Tensor=None,
encoder_input_lengths:Tensor=None,
stopping_criteria:StoppingCriteria=None,
logits_processor:LogitsProcessor=None,
cross_attention_mask:List[Tensor]=None,
**kwargs,
)[source]#

decode_stream(
*,
batch_size:int,
scfg:SamplingConfig,
sequence_lengths:Tensor,
context_lengths:Tensor,
host_context_lengths,
max_context_length:int,
beam_width:int,
cache_indirections:list,
input_ids:Tensor,
hidden_states:Tensor,
prompt_embedding_table:Tensor,
tasks:Tensor,
prompt_vocab_size:Tensor,
ite:int,
sequence_limit_lengths:Tensor,
stop_words_data,
bad_words_data,
output_sequence_lengths:bool=False,
output_generation_logits:bool=False,
return_dict:bool=False,
encoder_output:Tensor=None,
encoder_input_lengths:Tensor=None,
stopping_criteria:StoppingCriteria=None,
logits_processor:LogitsProcessor=None,
cross_attention_mask:List[Tensor]=None,
**kwargs,
)[source]#

decode_batch(
input_ids:Sequence[Tensor],
sampling_config:SamplingConfig,
streaming:bool=False,
**kwargs,
)[source]#

decode(
input_ids:Tensor,
context_lengths:Tensor,
sampling_config:SamplingConfig,
prompt_embedding_table:Tensor=None,
tasks:Tensor=None,
prompt_vocab_size:Tensor=None,
stop_words_list=None,
bad_words_list=None,
streaming:bool=False,
output_sequence_lengths:bool=False,
output_generation_logits:bool=False,
return_dict:bool=False,
encoder_output:Tensor=None,
encoder_input_lengths:Tensor=None,
stopping_criteria:StoppingCriteria=None,
logits_processor:LogitsProcessor=None,
cross_attention_mask:List[Tensor]=None,
**kwargs,
)[source]#

classtensorrt_llm.runtime.GenerationSequence(seq_idx,batch_idx)[source]#

Bases:object

get_batch_idx()→int[source]#: Returns idx of sequence in batch

get_seq_idx()→int[source]#: Returns sequence idx

classtensorrt_llm.runtime.KVCacheManager( *, num_layers:int, num_blocks:int, block_size:int, tokens_per_block:int, max_blocks_per_seq:int, max_attention_window_size:int, sink_token_len:int, beam_width:int=1, use_one_more_block:bool=False, )[source]#

Bases:object

step(finished:List[bool])[source]#: Iterate to the next generation step.Add new blocks where needed and clear finished sequences.

add_sequence( sequence:GenerationSequence, context_len:int, always_share_across_beam:bool=False, )[source]#: Add sequence to the manager and allocate minimum amount of blocks for context

get_block_offsets(beam_width:int)→Tensor[source]#: Returns array of offsets into memory pools

Bases:object

end_id:int#

pad_id:int#

max_new_tokens:int=20#

num_beams:int=1#

num_return_sequences:int|None=None#

max_attention_window_size:int|None=None#

sink_token_length:int|None=None#

output_sequence_lengths:bool=False#

return_dict:bool=False#

stop_words_list:list|ndarray|Tensor|None=None#

bad_words_list:list|ndarray|Tensor|None=None#

temperature:float|Tensor=1.0#

top_k:int|Tensor=1#

top_p:float|Tensor=0.0#

top_p_decay:Tensor|None=None#

top_p_min:Tensor|None=None#

top_p_reset_ids:Tensor|None=None#

random_seed:int|Tensor=None#

length_penalty:float|Tensor=1.0#

early_stopping:int|Tensor=1#

repetition_penalty:float|Tensor=1.0#

min_length:int|Tensor=1#

presence_penalty:float|Tensor=0.0#

frequency_penalty:float|Tensor=0.0#

use_beam_hyps:bool=True#

beam_search_diversity_rate:float|Tensor=0.0#

output_cum_log_probs:bool=False#

output_log_probs:bool=False#

no_repeat_ngram_size:int|Tensor=None#

min_p:float|Tensor=0.0#

update(**kwargs)[source]#

classtensorrt_llm.runtime.Session(**kwargs)[source]#

Bases:object

Session is a managed TensorRT runtime.

staticfrom_serialized_engine( engine, )→Session[source]#: @brief: Create a session from a serialized engine@param engine: a serialized engine@return: a Session object

staticfrom_engine(engine)→Session[source]#: @brief: Create a session from an existing ICudaEngine engine@param engine: an ICudaEngine@return: a Session object

propertyruntime:Runtime#

propertyengine:ICudaEngine#

propertycontext:IExecutionContext#

Get the default TensorRT execution context,: use self.engine.create_execution_context() to create a new context if needed

@return: one TensorRT execution context object

Type:: @brief

propertycontext_mem_size:int#

set_shapes( tensor_dict:Dict[str,Tensor], context:IExecutionContext|None=None, )[source]#

infer_shapes( inputs:List[TensorInfo], context:IExecutionContext|None=None, )→List[TensorInfo][source]#

@brief: Set input shapes to given context, and infer the output shapes from the given input shapes.: This function should be called every time when the input shapes are changed before calling run().Or call the context.set_input_shape on all dynamic shaped input tensors manually.

@param inputs: list of TensorInfo object, each item represents an input tensor@param context: TensorRT execution context, if None, use the default context@return: list of TensorInfo object, each item represents an output tensor, returns None if failed

run( inputs:Dict[str,Any], outputs:Dict[str,Any], stream, context=None, )→bool[source]#: @brief: Run the TensorRT engine with the given inputs and outputs@param inputs: dict of input tensors, key is tensor name, value is tensor pointer or torch tensor@param outputs: dict of output tensors, key is tensor name, value is tensor pointer or torch tensor@param stream: cuda stream to enqueue the TensorRT engine on@param context: TensorRT execution context, if None, use the default context@return: True if enqueue succeeded, note the enqueue is an async call,
returning True does not mean the execution is finished

classtensorrt_llm.runtime.TensorInfo(name:'str',dtype:'trt.DataType',shape:'tuple')[source]#

Bases:object

name:str#

dtype:DataType#

shape:tuple#

numel()[source]#

view(*shape)[source]#

squeeze(dim=0)[source]#

classtensorrt_llm.runtime.ChatGLMGenerationSession( model_config:ModelConfig, engine_buffer, mapping:Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream:Stream=None, )[source]#: Bases:GenerationSession

classtensorrt_llm.runtime.QWenForCausalLMGenerationSession( model_config:ModelConfig, engine_buffer, mapping:Mapping, debug_mode=False, debug_tensors_to_save=None, cuda_graph_mode=False, stream:Stream=None, global_max_input_length:int=2048, global_max_output_length:int=4096, )[source]#

Bases:GenerationSession

generate( input_ids:Tensor, input_lengths:Tensor, sampling_config:SamplingConfig, max_new_tokens:int, runtime_rank:int=0, )[source]#

tensorrt_llm.runtime.decode_words_list( word_dict:List[List[str]], tokenizer=None, add_special_tokens=False, )[source]#

format of word_dict: len(word_dict) should be same to batch_sizeword_dict[i] means the words for batch ilen(word_dict[i]) >= 1, which means it must contain at least 1 stringFor example, word_dict[2] = [” I am happy”, “ I am sad”].

classtensorrt_llm.runtime.LogitsProcessorList(iterable=(),/)[source]#: Bases:list,LogitsProcessor

classtensorrt_llm.runtime.LogitsProcessor[source]#

Bases:object

Base class for all logit processors that can be applied during generation.

classtensorrt_llm.runtime.StoppingCriteriaList(iterable=(),/)[source]#: Bases:list,StoppingCriteria

classtensorrt_llm.runtime.StoppingCriteria[source]#

Bases:object

Base class for all stopping criteria that can be applied during generation.

classtensorrt_llm.runtime.ModelRunner( session:GenerationSession, max_batch_size:int, max_input_len:int, max_seq_len:int, max_beam_width:int, kv_cache_type:KVCacheType, lora_manager:LoraManager|None=None, )[source]#

Bases:ModelRunnerMixin

An interface class that wraps GenerationSession and provides generation methods.

classmethodfrom_engine( engine:Engine, *, max_output_len:int|None, lora_dir:List[str]|None, rank:int, debug_mode:bool, lora_ckpt_source:str, medusa_choices:List[List[int]], stream:Stream, gpu_weights_percent:float, enable_context_fmha_fp32_acc:bool|None, multi_block_mode:bool|None, )→ModelRunner[source]#

classmethodfrom_dir( engine_dir:str, *, max_output_len:int|None=None, lora_dir:List[str]|None=None, rank:int=0, debug_mode:bool=False, lora_ckpt_source:str='hf', medusa_choices:List[List[int]]=None, stream:Stream=None, gpu_weights_percent:float=1, enable_context_fmha_fp32_acc:bool|None=None, multi_block_mode:bool|None=None, fail_fast_on_attention_window_too_large:bool=False, )→ModelRunner[source]#

Create a ModelRunner instance from an engine directory.

Parameters:

engine_dir (str) – The directory that contains the serialized engine files and config files.
max_output_len (Optional[int]) – max_output_len, this arg might be available only when loading time, generate will still to check when disable_kv_cache is enabled.
lora_dir (Optional[List[str]]) – The directories that contain LoRA weights.
rank (int) – The runtime rank id.
debug_mode (bool) – Whether or not to turn on the debug mode.
medusa_choices (List[List[int]]) – Medusa choices to use when in Medusa decoding
stream (torch.cuda.Stream) – Stream to use.
multi_block_mode (bool) – Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.
fail_fast_on_attention_window_too_large (bool) – Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache.Note: This parameter is only applicable to C++ runtime (ModelRunnerCpp).

Returns:

An instance of ModelRunner.

Return type:

ModelRunner

propertydtype:dtype#

propertyvocab_size:int#

propertyvocab_size_padded:int#

propertyhidden_size:int#

propertynum_heads:int#

propertynum_layers:int#

propertymax_sequence_length:int#

propertyremove_input_padding:bool#

propertyuse_lora_plugin:bool#

propertymax_prompt_embedding_table_size:int#

propertymapping:Mapping#

propertygather_context_logits:bool#

propertygather_generation_logits:bool#

generate(
batch_input_ids:List[Tensor],
position_ids:List[Tensor]=None,
sampling_config:SamplingConfig|None=None,
prompt_table:str|Tensor|None=None,
prompt_tasks:str|None=None,
lora_uids:list|None=None,
streaming:bool=False,
output_generation_logits:bool=False,
stopping_criteria:StoppingCriteria|None=None,
logits_processor:LogitsProcessor|None=None,
medusa_choices:List[List[int]]|None=None,
encoder_max_input_length:int=None,
encoder_input_features:List[Tensor]=None,
encoder_output_lengths:List[Tensor]=None,
cross_attention_masks:List[Tensor]=None,
**kwargs,
)→Tensor|dict[source]#

Generates sequences of token ids.The generation-controlling parameters are set in the sampling_config; it will be set to a default one if not passed.You can override any sampling_config’s attributes by passing corresponding parameters.

Parameters:

batch_input_ids (List[torch.Tensor]) – A list of input id tensors. Each tensor is of shape (sequence_length, ).
sampling_config (SamplingConfig) – The sampling configuration to be used as base parametrization for the generation call.The passed**kwargs matching the sampling_config’s attributes will override them.If the sampling_config is not provided, a default will be used.
prompt_table (str ortorch.Tensor) – The file path of prompt table (.npy format, exported by nemo_prompt_convert.py) or the prompt table itself.
prompt_tasks (str) – The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).
lora_uids (list) – The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.
streaming (bool) – Whether or not to use streaming mode for generation.
stopping_criteria (StoppingCriteria) – Custom stopping criteria.
logits_processor (LogitsProcessor) – Custom logits processors.
medusa_choices (List[List[int]]) – Medusa decoding choices.
(Dict[str (kwargs) – Ad hoc parametrization of sampling_config.The passed**kwargs matching the sampling_config’s attributes will override them.
Any] – Ad hoc parametrization of sampling_config.The passed**kwargs matching the sampling_config’s attributes will override them.

Returns:

If return_dict=False, the method returns generated output_ids.If return_dict=True, the method returns a dict of output_ids,sequence_lengths (if sampling_config.output_sequence_lengths=True),context_logits and generation_logits (if self.gather_context_logits=Trueand self.gather_generation_logits=True, respectively).

Return type:

torch.Tensor or dict

serialize_engine()→IHostMemory[source]#

Serialize the engine.

Returns:: The serialized engine.
Return type:: bytes

classtensorrt_llm.runtime.ModelRunnerCpp( executor:Executor, max_batch_size:int, max_input_len:int, max_seq_len:int, max_beam_width:int, model_config:ModelConfig, world_config:WorldConfig, use_kv_cache:bool, lora_manager:LoraManager|None=None, )[source]#

Bases:ModelRunnerMixin

An interface class that wraps Executor and provides generation methods.

classmethodfrom_dir( engine_dir:str, *, lora_dir:str|None=None, rank:int=0, max_batch_size:int|None=None, max_input_len:int|None=None, max_output_len:int|None=None, max_beam_width:int|None=None, max_attention_window_size:list[int]|None=None, sink_token_length:int|None=None, kv_cache_free_gpu_memory_fraction:float|None=None, cross_kv_cache_fraction:float|None=None, medusa_choices:list[list[int]]|None=None, eagle_choices:list[list[int]]|None=None, eagle_posterior_threshold:float|None=None, eagle_use_dynamic_tree:bool=False, eagle_dynamic_tree_max_top_k:int|None=None, lookahead_config:list[int]|None=None, debug_mode:bool=False, lora_ckpt_source:str='hf', use_gpu_direct_storage:bool=False, gpu_weights_percent:float=1, max_tokens_in_paged_kv_cache:int|None=None, kv_cache_enable_block_reuse:bool=False, enable_chunked_context:bool=False, is_enc_dec:bool=False, multi_block_mode:bool=True, enable_context_fmha_fp32_acc:bool|None=None, cuda_graph_mode:bool|None=None, logits_processor_map:Dict[str,LogitsProcessor]|None=None, device_ids:List[int]|None=None, is_orchestrator_mode:bool=False, use_runtime_defaults:bool=True, gather_generation_logits:bool=False, use_variable_beam_width_search:bool=False, mm_embedding_offloading:bool=False, fail_fast_on_attention_window_too_large:bool=False, )→ModelRunnerCpp[source]#

Create a ModelRunnerCpp instance from an engine directory.

Parameters:

engine_dir (str) – The directory that contains the serialized engine files and config files.
lora_dir (str) – The directory that contains LoRA weights.
rank (int) – The runtime rank id.
max_batch_size (int) – The runtime batch size limit. If max_batch_size is not None, it should notbe larger than the engine’s max_batch_size; otherwise, the engine’s max_batch_sizewill be used.
max_input_len (int) – The runtime input length limit. If max_input_len is not None, it should notbe larger than the engine’s max_input_len; otherwise, the engine’s max_input_lenwill be used.
max_output_len (int) – The runtime output length limit. If max_output_len is not None, it should notbe larger than the engine’s max_output_len; otherwise, the engine’s max_output_lenwill be used.
max_beam_width (int) – The runtime beam width limit. If max_beam_width is not None, it should notbe larger than the engine’s max_beam_width; otherwise, the engine’s max_beam_widthwill be used.
max_attention_window_size (List[int]) – The attention window size that controls the sliding window attention / cyclic kv cache behavior.
sink_token_length (int) – The sink token length, default=0.
kv_cache_free_gpu_memory_fraction (float) – Free GPU memory fraction that KV cache used.
cross_kv_cache_fraction (float) – KV Cache fraction reserved for cross attention, should only be used with enc-dec models.
debug_mode (bool) – Whether or not to turn on the debug mode.
medusa_choices (List[List[int]]) – Medusa choices to use when in Medusa decoding.
eagle_choices (List[List[int]]) – Eagle choices to use when in Eagle-1 decoding.
float (eagle_posterior_threshold) – Minimum token probability threshold for typical acceptance.Value different from None enables typical acceptance in Eagle.
bool (eagle_use_dynamic_tree) – Whether to use Eagle-2, which is dynamic tree.
int (eagle_dynamic_tree_max_top_k) – The maximum number of draft tokens to expand for each node in Eagle-2.
lora_ckpt_source (str) – Source of checkpoint. Should be one of [‘hf’, ‘nemo’].
max_tokens_in_paged_kv_cache (int) – Maximum amount of tokens configured in kv cache.
kv_cache_enable_block_reuse (bool) – Enables block reuse in kv cache.
enable_chunked_context (bool) – Enables chunked context.
is_enc_dec (bool) – Whether the model is encoder-decoder architecture.
multi_block_mode (bool) – Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.
enable_context_fmha_fp32_acc (bool) – Enable FMHA runner FP32 accumulation.
cuda_graph_mode (bool) – Whether to use cuda graph for inference.
logits_processor_map (Dict[str,LogitsProcessor]) – A map of logits processor functions indexed by names. A name can be provided later tothe generate() function to specify which logits processor to run.
device_ids (List[int]) – Device indices to run the Executor on.
is_orchestrator_mode (bool) – The mode to run the model-runner, Leader mode by default.
gather_generation_logits (bool) – Enable gathering generation logits.
fail_fast_on_attention_window_too_large (bool) – Whether to fail fast if the attention window(s) are too large to fit even a single sequence in the KVCache.

Returns:

An instance of ModelRunnerCpp.

Return type:

ModelRunnerCpp

propertydtype:dtype#

propertyvocab_size:int#

propertyvocab_size_padded:int#

propertyhidden_size:int#

propertynum_heads:int#

propertynum_layers:int#

propertymax_sequence_length:int#

propertyremove_input_padding:bool#

propertymax_prompt_embedding_table_size:int#

propertygather_context_logits:bool#

propertygather_generation_logits:bool#

generate(
batch_input_ids:List[Tensor],
*,
position_ids:List[Tensor]=None,
encoder_input_ids:List[Tensor]=None,
encoder_input_features:List[Tensor]=None,
encoder_output_lengths:List[int]=None,
cross_attention_masks:List[Tensor]=None,
mrope_params:MropeParams|None=None,
sampling_config:SamplingConfig|None=None,
lora_uids:list|None=None,
lookahead_config:list[int]|None=None,
streaming:bool=False,
stopping_criteria:StoppingCriteria|None=None,
logits_processor_names:list[str]|None=None,
max_new_tokens:int=1,
end_id:int|None=None,
pad_id:int|None=None,
bad_words_list:list[list[int]]|None=None,
stop_words_list:list[list[int]]|None=None,
return_dict:bool=False,
output_sequence_lengths:bool=False,
output_generation_logits:bool=False,
output_log_probs:bool=False,
output_cum_log_probs:bool=False,
prompt_table:str|Tensor|None=None,
prompt_tasks:str|None=None,
input_token_extra_ids:List[List[int]]=None,
return_all_generated_tokens:bool=False,
language_adapter_uids:List[int]|None=None,
mm_embedding_offloading:bool=False,
**kwargs,
)→Tensor|dict[source]#

Parameters:

batch_input_ids (List[torch.Tensor]) – A list of input id tensors. Each tensor is of shape (sequence_length, ).
position_ids (List[torch.Tensor]) – A list of position id tensors. Each tensor is of shape (sequence_length, ).
encoder_input_ids (List[torch.Tensor]) – A list of encoder input id tensors for encoder-decoder models (optional). Each tensor is of shape (sequence_length, ).
encoder_input_features – (List[torch.Tensor]):A list of encoder input feature tensors for multimodal encoder-decoder models (optional). Each tensor is of shape (sequence_length, feature_dim).
encoder_output_lengths – (List[int]):A list of encoder output lengths (optional) if encoder output has different length from encoder input (due to convolution down-sampling, etc.)
sampling_config (SamplingConfig) – The sampling configuration to be used as base parametrization for the generation call.The passed**kwargs matching the sampling_config’s attributes will override them.If the sampling_config is not provided, a default will be used.
prompt_table (str ortorch.Tensor) – The file path of prompt table (.npy format, exported by nemo_prompt_convert.py) or the prompt table itself.
prompt_tasks (str) – The prompt tuning task ids for the input batch, in format of comma-separated list (e.g., 0,3,1,0).
input_token_extra_ids (List[List[int]]) – Input token extra ids for using p-tuning and KV Cache reuse together
lora_uids (list) – The uids of LoRA weights for the input batch. Use -1 to disable the LoRA module.
streaming (bool) – Whether or not to use streaming mode for generation.
stopping_criteria (StoppingCriteria) – Custom stopping criteria.
logits_processor_names (List[str]) – Custom logits processor names.
return_all_generated_tokens (bool) – Whether the full output is returned at each streaming step
(Dict[str (kwargs) – Ad hoc parametrization of sampling_config.The passed**kwargs matching the sampling_config’s attributes will override them.
Any] – Ad hoc parametrization of sampling_config.The passed**kwargs matching the sampling_config’s attributes will override them.

Returns:

If return_dict=False, the method returns generated output_ids.If return_dict=True, the method returns a dict of output_ids,sequence_lengths (if sampling_config.output_sequence_lengths=True),context_logits and generation_logits (if self.gather_context_logits=True andself.gather_generation_logits=True, respectively).

Return type:

torch.Tensor or dict

classtensorrt_llm.runtime.EncDecModelRunner( engine_name, engine_dir, lora_dir=None, lora_task_uids=None, debug_mode=False, skip_encoder=False, stream:Stream=None, enable_context_fmha_fp32_acc:bool=None, )[source]#

Bases:object

classmethodfrom_engine( engine_name, engine_dir, lora_dir=None, lora_task_uids=None, debug_mode=False, skip_encoder=False, stream=None, enable_context_fmha_fp32_acc=None, )[source]#

process_input( input_ids, remove_input_padding=False, pad_token_id=0, prompt_tasks=None, language_adapter_routings=None, )[source]#

encoder_run( input_ids, input_lengths, max_input_length, position_ids=None, token_type_ids=None, debug_mode=False, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, attention_mask=None, language_adapter_routings=None, )[source]#

generate( encoder_input_ids, decoder_input_ids, max_new_tokens, num_beams=1, pad_token_id=None, eos_token_id=None, bos_token_id=None, debug_mode=False, return_dict=False, prompt_embedding_table=None, prompt_tasks=None, prompt_vocab_size=None, attention_mask=None, time_encoder=False, return_encoder_output=False, encoder_language_adapter_routings=None, decoder_language_adapter_routings=None, )[source]#

classtensorrt_llm.runtime.MultimodalModelRunner(args)[source]#

Bases:object

propertycpp_e2e#

propertycpp_llm_only#

propertypython_e2e#

propertyvisual_engine_dir#

propertyaudio_engine_dir#

propertyllm_engine_dir#

init_tokenizer()[source]#

init_processor()[source]#

init_image_encoder()[source]#

init_audio_encoder()[source]#

init_llm()[source]#

video_preprocess(video_path)[source]#

preprocess( pre_prompt, post_prompt, image, other_vision_inputs, other_audio_inputs, )[source]#

statictokenizer_image_token( batch_size, pre_prompt, post_prompt, tokenizer, image_token_index=-200, )[source]#

split_prompt_by_images(tensor)[source]#

prepare_position_ids_for_cogvlm(input_ids)[source]#

generate( pre_prompt, post_prompt, image, decoder_input_ids, max_new_tokens, other_vision_inputs={}, other_audio_inputs={}, other_decoder_inputs={}, )[source]#

get_visual_features(image,other_vision_inputs)[source]#

get_audio_features(audio,other_audio_inputs)[source]#

setup_fake_prompts_vila( batch_size, visual_features, split_input_ids, input_lengths, )[source]#

setup_fake_prompts( visual_features, pre_input_ids, post_input_ids, input_lengths, )[source]#

get_rope_index( input_ids:IntTensor, image_grid_thw:LongTensor|None=None, video_grid_thw:LongTensor|None=None, attention_mask:Tensor|None=None, )→Tuple[Tensor,Tensor][source]#

Calculate the 3D rope index based on image and video’s temporal, height and width in LLM.

Explanation:

Each embedding sequence contains vision embedding and text embedding or just contains text embedding.

For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.Examples:

input_ids: [T T T T T], here T is for text.temporal position_ids: [0, 1, 2, 3, 4]height position_ids: [0, 1, 2, 3, 4]width position_ids: [0, 1, 2, 3, 4]

For vision and text embedding sequence, we calculate 3D rotary position embedding for vision partand 1D rotary position embedding for text part.Examples:

Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]text temporal position_ids: [3, 4, 5, 6, 7]text height position_ids: [3, 4, 5, 6, 7]text width position_ids: [3, 4, 5, 6, 7]Here we calculate the text start position_ids as the max vision position_ids plus 1.

Parameters:

input_ids (torch.IntTensor of shape(batch_size, sequence_length)) – Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provideit.
image_grid_thw (torch.LongTensor of shape(num_images, 3),optional) – The temporal, height and width of feature shape of each image in LLM.
video_grid_thw (torch.LongTensor of shape(num_videos, 3),optional) – The temporal, height and width of feature shape of each video in LLM.
attention_mask (torch.Tensor of shape(batch_size, sequence_length),optional) –
Mask to avoid performing attention on padding token indices. Mask values selected in[0, 1]:
- 1 for tokens that arenot masked,
- 0 for tokens that aremasked.

Returns:

position_ids (torch.IntTensor of shape(3, batch_size, sequence_length))mrope_position_deltas (torch.Tensor of shape(batch_size))

setup_fake_prompts_qwen2vl( visual_features, input_ids, vision_grid_thws, attention_mask, input_lengths, )[source]#

ptuning_setup_fuyu( input_ids, image_patches_indices, )[source]#

ptuning_setup_pixtral(input_ids)[source]#

ptuning_setup_llava_next( visual_features, pre_prompt, post_prompt, )[source]#

ptuning_setup_phi3( visual_features, audio_features, input_ids, num_img_tokens, num_aud_tokens, )[source]#

ptuning_setup( prompt_table, input_ids, input_lengths, )[source]#

load_test_data(image_path=None,video_path=None)[source]#

load_test_audio(audio_path)[source]#

setup_inputs( input_text, raw_image, raw_audio=None, )[source]#

run( input_text, input_image, input_audio, max_new_tokens, )[source]#

On this page

Movatterモバイル変換

Runtime#

Runtime #