Executor #

disaggServerUtil.h#

namespacetensorrt_llm#

namespaceexecutor#

namespacedisagg_executor#

classDisaggExecutorOrchestrator#

Public Functions

DisaggExecutorOrchestrator( std::vector<std::filesystem::path>const&ctxEnginePaths, std::vector<std::filesystem::path>const&genEnginePaths, std::vector<executor::ExecutorConfig>const&ctxExecutorConfigs, std::vector<executor::ExecutorConfig>const&genExecutorConfigs, boolhasContextAwaitThreads, boolhasGenAwaitThreads, )#

Constructs aDisaggExecutorOrchestrator object.

Parameters:

ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector ofExecutorConfig for context executors.
genExecutorConfigs – A vector ofExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.

std::vector<IdType>enqueueContext( std::vector<texec::Request>const&requests, std::optional<int>selectContextId=std::nullopt, boolbatch=false, )#

Enqueue context-only requests to context executors.

Parameters:

requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. Ifstd::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.

Returns:

A vector of global request ids, corresponding to the order of the requests inrequests, the id returned may be different from the request id in each executor.

voidenqueueGeneration( std::vector<texec::Request>const&requests, std::vector<IdType>const&globalRequestIds, std::optional<int>selectGenIdx=std::nullopt, boolbatch=false, )#

Enqueue generation-only requests to generation executors.

Parameters:

requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. Ifstd::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.

std::vector<ResponseWithId>awaitContextResponses( std::optional<std::chrono::milliseconds>const&timeout, std::optional<int>contextIdx=std::nullopt, )#

Await for context responses.

Parameters:

timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. Ifstd::nullopt, return ready responses in all context executors,ifhasContextAwaitThreads is true, then this parameter must be std::nullopt.

Returns:

A vector of responses with corresponding global request ids

std::vector<ResponseWithId>awaitGenerationResponses( std::optional<std::chrono::milliseconds>const&timeout, std::optional<int>genIdx=std::nullopt, )#

Await for generation responses.

Parameters:

timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. Ifstd::nullopt, return ready responses in all generation executors,ifhasGenAwaitThreads is true, then this parameter must be std::nullopt.

Returns:

A vector of responses with corresponding global request ids.

boolcanEnqueue()const#: Indicates if the current process is allowed to enqueueRequests.

std::vector<std::unique_ptr<texec::Executor>>const&getContextExecutors( )const#: Get context executors.

std::vector<std::unique_ptr<texec::Executor>>const&getGenExecutors( )const#: Get generation executors.

~DisaggExecutorOrchestrator()#

Private Members

std::unique_ptr<Impl>mImpl#

structResponseWithId#

Public Functions

inlineResponseWithId( tensorrt_llm::executor::Response&&response, IdTypegid, )#

inlineResponseWithId( tensorrt_llm::executor::Responseconst&response, IdTypegid, )#

inlineResponseWithId(ResponseWithId&&other)noexcept#

ResponseWithId(ResponseWithIdconst&other)=default#

inlineResponseWithId&operator=(ResponseWithId&&other)noexcept#

inlineResponseWithId&operator=(ResponseWithIdconst&other)#

~ResponseWithId()=default#

Public Members

tensorrt_llm::executor::Responseresponse#

IdTypegid#

tensor.h#

namespacetensorrt_llm

namespaceexecutor

classShape:publictensorrt_llm::common::ArrayView<detail::DimType64const>#

Public Types

usingBase=tensorrt_llm::common::ArrayView<detail::DimType64const>#

usingDimType64=typenamestd::remove_cv_t<Base::value_type>#

Public Functions

inlineShape()#

inlineShape(DimType64const*data,Base::size_typesize)#

inlineShape(std::initializer_list<DimType64>dims)#

classTensor#

Public Types

usingCudaStreamPtr=std::shared_ptr<runtime::CudaStream>#

Public Functions

TensorcopyToCpu(Tensor::CudaStreamPtrstream=nullptr)const#

TensorcopyToPinned(Tensor::CudaStreamPtrstream=nullptr)const#

TensorcopyToPooledPinned(Tensor::CudaStreamPtrstream=nullptr)const#

TensorcopyToManaged(Tensor::CudaStreamPtrstream=nullptr)const#

TensorcopyToGpu(Tensor::CudaStreamPtrstream)const#

Tensor()noexcept=default#

~Tensor()=default#

Tensor(Tensorconst&other)noexcept=default#

Tensor(Tensor&&other)noexcept=default#

Tensor&operator=(Tensorconst&other)noexcept=default#

Tensor&operator=(Tensor&&other)noexcept=default#

void*getData()#: Returns a pointer to underlying array.

voidconst*getData()const#: Returns a pointer to underlying array.

DataTypegetDataType()const#: Returns the data type of the buffer.

MemoryTypegetMemoryType()const#: Returns the memory type of the buffer.

ShapegetShape()const#: Returns the tensor dimensions.

std::size_tgetSize()const#: Returns the number of elements in the tensor.

std::size_tgetSizeInBytes()const#: Returns the size of the tensor in bytes.

voidsetZero(CudaStreamPtrstream=nullptr)#

Set the entire memory to zero.

Parameters:: stream – Must be a valid CUDA stream if the memory type is GPU.

voidsetFrom(Tensorconst&other,CudaStreamPtrstream=nullptr)#

Copy the data and shape from another tensor.

Parameters:

other – A tensor to copy from.
stream – Must be a valid CUDA stream if the memory type is GPU.

inlineexplicitoperatorbool()const#

inlinebooloperator==(Tensorconst&rhs)const#

inlinebooloperator!=(Tensorconst&rhs)const#

Public Static Functions

staticTensorcpu(DataTypedataType,Shapeshape={})#

Allocate a cpu tensor with the given shape and data type.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.

template<typenameT> staticinlineTensorcpu(Shapeshape={})#

staticTensorpinned(DataTypedataType,Shapeshape={})#

Allocate a cpu tensor in pinned memory with the given shape and data type.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.

template<typenameT> staticinlineTensorpinned(Shapeshape={})#

staticTensorpooledPinned(DataTypedataType,Shapeshape={})#

Allocate a cpu tensor in pooled pinned memory with the given shape and data type.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.

template<typenameT> staticinlineTensorpooledPinned( Shapeshape={}, )#

staticTensormanaged(DataTypedataType,Shapeshape={})#

Allocate a tensor in managed memory (UVM) with the given shape and data type.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.

template<typenameT> staticinlineTensormanaged(Shapeshape={})#

staticTensorgpu( DataTypedataType, CudaStreamPtrstream, Shapeshape={}, )#

Allocate a gpu tensor with the given shape and data type on a particular cuda stream.

Parameters:

shape – The shape of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.
dataType – The data type of the tensor.

template<typenameT> staticinlineTensorgpu( CudaStreamPtrstream, Shapeshape={}, )#

staticTensorof(DataTypedataType,void*data,Shapeshape)#

Wrap a data pointer into a tensor without taking ownership.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

template<typenameT> staticinlineTensorof(T*data,Shapeshape)#

Wrap a data pointer into a tensor without taking ownership.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

template<typenameT> staticinlineTensorof(T&data)#

Wrap any container into a tensor without taking ownership.

Parameters:

shape – The shape of the tensor.
dataType – The data type of the tensor.
stream – Specifies the CUDA stream on which to allocate the tensor for GPU memory.

Private Types

usingImpl=runtime::ITensor #

Private Functions

explicitTensor(std::shared_ptr<runtime::ITensor>tensor)#

TensorcopyTo( std::shared_ptr<Impl>tensor, CudaStreamPtrstream, )const#

Private Members

std::shared_ptr<Impl>mTensor#

Private Static Functions

template<typenameT> staticinlineDataTypegetRuntimeType()#

Friends

friendclassSerialization

friendstd::shared_ptr<runtime::ITensor>const&toITensor( Tensorconst&tensor, )#

friendTensorofITensor( std::shared_ptr<runtime::ITensor>tensor, )#

namespacedetail#

Typedefs

usingDimType64=int64_t#

Functions

std::shared_ptr<runtime::ITensor>const&toITensor( Tensorconst&tensor, )#

TensorofITensor(std::shared_ptr<runtime::ITensor>tensor)#

namespaceruntime#

transferAgent.h#

namespacetensorrt_llm

namespaceexecutor

namespacekv_cache#

Typedefs

usingTransferDescs=MemoryDescs #

usingRegisterDescs=MemoryDescs #

usingSyncMessage=std::string#

usingConnectionInfoType=std::string#

Enums

enumclassMemoryType:uint8_t#

Values:

enumeratorkDRAM#

enumeratorkVRAM#

enumeratorkBLK#

enumeratorkOBJ#

enumeratorkFILE#

enumclassTransferOp:uint8_t#

Values:

enumeratorkREAD#

enumeratorkWRITE#

Functions

template<typename...Args> std::unique_ptr<BaseTransferAgent>makeTransferAgent( std::stringconst&backend, Args&&...args, )#

template<typename...Args> std::shared_ptr<BaseLoopbackAgent>makeLoopbackAgent( std::stringconst&backend, Args&&...args, )#

classAgentDesc#

Public Functions

inlineAgentDesc(std::stringbackendAgentDesc)#

inlinestd::stringconst&getBackendAgentDesc()constnoexcept#

Private Members

std::stringmBackendAgentDesc#

structBaseAgentConfig#

Public Members

std::stringmName#

booluseProgThread#

boolmultiThread#

classBaseLoopbackAgent#

Public Functions

virtual~BaseLoopbackAgent()=default#

virtualvoidexecuteLoopbackRequest( MemoryDescsconst&memoryDescs, FileDescsconst&fileDescs, boolisOffload, )=0#

classBaseTransferAgent#

Public Functions

virtual~BaseTransferAgent()=default#

virtualvoidregisterMemory(RegisterDescsconst&descs)=0#

Parameters:: descs – Describe the memory regions to be registered.

virtualvoidderegisterMemory(RegisterDescsconst&descs)=0#

Unregister a memory region.

Parameters:: descs – Describe the memory regions to be unregistered.

virtualvoidloadRemoteAgent( std::stringconst&name, AgentDescconst&agentDesc, )=0#

Initialize and establish a connection with a remote agent.

Parameters:

name – Specify the name of the remote agent.
agentDesc – Provide the necessary communication details for connecting to the remote agent.

virtualvoidloadRemoteAgent( std::stringconst&name, ConnectionInfoTypeconst&connectionInfo, )=0#

Initialize and establish a connection with a remote agent.

Parameters:

name – Specify the name of the remote agent.
connectionInfo – Provide the necessary communication details for connecting to the remote agent.

virtualvoidinvalidateRemoteAgent(std::stringconst&name)=0#

Invalidate a connection with a remote agent.

Parameters:: name – Specify the name of the remote agent.

virtualAgentDescgetLocalAgentDesc()=0#

Fetch the descriptor of the local agent.

Returns:: The descriptor of the local agent.

virtualConnectionInfoTypegetLocalConnectionInfo()=0#

Fetch the descriptor of the local agent.

Returns:: The descriptor of the local agent.

virtualstd::unique_ptr<TransferStatus>submitTransferRequests( TransferRequestconst&request, )=0#

Initiate the transfer by submitting the request.

Parameters:: request – Specify the transmission request.
Returns:: The status of the requests.

virtualvoidnotifySyncMessage( std::stringconst&name, SyncMessageconst&syncMessage, )=0#

Generate a notification, not bound to a transfer, e.g., for control.

Parameters:

name – Specify the name of the remote agent to which the information should be sent.
syncMessage – The data or message intended for synchronization.

virtualstd::unordered_map<std::string,std::vector<SyncMessage>>getNotifiedSyncMessages( )=0#

Retrieve notification messages sent by other agents.

Returns:: A mapping from remote agent names to their respective notification messages.

virtualboolcheckRemoteDescs( std::stringconst&name, MemoryDescsconst&memoryDescs, )=0#

Check if metadata is available for a remote agent.

Returns:: Whether the metadata is available for a remote agent.

classDynLibLoader#

Public Functions

void*getHandle(std::stringconst&name)#

template<typenameFunctionT> inlineFunctionTgetFunctionPointer( std::stringconst&libName, std::stringconst&funcName, )#

~DynLibLoader()#

DynLibLoader()=default#

DynLibLoader(DynLibLoaderconst&)=delete#

DynLibLoader&operator=(DynLibLoaderconst&)=delete#

Public Static Functions

staticDynLibLoader&getInstance()#

Private Members

std::mutexmDllMutex#

std::unordered_map<std::string,void*>mHandlers#

Private Static Functions

staticvoid*dlSym(void*handle,charconst*symbol)#

classFileDesc#

Public Functions

inlineFileDesc( std::stringconst&filename, intflags, mode_tmode, size_tlen, )#

inlineFileDesc(FileDesc&&other)noexcept#

inlineFileDesc&operator=(FileDesc&&other)noexcept#

inline~FileDesc()#

inlineuint64_tgetFd()constnoexcept#

inlinesize_tgetLen()constnoexcept#

FileDesc(FileDescconst&)=delete#

FileDesc&operator=(FileDescconst&)=delete#

Private Members

intfd#

size_tmLen#

classFileDescs#

Public Functions

inlineFileDescs(std::vector<FileDesc>&&descs)#

inlinestd::vector<FileDesc>const&getDescs()constnoexcept#

Private Members

std::vector<FileDesc>mDescs#

classMemoryDesc#

Public Functions

inlineMemoryDesc( std::vector<char>const&vec, uint32_tdeviceId=0, )#

inlineMemoryDesc(void*addr,size_tlen,uint32_tdeviceId)#

inlineMemoryDesc(uintptr_taddr,size_tlen,uint32_tdeviceId)#

inlineuintptr_tgetAddr()constnoexcept#

inlinesize_tgetLen()constnoexcept#

inlineuint32_tgetDeviceId()constnoexcept#

Public Static Functions

staticvoidserialize(MemoryDescconst&memoryDesc,std::ostream&os)#

staticMemoryDescdeserialize(std::istream&is)#

staticsize_tserializedSize(MemoryDescconst&memoryDesc)#

Private Members

uintptr_tmAddr#

size_tmLen#

uint32_tmDeviceId#

classMemoryDescs#

Public Functions

inlineMemoryDescs(MemoryTypetype,std::vector<MemoryDesc>descs)#

inlineMemoryTypegetType()constnoexcept#

inlinestd::vector<MemoryDesc>const&getDescs()constnoexcept#

Private Members

MemoryTypemType#

std::vector<MemoryDesc>mDescs#

classTransferRequest#

Public Functions

inlineTransferRequest( TransferOpop, TransferDescssrcDescs, TransferDescsdstDescs, std::stringconst&remoteName, std::optional<SyncMessage>syncMessage=std::nullopt, )#

The constructor ofTransferRequest.

Parameters:

op – Source data arrangement.
srcDescs – Description of the source memory region.
dstDescs – Description of the destination memory region.
remoteName – Name of the remote counterpart.
syncMessage – Synchronization information for the end of the transfer.

inlineTransferOpgetOp()constnoexcept#

inlineTransferDescsconst&getSrcDescs()constnoexcept#

inlineTransferDescsconst&getDstDescs()constnoexcept#

inlinestd::stringconst&getRemoteName()constnoexcept#

inlinestd::optional<SyncMessage>getSyncMessage()constnoexcept#

Private Members

TransferOpmOp#

TransferDescsmSrcDescs#

TransferDescsmDstDescs#

std::stringmRemoteName#

std::optional<SyncMessage>mSyncMessage#

classTransferStatus#

Public Functions

virtual~TransferStatus()=default#

virtualboolisCompleted()const=0#

virtualvoidwait()const=0#

serialization.h#

namespacetensorrt_llm

namespaceexecutor

classSerialization#

Public Static Functions

staticsize_tserializedSize( tensorrt_llm::batch_manager::kv_cache_manager::BlockKeyconst&key, )#

staticvoidserialize( tensorrt_llm::batch_manager::kv_cache_manager::BlockKeyconst&key, std::ostream&os, )#

statictensorrt_llm::batch_manager::kv_cache_manager::BlockKeydeserializeBlockKey( std::istream&is, )#

staticRequestPerfMetrics::TimePointdeserializeTimePoint( std::istream&is, )#

staticvoidserialize( RequestPerfMetrics::TimePointconst&tp, std::ostream&os, )#

staticsize_tserializedSize(RequestPerfMetrics::TimePointconst&)#

staticRequestPerfMetricsdeserializeRequestPerfMetrics( std::istream&is, )#

staticvoidserialize( RequestPerfMetricsconst&metrics, std::ostream&os, )#

staticsize_tserializedSize(RequestPerfMetricsconst&metrics)#

staticSamplingConfigdeserializeSamplingConfig(std::istream&is)#

staticvoidserialize(SamplingConfigconst&config,std::ostream&os)#

staticsize_tserializedSize(SamplingConfigconst&config)#

staticOutputConfigdeserializeOutputConfig(std::istream&is)#

staticvoidserialize(OutputConfigconst&config,std::ostream&os)#

staticsize_tserializedSize(OutputConfigconst&config)#

staticAdditionalModelOutputdeserializeAdditionalModelOutput( std::istream&is, )#

staticvoidserialize( AdditionalModelOutputconst&additionalModelOutput, std::ostream&os, )#

staticsize_tserializedSize( AdditionalModelOutputconst&additionalModelOutput, )#

staticExternalDraftTokensConfigdeserializeExternalDraftTokensConfig( std::istream&is, )#

staticvoidserialize( ExternalDraftTokensConfigconst&config, std::ostream&os, )#

staticsize_tserializedSize(ExternalDraftTokensConfigconst&config)#

staticPromptTuningConfigdeserializePromptTuningConfig( std::istream&is, )#

staticvoidserialize( PromptTuningConfigconst&config, std::ostream&os, )#

staticsize_tserializedSize(PromptTuningConfigconst&config)#

staticMultimodalInputdeserializeMultimodalInput(std::istream&is)#

staticvoidserialize( MultimodalInputconst&multimodalInput, std::ostream&os, )#

staticsize_tserializedSize(MultimodalInputconst&multimodalInput)#

staticMropeConfigdeserializeMropeConfig(std::istream&is)#

staticvoidserialize(MropeConfigconst&config,std::ostream&os)#

staticsize_tserializedSize(MropeConfigconst&config)#

staticLoraConfigdeserializeLoraConfig(std::istream&is)#

staticvoidserialize(LoraConfigconst&config,std::ostream&os)#

staticsize_tserializedSize(LoraConfigconst&config)#

statickv_cache::CommStatedeserializeCommState(std::istream&is)#

staticvoidserialize( kv_cache::CommStateconst&state, std::ostream&os, )#

staticsize_tserializedSize(kv_cache::CommStateconst&state)#

statickv_cache::SocketStatedeserializeSocketState(std::istream&is)#

staticvoidserialize( kv_cache::SocketStateconst&state, std::ostream&os, )#

staticsize_tserializedSize(kv_cache::SocketStateconst&state)#

statickv_cache::AgentStatedeserializeAgentState(std::istream&is)#

staticvoidserialize( kv_cache::AgentStateconst&state, std::ostream&os, )#

staticsize_tserializedSize(kv_cache::AgentStateconst&state)#

statickv_cache::CacheStatedeserializeCacheState(std::istream&is)#

staticvoidserialize( kv_cache::CacheStateconst&state, std::ostream&os, )#

staticsize_tserializedSize(kv_cache::CacheStateconst&state)#

staticDataTransceiverStatedeserializeDataTransceiverState( std::istream&is, )#

staticDataTransceiverStatedeserializeDataTransceiverState( std::vector<char>&buffer, )#

staticvoidserialize( DataTransceiverStateconst&dataTransceiverState, std::ostream&os, )#

staticstd::vector<char>serialize( DataTransceiverStateconst&dataTransceiverState, )#

staticsize_tserializedSize( DataTransceiverStateconst&dataTransceiverState, )#

staticContextPhaseParamsdeserializeContextPhaseParams( std::istream&is, )#

staticvoidserialize( ContextPhaseParamsconst&contextPhaseParams, std::ostream&os, )#

staticsize_tserializedSize( ContextPhaseParamsconst&contextPhaseParams, )#

staticRequestdeserializeRequest(std::istream&is)#

staticvoidserialize(Requestconst&request,std::ostream&os)#

staticsize_tserializedSize(Requestconst&request)#

staticTensordeserializeTensor(std::istream&is)#

staticvoidserialize(Tensorconst&tensor,std::ostream&os)#

staticsize_tserializedSize(Tensorconst&tensor)#

staticSpeculativeDecodingFastLogitsInfodeserializeSpecDecFastLogitsInfo( std::istream&is, )#

staticvoidserialize( SpeculativeDecodingFastLogitsInfoconst&info, std::ostream&os, )#

staticsize_tserializedSize( SpeculativeDecodingFastLogitsInfoconst&info, )#

staticResultdeserializeResult(std::istream&is)#

staticvoidserialize(Resultconst&result,std::ostream&os)#

staticsize_tserializedSize(Resultconst&result)#

staticAdditionalOutputdeserializeAdditionalOutput(std::istream&is)#

staticvoidserialize( AdditionalOutputconst&additionalOutput, std::ostream&os, )#

staticsize_tserializedSize( AdditionalOutputconst&additionalOutput, )#

staticResponsedeserializeResponse(std::istream&is)#

staticvoidserialize(Responseconst&response,std::ostream&os)#

staticsize_tserializedSize(Responseconst&response)#

staticstd::vector<Response>deserializeResponses( std::vector<char>&buffer, )#

staticstd::vector<char>serialize( std::vector<Response>const&responses, )#

staticKvCacheConfigdeserializeKvCacheConfig(std::istream&is)#

staticvoidserialize( KvCacheConfigconst&kvCacheConfig, std::ostream&os, )#

staticsize_tserializedSize(KvCacheConfigconst&kvCacheConfig)#

staticDynamicBatchConfigdeserializeDynamicBatchConfig( std::istream&is, )#

staticvoidserialize( DynamicBatchConfigconst&dynamicBatchConfig, std::ostream&os, )#

staticsize_tserializedSize( DynamicBatchConfigconst&dynamicBatchConfig, )#

staticSchedulerConfigdeserializeSchedulerConfig(std::istream&is)#

staticvoidserialize( SchedulerConfigconst&schedulerConfig, std::ostream&os, )#

staticsize_tserializedSize(SchedulerConfigconst&schedulerConfig)#

staticExtendedRuntimePerfKnobConfigdeserializeExtendedRuntimePerfKnobConfig( std::istream&is, )#

staticvoidserialize( ExtendedRuntimePerfKnobConfigconst&extendedRuntimePerfKnobConfig, std::ostream&os, )#

staticsize_tserializedSize( ExtendedRuntimePerfKnobConfigconst&extendedRuntimePerfKnobConfig, )#

staticParallelConfigdeserializeParallelConfig(std::istream&is)#

staticvoidserialize( ParallelConfigconst&parallelConfig, std::ostream&os, )#

staticsize_tserializedSize(ParallelConfigconst&parallelConfig)#

staticPeftCacheConfigdeserializePeftCacheConfig(std::istream&is)#

staticvoidserialize( PeftCacheConfigconst&peftCacheConfig, std::ostream&os, )#

staticsize_tserializedSize(PeftCacheConfigconst&peftCacheConfig)#

staticOrchestratorConfigdeserializeOrchestratorConfig( std::istream&is, )#

staticvoidserialize( OrchestratorConfigconst&orchestratorConfig, std::ostream&os, )#

staticsize_tserializedSize( OrchestratorConfigconst&orchestratorConfig, )#

staticDecodingModedeserializeDecodingMode(std::istream&is)#

staticvoidserialize( DecodingModeconst&decodingMode, std::ostream&os, )#

staticsize_tserializedSize(DecodingModeconst&decodingMode)#

staticLookaheadDecodingConfigdeserializeLookaheadDecodingConfig( std::istream&is, )#

staticvoidserialize( LookaheadDecodingConfigconst&lookaheadDecodingConfig, std::ostream&os, )#

staticsize_tserializedSize( LookaheadDecodingConfigconst&lookaheadDecodingConfig, )#

staticEagleConfigdeserializeEagleConfig(std::istream&is)#

staticvoidserialize( EagleConfigconst&eagleConfig, std::ostream&os, )#

staticsize_tserializedSize(EagleConfigconst&eagleConfig)#

staticSpeculativeDecodingConfigdeserializeSpeculativeDecodingConfig( std::istream&is, )#

staticvoidserialize( SpeculativeDecodingConfigconst&specDecConfig, std::ostream&os, )#

staticsize_tserializedSize( SpeculativeDecodingConfigconst&specDecConfig, )#

staticGuidedDecodingConfigdeserializeGuidedDecodingConfig( std::istream&is, )#

staticvoidserialize( GuidedDecodingConfigconst&guidedDecodingConfig, std::ostream&os, )#

staticsize_tserializedSize( GuidedDecodingConfigconst&guidedDecodingConfig, )#

staticGuidedDecodingParamsdeserializeGuidedDecodingParams( std::istream&is, )#

staticvoidserialize( GuidedDecodingParamsconst&guidedDecodingParams, std::ostream&os, )#

staticsize_tserializedSize( GuidedDecodingParamsconst&guidedDecodingParams, )#

staticKvCacheRetentionConfigdeserializeKvCacheRetentionConfig( std::istream&is, )#

staticvoidserialize( KvCacheRetentionConfigconst&kvCacheRetentionConfig, std::ostream&os, )#

staticsize_tserializedSize( KvCacheRetentionConfigconst&kvCacheRetentionConfig, )#

staticKvCacheRetentionConfig::TokenRangeRetentionConfigdeserializeTokenRangeRetentionConfig( std::istream&is, )#

staticvoidserialize( KvCacheRetentionConfig::TokenRangeRetentionConfigconst&tokenRangeRetentionConfig, std::ostream&os, )#

staticsize_tserializedSize( KvCacheRetentionConfig::TokenRangeRetentionConfigconst&tokenRangeRetentionConfig, )#

staticDecodingConfigdeserializeDecodingConfig(std::istream&is)#

staticvoidserialize( DecodingConfigconst&decodingConfig, std::ostream&os, )#

staticsize_tserializedSize(DecodingConfigconst&decodingConfig)#

staticDebugConfigdeserializeDebugConfig(std::istream&is)#

staticvoidserialize( DebugConfigconst&debugConfig, std::ostream&os, )#

staticsize_tserializedSize(DebugConfigconst&debugConfig)#

staticCacheTransceiverConfigdeserializeCacheTransceiverConfig( std::istream&is, )#

staticvoidserialize( CacheTransceiverConfigconst&cacheTransceiverConfig, std::ostream&os, )#

staticsize_tserializedSize( CacheTransceiverConfigconst&cacheTransceiverConfig, )#

staticExecutorConfigdeserializeExecutorConfig(std::istream&is)#

staticvoidserialize( ExecutorConfigconst&executorConfig, std::ostream&os, )#

staticsize_tserializedSize(ExecutorConfigconst&executorConfig)#

staticKvCacheStatsdeserializeKvCacheStats(std::istream&is)#

staticvoidserialize( KvCacheStatsconst&kvCacheStats, std::ostream&os, )#

staticsize_tserializedSize(KvCacheStatsconst&kvCacheStats)#

staticStaticBatchingStatsdeserializeStaticBatchingStats( std::istream&is, )#

staticvoidserialize( StaticBatchingStatsconst&staticBatchingStats, std::ostream&os, )#

staticsize_tserializedSize( StaticBatchingStatsconst&staticBatchingStats, )#

staticInflightBatchingStatsdeserializeInflightBatchingStats( std::istream&is, )#

staticvoidserialize( InflightBatchingStatsconst&inflightBatchingStats, std::ostream&os, )#

staticsize_tserializedSize( InflightBatchingStatsconst&inflightBatchingStats, )#

staticSpecDecodingStatsdeserializeSpecDecodingStats( std::istream&is, )#

staticvoidserialize( SpecDecodingStatsconst&specDecodingStats, std::ostream&os, )#

staticsize_tserializedSize( SpecDecodingStatsconst&specDecodingStats, )#

staticIterationStatsdeserializeIterationStats( std::vector<char>&buffer, )#

staticIterationStatsdeserializeIterationStats(std::istream&is)#

staticvoidserialize( IterationStatsconst&iterStats, std::ostream&os, )#

staticstd::vector<char>serialize(IterationStatsconst&iterStats)#

staticsize_tserializedSize(IterationStatsconst&iterStats)#

staticstd::vector<char>serialize( std::vector<IterationStats>const&iterStatsVec, )#

staticstd::vector<IterationStats>deserializeIterationStatsVec( std::vector<char>&buffer, )#

staticDisServingRequestStatsdeserializeDisServingRequestStats( std::istream&is, )#

staticvoidserialize( DisServingRequestStatsconst&stats, std::ostream&os, )#

staticsize_tserializedSize( DisServingRequestStatsconst&disServingRequestStats, )#

staticRequestStagedeserializeRequestStage(std::istream&is)#

staticvoidserialize( RequestStageconst&requestStage, std::ostream&os, )#

staticsize_tserializedSize(RequestStageconst&requestStage)#

staticRequestStatsdeserializeRequestStats(std::istream&is)#

staticvoidserialize(RequestStatsconst&state,std::ostream&os)#

staticsize_tserializedSize(RequestStatsconst&state)#

staticRequestStatsPerIterationdeserializeRequestStatsPerIteration( std::istream&is, )#

staticRequestStatsPerIterationdeserializeRequestStatsPerIteration( std::vector<char>&buffer, )#

staticvoidserialize( RequestStatsPerIterationconst&state, std::ostream&os, )#

staticstd::vector<char>serialize( RequestStatsPerIterationconst&state, )#

staticsize_tserializedSize(RequestStatsPerIterationconst&state)#

staticstd::vector<char>serialize( std::vector<RequestStatsPerIteration>const&requestStatsVec, )#

staticstd::vector<RequestStatsPerIteration>deserializeRequestStatsPerIterationVec( std::vector<char>&buffer, )#

staticstd::vector<char>serialize( std::deque<KVCacheEvent>const&kvCacheEvents, )#

staticstd::deque<KVCacheEvent>deserializeKVCacheEvents( std::vector<char>&buffer, )#

staticsize_tserializedSize(KVCacheEventconst&event)#

staticvoidserialize(KVCacheEventconst&event,std::ostream&os)#

staticKVCacheEventdeserializeKVCacheEvent(std::istream&is)#

staticsize_tserializedSize(KVCacheCreatedDataconst&data)#

staticvoidserialize( KVCacheCreatedDataconst&data, std::ostream&os, )#

staticKVCacheCreatedDatadeserializeKVCacheCreatedData( std::istream&is, )#

staticsize_tserializedSize(KVCacheStoredDataconst&data)#

staticvoidserialize( KVCacheStoredDataconst&data, std::ostream&os, )#

staticKVCacheStoredDatadeserializeKVCacheStoredData( std::istream&is, )#

staticsize_tserializedSize(KVCacheStoredBlockDataconst&data)#

staticvoidserialize( KVCacheStoredBlockDataconst&data, std::ostream&os, )#

staticKVCacheStoredBlockDatadeserializeKVCacheStoredBlockData( std::istream&is, )#

staticsize_tserializedSize(KVCacheRemovedDataconst&data)#

staticvoidserialize( KVCacheRemovedDataconst&data, std::ostream&os, )#

staticKVCacheRemovedDatadeserializeKVCacheRemovedData( std::istream&is, )#

template<typenameT> staticsize_tserializedSize( KVCacheEventDiff<T>const&data, )#

template<typenameT> staticvoidserialize( KVCacheEventDiff<T>const&data, std::ostream&os, )#

template<typenameT> staticKVCacheEventDiff<T>deserializeKVCacheEventDiff( std::istream&is, )#

staticsize_tserializedSize(KVCacheUpdatedDataconst&data)#

staticvoidserialize( KVCacheUpdatedDataconst&data, std::ostream&os, )#

staticKVCacheUpdatedDatadeserializeKVCacheUpdatedData( std::istream&is, )#

staticsize_tserializedSize( tensorrt_llm::runtime::UniqueTokenconst&token, )#

staticvoidserialize( tensorrt_llm::runtime::UniqueTokenconst&token, std::ostream&os, )#

statictensorrt_llm::runtime::UniqueTokendeserializeUniqueToken( std::istream&is, )#

staticstd::stringdeserializeString(std::istream&is)#

staticbooldeserializeBool(std::istream&is)#

staticModelTypedeserializeModelType(std::istream&is)#

namespacekv_cache

types.h#

namespacetensorrt_llm

namespaceexecutor

Typedefs

usingTensorPtr=std::shared_ptr<Tensor>#

usingSizeType32=std::int32_t#

usingSizeType64=std::int64_t#

usingFloatType=float#

usingTokenIdType=std::int32_t#

usingVecTokens=std::vector<TokenIdType>#

usingBeamTokens=std::vector<VecTokens>#

usingIdType=std::uint64_t#

usingVecTokenExtraIds=std::vector<IdType>#

usingIterationType=std::uint64_t#

usingRandomSeedType=std::uint64_t#

usingVecLogProbs=std::vector<FloatType>#

usingStreamPtr=std::shared_ptr<tensorrt_llm::runtime::CudaStream>#

usingMillisecondsType=std::chrono::milliseconds#

usingCacheSaltIDType=std::uint64_t#

usingLogitsPostProcessor=std::function<void(IdType,Tensor&,BeamTokensconst&,StreamPtrconst&,std::optional<IdType>)>#

usingLogitsPostProcessorMap=std::unordered_map<std::string,LogitsPostProcessor>#

usingLogitsPostProcessorBatched=std::function<void(std::vector<IdType>const&,std::vector<Tensor>&,std::vector<std::reference_wrapper<BeamTokensconst>>const&,StreamPtrconst&,std::vector<std::optional<IdType>>const&)>#

usingMedusaChoices=std::vector<std::vector<SizeType32>>#

usingEagleChoices=std::vector<std::vector<SizeType32>>#

usingPriorityType=float#

usingBufferView=std::basic_string_view<uint8_t>#

Enums

enumclassDataType#

Values:

enumeratorkBOOL#

enumeratorkUINT8#

enumeratorkINT8#

enumeratorkINT32#

enumeratorkINT64#

enumeratorkBF16#

enumeratorkFP8#

enumeratorkFP16#

enumeratorkFP32#

enumeratorkUNKNOWN#

enumclassRequestType#

Values:

enumeratorREQUEST_TYPE_CONTEXT_AND_GENERATION#

enumeratorREQUEST_TYPE_CONTEXT_ONLY#

enumeratorREQUEST_TYPE_GENERATION_ONLY#

enumclassMemoryType#

Values:

enumeratorkCPU#

enumeratorkCPU_PINNED#

enumeratorkCPU_PINNEDPOOL#

enumeratorkGPU#

enumeratorkUVM#

enumeratorkUNKNOWN#

enumclassModelType#

Values:

enumeratorkDECODER_ONLY#

enumeratorkENCODER_ONLY#

enumeratorkENCODER_DECODER#

enumclassBatchingType#

The batching type.

Values:

enumeratorkSTATIC#: STATIC refers to the traditional batching scheme with a batch of requests running in lockstep until the full generation for all of them is complete. Requests in a batch are all padded up to the maximum input and output sequence length of any member of the batch.

enumeratorkINFLIGHT#: INFLIGHT refers to a scheme where newly arrived requests are dynamically incorporated into the batch under execution, and requests are returned as soon as the end condition is met without any padding.

enumclassCapacitySchedulerPolicy#

The policy used to select the subset of available requests in each iteration of the executor generation loop.

Values:

enumeratorkMAX_UTILIZATION#: MAX_UTILIZATION packs as many requests as the underlying TRT engine can support in any iteration of the InflightBatching generation loop. While this is expected to maximize GPU throughput, it might require that some requests be paused and restarted depending on peak KV cache memory availability.

enumeratorkGUARANTEED_NO_EVICT#: GUARANTEED_NO_EVICT uses KV cache more conservatively guaranteeing that a request, once started, will run to completion without eviction.

enumeratorkSTATIC_BATCH#: kSTATIC_BATCH does not schedule new requests until all requests in current batch are completed. Similar to kGUARANTEED_NO_EVICT, requests will run to completion without eviction.

enumclassContextChunkingPolicy#

Values:

enumeratorkFIRST_COME_FIRST_SERVED#: Sequential chunking, complete the unfinished context phase first.

enumeratorkEQUAL_PROGRESS#: Iterate through each context request in sequence and attempt to increase its chunk count until the constraint is exceeded.

enumclassCommunicationType#

Values:

enumeratorkMPI#

enumclassCommunicationMode#

Values:

enumeratorkLEADER#

enumeratorkORCHESTRATOR#

enumclassRequestStage#

Enum class that represents the state of a request.

Values:

enumeratorkQUEUED#: Request that have been received but not yet included in the active requests (due to constraints such as maximum batch size for example).

enumeratorkENCODER_IN_PROGRESS#: Active request in encoder phase.

enumeratorkCONTEXT_IN_PROGRESS#: Active request in context phase.

enumeratorkGENERATION_IN_PROGRESS#: Active request in generation phase.

enumeratorkGENERATION_COMPLETE#: Active request for which generation has completed.

enumclassFinishReason#

The reason why the model stopped generating tokens for a request.

Values:

enumeratorkNOT_FINISHED#: The request is not finished.

enumeratorkEND_ID#: The request finished because the end id was generated.

enumeratorkSTOP_WORDS#: The request finished because a stop word was generated.

enumeratorkLENGTH#: The request finished because the maximum number of tokens was reached.

enumeratorkTIMED_OUT#: The request finished because it got timed out (via the mAllotedTime parameter)

enumeratorkCANCELLED#: The request was cancelled by calling cancelRequest.

enumclassKvCacheTransferMode#

Enum describing the transfer mode for KV cache.

Values:

enumeratorDRAM#: Copy to/from CPU memory (original approach).

enumeratorGDS#: Attempt GPUDirect Storage (cuFile).

enumeratorPOSIX_DEBUG_FALLBACK#: Force a POSIX read/write for debugging.

Functions

std::ostream&operator<<( std::ostream&os, CapacitySchedulerPolicypolicy, )#

std::ostream&operator<<( std::ostream&os, ContextChunkingPolicypolicy, )#

structDebugTensorsPerIteration#

#include <types.h>

Struct that holds the debug tensors in an iteration.

Public Members

IterationTypeiter#: The iteration id for these tensors.

std::map<std::string,Tensor>debugTensors#: The debug tensors for this iteration.

classDecodingMode#

#include <types.h>

mode of the decoder

Public Types

usingUnderlyingType=uint32_t#

Public Functions

inlineautoconstexpruseTemperature(booluseTemp)#

inlineautoconstexpruseOccurrencePenalties(boolusePenalty)#

inlineautoconstexprusePresencePenalty(boolusePenalty)#

inlineautoconstexpruseRepetitionPenalty(boolusePenalty)#

inlineautoconstexpruseFrequencyPenalty(boolusePenalty)#

inlineautoconstexpruseMinLength(booluseMinLen)#

inlineautoconstexpruseBanTokens(boolbanTokens)#

inlineautoconstexpruseBanWords(boolbanWords)#

inlineautoconstexpruseNoRepeatNgramSize(boolnoRepeatNgramSize)#

inlineautoconstexpruseStopWords(boolstopWords)#

inlineautoconstexpruseMaxLengthStop(boolmaxLengthStop)#

inlineautoconstexpruseExplicitEosStop(boolexplicitEosStop)#

inlineautoconstexpruseMinP(booluseMinP)#

inlineautoconstexpruseVariableBeamWidthSearch( booluseVariableBeamWidthSearch, )#

inlineboolconstexprisAuto()const#

inlineboolconstexprisTopK()const#

inlineboolconstexprisTopP()const#

inlineboolconstexprisTopKorTopP()const#

inlineboolconstexprisTopKandTopP()const#

inlineboolconstexprisBeamSearch()const#

inlineboolconstexprisMedusa()const#

inlineboolconstexprisLookahead()const#

inlineboolconstexprisExplicitDraftTokens()const#

inlineboolconstexprisExternalDraftTokens()const#

inlineboolconstexprisEagle()const#

inlineboolconstexprisUseTemperature()const#

inlineboolconstexprisUsePresencePenalty()const#

inlineboolconstexprisUseFrequencyPenalty()const#

inlineboolconstexprisUseRepetitionPenalty()const#

inlineboolconstexprisUseMinLength()const#

inlineboolconstexprisUseOccurrencePenalty()const#

inlineboolconstexprisUsePenalty()const#

inlineboolconstexprisUseBanWords()const#

inlineboolconstexprisUseNoRepeatNgramSize()const#

inlineboolconstexprisUseBanTokens()const#

inlineboolconstexprisUseStopWords()const#

inlineboolconstexprisUseMaxLengthStop()const#

inlineboolconstexprisUseExplicitEosStop()const#

inlineboolconstexprisUseStopCriteria()const#

inlineboolconstexprisUseMinP()const#

inlineboolconstexprisUseVariableBeamWidthSearch()const#

inlinebooloperator==(DecodingModeconst&other)const#

inlineexplicitconstexprDecodingMode(UnderlyingTypestate)#

inlineconstexprUnderlyingTypegetState()const#

inlineconstexprcharconst*getName()const#

Public Static Functions

staticinlineautoconstexprAuto()#: No mode specified. Config will be determined from the beam width of the first request at runtime TopKTopP if beamWidth == 1, BeamSearch otherwise.

staticinlineautoconstexprTopK()#

staticinlineautoconstexprTopP()#

staticinlineautoconstexprTopKTopP()#

staticinlineautoconstexprBeamSearch()#

staticinlineautoconstexprMedusa()#

staticinlineautoconstexprLookahead()#

staticinlineautoconstexprExplicitDraftTokens()#

staticinlineautoconstexprExternalDraftTokens()#

staticinlineautoconstexprEagle()#

Private Functions

inlineboolconstexpranyBitSet(UnderlyingTypebits)const#

inlineboolconstexprallBitSet(UnderlyingTypebits)const#

inlineUnderlyingTypeconstexprsetBitTo( UnderlyingTypestate, boolx, )#

Private Members

UnderlyingTypemState={}#

Private Static Attributes

staticSizeType32constexprkNumFlags={12}#

staticUnderlyingTypeconstexprkUseRepetitionPenalties={1u<<0}#

staticUnderlyingTypeconstexprkUseFrequencyPenalties={1u<<1}#

staticUnderlyingTypeconstexprkUsePresencePenalties={1u<<2}#

staticUnderlyingTypeconstexprkUseTemperature={1u<<3}#

staticUnderlyingTypeconstexprkUseMinLength={1u<<4}#

staticUnderlyingTypeconstexprkUseBanWords={1u<<5}#

staticUnderlyingTypeconstexprkUseStopWords={1u<<6}#

staticUnderlyingTypeconstexprkUseMaxLengthStop={1u<<7}#

staticUnderlyingTypeconstexprkUseExplicitEosStop={1u<<8}#

staticUnderlyingTypeconstexprkUseNoRepeatNgramSize={1u<<9}#

staticUnderlyingTypeconstexprkUseMinP={1u<<10}#

staticUnderlyingTypeconstexprkUseVariableBeamWidthSearch={1u<<11}#

staticUnderlyingTypeconstexprkUseStandardStopCriteria={kUseStopWords|kUseMaxLengthStop}#

staticUnderlyingTypeconstexprkUseOccurrencePenalties{kUseRepetitionPenalties|kUseFrequencyPenalties|kUsePresencePenalties}#

staticUnderlyingTypeconstexprkUsePenalties={kUseOccurrencePenalties|kUseTemperature|kUseMinLength}#

staticUnderlyingTypeconstexprkUseBanTokens={kUseNoRepeatNgramSize|kUseBanWords}#

staticUnderlyingTypeconstexprkAuto={1u<<(kNumFlags+0)}#

staticUnderlyingTypeconstexprkTopK={1u<<(kNumFlags+1)}#

staticUnderlyingTypeconstexprkTopP={1u<<(kNumFlags+2)}#

staticUnderlyingTypeconstexprkBeamSearch={1u<<(kNumFlags+3)}#

staticUnderlyingTypeconstexprkMedusa={1u<<(kNumFlags+4)}#

staticUnderlyingTypeconstexprkLookahead={1u<<(kNumFlags+5)}#

staticUnderlyingTypeconstexprkExplicitDraftTokens={1u<<(kNumFlags+6)}#

staticUnderlyingTypeconstexprkExternalDraftTokens={1u<<(kNumFlags+7)}#

staticUnderlyingTypeconstexprkEagle={1u<<(kNumFlags+8)}#

staticUnderlyingTypeconstexprkTopKTopP={kTopK|kTopP}#

structDisServingRequestStats#

#include <types.h>

Struct that holds the request stats in the case of disaggregated serving.

Public Members

doublekvCacheTransferMS#: The total time spent on transferring KV cache from context phase to generation phase (ms)

size_tkvCacheSize#: The total size of KV cache transferred from context phase to generation phase (bytes)

structInflightBatchingStats#

#include <types.h>

Struct that holds the stats of inflight batching models for a single iteration.

Public Members

SizeType32numScheduledRequests#: Number of scheduled requests.

SizeType32numContextRequests#: Number of requests in context stage.

SizeType32numGenRequests#: Number of requests in generation stage.

SizeType32numPausedRequests#: Number of paused requests.

SizeType32numCtxTokens#: Total number of context tokens in the iteration.

SizeType32microBatchId#: Index of mirco batch.

floatavgNumDecodedTokensPerIter#: Average number of tokens decoded per request per iteration.

structIterationStats#

#include <types.h>

Struct that holds the stats of a single iteration.

Public Members

std::stringtimestamp#: Ending time of this iteration.

IterationTypeiter#: Iteration id.

doubleiterLatencyMS#: Iteration latency (ms)

doublenewActiveRequestsQueueLatencyMS#: The total time spent in queue by the requests that became active in this iteration (ms)

SizeType32numNewActiveRequests#: Number of new fetched active requests.

SizeType32numActiveRequests#: Number of active requests.

SizeType32numQueuedRequests#: Number of queued requests.

SizeType32numCompletedRequests#: Number of requests that were completed in this iteration.

SizeType32maxNumActiveRequests#: Number of max active requests.

SizeType32maxBatchSizeStatic#: Static max batch size passed to the executor.

SizeType32maxBatchSizeTunerRecommended#: Batch size produced by dynamic tuner based on input stats.

SizeType32maxBatchSizeRuntime#: @brife The min of maxBatchSizeStatic and maxBatchSizeRuntimeUpperbound

SizeType32maxNumTokensStatic#: @brife Static max num tokens passed to the executor

SizeType32maxNumTokensTunerRecommended#: @brife Max num tokens produced by dynamic tuner based on input stats

SizeType32maxNumTokensRuntime#: @brife The runtime max num tokens

size_tgpuMemUsage#: GPU memory usage in bytes.

size_tcpuMemUsage#: CPU memory usage in bytes.

size_tpinnedMemUsage#: Pinned memory usage in bytes.

std::optional<KvCacheStats>kvCacheStats#: Stats specific to KV caches.

std::optional<KvCacheStats>crossKvCacheStats#: Stats specific to cross KV caches.

std::optional<StaticBatchingStats>staticBatchingStats#: Stats specific to static batching.

std::optional<InflightBatchingStats>inflightBatchingStats#: Stats specific to inflight batching.

std::optional<SpecDecodingStats>specDecodingStats#: Stats specific to speculative decoding.

structKvCacheStats#

#include <types.h>

Struct that holds the stats of a KV cache manager.

Public Members

SizeType32maxNumBlocks#: Max number of blocks.

SizeType32freeNumBlocks#: Number of free blocks.

SizeType32usedNumBlocks#: Number of used blocks.

SizeType32tokensPerBlock#: Number of tokens per block.

SizeType32allocTotalBlocks#: Number of total allocated block.

SizeType32allocNewBlocks#: Number of newly allocated block.

SizeType32reusedBlocks#: Number of reused block.

SizeType32missedBlocks#: Number of not reused block.

floatcacheHitRate#: Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).

structRequestPerfMetrics#

#include <types.h>

Struct that holds the stats of a request.

Public Types

usingTimePoint=std::chrono::time_point<std::chrono::steady_clock>#

Public Members

TimingMetricstimingMetrics#

KvCacheMetricskvCacheMetrics#

SpeculativeDecodingMetricsspeculativeDecoding#

std::optional<IterationType>firstIter#: First iteration where the request was processed.

std::optional<IterationType>lastIter#: Last iteration where a token was generated.

std::optional<IterationType>iter#: Current iteration.

structKvCacheMetrics#

Public Members

SizeType32numTotalAllocatedBlocks={0}#: Number of total allocated blocks.

SizeType32numNewAllocatedBlocks={0}#: Number of newly allocated blocks.

SizeType32numReusedBlocks={0}#: Number of reused blocks.

SizeType32numMissedBlocks={0}#: Number of missed blocks.

FloatTypekvCacheHitRate={0.f}#: KV Cache Hit Rate, defined as reusedBlocks / (reusedBlocks + missedBlocks)

structSpeculativeDecodingMetrics#

Public Members

FloatTypeacceptanceRate={0.f}#: Token acceptance rate for speculative decoding requests.

SizeType32totalAcceptedDraftTokens={0}#: Total number of accepted draft tokens.

SizeType32totalDraftTokens={0}#: Total number of draft tokens used in the request.

structTimingMetrics#

Public Members

TimePointarrivalTime#: The time when the request arrived.

TimePointfirstScheduledTime#: The time when the request was first scheduled.

TimePointfirstTokenTime#: The time when the first token was generated.

TimePointlastTokenTime#: The time when the request was finished.

TimePointkvCacheTransferStart#: Start time of the KV cache transfer for disaggregated serving.

TimePointkvCacheTransferEnd#: End time of the KV cache transfer for disaggregated serving.

size_tkvCacheSize=0#: KV Cache size transfer for disaggregated serving.

structRequestStats#

#include <types.h>

Struct that holds the stats of a single request.

Public Members

IdTypeid#: The request id.

RequestStagestage#: The current stage the request is in.

SizeType32contextPrefillPosition#: If using chunked context, the current context prefill position.

SizeType32numGeneratedTokens#: The number of generated tokens so far.

floatavgNumDecodedTokensPerIter#: The average number of decoded tokens per iteration. It is >= 1 for speculative decoding.

boolscheduled#: Whether the request is scheduled for the current iteration.

boolpaused#: Whether the request is being paused at the current iteration due to lack of resources (KV cache blocks exhaustion for example)

std::optional<DisServingRequestStats>disServingStats#: Stats specific to disaggregated serving.

SizeType32allocTotalBlocksPerRequest#: Number of total allocated blocks per request.

SizeType32allocNewBlocksPerRequest#: Number of newly allocated blocks per request.

SizeType32reusedBlocksPerRequest#: Number of reused blocks per request.

SizeType32missedBlocksPerRequest#: Number of missed blocks per request.

FloatTypekvCacheHitRatePerRequest#: KV Cache Hit Rate per request, defined as reusedBlocks / (reusedBlocks + missedBlocks)

structRequestStatsPerIteration#

#include <types.h>

Struct that holds the stats of all requests in an iteration.

Public Members

IterationTypeiter#: The iteration id for these stats.

std::vector<RequestStats>requestStats#: The stats of all active requests for this iteration.

structSpecDecodingStats#

#include <types.h>

Struct that holds speculative decoding stats.

Public Members

SizeType64numDraftTokens#: Total number of proposed draft tokens for all requests.

SizeType64numAcceptedTokens#: Total number of accepted draft tokens for all requests.

SizeType64numRequestsWithDraftTokens#: Number of requests with at least one draft token in batch.

doubleacceptanceLength#: Acceptance length, defined as average number of tokens produced per step for all requests with at least one draft token.

doubleiterLatencyMS#: Iteration latency for draft token generation only (ms)

doubledraftOverhead#: Draft overhead, defined as iterLatencyMS (specdec) / iterLatencyMS (total)

structStaticBatchingStats#

#include <types.h>

Struct that holds the stats of static batching models for a single iteration.

Public Members

SizeType32numScheduledRequests#: Number of scheduled requests.

SizeType32numContextRequests#: Number of requests in context stage.

SizeType32numCtxTokens#: Total number of context tokens in the iteration.

SizeType32numGenTokens#: Total number of tokens to generate in the iteration.

SizeType32emptyGenSlots#: Total number of unused generation token slots.

template<typenameT,bool=false> structTypeTraits#: #include <types.h>
For converting a C++ data type to aTrtLmmDataType.

template<> structTypeTraits<bool>#

Public Static Attributes

staticconstexprautovalue=DataType::kBOOL #

template<> structTypeTraits<float>#

Public Static Attributes

staticconstexprautovalue=DataType::kFP32 #

template<> structTypeTraits<half>#

Public Static Attributes

staticconstexprautovalue=DataType::kFP16 #

template<> structTypeTraits<std::int32_t>#

Public Static Attributes

staticconstexprautovalue=DataType::kINT32 #

template<> structTypeTraits<std::int64_t>#

Public Static Attributes

staticconstexprautovalue=DataType::kINT64 #

template<> structTypeTraits<std::int8_t>#

Public Static Attributes

staticconstexprautovalue=DataType::kINT8 #

template<> structTypeTraits<std::uint8_t>#

Public Static Attributes

staticconstexprautovalue=DataType::kUINT8 #

template<typenameT> structTypeTraits<T*>#

Public Static Attributes

staticconstexprautovalue=DataType::kINT64 #

namespaceruntime

executor.h#

namespacetensorrt_llm

namespacebatch_manager#

namespacekv_cache_manager#

namespaceexecutor

Typedefs

usingRetentionPriority=SizeType32 #

usingKVCacheEventData=std::variant<KVCacheCreatedData,KVCacheStoredData,KVCacheRemovedData,KVCacheUpdatedData>#

Functions

charconst*version()noexcept#: Version of TRT-LLM.

classAdditionalModelOutput#

#include <executor.h>

Additional output that should be gathered.

By default gather output of shape [beamWidth, x] from each generation phase. If gatherContext is true, also gather output of shape [promptLen, x] from context phase.

Public Functions

explicitAdditionalModelOutput( std::stringname, boolgatherContext=false, )#

booloperator==(AdditionalModelOutputconst&other)const#

Public Members

std::stringname#

boolgatherContext={false}#

structAdditionalOutput#

Public Functions

inlineAdditionalOutput(std::stringname,Tensoroutput)#

AdditionalOutput(AdditionalOutputconst&other)=default#

AdditionalOutput(AdditionalOutput&&other)noexcept=default#

AdditionalOutput&operator=(AdditionalOutputconst&other)=default#

AdditionalOutput&operator=( AdditionalOutput&&other, )noexcept=default#

~AdditionalOutput()=default#

Public Members

std::stringname#

Tensoroutput#

classCacheTransceiverConfig#

Public Types

enumclassBackendType:std::uint8_t#

Values:

enumeratorDEFAULT#

enumeratorMPI#

enumeratorUCX#

enumeratorNIXL#

Public Functions

explicitCacheTransceiverConfig( std::optional<BackendType>backendType=std::nullopt, std::optional<size_t>maxNumTokens=std::nullopt, std::optional<int>kvTransferTimeoutMs=std::nullopt, std::optional<int>kvTransferSenderFutureTimeoutMs=std::nullopt, )#

booloperator==(CacheTransceiverConfigconst&other)const#

voidsetBackendType(std::optional<BackendType>backendType)#

voidsetMaxTokensInBuffer(std::optional<size_t>maxTokensInBuffer)#

voidsetKvTransferTimeoutMs(std::optional<int>kvTransferTimeoutMs)#

voidsetKvTransferSenderFutureTimeoutMs( std::optional<int>kvTransferSenderFutureTimeoutMs, )#

std::optional<size_t>getMaxTokensInBuffer()const#

std::optional<BackendType>getBackendType()const#

std::optional<int>getKvTransferTimeoutMs()const#

std::optional<int>getKvTransferSenderFutureTimeoutMs()const#

Private Members

std::optional<BackendType>mBackendType#

std::optional<size_t>mMaxTokensInBuffer#: The maximum number of tokens that the CacheTransceiver’s pre-allocated buffer can hold. If the number of kvCache tokens to be transferred for a single request is greater than this value, the performance of the cache transfer may be degraded.

std::optional<int>mKvTransferTimeoutMs#

std::optional<int>mKvTransferSenderFutureTimeoutMs#

classContextPhaseParams#

Public Types

usingRequestIdType=std::uint64_t#

Public Functions

ContextPhaseParams( VecTokensfirstGenTokens, RequestIdTypereqId, std::optional<VecTokens>draftTokens, )#

ContextPhaseParams( VecTokensfirstGenTokens, RequestIdTypereqId, void*state, std::optional<VecTokens>draftTokens, )#

ContextPhaseParams( VecTokensfirstGenTokens, RequestIdTypereqId, std::vector<char>const&serializedState, std::optional<VecTokens>draftTokens, )#

ContextPhaseParams(ContextPhaseParamsconst&)#

ContextPhaseParams(ContextPhaseParams&&)noexcept#

ContextPhaseParams&operator=(ContextPhaseParamsconst&)#

ContextPhaseParams&operator=(ContextPhaseParams&&)noexcept#

~ContextPhaseParams()#

booloperator==(ContextPhaseParamsconst&)constnoexcept#

VecTokensconst&getFirstGenTokens()const&noexcept#

std::optional<VecTokens>const&getDraftTokens()const&noexcept#

VecTokenspopFirstGenTokens()&&noexcept#

RequestIdTypegetReqId()constnoexcept#

voidconst*getState()constnoexcept#

void*getState()noexcept#

void*releaseState()noexcept#

std::vector<char>getSerializedState()constnoexcept#

Private Types

usingStatePtr=std::unique_ptr<void,decltype(&deleter)>#

Private Members

RequestIdTypemReqId={0}#: This request corresponds to the request ID in the context phase.

VecTokensmFirstGenTokens#: The first tokens generated by context executor.

StatePtrmState={nullptr,deleter}#: Context phase state of this request.

std::optional<VecTokens>mDraftTokens#: The draft tokens generated by context executor.

Private Static Functions

staticvoiddeleter(voidconst*data)#

Friends

friendclassSerialization

classDebugConfig#

#include <executor.h>

Configuration class for debugging output.

Public Functions

explicitDebugConfig( booldebugInputTensors=false, booldebugOutputTensors=false, StringVecdebugTensorNames={}, SizeType32debugTensorsMaxIterations=0, )#

booloperator==(DebugConfigconst&other)const#

boolgetDebugInputTensors()const#

boolgetDebugOutputTensors()const#

StringVecconst&getDebugTensorNames()const#

SizeType32getDebugTensorsMaxIterations()const#

voidsetDebugInputTensors(booldebugInputTensors)#

voidsetDebugOutputTensors(booldebugOutputTensors)#

voidsetDebugTensorNames(StringVecconst&debugTensorNames)#

voidsetDebugTensorsMaxIterations( SizeType32debugTensorsMaxIterations, )#

Private Types

usingStringVec=std::vector<std::string>#

Private Members

boolmDebugInputTensors#: If true, debug all input tensors.

boolmDebugOutputTensors#: If true, debug all output tensors.

StringVecmDebugTensorNames#: If not empty, only debug tensors in this list.

SizeType32mDebugTensorsMaxIterations#: If > 0, provide debug tensors for at most debugTensorsMaxIterations past iterations, else dump them to files.

Friends

friendclassSerialization

classDecodingConfig#

#include <executor.h>

Configuration class for the decoding.

Public Functions

explicitDecodingConfig( std::optional<DecodingMode>decodingMode=std::nullopt, std::optional<LookaheadDecodingConfig>lookaheadDecodingConfig=std::nullopt, std::optional<MedusaChoices>medusaChoices=std::nullopt, std::optional<EagleConfig>eagleConfig=std::nullopt, )#

booloperator==(DecodingConfigconst&other)const#

voidsetDecodingMode(DecodingModeconst&)#: Sets decoding mode. Some modes require the use of their own setters.

std::optional<DecodingMode>getDecodingMode()const#

voidsetLookaheadDecodingConfig( LookaheadDecodingConfigconst&lookaheadDecodingConfig, )#: Sets lookahead decoding mode and config.

voidenableSeamlessLookaheadDecoding()#

std::optional<LookaheadDecodingConfig>getLookaheadDecodingConfig( )const#

SizeType32getLookaheadDecodingMaxNumRequest()const#

voidsetMedusaChoices(MedusaChoicesconst&)#: Sets medusa mode and config.

std::optional<MedusaChoices>getMedusaChoices()const#

voidsetEagleConfig(EagleConfigconst&)#: Sets eagle mode and config.

std::optional<EagleConfig>getEagleConfig()const#

Private Members

std::optional<DecodingMode>mDecodingMode#

std::optional<LookaheadDecodingConfig>mLookaheadDecodingConfig#

std::optional<MedusaChoices>mMedusaChoices#

std::optional<EagleConfig>mEagleConfig#

Private Static Attributes

staticconstexprSizeType32mLookaheadDecodingMaxNumRequest=8#

Friends

friendclassSerialization

classDynamicBatchConfig#

#include <executor.h>

Configuration class for dynamic tuning of batch size and max num tokens. During runtime the statistics of input and output lengths are recoreded. Based on these statistics, the batch size and max num tokens are tuned dynamically to better serve the requests.

Public Functions

explicitDynamicBatchConfig( boolenableBatchSizeTuning=false, boolenableMaxNumTokensTuning=false, SizeType32dynamicBatchMovingAverageWindow=kDefaultDynamicBatchMovingAverageWindow, std::vector<std::pair<SizeType32,SizeType32>>batchSizeTable=kDefaultBatchSizeTable, )#

SizeType32getDynamicBatchMovingAverageWindow()const#

boolgetEnableBatchSizeTuning()const#

boolgetEnableMaxNumTokensTuning()const#

std::vector<std::pair<SizeType32,SizeType32>>getBatchSizeTable( )const#

Public Static Attributes

staticSizeType32constkDefaultDynamicBatchMovingAverageWindow=128#: The default window size for moving average of input and output length which is used to calculate dynamic batch size and max num tokens.

staticstd::vector<std::pair<SizeType32,SizeType32>>constkDefaultBatchSizeTable#: The default value of batch size table.

Private Members

boolmEnableBatchSizeTuning#: Controls if the batch size should be tuned dynamically.

boolmEnableMaxNumTokensTuning#: Controls if the max num tokens should be tuned dynamically.

SizeType32mDynamicBatchMovingAverageWindow#: The window size for moving average of input and output length which is used to calculate dynamic batch size and max num tokens.

std::vector<std::pair<SizeType32,SizeType32>>mBatchSizeTable#: A vector of (batchSizeLimit, batchSize). When max capacity batch size is less than.

Friends

friendclassSerialization

structEagleConfig#

Public Functions

explicitEagleConfig( std::optional<EagleChoices>eagleChoices=std::nullopt, boolgreedySampling=true, std::optional<float>posteriorThreshold=std::nullopt, booluseDynamicTree=false, std::optional<SizeType32>dynamicTreeMaxTopK=std::nullopt, )#

booloperator==(EagleConfigconst&other)const#

std::optional<EagleChoices>getEagleChoices()const#

std::optional<float>getPosteriorThreshold()const#

boolisGreedySampling()const#

booluseDynamicTree()const#

std::optional<SizeType32>getDynamicTreeMaxTopK()const#

Private Functions

std::optional<float>const&checkPosteriorValue( std::optional<float>const&value, )#

Private Members

std::optional<EagleChoices>mEagleChoices#: choices forming tree for EAGLE-1.

boolmGreedySampling#: Flag to use greedy or typical acceptance.

std::optional<float>mPosteriorThreshold#: Minimum token probability of the typical acceptance. Corresponds to epsilon inhttps://arxiv.org/pdf/2401.10774. Default is 0.09f.

boolmUseDynamicTree#: Flag to use Eagle-2.

std::optional<SizeType32>mDynamicTreeMaxTopK#: Number of draft tokens expand for each node in Eagle-2.

Friends

friendclassSerialization

classExecutor#

#include <executor.h>

The executor is responsible for receiving new requests and sending responses, and running the inference.

Public Functions

Executor( std::filesystem::pathconst&modelPath, ModelTypemodelType, ExecutorConfigconst&executorConfig, )#

Parameters:

modelPath – Path to the folder that defines the model to run
modelType – The type of model
executorConfig – The configuration for the executor

Executor( std::filesystem::pathconst&encoderModelPath, std::filesystem::pathconst&decoderModelPath, ModelTypemodelType, ExecutorConfigconst&executorConfig, )#

Executor( BufferViewconst&engineBuffer, std::stringconst&jsonConfigStr, ModelTypemodelType, ExecutorConfigconst&executorConfig, std::optional<std::map<std::string,Tensor>>const&managedWeights=std::nullopt, )#

Executor( BufferViewconst&encoderEngineBuffer, std::stringconst&encoderJsonConfigStr, BufferViewconst&decoderEngineBuffer, std::stringconst&decoderJsonConfigStr, ModelTypemodelType, ExecutorConfigconst&executorConfig, )#

Executor( std::shared_ptr<Model>model, ExecutorConfigconst&executorConfig, )#

Executor( std::shared_ptr<Model>encoderModel, std::shared_ptr<Model>decoderModel, ExecutorConfigconst&executorConfig, )#

~Executor()#

Executor(Executorconst&executor)=delete#

Executor&operator=(Executorconst&executor)=delete#

Executor(Executor&&)=default#

Executor&operator=(Executor&&)=default#

IdTypeenqueueRequest(Requestconst&request)#

Enqueue a new request.

Parameters:: request – The LLM request which contains input tokens and request parameters
Returns:: A unique id that identifies the request

std::vector<IdType>enqueueRequests( std::vector<Request>const&requests, )#: Enqueue a batch of request.

std::vector<Response>awaitResponses( std::optional<std::chrono::milliseconds>const&timeout=std::nullopt, )#

Await for ready responses.

Thisoverloadawaitsforanyreadyresponses.Inparticular,ifseveralrequestshavebeenenqueued,thismethodwillprovideanyreadyresponseswithoutorderguarantees.

Parameters:: timeout – The maximum time to wait for new responses
Returns:: A vector of responses

std::vector<Response>awaitResponses( IdTypeconst&requestId, std::optional<std::chrono::milliseconds>const&timeout=std::nullopt, )#

Await for ready responses.

Parameters:

id – A request id
timeout – The maximum time to wait for new responses

Returns:

A vector of responses

std::vector<std::vector<Response>>awaitResponses( std::vector<IdType>const&requestIds, std::optional<std::chrono::milliseconds>const&timeout=std::nullopt, )#

Await for multiple ready responses.

AmultipleIDrequestbehavesasifawaitResponses(IdType,timeout)wereinvokedonallIDs.ThereturnedvectorcontainsavectorofresponsesperIDinthesameorderspecifiedbytherequestIds.ThesamebehaviourasawaitResponses(IdType,timeout)applies:*Responsesmaybeempty.*IfallresponseshavealreadybeengivenforoneoftherequestIds,thenthismethodwillhangunlessatimeoutisspecified.

Parameters:

requestIds – Ids requested
timeout – The maximum time to wait for new responses

Returns:

A vector of vector of responses

SizeType32getNumResponsesReady( std::optional<IdType>const&requestId=std::nullopt, )const#

Get the number of ready responses.

Parameters:: requestId – An optional request id
Returns:: The number of ready responses

voidcancelRequest(IdTyperequestId)#

Cancel the request with provided request id.

Parameters:: id – The request id for which to cancel the response

voidshutdown()#

Signals the server to shutdown.

This call is blocking. Only returns when all requests have terminated or timeout has been reached

std::deque<IterationStats>getLatestIterationStats()#

Returns the per-iterations statistics computed since last call to getLatestIterationStats. Contains at most iterStatsMaxIterations iterations.

Returns:: Iteration stats

std::deque<RequestStatsPerIteration>getLatestRequestStats()#

Returns the request stats of each iteration computed since last call to getLatestRequestStats. Contains at most requestStatsMaxIterations iterations.

Returns:: Request stats grouped by iterations

std::deque<DebugTensorsPerIteration>getLatestDebugTensors()#

Returns the debug tensors of each iteration computed since last call to getLatestDebugTensors. Contains at most debugTensorsMaxIterations iterations.

Returns:: Request debug tensors grouped by iterations

boolcanEnqueueRequests()const#: Indicates if the current process is allowed to enqueueRequests.

boolisParticipant()const#: Indicates if the current process participates in this executor instance.

std::optional<std::shared_ptr<KVCacheEventManager>>getKVCacheEventManager( )const#

Private Members

std::unique_ptr<Impl>mImpl#

classExecutorConfig#

#include <executor.h>

Configuration class for the model executor.

Public Functions

explicitExecutorConfig( SizeType32maxBeamWidth=1, SchedulerConfigschedulerConfig=SchedulerConfig(), KvCacheConfigkvCacheConfig=KvCacheConfig(), boolenableChunkedContext=true, boolnormalizeLogProbs=true, SizeType32iterStatsMaxIterations=kDefaultIterStatsMaxIterations, SizeType32requestStatsMaxIterations=kDefaultRequestStatsMaxIterations, BatchingTypebatchingType=BatchingType::kINFLIGHT, std::optional<SizeType32>maxBatchSize=std::nullopt, std::optional<SizeType32>maxNumTokens=std::nullopt, std::optional<ParallelConfig>parallelConfig=std::nullopt, std::optional<PeftCacheConfig>const&peftCacheConfig=std::nullopt, std::optional<LogitsPostProcessorConfig>logitsPostProcessorConfig=std::nullopt, std::optional<DecodingConfig>decodingConfig=std::nullopt, booluseGpuDirectStorage=false, floatgpuWeightsPercent=1, std::optional<SizeType32>maxQueueSize=std::nullopt, ExtendedRuntimePerfKnobConfigconst&extendedRuntimePerfKnobConfig=ExtendedRuntimePerfKnobConfig(), std::optional<DebugConfig>debugConfig=std::nullopt, SizeType32recvPollPeriodMs=0, uint64_tmaxSeqIdleMicroseconds=kDefaultMaxSeqIdleMicroseconds, std::optional<SpeculativeDecodingConfig>specDecConfig=std::nullopt, std::optional<GuidedDecodingConfig>guidedDecodingConfig=std::nullopt, std::optional<std::vector<AdditionalModelOutput>>additionalModelOutputs=std::nullopt, std::optional<CacheTransceiverConfig>cacheTransceiverConfig=std::nullopt, boolgatherGenerationLogits=false, boolpromptTableOffloading=false, boolenableTrtOverlap=false, boolfailFastOnAttentionWindowTooLarge=false, )#

SizeType32getMaxBeamWidth()const#

SchedulerConfiggetSchedulerConfig()const#

KvCacheConfiggetKvCacheConfig()const#

SchedulerConfig&getSchedulerConfigRef()#

KvCacheConfig&getKvCacheConfigRef()#

boolgetEnableChunkedContext()const#

boolgetNormalizeLogProbs()const#

SizeType32getIterStatsMaxIterations()const#

SizeType32getRequestStatsMaxIterations()const#

BatchingTypegetBatchingType()const#

std::optional<SizeType32>getMaxBatchSize()const#

std::optional<SizeType32>getMaxNumTokens()const#

std::optional<ParallelConfig>getParallelConfig()const#

std::optional<PeftCacheConfig>getPeftCacheConfig()const#

std::optional<LogitsPostProcessorConfig>getLogitsPostProcessorConfig( )const#

std::optional<DecodingConfig>getDecodingConfig()const#

boolgetUseGpuDirectStorage()const#

floatgetGpuWeightsPercent()const#

std::optional<SizeType32>getMaxQueueSize()const#

ExtendedRuntimePerfKnobConfiggetExtendedRuntimePerfKnobConfig( )const#

std::optional<DebugConfig>getDebugConfig()const#

SizeType32getRecvPollPeriodMs()const#

uint64_tgetMaxSeqIdleMicroseconds()const#

std::optional<SpeculativeDecodingConfig>getSpecDecConfig()const#

std::optional<GuidedDecodingConfig>getGuidedDecodingConfig()const#

std::optional<std::vector<AdditionalModelOutput>>getAdditionalModelOutputs( )const#

boolgetGatherGenerationLogits()const#

boolgetPromptTableOffloading()const#

std::optional<CacheTransceiverConfig>getCacheTransceiverConfig( )const#

boolgetEnableTrtOverlap()const#

boolgetFailFastOnAttentionWindowTooLarge()const#

voidsetMaxBeamWidth(SizeType32maxBeamWidth)#

voidsetMaxBatchSize(SizeType32maxBatchSize)#

voidsetMaxNumTokens(SizeType32maxNumTokens)#

voidsetSchedulerConfig(SchedulerConfigconst&schedulerConfig)#

voidsetKvCacheConfig(KvCacheConfigconst&kvCacheConfig)#

voidsetEnableChunkedContext(boolenableChunkedContext)#

voidsetNormalizeLogProbs(boolnormalizeLogProbs)#

voidsetIterStatsMaxIterations(SizeType32iterStatsMaxIterations)#

voidsetRequestStatsMaxIterations( SizeType32requestStatsMaxIterations, )#

voidsetBatchingType(BatchingTypebatchingType)#

voidsetParallelConfig(ParallelConfigconst&parallelConfig)#

voidsetPeftCacheConfig(PeftCacheConfigconst&peftCacheConfig)#

voidsetLogitsPostProcessorConfig( LogitsPostProcessorConfigconst&logitsPostProcessorConfig, )#

voidsetDecodingConfig(DecodingConfigconst&decodingConfig)#

voidsetUseGpuDirectStorage(boolconst&useGpuDirectStorage)#

voidsetGpuWeightsPercent(floatconst&gpuWeightsPercent)#

voidsetMaxQueueSize(std::optional<SizeType32>const&maxQueueSize)#

voidsetExtendedRuntimePerfKnobConfig( ExtendedRuntimePerfKnobConfigconst&extendedRuntimePerfKnobConfig, )#

voidsetDebugConfig(DebugConfigconst&debugConfig)#

voidsetRecvPollPeriodMs(SizeType32const&recvPollPeriodMs)#

voidsetMaxSeqIdleMicroseconds(uint64_tmaxSeqIdleMicroseconds)#

voidsetSpecDecConfig(SpeculativeDecodingConfigconst&specDecConfig)#

voidsetGuidedDecodingConfig( GuidedDecodingConfigconst&guidedDecodingConfig, )#

voidsetAdditionalModelOutputs( std::vector<AdditionalModelOutput>const&additionalModelOutputs, )#

voidsetGatherGenerationLogits(boolgatherGenerationLogits)#

voidsetPromptTableOffloading(boolpromptTableOffloading)#

voidsetCacheTransceiverConfig( CacheTransceiverConfigconst&cacheTransceiverConfig, )#

voidsetEnableTrtOverlap(boolenableTrtOverlap)#

voidsetFailFastOnAttentionWindowTooLarge( boolfailFastOnAttentionWindowTooLarge, )#

Public Static Attributes

staticconstexpruint64_tkDefaultMaxSeqIdleMicroseconds=std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::minutes(3)).count()#

staticconstexprSizeType32kDefaultIterStatsMaxIterations=1000#

staticconstexprSizeType32kDefaultRequestStatsMaxIterations=0#

Private Members

SizeType32mMaxBeamWidth#: The beam width value of requests that will be sent to the executor.

SchedulerConfigmSchedulerConfig#: The scheduler configuration.

KvCacheConfigmKvCacheConfig#: The KV cache configuration.

boolmEnableChunkedContext#: Controls whether context is allowed to be chunked.

boolmNormalizeLogProbs#: Controls if log probabilities should be normalized or not.

SizeType32mIterStatsMaxIterations#: Controls the maximum number of iterations for which to keep statistics.

SizeType32mRequestStatsMaxIterations#: Controls the maximum number of iterations for which to keep per-request statistics.

BatchingTypemBatchingType#: The type of batching strategy to use. See BatchingType.

std::optional<SizeType32>mMaxBatchSize#: The max batch size of requests.

std::optional<SizeType32>mMaxNumTokens#: The max number of tokens per batch.

std::optional<ParallelConfig>mParallelConfig#: The parallel execution configuration.

std::optional<PeftCacheConfig>mPeftCacheConfig#

std::optional<LogitsPostProcessorConfig>mLogitsPostProcessorConfig#: Logits post processor configuration.

std::optional<DecodingConfig>mDecodingConfig#: Decoding configuration.

boolmUseGpuDirectStorage#: Enable/disable use of GPU Direct Storage (GDS) to load engines.

floatmGpuWeightsPercent#: GPU weights percent for weight streaming.

std::optional<SizeType32>mMaxQueueSize#: The maximum number of requests allowed in queue before rejecting new requests.

ExtendedRuntimePerfKnobConfigmExtendedRuntimePerfKnobConfig#: Config for perf knobs that can be set in runtime.

std::optional<DebugConfig>mDebugConfig#: Debugging configuration.

SizeType32mRecvPollPeriodMs#: The time in ms between polls for new communication in orchestrator mode. Use 0 for busy loop.

uint64_tmMaxSeqIdleMicroseconds#: The maximum time in microseconds a scheduled request can remain idle before getting terminated. Default value is 3 minutes.

std::optional<SpeculativeDecodingConfig>mSpeculativeDecodingConfig#: The speculative decoding configuration.

std::optional<GuidedDecodingConfig>mGuidedDecodingConfig#: The guided decoding configuration.

std::optional<std::vector<AdditionalModelOutput>>mAdditionalModelOutputs#: The additional outputs to gather from the model.

std::optional<CacheTransceiverConfig>mCacheTransceiverConfig#: The cache transceiver configuration.

boolmGatherGenerationLogits={false}#: Controls if generation logits should be gathered, so that returnGenerationLogits can be requested.

boolmPromptTableOffloading={false}#: Controls if prompt table offloading is enabled.

boolmEnableTrtOverlap={false}#: Controls whether preparation and TRT engine execution should be overlapped.

boolmFailFastOnAttentionWindowTooLarge={false}#: Controls whether to fail fast when attention window is too large to fit even a single sequence in the KV cache.

Friends

friendclassSerialization

classExtendedRuntimePerfKnobConfig#

#include <executor.h>

Configuration class for the runtime perf knobs.

Public Functions

explicitExtendedRuntimePerfKnobConfig( boolmultiBlockMode=true, boolenableContextFMHAFP32Acc=false, boolcudaGraphMode=false, SizeType32cudaGraphCacheSize=0, )#

inlinebooloperator==( ExtendedRuntimePerfKnobConfigconst&other, )const#

boolgetMultiBlockMode()const#

boolgetEnableContextFMHAFP32Acc()const#

boolgetCudaGraphMode()const#

SizeType32getCudaGraphCacheSize()const#

voidsetMultiBlockMode(boolmultiBlockMode)#

voidsetEnableContextFMHAFP32Acc(boolenableContextFMHAFP32Acc)#

voidsetCudaGraphMode(boolcudaGraphMode)#

voidsetCudaGraphCacheSize(SizeType32cacheSize)#

Private Members

boolmMultiBlockMode#: Control if multi block mode should be enabled or not.

boolmEnableContextFMHAFP32Acc#: If enable FMHA runner FP32 accumulation.

boolmCudaGraphMode#: Control if enable cuda graph.

SizeType32mCudaGraphCacheSize#: Number of cuda graphs to be cached in the runtime. The larger the cache, the better the perf, but more GPU memory is consumed.

Friends

friendclassSerialization

classExternalDraftTokensConfig#

#include <executor.h>

Configuration for speculative decoding with external draft tokens. Allows to include draft tokens, draft logits and specify acceptance threshold.

Public Functions

explicitExternalDraftTokensConfig( VecTokenstokens, std::optional<Tensor>logits=std::nullopt, std::optional<FloatType>const&acceptanceThreshold=std::nullopt, std::optional<bool>const&fastLogits=std::nullopt, )#

VecTokensgetTokens()const#

std::optional<Tensor>getLogits()const#

std::optional<FloatType>getAcceptanceThreshold()const#

std::optional<bool>getFastLogits()const#

Private Members

VecTokensmTokens#: The draft tokens.

std::optional<Tensor>mLogits#: The draft logits. Expected shape: [num_draft_tokens, vocab_size].

std::optional<FloatType>mAcceptanceThreshold#: The acceptance threshold. Must be > 0.f and <= 1.f.

std::optional<bool>mFastLogits#: Use direct transfer for draft logits.

Friends

friendclassSerialization

classGuidedDecodingConfig#

#include <executor.h>

Guided decoding configurations for executor.

Public Types

enumclassGuidedDecodingBackend#

Values:

enumeratorkXGRAMMAR#: Enable guided decoding with XGrammar backend.

enumeratorkLLGUIDANCE#: Enable guided decoding with LLGuidance backend.

Public Functions

explicitGuidedDecodingConfig( GuidedDecodingBackendbackend, std::optional<std::vector<std::string>>encodedVocab=std::nullopt, std::optional<std::string>tokenizerStr=std::nullopt, std::optional<std::vector<TokenIdType>>stopTokenIds=std::nullopt, )#

booloperator==(GuidedDecodingConfigconst&other)const#

voidsetBackend(GuidedDecodingBackendconst&backend)#

GuidedDecodingBackendgetBackend()const#

voidsetEncodedVocab(std::vector<std::string>const&encodedVocab)#

std::optional<std::vector<std::string>>getEncodedVocab()const#

voidsetTokenizerStr(std::stringconst&tokenizerStr)#

std::optional<std::string>getTokenizerStr()const#

voidsetStopTokenIds(std::vector<TokenIdType>const&stopTokenIds)#

std::optional<std::vector<TokenIdType>>getStopTokenIds()const#

voidvalidate()const#

Private Members

GuidedDecodingBackendmBackend#: Guided decoding backend. Currently supports XGrammar.

std::optional<std::vector<std::string>>mEncodedVocab#

Encoded vocabulary. For a huggingface tokenizer, it can be extracted by:

encoded_vocab=tokenizer.get_vocab()encoded_vocab=[tokenfortoken,_insorted(encoded_vocab.items(),key=lambdax:x[1])]

std::optional<std::string>mTokenizerStr#

Tokenizer string. For a huggingface fast tokenizer, it can be extracted by:

tokenizer_str=tokenizer.backend_tokenizer.to_str()

std::optional<std::vector<TokenIdType>>mStopTokenIds#: Stop token ids. If not provided, it can be automatically detected.

Friends

friendclassSerialization

classGuidedDecodingParams#

#include <executor.h>

Guided decoding parameters for a request.

Public Types

enumclassGuideType#

Values:

enumeratorkJSON#: The generated text is amenable to json format.

enumeratorkJSON_SCHEMA#: The generated text is amenable to json format with additional user-specified restrictions, namely schema.

enumeratorkREGEX#: The generated text is amenable to the user-specified regular expression.

enumeratorkEBNF_GRAMMAR#: The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. EBNF grammar is widely-used to express context-free grammars.

enumeratorkSTRUCTURAL_TAG#: The generated text is amenable to the XGrammar structural tag.

Public Functions

explicitGuidedDecodingParams( GuideTypeguideType, std::optional<std::string>guide=std::nullopt, )#

booloperator==(GuidedDecodingParamsconst&other)const#

GuideTypegetGuideType()const#

std::optional<std::string>getGuide()const#

Private Members

GuideTypemGuideType#: The guide type. See GuideType.

std::optional<std::string>mGuide#: The detailed guide string. It could be a json schema, a regular expression or a EBNF grammar depending on mGuideType.

Friends

friendclassSerialization

classJsonSerialization#

#include <executor.h>

Class with utility functions to serialize statistics to json string.

Public Static Functions

staticstd::stringtoJsonStr(IterationStatsconst&iterationStats)#: Utility function to convert an iterationStats struct to a json serialized string.

staticstd::stringtoJsonStr( RequestStatsPerIterationconst&requestStatsPerIter, )#: Utility function to convert a requestStatsPerIteration struct to a json serialized string.

staticstd::stringtoJsonStr(RequestStatsconst&requestStats)#: Utility function to convert a requestStats struct to a json serialized string.

classKvCacheConfig#

#include <executor.h>

Configuration class for the KV cache.

Public Functions

explicitKvCacheConfig( boolenableBlockReuse=true, std::optional<SizeType32>const&maxTokens=std::nullopt, std::optional<std::vector<SizeType32>>const&maxAttentionWindowVec=std::nullopt, std::optional<SizeType32>const&sinkTokenLength=std::nullopt, std::optional<FloatType>const&freeGpuMemoryFraction=std::nullopt, std::optional<size_t>const&hostCacheSize=std::nullopt, boolonboardBlocks=true, std::optional<FloatType>const&crossKvCacheFraction=std::nullopt, std::optional<RetentionPriority>secondaryOffloadMinPriority=std::nullopt, size_teventBufferMaxSize=0, boolenablePartialReuse=true, boolcopyOnPartialReuse=true, booluseUvm=false, SizeType32attentionDpEventsGatherPeriodMs=5, std::optional<tensorrt_llm::runtime::RuntimeDefaults>const&runtimeDefaults=std::nullopt, uint64_tconst&maxGpuTotalBytes=0, )#

boolgetEnableBlockReuse()const#

boolgetEnablePartialReuse()const#

boolgetCopyOnPartialReuse()const#

std::optional<SizeType32>getMaxTokens()const#

std::optional<std::vector<SizeType32>>getMaxAttentionWindowVec( )const#

std::optional<SizeType32>getSinkTokenLength()const#

std::optional<FloatType>getFreeGpuMemoryFraction()const#

std::optional<FloatType>getCrossKvCacheFraction()const#

std::optional<size_t>getHostCacheSize()const#

boolgetOnboardBlocks()const#

std::optional<RetentionPriority>getSecondaryOffloadMinPriority( )const#

size_tgetEventBufferMaxSize()const#

boolgetUseUvm()const#

SizeType32getAttentionDpEventsGatherPeriodMs()const#

uint64_tgetMaxGpuTotalBytes()const#

voidsetEnableBlockReuse(boolenableBlockReuse)#

voidsetEnablePartialReuse(boolenablePartialReuse)#

voidsetCopyOnPartialReuse(boolcopyOnPartialReuse)#

voidsetMaxTokens(std::optional<SizeType32>maxTokens)#

voidsetMaxAttentionWindowVec( std::vector<SizeType32>maxAttentionWindowVec, )#

voidsetSinkTokenLength(SizeType32sinkTokenLength)#

voidsetFreeGpuMemoryFraction(FloatTypefreeGpuMemoryFraction)#

voidsetCrossKvCacheFraction(FloatTypecrossKvCacheFraction)#

voidsetHostCacheSize(size_thostCacheSize)#

voidsetOnboardBlocks(boolonboardBlocks)#

voidsetSecondaryOffloadMinPriority( std::optional<RetentionPriority>secondaryOffloadMinPriority, )#

voidsetEventBufferMaxSize(size_teventBufferMaxSize)#

voidsetUseUvm(booluseUvm)#

voidsetAttentionDpEventsGatherPeriodMs( SizeType32attentionDpEventsGatherPeriodMs, )#

voidsetMaxGpuTotalBytes(uint64_tmaxGpuTotalBytes)#

voidfillEmptyFieldsFromRuntimeDefaults( tensorrt_llm::runtime::RuntimeDefaultsconst&runtimeDefaults, )#

Public Static Attributes

staticconstexprautokDefaultGpuMemFraction=0.9F#

Private Members

boolmEnableBlockReuse#: Controls if KV cache blocks can be reused for different requests.

std::optional<SizeType32>mMaxTokens#: The maximum number of tokens that should be stored in the KV cache If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.

std::optional<std::vector<SizeType32>>mMaxAttentionWindowVec#: Size of the attention window for each sequence. Only the last mMaxAttentionWindow tokens of each sequence will be stored in the KV cache. Different layers may have different max attention window sizes. If the number of elements in mMaxAttentionWindowVec is less than the number of layers, mMaxAttentionWindowVec will be repeated multiple times to the number of layers.

std::optional<SizeType32>mSinkTokenLength#: Number of sink tokens (tokens to always keep in attention window)

std::optional<FloatType>mFreeGpuMemoryFraction#: The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both mMaxTokens and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.

std::optional<FloatType>mCrossKvCacheFraction#: The fraction of the KV Cache memory should be reserved for cross attention If set to p, self attention will use 1-p of KV Cache memory and cross attention will use p of KV Cache memory. Default is 50%. Should only be set when using encoder-decoder model.

std::optional<size_t>mHostCacheSize#: Size of secondary memory pool in bytes. Default is 0. Having a secondary memory pool increases KV cache block reuse potential.

boolmOnboardBlocks#: Controls whether offloaded blocks should be onboarded back into primary memory before being reused.

std::optional<RetentionPriority>mSecondaryOffloadMinPriority#: Only blocks with priority > mSecondaryOfflineMinPriority can be offloaded to secondary memory.

size_tmEventBufferMaxSize#: Max size of the KV cache event buffer.

boolmEnablePartialReuse#: Whether blocks that are only partially matched can be reused.

boolmCopyOnPartialReuse#: Whether partially matched blocks that are in use can be reused after copying them.

boolmUseUvm#: Whether to use UVM for the KV cache.

SizeType32mAttentionDpEventsGatherPeriodMs#: The period in milliseconds to gather attention DP events across ranks.

uint64_tmMaxGpuTotalBytes#: The maximum size in bytes of GPU memory that can be allocated for the KV cache. If both mMaxGpuTotalBytes and mFreeGpuMemoryFraction are specified, memory corresponding to the minimum will be allocated.

Friends

friendclassSerialization

structKVCacheCreatedData#

Public Members

std::vector<SizeType32>numBlocksPerCacheLevel#: The amount of blocks at each cache level.

structKVCacheEvent#

Public Functions

KVCacheEvent( IdTypeeventId, KVCacheEventDatadata, SizeType32windowSize, std::optional<SizeType32>attentionDpRank=std::nullopt, )#

Public Members

IdTypeeventId#: The unique id of this event.

KVCacheEventDatadata#: The data corresponding to this event.

SizeType32windowSize#: The sliding window size.

std::optional<SizeType32>attentionDpRank#: The attention DP rank of the event, if applicable.

template<typenameT> structKVCacheEventDiff#

Public Members

ToldValue#

TnewValue#

classKVCacheEventManager#

#include <executor.h>

Exposes a limited set of KV cache manager functionalities.

Public Functions

KVCacheEventManager( std::shared_ptr<tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager>kvCacheManager, )#

std::deque<KVCacheEvent>getLatestEvents( std::optional<std::chrono::milliseconds>timeout=std::nullopt, )#

Get the latest KV Cache events.

Parameters:: timeout – The maximum time to wait for new events. If nullopt, will only return when new events are available, or when the executor instance has shutdown.

Private Members

std::shared_ptr<tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager>kvCacheManager#

structKVCacheRemovedData#

Public Members

std::vector<IdType>blockHashes#: The hashes of blocks being removed.

classKvCacheRetentionConfig#

#include <executor.h>

Configuration for the request’s retention in the KV Cache.

Public Functions

inlineexplicitKvCacheRetentionConfig()#

explicitKvCacheRetentionConfig( std::vector<TokenRangeRetentionConfig>const&tokenRangeRetentionPriorities, RetentionPrioritydecodeRetentionPriority=kDefaultRetentionPriority, std::optional<std::chrono::milliseconds>decodeDurationMs=std::nullopt, KvCacheTransferModetransferMode=KvCacheTransferMode::DRAM, std::stringconst&directory="", )#

std::vector<TokenRangeRetentionConfig>getTokenRangeRetentionConfigs( )const#

RetentionPrioritygetDecodeRetentionPriority()const#

std::optional<std::chrono::milliseconds>getDecodeDurationMs( )const#

KvCacheTransferModegetTransferMode()const#

std::stringconst&getDirectory()const#

std::vector<RetentionPriorityAndDuration>getPerBlockRetentionPriorityDuration( SizeType32blockSize, SizeType32seqLen, )const#: Convert the token range data into an entry per kv block. Returns a tuple of vectors corresponding to the priorities and durations for each block.

inlinebooloperator==(KvCacheRetentionConfigconst&other)const#

Public Static Attributes

staticconstexprRetentionPrioritykMinRetentionPriority=0#

staticconstexprRetentionPrioritykMaxRetentionPriority=100#

staticconstexprRetentionPrioritykDefaultRetentionPriority=35#

Private Members

std::vector<TokenRangeRetentionConfig>mTokenRangeRetentionConfigs#: The token ranges and priority levels to update. Ranges must be non-overlapping. For example [(0, 64), (100, 128), (70, 80)] is valid, whereas [(0, 64), (60, 128)] is not.

RetentionPrioritymDecodeRetentionPriority#: The priority level to assign to blocks allocated in the decode phase.

std::optional<std::chrono::milliseconds>mDecodeDurationMs#: The duration in ms that decode blocks should remain at their assigned priority level.

KvCacheTransferModemTransferMode#: The transfer mode for the block.

std::stringmDirectory#: Name of the directory if transfer mode is GDS or POSIX_DEBUG_FALLBACK.

structTokenRangeRetentionConfig#

#include <executor.h>

A single entry to set block priorities over a token range. Earlier ranges always take priority over later ones. For example, with a block size of 16, a range of [0, 17] would be applied to the first two blocks.

Public Functions

explicitTokenRangeRetentionConfig( SizeType32tokenStart, std::optional<SizeType32>tokenEnd=std::nullopt, RetentionPrioritypriority=KvCacheRetentionConfig::kDefaultRetentionPriority, std::optional<std::chrono::milliseconds>durationMs=std::nullopt, )#

booloperator==(TokenRangeRetentionConfigconst&other)const#

Public Members

SizeType32tokenStart#: The first token of this range.

std::optional<SizeType32>tokenEnd#: The final token of this range. The end is not included in the range. This can be set to std::nullopt to extend the range to the end of the sequence.

RetentionPrioritypriority#: The priority of this token range. Higher priorities are less likely to be evicted or offloaded.

std::optional<std::chrono::milliseconds>durationMs#: The duration in ms that the block should remain at the given priority level. Set to std::nullopt to have no expiration time, and keep the block at the given priority level until it gets reclaimed. After the duration has passed, the block will be moved back to thekDefaultRetentionPriority level.

structKVCacheStoredBlockData#

#include <executor.h>

An entry for a single block stored into the tree.

Public Functions

inlineKVCacheStoredBlockData( IdTypeblockHash, tensorrt_llm::runtime::VecUniqueTokenstokens, std::optional<tensorrt_llm::runtime::LoraTaskIdType>loraId, SizeType32cacheLevel, SizeType32priority, )#

Public Members

IdTypeblockHash#: The hash of the block.

tensorrt_llm::runtime::VecUniqueTokenstokens#: The unique tokens of the block.

std::optional<tensorrt_llm::runtime::LoraTaskIdType>loraId#: The Lora task id of the block.

SizeType32cacheLevel#: The cache level of the block.

SizeType32priority#: The priority of the block.

structKVCacheStoredData#

Public Members

std::optional<IdType>parentHash#: The parent of this sequence of stored blocks.

std::vector<KVCacheStoredBlockData>blocks#: A sequence of blocks. The parent of blocki is blocki-1

structKVCacheUpdatedData#

Public Functions

inlineexplicitKVCacheUpdatedData(IdTypeblockHash)#

inlineexplicitKVCacheUpdatedData( IdTypeblockHash, std::optional<KVCacheEventDiff<SizeType32>>cacheLevel, std::optional<KVCacheEventDiff<SizeType32>>priority, )#

inlineKVCacheUpdatedData&cacheLevelUpdated( SizeType32oldValue, SizeType32newValue, )#

inlineKVCacheUpdatedData&priorityUpdated( SizeType32oldValue, SizeType32newValue, )#

Public Members

IdTypeblockHash#: The hash of the updated block.

std::optional<KVCacheEventDiff<SizeType32>>cacheLevel=std::nullopt#: The updated value of the cacheLevel field.

std::optional<KVCacheEventDiff<SizeType32>>priority=std::nullopt#: The updated value of the priority field.

classLogitsPostProcessorConfig#

Public Functions

explicitLogitsPostProcessorConfig( std::optional<LogitsPostProcessorMap>processorMap=std::nullopt, std::optional<LogitsPostProcessorBatched>processorBatched=std::nullopt, boolreplicate=true, )#

std::optional<LogitsPostProcessorMap>getProcessorMap()const#

std::optional<LogitsPostProcessorBatched>getProcessorBatched( )const#

boolgetReplicate()const#

voidsetProcessorMap(LogitsPostProcessorMapconst&processorMap)#

voidsetProcessorBatched( LogitsPostProcessorBatchedconst&processorBatched, )#

voidsetReplicate(boolreplicate)#

Private Members

std::optional<LogitsPostProcessorMap>mProcessorMap#: mapping from post processor names to non-batched post processors

std::optional<LogitsPostProcessorBatched>mProcessorBatched#: single batched post processor

boolmReplicate#: If set to true, logits post processor will run on all TP ranks in last PP rank.

structLookaheadDecodingConfig#

#include <executor.h>

Configuration for Look-Ahead speculative decoding. Allows to include window size, ngram size and verification set size.

Public Functions

LookaheadDecodingConfig( SizeType32windowSize, SizeType32ngramSize, SizeType32verificationSetSize, )#

inlineexplicitLookaheadDecodingConfig()#

booloperator==(LookaheadDecodingConfigconst&other)const#

std::tuple<SizeType32const,SizeType32const,SizeType32const>get( )const#

SizeType32getWindowSize()const#

SizeType32getNgramSize()const#

SizeType32getVerificationSetSize()const#

std::tuple<SizeType32,SizeType32,SizeType32,SizeType32>calculateSpeculativeResource( )const#: return <maxDecodingTokens, maxPathLen, maxDraftTokens, maxDraftPathLen>

boolisLE(LookaheadDecodingConfigconst&that)const#: return true whenthis can be executed on resources defined bythat

Public Static Functions

staticstd::tuple<SizeType32,SizeType32,SizeType32,SizeType32>calculateSpeculativeResourceTuple( SizeType32windowSize, SizeType32ngramSize, SizeType32verificationSetSize, )#

staticboolisLegal( SizeType32windowSize, SizeType32ngramSize, SizeType32verificationSetSize, )noexcept#: return true when the parameter combination is valid.

Public Static Attributes

staticconstexprSizeType32kDefaultLookaheadDecodingWindow=4#

staticconstexprSizeType32kDefaultLookaheadDecodingNgram=3#

staticconstexprSizeType32kDefaultLookaheadDecodingVerificationSet=4#

Private Members

SizeType32mWindowSize#

SizeType32mNgramSize#

SizeType32mVerificationSetSize#

Friends

friendclassSerialization

classLoraConfig#

#include <executor.h>

Configuration for LoRA.

Public Functions

explicitLoraConfig( IdTypetaskId, std::optional<Tensor>weights=std::nullopt, std::optional<Tensor>config=std::nullopt, )#

IdTypegetTaskId()const#

std::optional<Tensor>getWeights()const#

std::optional<Tensor>getConfig()const#

Private Members

IdTypemTaskId#: The Lora task id.

std::optional<Tensor>mWeights#: The Lora weights. See TRT-LLM documentation for expected shapes and types.

std::optional<Tensor>mConfig#: The Lora configuration. See TRT-LLM documentation for detailed description of the config tensor.

Friends

friendclassSerialization

classMropeConfig#

#include <executor.h>

Configuration for mrope.

Public Functions

explicitMropeConfig( TensormropeRoratySinCos, SizeType32mropePositionDeltas, )#

TensorgetMRopeRotaryCosSin()const#

SizeType32getMRopePositionDeltas()const#

Private Members

TensormMRopeRotaryCosSin#: The mrope rotary sin and cos cache. Expected shape: [maxPositionEmbeddings*rotaryEmbeddingDim],Data type must float32.

SizeType32mMRopePositionDeltas#: The mrope position deltas.

Friends

friendclassSerialization

classMultimodalInput#

#include <executor.h>

Multimodal input data class.

Public Functions

explicitMultimodalInput( std::vector<std::vector<SizeType32>>multimodalHashes, std::vector<SizeType32>multimodalPositions, std::vector<SizeType32>multimodalLengths, )#

std::vector<std::vector<SizeType32>>getMultimodalHashes()const#

std::vector<SizeType32>getMultimodalPositions()const#

std::vector<SizeType32>getMultimodalLengths()const#

Private Members

std::vector<std::vector<SizeType32>>mMultimodalHashes#: The multimodal hashes.

std::vector<SizeType32>mMultimodalPositions#: The multimodal positions.

std::vector<SizeType32>mMultimodalLengths#: The multimodal lengths.

Friends

friendclassSerialization

classOrchestratorConfig#

Public Functions

explicitOrchestratorConfig( boolisOrchestrator=true, std::stringworkerExecutablePath="", std::shared_ptr<mpi::MpiComm>orchLeaderComm=nullptr, boolspawnProcesses=true, )#

boolgetIsOrchestrator()const#

std::stringgetWorkerExecutablePath()const#

std::shared_ptr<mpi::MpiComm>getOrchLeaderComm()const#

boolgetSpawnProcesses()const#

voidsetIsOrchestrator(boolisOrchestrator)#

voidsetWorkerExecutablePath(std::stringconst&workerExecutablePath)#

voidsetOrchLeaderComm( std::shared_ptr<mpi::MpiComm>const&orchLeaderComm, )#

voidsetSpawnProcesses(boolspawnProcesses)#

Private Members

boolmIsOrchestrator#

std::stringmWorkerExecutablePath#

std::shared_ptr<mpi::MpiComm>mOrchLeaderComm#

boolmSpawnProcesses#

classOutputConfig#

#include <executor.h>

Configuration that controls the outputs of aResult.

Public Functions

explicitOutputConfig( boolreturnLogProbs=false, boolreturnContextLogits=false, boolreturnGenerationLogits=false, boolexcludeInputFromOutput=false, boolreturnEncoderOutput=false, boolreturnPerfMetrics=false, std::optional<std::vector<AdditionalModelOutput>>additionalModelOutputs=std::nullopt, )#

Public Members

boolreturnLogProbs#: Controls ifResult should contain log probabilities. Default is false.

boolreturnContextLogits#: Controls ifResult should contain the context logits. Default is false.

boolreturnGenerationLogits#: Controls ifResult should contain the generation logits. Default is false.

boolexcludeInputFromOutput#: Controls if output tokens inResult should include the input tokens. Default is false.

boolreturnEncoderOutput#: Controls ifResult should contain encoder output hidden states (for encoder-only and encoder-decoder models). Default is false.

boolreturnPerfMetrics#: Controls ifResult should contain performance metrics.

std::optional<std::vector<AdditionalModelOutput>>additionalModelOutputs#: The additional outputs to gather from the model.

classParallelConfig#

#include <executor.h>

A configuration class for the parallel execution parameters Currently only supports commType = CommunicationType::kMPI.

Public Functions

explicitParallelConfig( CommunicationTypecommType=CommunicationType::kMPI, CommunicationModecommMode=CommunicationMode::kLEADER, std::optional<std::vector<SizeType32>>deviceIds=std::nullopt, std::optional<std::vector<SizeType32>>participantIds=std::nullopt, std::optional<OrchestratorConfig>const&orchestratorConfig=std::nullopt, std::optional<SizeType32>numNodes=std::nullopt, )#

Constructor.

Parameters:

commType – The communication type. See CommunicationType.
commMode – The communication mode. See CommunicationMode.
deviceIds – The IDs of the GPUs involved in the execution of the model
participantIds – The participant IDs (MPI ranks if commType == kMPI) involved in the execution of the model. The first participant is considered to be the leader.
orchestratorConfig – The orchestrator configuration. SeeOrchestratorConfig.
numNodes – The number of nodes to use for execution. Default is 1.

CommunicationTypegetCommunicationType()const#

CommunicationModegetCommunicationMode()const#

std::optional<std::vector<SizeType32>>getDeviceIds()const#

std::optional<std::vector<SizeType32>>getParticipantIds()const#

std::optional<OrchestratorConfig>getOrchestratorConfig()const#

std::optional<SizeType32>getNumNodes()const#

voidsetCommunicationType(CommunicationTypetype)#

voidsetCommunicationMode(CommunicationModemode)#

voidsetDeviceIds(std::vector<SizeType32>const&deviceIds)#

voidsetParticipantIds( std::vector<SizeType32>const&participantIds, )#

voidsetOrchestratorConfig( OrchestratorConfigconst&orchestratorConfig, )#

voidsetNumNodes(SizeType32numNodes)#

Private Members

CommunicationTypemCommType#: The type of communication protocol used. Default is MPI.

CommunicationModemCommMode#: The mode of communication. See CommunicationMode.

std::optional<std::vector<SizeType32>>mDeviceIds#: The GPU device ids to use for executing this model.

std::optional<std::vector<SizeType32>>mParticipantIds#: The participant ids (MPI ranks for example) used for executing this model.

std::optional<OrchestratorConfig>mOrchestratorConfig#: Optional orchestrator configuration.

std::optional<SizeType32>mNumNodes#: The number of nodes to use for execution. Default is 1.

Friends

friendclassSerialization

classPeftCacheConfig#

#include <executor.h>

config for PeftCacheManager

Public Functions

explicitPeftCacheConfig( SizeType32numHostModuleLayer=0, SizeType32numDeviceModuleLayer=0, SizeType32optimalAdapterSize=kDefaultOptimalAdapterSize, SizeType32maxAdapterSize=kDefaultMaxAdapterSize, SizeType32numPutWorkers=1, SizeType32numEnsureWorkers=1, SizeType32numCopyStreams=1, SizeType32maxPagesPerBlockHost=kDefaultMaxPagesPerBlockHost, SizeType32maxPagesPerBlockDevice=kDefaultMaxPagesPerBlockDevice, std::optional<float>const&deviceCachePercent=std::nullopt, std::optional<size_t>const&hostCacheSize=std::nullopt, std::optional<std::string>const&loraPrefetchDir=std::nullopt, )#

booloperator==(PeftCacheConfigconst&other)const#

SizeType32getNumHostModuleLayer()const#

SizeType32getNumDeviceModuleLayer()const#

SizeType32getOptimalAdapterSize()const#

SizeType32getMaxAdapterSize()const#

SizeType32getNumPutWorkers()const#

SizeType32getNumEnsureWorkers()const#

SizeType32getNumCopyStreams()const#

SizeType32getMaxPagesPerBlockHost()const#

SizeType32getMaxPagesPerBlockDevice()const#

std::optional<float>getDeviceCachePercent()const#

std::optional<size_t>getHostCacheSize()const#

std::optional<std::string>getLoraPrefetchDir()const#

Public Static Attributes

staticconstexprSizeType32kDefaultOptimalAdapterSize=8#

staticconstexprSizeType32kDefaultMaxAdapterSize=64#

staticconstexprSizeType32kDefaultMaxPagesPerBlockHost=24#

staticconstexprSizeType32kDefaultMaxPagesPerBlockDevice=8#

Private Members

SizeType32mNumHostModuleLayer#

SizeType32mNumDeviceModuleLayer#

SizeType32mOptimalAdapterSize#

SizeType32mMaxAdapterSize#

SizeType32mNumPutWorkers#

SizeType32mNumEnsureWorkers#

SizeType32mNumCopyStreams#

SizeType32mMaxPagesPerBlockHost#

SizeType32mMaxPagesPerBlockDevice#

std::optional<FloatType>mDeviceCachePercent#

std::optional<size_t>mHostCacheSize#

std::optional<std::string>mLoraPrefetchDir#

Friends

friendclassSerialization

classPromptTuningConfig#

#include <executor.h>

Configuration for prompt tuning.

Public Functions

explicitPromptTuningConfig( TensorembeddingTable, std::optional<VecTokenExtraIds>inputTokenExtraIds=std::nullopt, )#

TensorgetEmbeddingTable()const#

std::optional<VecTokenExtraIds>getInputTokenExtraIds()const#

Private Members

TensormEmbeddingTable#: The prompt embedding table. Expected shape: [task vocab_size, hidden_size]. Data type must match model weights.

std::optional<VecTokenExtraIds>mInputTokenExtraIds#: The input token extra ids for KV Cache reuse when p-tuning is enabled.

Friends

friendclassSerialization

classRequest#

#include <executor.h>

A class that holds information about the request.

Public Functions

Request( VecTokensinputTokenIds, SizeType32maxTokens, boolstreaming=false, SamplingConfigconst&samplingConfig=SamplingConfig(), OutputConfigconst&outputConfig=OutputConfig(), std::optional<SizeType32>const&endId=std::nullopt, std::optional<SizeType32>const&padId=std::nullopt, std::optional<std::vector<SizeType32>>positionIds=std::nullopt, std::optional<std::list<VecTokens>>badWords=std::nullopt, std::optional<std::list<VecTokens>>stopWords=std::nullopt, std::optional<Tensor>embeddingBias=std::nullopt, std::optional<ExternalDraftTokensConfig>externalDraftTokensConfig=std::nullopt, std::optional<PromptTuningConfig>pTuningConfig=std::nullopt, std::optional<MultimodalInput>multimodalInput=std::nullopt, std::optional<Tensor>multimodalEmbedding=std::nullopt, std::optional<MropeConfig>mRopeConfig=std::nullopt, std::optional<LoraConfig>loraConfig=std::nullopt, std::optional<LookaheadDecodingConfig>lookaheadConfig=std::nullopt, std::optional<KvCacheRetentionConfig>kvCacheRetentionConfig=std::nullopt, std::optional<std::string>logitsPostProcessorName=std::nullopt, std::optional<LogitsPostProcessor>logitsPostProcessor=std::nullopt, std::optional<VecTokens>encoderInputTokenIds=std::nullopt, std::optional<IdType>clientId=std::nullopt, boolreturnAllGeneratedTokens=false, PriorityTypepriority=kDefaultPriority, RequestTypetype=RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION, std::optional<ContextPhaseParams>contextPhaseParams=std::nullopt, std::optional<Tensor>encoderInputFeatures=std::nullopt, std::optional<SizeType32>encoderOutputLength=std::nullopt, std::optional<Tensor>crossAttentionMask=std::nullopt, SizeType32numReturnSequences=1, std::optional<EagleConfig>eagleConfig=std::nullopt, std::optional<Tensor>skipCrossAttnBlocks=std::nullopt, std::optional<GuidedDecodingParams>guidedDecodingParams=std::nullopt, std::optional<SizeType32>languageAdapterUid=std::nullopt, std::optional<MillisecondsType>allottedTimeMs=std::nullopt, std::optional<CacheSaltIDType>cacheSaltID=std::nullopt, )#

TheRequest constructor.

Parameters:

inputTokenIds – The input token ids
maxTokens – The maximum number of tokens to generate
streaming – Indicates if the responses should be streamed or not. Default is false.
samplingConfig – The sampling configuration
outputConfig – The output configuration
endId – The end token id
padId – The pad token id
positionIds – The input position ids
badWords – A list of bad words tokens. Each “word” can be composed of multiple tokens
stopWords – A list of stop words tokens. Each “word” can be composed of multiple tokens
embeddingBias – The embedding bias tensor. Expected shape is [vocab_size]
externalDraftTokensConfig – The speculative decoding with external draft tokens configuration
pTuningConfig – The prompt tuning configuration
multimodalInput – The multimodal input {multimodalHashes, multimodalPositions, multimodalLengths}
multimodalEmbedding – The multimodal embedding tensor. Expected shape is [num_multimodal_tokens, hidden_dim]
mRopeConfig – The mrope configuration
loraConfig – The LoRA configuration
lookaheadConfig – The lookahead speculative decoding configuration
kvCacheRetentionConfig – The configuration used for KV cache block eviction.
logitsPostProcessorName – The logits postprocessor name. Must correspond to one of the logits postprocessor name provided to theExecutorConfig.
logitsPostProcessor – The logits postprocessor dynamically specified per request; only supported with replicate=false or no tensor parallelism.
encoderInputTokenIds – The encoder input token ids for encoder-decoder models, or encoder-only models
clientId –
returnAllGeneratedTokens – Indicates whether to return the full beams or just the newly generated tokens after every streaming step.
priority – Sets the execution priority of this request.
type – Indicate the request type for disaggregated serving mode.
contextPhaseParams – Generated token ID from context only executor.
encoderInputFeatures – Encoder input features for multimodal models.
encoderOutputLength – Encoder output length if encoder input and output have different lengths (due to convolution down-sampling, etc.)
crossAttentionMask – Cross attention mask.
numReturnSequences – The number of returning sequences.
eagleConfig – The EAGLE speculative decoding configuration
skipCrossAttnBlocks – Skip the cross attention transformer blocks or not.
guidedDecodingParams – The guided decoding parameters.
languageAdapterUid – Task Uid for language adapter.
allottedTimeMs – The allotted time in milliseconds after which the request is cancelled with a timedOut finish reason. The request may exceed this time slightly, but at most by 1 forward pass (in pipeline parallelism that may involve multiple micro-batches). A request can be timed-out before ever being scheduled.
cacheSaltID – Salt ID for KV cache blocks to limit the kv cache reuse to the requests with the same string.

Request(Requestconst&other)#

Request(Request&&other)noexcept#

Request&operator=(Requestconst&other)#

Request&operator=(Request&&other)noexcept#

~Request()#

VecTokensgetInputTokenIds()const#

SizeType32getMaxTokens()const#

boolgetStreaming()const#

SamplingConfiggetSamplingConfig()const#

OutputConfiggetOutputConfig()const#

std::optional<SizeType32>getEndId()const#

std::optional<SizeType32>getPadId()const#

std::optional<std::vector<SizeType32>>getPositionIds()const#

std::optional<std::list<VecTokens>>getBadWords()const#

std::optional<std::list<VecTokens>>getStopWords()const#

std::optional<Tensor>getEmbeddingBias()const#

std::optional<ExternalDraftTokensConfig>getExternalDraftTokensConfig( )const#

std::optional<PromptTuningConfig>getPromptTuningConfig()const#

std::optional<MultimodalInput>getMultimodalInput()const#

std::optional<Tensor>getMultimodalEmbedding()const#

std::optional<MropeConfig>getMropeConfig()const#

std::optional<LoraConfig>getLoraConfig()const#

std::optional<LookaheadDecodingConfig>getLookaheadConfig()const#

std::optional<KvCacheRetentionConfig>getKvCacheRetentionConfig( )const#

std::optional<std::string>getLogitsPostProcessorName()const#

std::optional<LogitsPostProcessor>getLogitsPostProcessor()const#

std::optional<VecTokens>getEncoderInputTokenIds()const#

std::optional<IdType>getClientId()const#

PriorityTypegetPriority()const#

boolgetReturnAllGeneratedTokens()const#

std::optional<ContextPhaseParams>const&getContextPhaseParams( )const#

std::optional<Tensor>getEncoderInputFeatures()const#

std::optional<SizeType32>getEncoderOutputLength()const#

std::optional<Tensor>getCrossAttentionMask()const#

RequestTypegetRequestType()const#

std::optional<EagleConfig>getEagleConfig()const#

std::optional<Tensor>getSkipCrossAttnBlocks()const#

std::optional<GuidedDecodingParams>getGuidedDecodingParams()const#

std::optional<SizeType32>getLanguageAdapterUid()const#

std::optional<MillisecondsType>getAllottedTimeMs()const#

std::optional<CacheSaltIDType>getCacheSaltID()const#

std::optional<std::vector<std::string>>getAdditionalOutputNames( )const#

voidsetStreaming(boolstreaming)#

voidsetSamplingConfig(SamplingConfigconst&config)#

voidsetOutputConfig(OutputConfigconst&outputConfig)#

voidsetEndId(SizeType32endId)#

voidsetPadId(SizeType32padId)#

voidsetPositionIds(std::vector<SizeType32>const&positionIds)#

voidsetBadWords(std::list<VecTokens>const&badWords)#

voidsetStopWords(std::list<VecTokens>const&stopWords)#

voidsetEmbeddingBias(Tensorconst&embeddingBias)#

voidsetExternalDraftTokensConfig( ExternalDraftTokensConfigconst&externalDraftTokensConfig, )#

voidsetPromptTuningConfig(PromptTuningConfigconst&pTuningConfig)#

voidsetMultimodalEmbedding(Tensorconst&multimodalEmbedding)#

voidsetMultimodalInput(MultimodalInputconst&multimodalInput)#

voidsetMropeConfig(MropeConfigconst&mRopeConfig)#

voidsetLoraConfig(LoraConfigconst&loraConfig)#

voidsetLookaheadConfig( LookaheadDecodingConfigconst&lookaheadConfig, )#

voidsetKvCacheRetentionConfig( KvCacheRetentionConfigconst&kvCacheRetentionConfig, )#

voidsetLogitsPostProcessorName( std::stringconst&logitsPostProcessorName, )#

voidsetLogitsPostProcessor( std::optional<LogitsPostProcessor>const&logitsPostProcessor, )#

voidsetEncoderInputTokenIds(VecTokensconst&encoderInputTokenIds)#

voidsetClientId(IdTypeclientId)#

voidsetPriority(PriorityTypepriority)#

voidsetReturnAllGeneratedTokens(boolreturnAllGeneratedTokens)#

voidsetRequestType(RequestTypeconst&requestType)#

voidsetContextPhaseParams(ContextPhaseParamscontextPhaseParams)#

voidsetEncoderInputFeatures(TensorencoderInputFeatures)#

voidsetEncoderOutputLength(SizeType32encoderOutputLength)#

voidsetCrossAttentionMask(TensorcrossAttentionMask)#

voidsetEagleConfig(std::optional<EagleConfig>const&eagleConfig)#

voidsetSkipCrossAttnBlocks(TensorskipCrossAttnBlocks)#

voidsetGuidedDecodingParams( GuidedDecodingParamsconst&guidedDecodingParams, )#

voidsetLanguageAdapterUid(SizeType32languageAdapterUid)#

voidsetAllottedTimeMs(MillisecondsTypeallottedTimeMs)#

voidsetCacheSaltID(CacheSaltIDTypecacheSaltID)#

Public Static Attributes

staticconstexprPriorityTypekDefaultPriority=0.5#

staticautoconstexprkBatchedPostProcessorName="batched"#: This logits postprocessor name will dispatch to the batched logits postprocessor.

staticautoconstexprkDynamicPostProcessorNamePrefix="dynamic"#: Dynamic logits postprocessor name will be “dynamic” + requestId.

Private Members

std::unique_ptr<Impl>mImpl#

Friends

friendclassSerialization

classResponse#

#include <executor.h>

Class that holds either an error or a result.

Public Functions

Response( IdTyperequestId, std::stringerrorMsg, std::optional<IdType>clientId=std::nullopt, )#

Response( IdTyperequestId, ResultResult, std::optional<IdType>clientId=std::nullopt, )#

~Response()#

Response(Responseconst&other)#

Response(Response&&other)noexcept#

Response&operator=(Responseconst&other)#

Response&operator=(Response&&other)noexcept#

IdTypegetRequestId()const#: Get the id of the request for which this response was generated.

std::optional<IdType>getClientId()const#: Get the client id of the request for which this response was generated.

boolhasError()const#: Indicates if this response has an error or not.

std::stringconst&getErrorMsg()const#: Get the error msg for this response Will throw an exception if hasError is false.

Resultconst&getResult()const#: Get the result for this response Will throw an exception if hasResult is true.

Private Members

std::unique_ptr<Impl>mImpl#

Friends

friendclassSerialization

structResult#

#include <executor.h>

Struct that holds the generation result.

Public Members

boolisFinal#: Indicates if this is the final result for the request.

BeamTokensoutputTokenIds#: The output tokens for each beam.

std::optional<VecLogProbs>cumLogProbs#: The cumulative log probabilities. Size beamSize.

std::optional<std::vector<VecLogProbs>>logProbs#: The log probabilities for each generated token. Size [beamSize, outputLen].

std::optional<Tensor>contextLogits#: The context logits. Size [promptLen, vocabSizePadded].

std::optional<Tensor>generationLogits#: The generation logits. Size [beamSize, maxTokens, vocabSizePadded] (non-streaming) or [maxTokens, beamSize, vocabSizePadded] (streaming and allGeneratedTokens) or [1, beamSize, vocabSizePadded] (streaming and non-allGeneratedTokens)

std::optional<SpeculativeDecodingFastLogitsInfo>specDecFastLogitsInfo#: Logits information for direct transfer when using fast logits.

std::optional<Tensor>encoderOutput#: The encoder output. Size [encoderLen, hiddenSize].

std::vector<FinishReason>finishReasons#: The reason why the model stopped generating tokens for each beam in this request. Size [beamSize]. Currently only supported when beamSize is 1 and when using BatchingType::kINFLIGHT.

std::optional<ContextPhaseParams>contextPhaseParams#: The params of the context phase.

SizeType32decodingIter={0}#: The number of the decoding iterations used to generate the result. In autoregressive decoding, it is equal to the maximum length of the beam in outputTokenIds. In speculative decoding, might be less than maximum length of the beam in outputTokenIds as more than one token can be generated per iteration. Used for speculative decoding statistics.

floatavgDecodedTokensPerIter={0.0f}#: The average number of decoded tokens per iteration. For standard model it is 1. For speculative decoding model >= 1 — number of draft tokens accepted per step + 1.

SizeType32sequenceIndex={0}#: The index of the output sequence of this result where 0 <= sequenceIndex < numReturnSequences. In beam search (beamWidth > 1), this index will be always zero because all beams to be returned are included in this result.

boolisSequenceFinal#: Indicates if this is the final result for a given sequence in the request In beam search (beamWidth > 1), the value will always equal to the value of isFinal.

std::optional<RequestPerfMetrics>requestPerfMetrics#: Performance metrics if returnPerfMetrics is set inOutputConfig.

std::vector<AdditionalOutput>additionalOutputs#: The additional outputs.

structRetentionPriorityAndDuration#

Public Functions

inlineRetentionPriorityAndDuration( std::optional<RetentionPriority>const&retentionPriority, std::optional<std::chrono::milliseconds>const&durationMs, )#

Public Members

std::optional<RetentionPriority>retentionPriority#

std::optional<std::chrono::milliseconds>durationMs#

classSamplingConfig#

#include <executor.h>

Sampling configuration.

Public Functions

explicitSamplingConfig( SizeType32beamWidth=1, std::optional<SizeType32>const&topK=std::nullopt, std::optional<FloatType>const&topP=std::nullopt, std::optional<FloatType>const&topPMin=std::nullopt, std::optional<TokenIdType>const&topPResetIds=std::nullopt, std::optional<FloatType>const&topPDecay=std::nullopt, std::optional<RandomSeedType>const&seed=std::nullopt, std::optional<FloatType>const&temperature=std::nullopt, std::optional<SizeType32>const&minTokens=std::nullopt, std::optional<FloatType>const&beamSearchDiversityRate=std::nullopt, std::optional<FloatType>const&repetitionPenalty=std::nullopt, std::optional<FloatType>const&presencePenalty=std::nullopt, std::optional<FloatType>const&frequencyPenalty=std::nullopt, std::optional<SizeType32>const&promptIgnoreLength=std::nullopt, std::optional<FloatType>const&lengthPenalty=std::nullopt, std::optional<SizeType32>const&earlyStopping=std::nullopt, std::optional<SizeType32>const&noRepeatNgramSize=std::nullopt, std::optional<SizeType32>const&numReturnSequences=std::nullopt, std::optional<FloatType>const&minP=std::nullopt, std::optional<std::vector<SizeType32>>const&beamWidthArray=std::nullopt, )#: Constructor forSamplingConfig See description of parameters below.

booloperator==(SamplingConfigconst&other)const#

SizeType32getBeamWidth()const#

SizeType32getNumReturnBeams()const#

std::optional<SizeType32>getTopK()const#

std::optional<FloatType>getTopP()const#

std::optional<FloatType>getTopPMin()const#

std::optional<SizeType32>getTopPResetIds()const#

std::optional<FloatType>getTopPDecay()const#

std::optional<RandomSeedType>getSeed()const#

std::optional<FloatType>getTemperature()const#

std::optional<SizeType32>getMinTokens()const#

std::optional<FloatType>getBeamSearchDiversityRate()const#

std::optional<FloatType>getRepetitionPenalty()const#

std::optional<FloatType>getPresencePenalty()const#

std::optional<FloatType>getFrequencyPenalty()const#

std::optional<SizeType32>getPromptIgnoreLength()const#

std::optional<FloatType>getLengthPenalty()const#

std::optional<SizeType32>getEarlyStopping()const#

std::optional<SizeType32>getNoRepeatNgramSize()const#

std::optional<SizeType32>getNumReturnSequences()const#

std::optional<FloatType>getMinP()const#

std::optional<std::vector<SizeType32>>getBeamWidthArray()const#

voidsetBeamWidth(SizeType32beamWidth)#

voidsetTopK(std::optional<SizeType32>const&topK)#

voidsetTopP(std::optional<FloatType>const&topP)#

voidsetTopPMin(std::optional<FloatType>const&topPMin)#

voidsetTopPResetIds( std::optional<TokenIdType>const&topPResetIds, )#

voidsetTopPDecay(std::optional<FloatType>const&topPDecay)#

voidsetSeed(std::optional<RandomSeedType>const&seed)#

voidsetTemperature(std::optional<FloatType>const&temperature)#

voidsetMinTokens(std::optional<SizeType32>const&minTokens)#

voidsetBeamSearchDiversityRate( std::optional<FloatType>const&beamSearchDiversityRate, )#

voidsetRepetitionPenalty( std::optional<FloatType>const&repetitionPenalty, )#

voidsetPresencePenalty( std::optional<FloatType>const&presencePenalty, )#

voidsetFrequencyPenalty( std::optional<FloatType>const&frequencyPenalty, )#

voidsetPromptIgnoreLength( std::optional<SizeType32>const&promptIgnoreLength, )#

voidsetLengthPenalty( std::optional<FloatType>const&lengthPenalty, )#

voidsetEarlyStopping( std::optional<SizeType32>const&earlyStopping, )#

voidsetNoRepeatNgramSize( std::optional<SizeType32>const&noRepeatNgramSize, )#

voidsetNumReturnSequences( std::optional<SizeType32>const&numReturnSequences, )#

voidsetMinP(std::optional<FloatType>const&minP)#

voidsetBeamWidthArray( std::optional<std::vector<SizeType32>>const&beamWidthArray, )#

Private Functions

voidupdateNumReturnBeams()#

Private Members

SizeType32mBeamWidth#: The beam width. Default is 1 which disables beam search.

std::optional<SizeType32>mTopK#: Controls number of logits to sample from. Default is 0 (all logits).

std::optional<FloatType>mTopP#: Controls the top-P probability to sample from. Default is 0.f.

std::optional<FloatType>mTopPMin#: Controls decay in the top-P algorithm. topPMin is lower-bound. Default is 1.e-6.

std::optional<TokenIdType>mTopPResetIds#: Controls decay in the top-P algorithm. Indicates where to reset the decay. Default is 1.

std::optional<FloatType>mTopPDecay#: Controls decay in the top-P algorithm. The decay value. Default is 1.f.

std::optional<RandomSeedType>mSeed#: Controls the random seed used by the random number generator in sampling. Default is 0.

std::optional<FloatType>mTemperature#: Controls the modulation of logits when sampling new tokens. It can have values > 0.f. Default is 1.0f.

std::optional<SizeType32>mMinTokens#: Lower bound on the number of tokens to generate. Values < 1 have no effect. Default is 1.

std::optional<FloatType>mBeamSearchDiversityRate#: Controls the diversity in beam search.

std::optional<FloatType>mRepetitionPenalty#: Used to penalize tokens based on how often they appear in the sequence. It can have any value > 0.f. Values < 1.f encourages repetition, values > 1.f discourages it. Default is 1.f.

std::optional<FloatType>mPresencePenalty#: Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.

std::optional<FloatType>mFrequencyPenalty#: Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values < 0.f encourage repetition, values > 0.f discourage it. Default is 0.f.

std::optional<SizeType32>mPromptIgnoreLength#: Controls how many tokens to ignore from the prompt for presence and frequency penalties. Values <= 0 have no effect. Values > input (prompt) length will be clamped. Default is 0.

std::optional<FloatType>mLengthPenalty#: Controls how to penalize longer sequences in beam search. Default is 0.f.

std::optional<SizeType32>mEarlyStopping#: Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token). Default is 1.

std::optional<SizeType32>mNoRepeatNgramSize#: Controls how many repeat ngram size are acceptable. Default is 1 << 30.

std::optional<SizeType32>mNumReturnSequences#: The number of return sequences or beams. In beam search, the value should be less than or equal to mBeamWidth. In sampling, it specifies the total number of independently generated sequences.

SizeType32mNumReturnBeams#: The number of beams to return. It is equal to beamWidth unless numReturnSequences is set. If beamWidth > 1 and numReturnSequences is set, then numReturnBeams is equal to numReturnSequences.

std::optional<FloatType>mMinP#: Controls the min_p scaling for sampling. It masks x which P_x < min_p * P_max, where P_x is probability of candidate x. Default is 0.f.

std::optional<std::vector<SizeType32>>mBeamWidthArray#: Controls the beam width for each step for Variable-Beam-Width-Search.

Private Static Functions

staticSizeType32checkBeamWidth(SizeType32beamWidth)#

staticstd::optional<FloatType>const&checkTopK( std::optional<FloatType>const&topK, )#

staticstd::optional<FloatType>const&checkTopP( std::optional<FloatType>const&topP, )#

staticstd::optional<FloatType>const&checkTopPMin( std::optional<FloatType>const&topPMin, )#

staticstd::optional<TokenIdType>const&checkTopPResetIds( std::optional<TokenIdType>const&topPResetIds, )#

staticstd::optional<FloatType>const&checkTopPDecay( std::optional<FloatType>const&topPDecay, )#

staticstd::optional<FloatType>const&checkTemperature( std::optional<FloatType>const&temperature, )#

staticstd::optional<SizeType32>const&checkMinTokens( std::optional<SizeType32>const&minTokens, )#

staticstd::optional<FloatType>const&checkBeamSearchDiversityRate( std::optional<FloatType>const&beamSearchDiversityRate, )#

staticstd::optional<FloatType>const&checkRepetitionPenalty( std::optional<FloatType>const&repetitionpenalty, )#

staticstd::optional<SizeType32>const&checkPromptIgnoreLength( std::optional<SizeType32>const&promptIgnoreLength, )#

staticstd::optional<FloatType>const&checkLengthPenalty( std::optional<FloatType>const&lengthPenalty, )#

staticstd::optional<SizeType32>const&checkEarlyStopping( std::optional<SizeType32>const&earlyStopping, )#

staticstd::optional<SizeType32>const&checkNoRepeatNgramSize( std::optional<SizeType32>const&noRepeatNgramSize, )#

staticstd::optional<SizeType32>const&checkNumReturnSequences( std::optional<SizeType32>const&numReturnSequences, SizeType32beamWidth, )#

staticstd::optional<FloatType>const&checkMinP( std::optional<FloatType>const&minP, )#

staticstd::pair<std::optional<std::vector<SizeType32>>const&,SizeType32const>constcheckBeamWidthArray( std::optional<std::vector<SizeType32>>const&beamWidthArray, SizeType32constbeamWidth, )#

Friends

friendclassSerialization

classSchedulerConfig#

#include <executor.h>

Configuration class for the scheduler.

Public Functions

explicitSchedulerConfig( CapacitySchedulerPolicycapacitySchedulerPolicy=CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT, std::optional<ContextChunkingPolicy>contextChunkingPolicy=std::nullopt, std::optional<DynamicBatchConfig>dynamicBatchConfig=std::nullopt, )#

booloperator==(SchedulerConfigconst&other)const#

CapacitySchedulerPolicygetCapacitySchedulerPolicy()const#

std::optional<ContextChunkingPolicy>getContextChunkingPolicy( )const#

std::optional<DynamicBatchConfig>getDynamicBatchConfig()const#

Private Members

CapacitySchedulerPolicymCapacitySchedulerPolicy#: The capacity scheduler policy. See CapacitySchedulerPolicy.

std::optional<ContextChunkingPolicy>mContextChunkingPolicy#: The context chunking policy. See ContextChunkingPolicy.

std::optional<DynamicBatchConfig>mDynamicBatchConfig#: The config for tuning batch size dynamically. See DynamicBatchSizeConfig.

Friends

friendclassSerialization

classSpeculativeDecodingConfig#

#include <executor.h>

Configuration for speculative decoding (both draft and target models)

Public Functions

explicitSpeculativeDecodingConfig(boolfastLogits=false)#

booloperator==(SpeculativeDecodingConfigconst&other)const#

Public Members

boolfastLogits#: Send logits tensor directly from draft to target model.

Friends

friendclassSerialization

structSpeculativeDecodingFastLogitsInfo#

#include <executor.h>

Struct that holds the logits information when using direct transfer.

Public Functions

TensortoTensor()const#: Returns the struct serialized into a tensor that can be used as generation logits input.

Public Members

uint64_tdraftRequestId#: Draft request id.

int32_tdraftParticipantId#: MPI world rank of the draft model leader.

namespacempi#

dataTransceiverState.h#

namespacetensorrt_llm

namespaceexecutor

classDataTransceiverState#

Public Functions

DataTransceiverState()=default#

inlineDataTransceiverState( kv_cache::CacheStatecacheState, kv_cache::CommStatecommState, )#

inlinevoidsetCacheState(kv_cache::CacheStatestate)#

inlinestd::optional<kv_cache::CacheState>const&getCacheState( )constnoexcept#

inlinevoidsetCommState(kv_cache::CommStatestate)#

inlinestd::optional<kv_cache::CommState>const&getCommState( )constnoexcept#

inlinebooloperator==( DataTransceiverStateconst&other, )constnoexcept#

inlinestd::stringtoString()const#

Private Members

std::optional<kv_cache::CacheState>mCacheState#

std::optional<kv_cache::CommState>mCommState#

Friends

friendclassSerialization

namespacekv_cache

structAgentState#

Public Functions

inlineAgentState(std::stringagentName,std::stringconnectionInfo)#

AgentState()=default#

inlinebooloperator==(AgentStateconst&other)constnoexcept#

inlinestd::stringtoString()const#

Public Members

std::stringmAgentName#

std::stringmConnectionInfo#

classCacheState#

Public Types

enumclassAttentionType:std::uint8_t#

Values:

enumeratorkDEFAULT#

enumeratorkMLA#

Public Functions

inlineCacheState( ModelConfigmodelConfig, runtime::WorldConfigconst&worldConfig, std::vector<SizeType32>const&attentionLayerNumPerPP, nvinfer1::DataTypedataType, AttentionTypeattentionType=AttentionType::kDEFAULT, intkvFactor=2, boolenableBlockReuse=false, boolhasIndexerKCache=false, SizeType32indexerDimPerHead=0, SizeType32indexerKCacheQuantBlockSize=128, )#

inlineCacheState( std::vector<SizeType32>nbKvHeadPerLayer, SizeType32sizePerHead, SizeType32tokensPerBlock, SizeType32tensorParallelism, SizeType32pipelineParallelism, SizeType32contextParallelism, std::vector<SizeType32>const&attentionLayerNumPerPP, nvinfer1::DataTypedataType, AttentionTypeattentionType=AttentionType::kDEFAULT, intkvFactor=2, boolenableAttentionDP=false, intDPrank=0, intDPsize=0, boolenableBlockReuse=false, boolhasIndexerKCache=false, SizeType32indexerDimPerHead=0, SizeType32indexerKCacheQuantBlockSize=128, )#

inlineCacheState( SizeType32nbAttentionLayers, SizeType32nbKvHeads, SizeType32sizePerHead, SizeType32tokensPerBlock, SizeType32tensorParallelism, SizeType32pipelineParallelism, SizeType32contextParallelism, std::vector<SizeType32>const&attentionLayerNumPerPP, nvinfer1::DataTypedataType, AttentionTypeattentionType=AttentionType::kDEFAULT, intkvFactor=2, boolenableAttentionDP=false, intDPrank=0, intDPsize=0, boolenableBlockReuse=false, boolhasIndexerKCache=false, SizeType32indexerDimPerHead=0, SizeType32indexerKCacheQuantBlockSize=128, )#

inlinebooloperator==( kv_cache::CacheStateconst&other, )constnoexcept#

inlineModelConfigconst&getModelConfig()const#

inlineParallelConfigconst&getParallelConfig()const#

inlineAttentionConfigconst&getAttentionConfig()const#

inlinenvinfer1::DataTypeconst&getDataType()const#

inlineboolgetEnableBlockReuse()const#

inlineboolgetHasIndexerKCache()const#

inlineSizeType32getIndexerDimPerHead()const#

inlineSizeType32getIndexerKCacheQuantBlockSize()const#

inlinestd::stringtoString()const#

Private Members

ModelConfigmModelConfig#

ParallelConfigmParallelConfig#

nvinfer1::DataTypemDataType#

AttentionConfigmAttentionConfig#

boolmEnableBlockReuse={false}#

boolmHasIndexerKCache={false}#

SizeType32mIndexerDimPerHead={0}#

SizeType32mIndexerKCacheQuantBlockSize={128}#

Friends

friendclasstensorrt_llm::executor::Serialization

structAttentionConfig#

Public Functions

inlineAttentionConfig(AttentionTypeattentionType,intkvFactor)#

inlinebooloperator==(AttentionConfigconst&other)constnoexcept#

Public Members

AttentionTypemAttentionType#

intmKvFactor#

structModelConfig#

Public Functions

inlinebooloperator==(ModelConfigconst&other)constnoexcept#

Public Members

std::vector<SizeType32>mNbKvHeadsPerLayer#

SizeType32mSizePerHead#

SizeType32mTokensPerBlock#

structParallelConfig#

Public Functions

inlinebooloperator==(ParallelConfigconst&other)constnoexcept#

Public Members

SizeType32mTensorParallelism#

SizeType32mPipelineParallelism#

SizeType32mContextParallelism#

boolmEnableAttentionDP#

SizeType32mDPrank#

SizeType32mDPsize#

std::vector<SizeType32>mAttentionLayerNumPerPP#

classCommState#

Public Functions

CommState()=default#

inlineexplicitCommState( std::vector<SizeType32>ranks, intselfIdx=-1, )#

inlineexplicitCommState( std::vector<SocketState>socketState, intselfIdx=-1, )#

inlineCommState(std::uint16_tport,std::stringip)#

inlineexplicitCommState( std::vector<AgentState>agentState, intselfIdx=-1, )#

inlineboolisMpiState()constnoexcept#

inlineboolisSocketState()constnoexcept#

inlineboolisAgentState()constnoexcept#

inlineMpiStateconst&getMpiState()const#

inlinestd::vector<SocketState>const&getSocketState()const#

inlinestd::vector<AgentState>const&getAgentState()const#

inlineintgetSelfIdx()constnoexcept#

inlinebooloperator==(CommStateconst&other)constnoexcept#

inlinestd::stringtoString()const#

Private Members

std::variant<std::monostate,MpiState,std::vector<SocketState>,std::vector<AgentState>>mState#

intmSelfIdx={-1}#

Friends

friendclasstensorrt_llm::executor::Serialization

structMpiState#

Public Functions

inlinebooloperator==(MpiStateconst&other)constnoexcept#

inlinestd::stringtoString()const#

Public Members

std::vector<SizeType32>mRanks#

structSocketState#

Public Functions

inlinebooloperator==(SocketStateconst&other)constnoexcept#

inlinestd::stringtoString()const#

Public Members

std::uint16_tmPort#

std::stringmIp#

cacheCommunicator.h#

namespacetensorrt_llm

namespaceexecutor

namespacekv_cache

classConnection#

Public Functions

virtual~Connection()=default#

virtualvoidsend( DataContextconst&ctx, voidconst*data, size_tsize, )const=0#

virtualvoidrecv( DataContextconst&ctx, void*data, size_tsize, )const=0#

inlinevirtualboolisThreadSafe()constnoexcept#

classConnectionManager#

Public Functions

virtual~ConnectionManager()=default#

virtualConnectionconst*recvConnect( DataContextconst&ctx, void*data, size_tsize, )=0#

virtualstd::vector<Connectionconst*>getConnections( CommStateconst&state, )=0#

virtualCommStateconst&getCommState()const=0#

structDataContext#

Public Functions

inlineexplicitDataContext(inttag)#

inlineintgetTag()constnoexcept#

Private Members

intconstmTag#

On this page

Movatterモバイル変換

Executor#

disaggServerUtil.h#

tensor.h#

transferAgent.h#

serialization.h#

types.h#

executor.h#

dataTransceiverState.h#

cacheCommunicator.h#

Executor #