Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

[TRTLLM-7292][feat] Support multi-threaded tokenizers for trtllm-serve#7515

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
Show file tree
Hide file tree
Changes from1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
PrevPrevious commit
NextNext commit
fix llm api stability
Signed-off-by: Yilin Fan <206948969+nv-yilinf@users.noreply.github.com>
  • Loading branch information
@nv-yilinf
nv-yilinf committedSep 4, 2025
commitc16d826cf6b70153a1ec2bcdd7f9427d271097a2
168 changes: 90 additions & 78 deletionstensorrt_llm/llmapi/llm.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -313,84 +313,6 @@ def _item_at(maybe_batched: Union[Any, Sequence[Any]], pos: int) -> Any:

return futures

def preprocess_inputs(
self,
inputs: PromptInputs,
sampling_params: Optional[SamplingParams] = None
) -> PreprocessedInputs:
sampling_params = self._prepare_sampling_params(sampling_params)

inputs = prompt_inputs(inputs)

if not inputs.get("prompt") and inputs.get("prompt_token_ids") and (
inputs.get("multi_modal_data")
or inputs.get("multi_modal_embeddings")) and not isinstance(
self.input_processor, DefaultInputProcessor):
# VLMs need to process/tokenize the prompt in their own way
prompt = self.tokenizer.decode(inputs['prompt_token_ids'])
inputs = TextPrompt(
prompt=prompt,
multi_modal_data=inputs.get("multi_modal_data"),
mm_processor_kwargs=inputs.get("mm_processor_kwargs"))
if sampling_params.add_special_tokens:
logger.debug(
"Setting add_special_tokens to False because prompt_token_ids were provided to generate. VLMs will re-encode the prompt."
)
sampling_params.add_special_tokens = False

query_token_ids = None
multimodal_params = None

if "prompt_token_ids" in inputs:
# TODO: if specify prompt_token_ids, the mm hashing is not supported yet
prompt_token_ids = inputs['prompt_token_ids']
prompt = None
query_token_ids = inputs.get("query_token_ids", None)
elif "prompt" in inputs:
if 'multi_modal_data' in inputs:
# TODO: The current design uses a wrapper for existing input processor (input_processor_with_hash)
# to handle/add multimodal hashes, positions, and lengths. Now we only support image modality.
# In the future, we should refactor this to:
# 1. Extend support for more modalities and models
# 2. Decouple input processor into distinct phases (preprocessor (all preprocessing logics), vision model (fuse in model fwd), etc.
input_processor_with_hash = create_input_processor_with_hash(
self.input_processor)
with nvtx_range_debug("input_processor_with_hash"):
prompt_token_ids, extra_processed_inputs = input_processor_with_hash(
inputs, sampling_params)
elif 'multi_modal_embeddings' in inputs:
mm_embedding_info = inputs['multi_modal_embeddings']
prompt_token_ids, extra_processed_inputs = self.input_processor.attach_multimodal_embeddings(
inputs, mm_embedding_info, sampling_params)
else:
with nvtx_range_debug("input_processor"):
prompt_token_ids, extra_processed_inputs = self.input_processor(
inputs, sampling_params)
prompt = inputs['prompt']
if extra_processed_inputs is not None:
query_token_ids = extra_processed_inputs.get('query_token_ids')
# Create unified MultimodalParams
multimodal_params = MultimodalParams(
multimodal_input=extra_processed_inputs.get(
'multimodal_input'),
multimodal_data=extra_processed_inputs.get(
'multimodal_data'))
# Only pass it if it has content
if not multimodal_params.has_content():
multimodal_params = None
else:
# Convert to shared tensor handle to reduce IPC overhead
multimodal_params.to_handle("multimodal_data")
else:
raise TypeError(
f"The inputs must be type str or list of int, but got {type(inputs)}"
)
return PreprocessedInputs(prompt_token_ids=prompt_token_ids,
prompt=prompt,
query_token_ids=query_token_ids,
sampling_params=sampling_params,
multimodal_params=multimodal_params)

@nvtx_range_debug("LLM.generate_async", color="green", category="LLM")
def generate_async(
self,
Expand DownExpand Up@@ -418,6 +340,7 @@ def generate_async(
kv_cache_retention_config (tensorrt_llm.bindings.executor.KvCacheRetentionConfig, optional): Configuration for the request's retention in the KV Cache. Defaults to None.
disaggregated_params (tensorrt_llm.disaggregated_params.DisaggregatedParams, optional): Disaggregated parameters. Defaults to None.
scheduling_params (tensorrt_llm.scheduling_params.SchedulingParams, optional): Scheduling parameters. Defaults to None.
preprocessed_inputs (tensorrt_llm.inputs.data.PreprocessedInputs, optional): If preprocessed_inputs is not None, params `inputs` and `sampling_params` will be ignored. Defaults to None.

Returns:
tensorrt_llm.llmapi.RequestOutput: The output data of the completion request to the LLM.
Expand DownExpand Up@@ -470,6 +393,95 @@ def generate_async(
return RequestOutput._from_generation_result(result, prompt,
self.tokenizer)

@set_api_status("beta")
def preprocess_inputs(
self,
inputs: PromptInputs,
sampling_params: Optional[SamplingParams] = None
) -> PreprocessedInputs:
'''Preprocess inputs and sampling_params before passing them into generate_async.
Normally you don't need this. But it could be useful if you want process many inputs in parallel

Args:
inputs (tensorrt_llm.inputs.data.PromptInputs): The prompt text or token ids; it must be single prompt.
sampling_params (tensorrt_llm.sampling_params.SamplingParams, optional): The sampling params for the generation. Defaults to None.

Returns:
tensorrt_llm.inputs.data.PreprocessedInputs: could be passed to generate_async as an optional argument
'''
sampling_params = self._prepare_sampling_params(sampling_params)

inputs = prompt_inputs(inputs)

if not inputs.get("prompt") and inputs.get("prompt_token_ids") and (
inputs.get("multi_modal_data")
or inputs.get("multi_modal_embeddings")) and not isinstance(
self.input_processor, DefaultInputProcessor):
# VLMs need to process/tokenize the prompt in their own way
prompt = self.tokenizer.decode(inputs['prompt_token_ids'])
inputs = TextPrompt(
prompt=prompt,
multi_modal_data=inputs.get("multi_modal_data"),
mm_processor_kwargs=inputs.get("mm_processor_kwargs"))
if sampling_params.add_special_tokens:
logger.debug(
"Setting add_special_tokens to False because prompt_token_ids were provided to generate. VLMs will re-encode the prompt."
)
sampling_params.add_special_tokens = False

query_token_ids = None
multimodal_params = None

if "prompt_token_ids" in inputs:
# TODO: if specify prompt_token_ids, the mm hashing is not supported yet
prompt_token_ids = inputs['prompt_token_ids']
prompt = None
query_token_ids = inputs.get("query_token_ids", None)
elif "prompt" in inputs:
if 'multi_modal_data' in inputs:
# TODO: The current design uses a wrapper for existing input processor (input_processor_with_hash)
# to handle/add multimodal hashes, positions, and lengths. Now we only support image modality.
# In the future, we should refactor this to:
# 1. Extend support for more modalities and models
# 2. Decouple input processor into distinct phases (preprocessor (all preprocessing logics), vision model (fuse in model fwd), etc.
input_processor_with_hash = create_input_processor_with_hash(
self.input_processor)
with nvtx_range_debug("input_processor_with_hash"):
prompt_token_ids, extra_processed_inputs = input_processor_with_hash(
inputs, sampling_params)
elif 'multi_modal_embeddings' in inputs:
mm_embedding_info = inputs['multi_modal_embeddings']
prompt_token_ids, extra_processed_inputs = self.input_processor.attach_multimodal_embeddings(
inputs, mm_embedding_info, sampling_params)
else:
with nvtx_range_debug("input_processor"):
prompt_token_ids, extra_processed_inputs = self.input_processor(
inputs, sampling_params)
prompt = inputs['prompt']
if extra_processed_inputs is not None:
query_token_ids = extra_processed_inputs.get('query_token_ids')
# Create unified MultimodalParams
multimodal_params = MultimodalParams(
multimodal_input=extra_processed_inputs.get(
'multimodal_input'),
multimodal_data=extra_processed_inputs.get(
'multimodal_data'))
# Only pass it if it has content
if not multimodal_params.has_content():
multimodal_params = None
else:
# Convert to shared tensor handle to reduce IPC overhead
multimodal_params.to_handle("multimodal_data")
else:
raise TypeError(
f"The inputs must be type str or list of int, but got {type(inputs)}"
)
return PreprocessedInputs(prompt_token_ids=prompt_token_ids,
prompt=prompt,
query_token_ids=query_token_ids,
sampling_params=sampling_params,
multimodal_params=multimodal_params)

@set_api_status("beta")
def get_stats(self, timeout: Optional[float] = 2) -> List[dict]:
'''Get iteration statistics from the runtime.
Expand Down
13 changes: 13 additions & 0 deletionstests/unittest/api_stability/references/llm.yaml
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -196,7 +196,20 @@ methods:
annotation: Optional[tensorrt_llm.scheduling_params.SchedulingParams]
default: null
status: prototype
preprocessed_inputs:
annotation: Optional[tensorrt_llm.inputs.data.PreprocessedInputs]
default: null
return_annotation: tensorrt_llm.llmapi.llm.RequestOutput
preprocess_inputs:
parameters:
inputs:
annotation: tensorrt_llm.inputs.PromptInputs
default: inspect._empty
sampling_params:
annotation: Optional[tensorrt_llm.sampling_params.SamplingParams]
default: null
return_annotation: tensorrt_llm.inputs.data.PreprocessedInputs
status: beta
get_kv_cache_events:
parameters:
timeout:
Expand Down

[8]ページ先頭

©2009-2025 Movatter.jp