- Notifications
You must be signed in to change notification settings - Fork1.9k
[TRTLLM-7292][feat] Support multi-threaded tokenizers for trtllm-serve#7515
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Merged
kaiyux merged 7 commits intoNVIDIA:release/1.1.0rc2fromnv-yilinf:optimize-serve-host-overheadSep 5, 2025
Uh oh!
There was an error while loading.Please reload this page.
Merged
Changes from1 commit
Commits
Show all changes
7 commits Select commitHold shift + click to select a range
6a074cd Use multithread for llm-serve tokenization
nv-yilinfe89b4c3 disable gc in trtllm-serve
nv-yilinfe6c4bc0 Refactor generate_async and extract preprocess inputs logic for multi…
nv-yilinf13ccda8 Revert "disable gc in trtllm-serve"
nv-yilinfca94fad Minor fix
nv-yilinfc16d826 fix llm api stability
nv-yilinf260b4bd Address comment and avoid changing llmapi
nv-yilinfFile filter
Filter by extension
Conversations
Failed to load comments.
Loading
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Jump to file
Failed to load files.
Loading
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
fix llm api stability
Signed-off-by: Yilin Fan <206948969+nv-yilinf@users.noreply.github.com>
- Loading branch information
Uh oh!
There was an error while loading.Please reload this page.
commitc16d826cf6b70153a1ec2bcdd7f9427d271097a2
There are no files selected for viewing
168 changes: 90 additions & 78 deletionstensorrt_llm/llmapi/llm.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -313,84 +313,6 @@ def _item_at(maybe_batched: Union[Any, Sequence[Any]], pos: int) -> Any: | ||
| return futures | ||
| @nvtx_range_debug("LLM.generate_async", color="green", category="LLM") | ||
| def generate_async( | ||
| self, | ||
| @@ -418,6 +340,7 @@ def generate_async( | ||
| kv_cache_retention_config (tensorrt_llm.bindings.executor.KvCacheRetentionConfig, optional): Configuration for the request's retention in the KV Cache. Defaults to None. | ||
| disaggregated_params (tensorrt_llm.disaggregated_params.DisaggregatedParams, optional): Disaggregated parameters. Defaults to None. | ||
| scheduling_params (tensorrt_llm.scheduling_params.SchedulingParams, optional): Scheduling parameters. Defaults to None. | ||
| preprocessed_inputs (tensorrt_llm.inputs.data.PreprocessedInputs, optional): If preprocessed_inputs is not None, params `inputs` and `sampling_params` will be ignored. Defaults to None. | ||
nv-yilinf marked this conversation as resolved. OutdatedShow resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
| Returns: | ||
| tensorrt_llm.llmapi.RequestOutput: The output data of the completion request to the LLM. | ||
| @@ -470,6 +393,95 @@ def generate_async( | ||
| return RequestOutput._from_generation_result(result, prompt, | ||
| self.tokenizer) | ||
| @set_api_status("beta") | ||
| def preprocess_inputs( | ||
| self, | ||
| inputs: PromptInputs, | ||
| sampling_params: Optional[SamplingParams] = None | ||
| ) -> PreprocessedInputs: | ||
| '''Preprocess inputs and sampling_params before passing them into generate_async. | ||
| Normally you don't need this. But it could be useful if you want process many inputs in parallel | ||
| Args: | ||
| inputs (tensorrt_llm.inputs.data.PromptInputs): The prompt text or token ids; it must be single prompt. | ||
| sampling_params (tensorrt_llm.sampling_params.SamplingParams, optional): The sampling params for the generation. Defaults to None. | ||
| Returns: | ||
| tensorrt_llm.inputs.data.PreprocessedInputs: could be passed to generate_async as an optional argument | ||
| ''' | ||
| sampling_params = self._prepare_sampling_params(sampling_params) | ||
| inputs = prompt_inputs(inputs) | ||
| if not inputs.get("prompt") and inputs.get("prompt_token_ids") and ( | ||
| inputs.get("multi_modal_data") | ||
| or inputs.get("multi_modal_embeddings")) and not isinstance( | ||
| self.input_processor, DefaultInputProcessor): | ||
| # VLMs need to process/tokenize the prompt in their own way | ||
| prompt = self.tokenizer.decode(inputs['prompt_token_ids']) | ||
| inputs = TextPrompt( | ||
| prompt=prompt, | ||
| multi_modal_data=inputs.get("multi_modal_data"), | ||
| mm_processor_kwargs=inputs.get("mm_processor_kwargs")) | ||
| if sampling_params.add_special_tokens: | ||
| logger.debug( | ||
| "Setting add_special_tokens to False because prompt_token_ids were provided to generate. VLMs will re-encode the prompt." | ||
| ) | ||
| sampling_params.add_special_tokens = False | ||
| query_token_ids = None | ||
| multimodal_params = None | ||
| if "prompt_token_ids" in inputs: | ||
| # TODO: if specify prompt_token_ids, the mm hashing is not supported yet | ||
| prompt_token_ids = inputs['prompt_token_ids'] | ||
| prompt = None | ||
| query_token_ids = inputs.get("query_token_ids", None) | ||
| elif "prompt" in inputs: | ||
| if 'multi_modal_data' in inputs: | ||
| # TODO: The current design uses a wrapper for existing input processor (input_processor_with_hash) | ||
| # to handle/add multimodal hashes, positions, and lengths. Now we only support image modality. | ||
| # In the future, we should refactor this to: | ||
| # 1. Extend support for more modalities and models | ||
| # 2. Decouple input processor into distinct phases (preprocessor (all preprocessing logics), vision model (fuse in model fwd), etc. | ||
| input_processor_with_hash = create_input_processor_with_hash( | ||
| self.input_processor) | ||
| with nvtx_range_debug("input_processor_with_hash"): | ||
| prompt_token_ids, extra_processed_inputs = input_processor_with_hash( | ||
| inputs, sampling_params) | ||
| elif 'multi_modal_embeddings' in inputs: | ||
| mm_embedding_info = inputs['multi_modal_embeddings'] | ||
| prompt_token_ids, extra_processed_inputs = self.input_processor.attach_multimodal_embeddings( | ||
| inputs, mm_embedding_info, sampling_params) | ||
| else: | ||
| with nvtx_range_debug("input_processor"): | ||
| prompt_token_ids, extra_processed_inputs = self.input_processor( | ||
| inputs, sampling_params) | ||
| prompt = inputs['prompt'] | ||
| if extra_processed_inputs is not None: | ||
| query_token_ids = extra_processed_inputs.get('query_token_ids') | ||
| # Create unified MultimodalParams | ||
| multimodal_params = MultimodalParams( | ||
| multimodal_input=extra_processed_inputs.get( | ||
| 'multimodal_input'), | ||
| multimodal_data=extra_processed_inputs.get( | ||
| 'multimodal_data')) | ||
| # Only pass it if it has content | ||
| if not multimodal_params.has_content(): | ||
| multimodal_params = None | ||
| else: | ||
| # Convert to shared tensor handle to reduce IPC overhead | ||
| multimodal_params.to_handle("multimodal_data") | ||
| else: | ||
| raise TypeError( | ||
| f"The inputs must be type str or list of int, but got {type(inputs)}" | ||
| ) | ||
| return PreprocessedInputs(prompt_token_ids=prompt_token_ids, | ||
| prompt=prompt, | ||
| query_token_ids=query_token_ids, | ||
| sampling_params=sampling_params, | ||
| multimodal_params=multimodal_params) | ||
| @set_api_status("beta") | ||
| def get_stats(self, timeout: Optional[float] = 2) -> List[dict]: | ||
| '''Get iteration statistics from the runtime. | ||
13 changes: 13 additions & 0 deletionstests/unittest/api_stability/references/llm.yaml
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.