NVIDIA/TensorRT-LLMPublic

NotificationsYou must be signed in to change notification settings
Fork1.9k
Star12.3k

Commit4c98e8b

committed

feat: batched sampling by strategy (supersedes enable_mixed_sampler)

Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com>

1 parent870cfcf commit4c98e8bCopy full SHA for 4c98e8b

File tree

18 files changed

+1005

-296

lines changed

setup.py
tensorrt_llm
- _torch
  - auto_deploy
    - llm_args.py
    - shim
      - ad_executor.py
  - modules
    - rms_norm.py
  - pyexecutor
  - speculative
    - eagle3.py
    - model_drafter.py
- evaluate
  - json_mode_eval.py
  - mmlu.py
- llmapi
  - llm_args.py
- scaffolding
  - worker.py
tests/unittest
- api_stability/references
  - llm.yaml
- llmapi
  - apps
    - _test_openai_misc.py
  - test_llm_pytorch.py

18 files changed

+1005

-296

lines changed

`‎setup.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -260,4 +260,4 @@ def extract_from_precompiled(precompiled_location: str, package_data: List[str],`
`260`	`260`	`install_requires=required_deps,`
`261`	`261`	`dependency_links=`
`262`	`262`	`extra_URLs,# Warning: Dependency links support has been dropped by pip 19.0`
`263`		`-python_requires=">=3.7, <4")`
	`263`	`+python_requires=">=3.10, <4")`

`‎tensorrt_llm/_torch/auto_deploy/llm_args.py‎`

Lines changed: 0 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -105,12 +105,6 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings):`
`105`	`105`	`description="Disable the overlap scheduler in trtllm runtime",`
`106`	`106`	`)`
`107`	`107`
`108`		`-enable_mixed_sampler:bool=Field(`
`109`		`-default=False,`
`110`		`-description="If true, will iterate over sampling_params of each request and use the corresponding "`
`111`		`-"sampling strategy, e.g. top-k, top-p, etc.",`
`112`		`- )`
`113`		`-`
`114`	`108`	`world_size:int=Field(`
`115`	`109`	`default=1,`
`116`	`110`	`ge=0,`

`‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎`

Lines changed: 0 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -337,16 +337,11 @@ def create_autodeploy_executor(ad_config: LlmArgs):`
`337`	`337`	`scheduler=SimpleScheduler(capacitor_scheduler,mb_scheduler)`
`338`	`338`
`339`	`339`	`# search sampler with speculative decoding`
`340`		`-# TODO (lucaslie, fridah-nv): some models require enable_mixed_sampler=True to have good outputs, see`
`341`		`-# https://github.com/NVIDIA/TensorRT-LLM/issues/5254`
`342`		`-# We should expose mixed_sample to our build_and_run_ad script so we can configure this`
`343`		`-# correctly for models as needed.`
`344`	`340`	`sampler_args=TorchSampler.Args(`
`345`	`341`	`max_seq_len=ad_config.max_seq_len,`
`346`	`342`	`max_draft_len=max_draft_len,`
`347`	`343`	`max_num_sequences=max_num_sequences,`
`348`	`344`	`max_beam_width=ad_config.max_beam_width,`
`349`		`-enable_mixed_sampler=ad_config.enable_mixed_sampler,`
`350`	`345`	`)`
`351`	`346`	`sampler=TorchSampler(sampler_args)`
`352`	`347`

`‎tensorrt_llm/_torch/modules/rms_norm.py‎`

Lines changed: 25 additions & 12 deletions

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,8 @@`
`14`	`14`	`# limitations under the License.`
`15`	`15`
`16`	`16`	`importenum`
`17`		`-fromtypingimportOptional,Tuple,Union`
	`17`	`+fromtypesimportEllipsisType# https://stackoverflow.com/a/66636313`
	`18`	`+fromtypingimportOptional,Tuple,TypeAlias,Union,cast`
`18`	`19`
`19`	`20`	`importtorch`
`20`	`21`	`fromtorchimportnn`
`@@ -24,6 +25,9 @@`
`24`	`25`
`25`	`26`	`classRMSNorm(nn.Module):`
`26`	`27`
	`28`	`+_ARGUMENT_NOT_SPECIFIED_SENTINEL= ...`
	`29`	`+_ArgumentNotSpecifiedSentinelType:TypeAlias=EllipsisType`
	`30`	`+`
`27`	`31`	`def__init__(`
`28`	`32`	`self,`
`29`	`33`	`*,`
`@@ -48,12 +52,19 @@ def __init__(`
`48`	`52`	`defforward(`
`49`	`53`	`self,`
`50`	`54`	`hidden_states:torch.Tensor,`
`51`		`-residual:Optional[torch.Tensor]= ...,`
`52`		`- )->Union[torch.Tensor,Tuple[torch.Tensor,torch.Tensor]]:`
	`55`	`+residual:Union[`
	`56`	`+Optional[torch.Tensor],`
	`57`	`+_ArgumentNotSpecifiedSentinelType]=_ARGUMENT_NOT_SPECIFIED_SENTINEL,`
	`58`	`+ )->Union[torch.Tensor,Tuple[torch.Tensor,Optional[torch.Tensor]]]:`
	`59`	`+return_residual=True`
	`60`	`+ifresidualisself._ARGUMENT_NOT_SPECIFIED_SENTINEL:`
	`61`	`+return_residual=False`
	`62`	`+residual=None`
	`63`	`+`
`53`	`64`	`ifIS_FLASHINFER_AVAILABLE:`
`54`	`65`	`from ..custom_opsimport (flashinfer_fused_add_rmsnorm,`
`55`	`66`	`flashinfer_rmsnorm)`
`56`		`-ifisinstance(residual,torch.Tensor):`
	`67`	`+ifresidualisnotNone:`
`57`	`68`	`flashinfer_fused_add_rmsnorm(hidden_states,residual,`
`58`	`69`	`self.weight,self.variance_epsilon)`
`59`	`70`	`else:`
`@@ -62,7 +73,7 @@ def forward(`
`62`	`73`	`else:`
`63`	`74`	`input_dtype=hidden_states.dtype`
`64`	`75`	`hidden_states=hidden_states.to(torch.float32)`
`65`		`-ifisinstance(residual,torch.Tensor):`
	`76`	`+ifresidualisnotNone:`
`66`	`77`	`hidden_states=hidden_states+residual.to(torch.float32)`
`67`	`78`	`residual=hidden_states.to(input_dtype)`
`68`	`79`
`@@ -71,20 +82,22 @@ def forward(`
`71`	`82`	`self.variance_epsilon)`
`72`	`83`	`hidden_states=self.weight*hidden_states.to(input_dtype)`
`73`	`84`
`74`		`-ifresidualis ...:`
`75`		`-returnhidden_states`
	`85`	`+ifreturn_residual:`
	`86`	`+returnhidden_states,cast(Optional[torch.Tensor],residual)`
`76`	`87`	`else:`
`77`		`-returnhidden_states,residual`
	`88`	`+returnhidden_states`
`78`	`89`
`79`	`90`	`defskip_forward(`
`80`	`91`	`self,`
`81`	`92`	`hidden_states:torch.Tensor,`
`82`		`-residual:Optional[torch.Tensor]= ...,`
`83`		`- )->Union[torch.Tensor,Tuple[torch.Tensor,torch.Tensor]]:`
`84`		`-ifresidualis ...:`
	`93`	`+residual:Union[`
	`94`	`+Optional[torch.Tensor],`
	`95`	`+_ArgumentNotSpecifiedSentinelType]=_ARGUMENT_NOT_SPECIFIED_SENTINEL,`
	`96`	`+ )->Union[torch.Tensor,Tuple[torch.Tensor,Optional[torch.Tensor]]]:`
	`97`	`+ifresidualisself._ARGUMENT_NOT_SPECIFIED_SENTINEL:`
`85`	`98`	`returnhidden_states`
`86`	`99`	`else:`
`87`		`-returnhidden_states,residual`
	`100`	`+returnhidden_states,cast(Optional[torch.Tensor],residual)`
`88`	`101`
`89`	`102`
`90`	`103`	`classGroupRMSNormKernelSelection(enum.Enum):`

`‎tensorrt_llm/_torch/pyexecutor/_util.py‎`

Lines changed: 1 addition & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -686,7 +686,7 @@ def create_py_executor_instance(`
`686`	`686`
`687`	`687`
`688`	`688`	`defcreate_torch_sampler_args(mapping:Mapping,*,max_seq_len:int,`
`689`		`-enable_mixed_sampler:bool,max_batch_size:int,`
	`689`	`+max_batch_size:int,`
`690`	`690`	`speculative_config:SpeculativeConfig,`
`691`	`691`	`max_beam_width:int):`
`692`	`692`	`max_num_sequences=max_batch_size*mapping.pp_size`
`@@ -697,7 +697,6 @@ def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,`
`697`	`697`	`max_draft_len=max_draft_len,`
`698`	`698`	`max_num_sequences=max_num_sequences,`
`699`	`699`	`max_beam_width=max_beam_width,`
`700`		`-enable_mixed_sampler=enable_mixed_sampler,`
`701`	`700`	`)`
`702`	`701`
`703`	`702`
`@@ -711,7 +710,6 @@ def instantiate_sampler(engine: PyTorchModelEngine,`
`711`	`710`	`sampler_args=create_torch_sampler_args(`
`712`	`711`	`mapping,`
`713`	`712`	`max_seq_len=engine.max_seq_len,`
`714`		`-enable_mixed_sampler=pytorch_backend_config.enable_mixed_sampler,`
`715`	`713`	`max_batch_size=max_batch_size,`
`716`	`714`	`speculative_config=speculative_config,`
`717`	`715`	`max_beam_width=max_beam_width)`

`‎tensorrt_llm/_torch/pyexecutor/config.py‎`

Lines changed: 0 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -56,11 +56,6 @@ class PyTorchConfig:`
`56`	`56`
`57`	`57`	`moe_disable_finalize_fusion:bool=False`
`58`	`58`
`59`		`-enable_mixed_sampler:bool=False`
`60`		`-"""`
`61`		`- If true, will iterate over sampling_params of each request and use the`
`62`		`- corresponding sampling strategy, e.g. top-k, top-p, etc.`
`63`		`- """`
`64`	`59`	`sampler_type:SamplerType=SamplerType.auto`
`65`	`60`	`"""`
`66`	`61`	`The type of sampler to use. Options are TRTLLMSampler, TorchSampler or auto.`

`‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -365,7 +365,7 @@ def __init__(`
`365`	`365`	`exclude_last_generation_logits)`
`366`	`366`	`self.child_requests= []`
`367`	`367`
`368`		`-self._py_embedding_bias_1d=None`
	`368`	`+self._py_embedding_bias_1d:Optional[torch.Tensor]=None`
`369`	`369`	`ifhasattr(self,'embedding_bias')andself.embedding_biasisnotNone:`
`370`	`370`	`# Pre-squeeze to 1D if needed (remove batch dimension)`
`371`	`371`	`ifself.embedding_bias.dim()>1:`

`‎tensorrt_llm/_torch/pyexecutor/py_executor_creator.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -350,7 +350,7 @@ def create_py_executor(`
`350`	`350`	`if_get_allow_chain_drafter():`
`351`	`351`	`use_chain_drafter= (`
`352`	`352`	`guided_decoding_configisNone`
`353`		`-andnotpytorch_backend_config.enable_mixed_sampler`
	`353`	`+anddraft_spec_config._allow_greedy_draft_tokens`
`354`	`354`	`andpytorch_backend_config.attn_backend=="TRTLLM")`
`355`	`355`	`else:`
`356`	`356`	`use_chain_drafter=False`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit4c98e8b

File tree

18 files changed

18 files changed

`‎setup.py‎`

`‎tensorrt_llm/_torch/auto_deploy/llm_args.py‎`

`‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎`

`‎tensorrt_llm/_torch/modules/rms_norm.py‎`

`‎tensorrt_llm/_torch/pyexecutor/_util.py‎`

`‎tensorrt_llm/_torch/pyexecutor/config.py‎`

`‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎`

`‎tensorrt_llm/_torch/pyexecutor/py_executor_creator.py‎`

0 commit comments