NVIDIA/TensorRT-Model-OptimizerPublic

NotificationsYou must be signed in to change notification settings
Fork202
Star1.6k

Commit80c5491

authored

Specdec Bench: vLLM reqid, SGL path, conc > 1 metric fix (#541)

## What does this PR do?**SGLang** Fix for actually passing the draft model path to the engine**vLLM** Fix for multiturn to not overlap request_id strings**Acceptance Rate** Fix for potential race condition on multiturndatasets in writing back AR**Overview:** ?## Usage```python# Add a code snippet demonstrating how to use this```## Testing## Before your PR is "*Ready for review*"- **Make sure you read and follow [Contributorguidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)**and your commits are signed.- **Is this change backward compatible?**: Yes/No - **Did you write any new necessary tests?**: Yes/No- **Did you add or update any necessary documentation?**: Yes/No- **Did you update[Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**:Yes/No ## Additional InformationSigned-off-by: Izzy Putterman <iputterman@nvidia.com>

1 parent592a499 commit80c5491Copy full SHA for 80c5491

File tree

12 files changed

+48

-39

lines changed

examples/specdec_bench
- run.py
- specdec_bench
  - metrics
  - models
  - runners
    - base.py
    - simple.py

12 files changed

+48

-39

lines changed

`‎examples/specdec_bench/run.py‎`

Lines changed: 4 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -49,12 +49,14 @@ async def process_single_request(request, i):`
`49`	`49`	`ifrequest.system_promptisnotNone:`
`50`	`50`	`messages.append({"role":"system","content":request.system_prompt})`
`51`	`51`
`52`		`-forquestioninrequest.turns:`
	`52`	`+forturn_id,questioninenumerate(request.turns):`
`53`	`53`	`messages.append({"role":"user","content":question})`
`54`	`54`	`entry_encoded=encode_chat(tokenizer,messages)`
`55`	`55`
`56`	`56`	`# Run the async runner.run directly`
`57`		`-output_tokens=awaitrunner.run(entry_encoded,max_length,end_id,i)`
	`57`	`+output_tokens=awaitrunner.run(`
	`58`	`+entry_encoded,max_length,end_id,request_id=i,turn_id=turn_id`
	`59`	`+ )`
`58`	`60`	`output_text=decode_chat(tokenizer,output_tokens["output_ids"][0])`
`59`	`61`	`output_text=postprocess(output_text)`
`60`	`62`	`messages.append({"role":"assistant","content":output_text})`

`‎examples/specdec_bench/specdec_bench/metrics/aa_timing.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ def __init__(self, base_tokenizer):`
`34`	`34`	`self.base_tokenizer=base_tokenizer`
`35`	`35`	`self.total_tokens= []`
`36`	`36`
`37`		`-defprocess_step(self,step_outputs,new_turn=True):`
	`37`	`+defprocess_step(self,step_outputs,request_id,turn_id):`
`38`	`38`	`self.timing.append(step_outputs["token_times"])`
`39`	`39`	`target_tokens= [`
`40`	`40`	`tfortok_listinstep_outputs["output_ids"]fortokintok_listfortintok`

`‎examples/specdec_bench/specdec_bench/metrics/acceptance_rate.py‎`

Lines changed: 18 additions & 13 deletions

Original file line number	Diff line number	Diff line change
`@@ -22,15 +22,17 @@`
`22`	`22`	`classAcceptanceRate(Metric):`
`23`	`23`	`def__init__(self):`
`24`	`24`	`super().__init__()`
`25`		`-self.prompt_ar=[]`
	`25`	`+self.prompt_ar={}`
`26`	`26`	`self.name="acceptance_rate"`
`27`	`27`
`28`		`-defprocess_step(self,step_outputs,new_turn=True):`
`29`		`-ifnew_turn:`
`30`		`-self.prompt_ar.append([])`
	`28`	`+defprocess_step(self,step_outputs,request_id,turn_id):`
	`29`	`+ifrequest_idnotinself.prompt_ar:`
	`30`	`+self.prompt_ar[request_id]= {}`
	`31`	`+ifturn_idnotinself.prompt_ar[request_id]:`
	`32`	`+self.prompt_ar[request_id][turn_id]= []`
`31`	`33`	`fori,beam_outputinenumerate(step_outputs["output_ids"]):`
`32`	`34`	`foroutput_id_iterinbeam_output:`
`33`		`-self.prompt_ar[-1].append(len(output_id_iter))`
	`35`	`+self.prompt_ar[request_id][turn_id].append(len(output_id_iter))`
`34`	`36`
`35`	`37`	`def_get_lengths(self,turn,lengths):`
`36`	`38`	`forjinturn:`
`@@ -55,16 +57,19 @@ def _process_lengths(self, lengths):`
`55`	`57`	`running_len-=v`
`56`	`58`
`57`	`59`	`defprocess_final(self,text_outputs):`
`58`		`-i=0`
	`60`	`+all_ar=[]`
`59`	`61`	`lengths= {}`
`60`	`62`	`self.out["Request_AR"]= {}`
`61`		`-whilei<len(self.prompt_ar):`
`62`		`-turn_1=self.prompt_ar[i]`
`63`		`-self.out["Request_AR"][i]=sum(turn_1)/len(turn_1)`
`64`		`-self._get_lengths(turn_1,lengths)`
`65`		`-print(i,self.out["Request_AR"][i])`
`66`		`-i+=1`
`67`		`-average_ar=sum(self.out["Request_AR"].values())/len(self.out["Request_AR"])`
	`63`	`+self.prompt_ar=dict(sorted(self.prompt_ar.items(),key=lambdax:x[0]))`
	`64`	`+forrequest_id,turnsinself.prompt_ar.items():`
	`65`	`+self.out["Request_AR"][request_id]= {}`
	`66`	`+forturn_id,turninturns.items():`
	`67`	`+ar=sum(turn)/len(turn)`
	`68`	`+self.out["Request_AR"][request_id][turn_id]=ar`
	`69`	`+all_ar.append(ar)`
	`70`	`+self._get_lengths(turn,lengths)`
	`71`	`+print(request_id,turn_id,self.out["Request_AR"][request_id][turn_id])`
	`72`	`+average_ar=sum(all_ar)/len(all_ar)`
`68`	`73`	`print("Average AR:",average_ar)`
`69`	`74`	`self.out["Average_AR"]=average_ar`
`70`	`75`	`self._process_lengths(lengths)`

`‎examples/specdec_bench/specdec_bench/metrics/base.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ def __init__(self):`
`24`	`24`	`self.out= {}`
`25`	`25`	`self.name="metric"`
`26`	`26`
`27`		`-defprocess_step(self,step_outputs,new_turn=True):`
	`27`	`+defprocess_step(self,step_outputs,request_id,turn_id):`
`28`	`28`	`raiseNotImplementedError`
`29`	`29`
`30`	`30`	`defprocess_final(self,text_outputs):`

`‎examples/specdec_bench/specdec_bench/metrics/mtbench.py‎`

Lines changed: 6 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -35,16 +35,16 @@ def process_final(self, text_outputs):`
`35`	`35`	`i=0`
`36`	`36`	`lengths= {}`
`37`	`37`	`self.out["Request_AR"]= {}`
`38`		`-whilei<len(self.prompt_ar):`
`39`		`-turn_1=self.prompt_ar[i]`
`40`		`-turn_2=self.prompt_ar[i+1]`
`41`		`-q_id=i//2`
	`38`	`+self.prompt_ar=dict(sorted(self.prompt_ar.items(),key=lambdax:x[0]))`
	`39`	`+forrequest_id,turnsinself.prompt_ar.items():`
	`40`	`+turn_1=turns[0]`
	`41`	`+turn_2=turns[1]`
	`42`	`+q_id=request_id`
`42`	`43`	`mtbench_topic=MTBENCH_TOPICS[q_id//10]`
`43`		`-self.out["Request_AR"][q_id]=sum(turn_1+turn_2)/len(turn_1+turn_2)`
	`44`	`+self.out["Request_AR"][request_id]=sum(turn_1+turn_2)/len(turn_1+turn_2)`
`44`	`45`	`self._get_lengths(turn_1,lengths)`
`45`	`46`	`self._get_lengths(turn_2,lengths)`
`46`	`47`	`print(mtbench_topic,sum(turn_1+turn_2)/len(turn_1+turn_2))`
`47`		`-i+=2`
`48`	`48`	`per_category= [[]for_inrange(len(MTBENCH_TOPICS))]`
`49`	`49`	`forq_id,arinself.out["Request_AR"].items():`
`50`	`50`	`per_category[q_id//10].append(ar)`

`‎examples/specdec_bench/specdec_bench/metrics/timing.py‎`

Lines changed: 4 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ def __init__(self, tp_size):`
`26`	`26`	`self.total_tokens= []`
`27`	`27`	`self.tp_size=tp_size`
`28`	`28`
`29`		`-defprocess_step(self,step_outputs,new_turn=True):`
	`29`	`+defprocess_step(self,step_outputs,request_id,turn_id):`
`30`	`30`	`self.timing.append(step_outputs["token_times"])`
`31`	`31`	`self.total_tokens.append(`
`32`	`32`	`sum([sum([len(j)forjini])foriinstep_outputs["output_ids"]])`
`@@ -42,8 +42,9 @@ def process_final(self, text_outputs):`
`42`	`42`	`self.out["Output TPS"]=sum(self.total_tokens)/ (end_time-start_time)`
`43`	`43`	`self.out["Output TPS/gpu"]=self.out["Output TPS"]/self.tp_size`
`44`	`44`	`fortokens,timesinzip(self.total_tokens,self.timing):`
`45`		`-e2e_time.append(times[-1]-times[0])`
`46`		`-ttft_time.append(times[1]-times[0])`
	`45`	`+iflen(times)>1:`
	`46`	`+e2e_time.append(times[-1]-times[0])`
	`47`	`+ttft_time.append(times[1]-times[0])`
`47`	`48`	`iflen(times)>2:`
`48`	`49`	`gen_tp_time.append((tokens-1)/ (times[-1]-times[1]))`
`49`	`50`	`tpot_time.extend([a-bfora,binzip(times[1:],times[:-1])])`

`‎examples/specdec_bench/specdec_bench/models/base.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ class Model:`
`18`	`18`	`def__init__(self,model_dir,tokenizer,max_draft_length):`
`19`	`19`	`raiseNotImplementedError`
`20`	`20`
`21`		`-asyncdefrun(self,prompt_ids,max_length,end_id,request_id):`
	`21`	`+asyncdefrun(self,prompt_ids,max_length,end_id,request_id,turn_id):`
`22`	`22`	`"""`
`23`	`23`	`prompt_ids is list of tokens`
`24`	`24`	`output is list of list of tokens`

`‎examples/specdec_bench/specdec_bench/models/sglang.py‎`

Lines changed: 2 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,7 @@ def __init__(`
`50`	`50`	`speculative_num_steps=kwargs.get("speculative_num_steps",3),`
`51`	`51`	`speculative_eagle_topk=kwargs.get("speculative_eagle_topk",1),`
`52`	`52`	`speculative_num_draft_tokens=kwargs.get("speculative_num_draft_tokens",4),`
	`53`	`+speculative_draft_model_path=kwargs.get("draft_model_dir"),`
`53`	`54`	`torch_compile_max_bs=max_concurrent_requests,`
`54`	`55`	`attention_backend=kwargs.get("attention_backend"),`
`55`	`56`	`enable_torch_compile=kwargs.get("enable_torch_compile",False),`
`@@ -70,7 +71,7 @@ def __init__(`
`70`	`71`
`71`	`72`	`self.sampling_config=sampling_kwargs`
`72`	`73`
`73`		`-asyncdefrun(self,prompt_ids,max_length,end_id,request_id):`
	`74`	`+asyncdefrun(self,prompt_ids,max_length,end_id,request_id,turn_id):`
`74`	`75`	`timing= []`
`75`	`76`	`output_dict= {}`
`76`	`77`	`self.sampling_config["max_new_tokens"]=max_length`

`‎examples/specdec_bench/specdec_bench/models/trtllm_torch_api.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ def __init__(`
`43`	`43`	`self.model=create_executor(model_path,max_concurrent_requests,kwargs)`
`44`	`44`	`self.sampling_kwargs=sampling_kwargs`
`45`	`45`
`46`		`-asyncdefrun(self,prompt_ids,max_length,end_id,request_id):`
	`46`	`+asyncdefrun(self,prompt_ids,max_length,end_id,request_id,turn_id):`
`47`	`47`	`output_dict= {}`
`48`	`48`	`sampling_config=check_sampling_config(self.sampling_kwargs,max_length,end_id)`
`49`	`49`	`outputs= []`

`‎examples/specdec_bench/specdec_bench/models/vllm.py‎`

Lines changed: 4 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -84,12 +84,12 @@ def __init__(self, model_dir, max_concurrent_requests, sampling_kwargs, **kwargs`
`84`	`84`	`self.loop=asyncio.new_event_loop()`
`85`	`85`	`asyncio.set_event_loop(self.loop)`
`86`	`86`
`87`		`-asyncdefrun(self,prompt_ids,max_length,end_id,request_id):`
	`87`	`+asyncdefrun(self,prompt_ids,max_length,end_id,request_id,turn_id):`
`88`	`88`	`output_dict= {}`
`89`	`89`	`self.sampling_config.max_tokens=max_length`
`90`	`90`	`self.sampling_config.stop_token_ids= [end_id]`
`91`	`91`
`92`		`-outputs,timing,full_tokens=awaitself.generate(prompt_ids,request_id)`
	`92`	`+outputs,timing,full_tokens=awaitself.generate(prompt_ids,request_id,turn_id)`
`93`	`93`
`94`	`94`	`reformatted_output_ids= [[]for_inrange(self.sampling_kwargs.get("beam_width",1))]`
`95`	`95`	`start=0`
`@@ -114,13 +114,13 @@ async def run(self, prompt_ids, max_length, end_id, request_id):`
`114`	`114`	`]`
`115`	`115`	`returnoutput_dict`
`116`	`116`
`117`		`-asyncdefgenerate(self,prompt_ids,request_id):`
	`117`	`+asyncdefgenerate(self,prompt_ids,request_id,turn_id):`
`118`	`118`	`timing= []`
`119`	`119`	`timing.append(time.perf_counter())`
`120`	`120`	`outputs= []`
`121`	`121`	`full_tokens= []`
`122`	`122`	`asyncforoutputinself.model.generate(`
`123`		`-request_id=str(request_id),`
	`123`	`+request_id=f"{request_id}.{turn_id}",`
`124`	`124`	`prompt=TokensPrompt(prompt_token_ids=prompt_ids),`
`125`	`125`	`sampling_params=self.sampling_config,`
`126`	`126`	`):`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit80c5491

File tree

12 files changed

12 files changed

`‎examples/specdec_bench/run.py‎`

`‎examples/specdec_bench/specdec_bench/metrics/aa_timing.py‎`

`‎examples/specdec_bench/specdec_bench/metrics/acceptance_rate.py‎`

`‎examples/specdec_bench/specdec_bench/metrics/base.py‎`

`‎examples/specdec_bench/specdec_bench/metrics/mtbench.py‎`

`‎examples/specdec_bench/specdec_bench/metrics/timing.py‎`

`‎examples/specdec_bench/specdec_bench/models/base.py‎`

`‎examples/specdec_bench/specdec_bench/models/sglang.py‎`

`‎examples/specdec_bench/specdec_bench/models/trtllm_torch_api.py‎`

`‎examples/specdec_bench/specdec_bench/models/vllm.py‎`

0 commit comments