Generate text in streaming#

SourceNVIDIA/TensorRT-LLM.

 1importasyncio 2 3fromtensorrt_llmimportLLM,SamplingParams 4 5 6defmain(): 7 8# model could accept HF model name or a path to local HF model. 9llm=LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")1011# Sample prompts.12prompts=[13"Hello, my name is",14"The capital of France is",15"The future of AI is",16]1718# Create a sampling params.19sampling_params=SamplingParams(temperature=0.8,top_p=0.95)2021# Async based on Python coroutines22asyncdeftask(id:int,prompt:str):2324# streaming=True is used to enable streaming generation.25asyncforoutputinllm.generate_async(prompt,26sampling_params,27streaming=True):28print(f"Generation for prompt-{id}:{output.outputs[0].text!r}")2930asyncdefmain():31tasks=[task(id,prompt)forid,promptinenumerate(prompts)]32awaitasyncio.gather(*tasks)3334asyncio.run(main())3536# Got output like follows:37# Generation for prompt-0: '\n'38# Generation for prompt-3: 'an'39# Generation for prompt-2: 'Paris'40# Generation for prompt-1: 'likely'41# Generation for prompt-0: '\n\n'42# Generation for prompt-3: 'an exc'43# Generation for prompt-2: 'Paris.'44# Generation for prompt-1: 'likely to'45# Generation for prompt-0: '\n\nJ'46# Generation for prompt-3: 'an exciting'47# Generation for prompt-2: 'Paris.'48# Generation for prompt-1: 'likely to nomin'49# Generation for prompt-0: '\n\nJane'50# Generation for prompt-3: 'an exciting time'51# Generation for prompt-1: 'likely to nominate'52# Generation for prompt-0: '\n\nJane Smith'53# Generation for prompt-3: 'an exciting time for'54# Generation for prompt-1: 'likely to nominate a'55# Generation for prompt-0: '\n\nJane Smith.'56# Generation for prompt-3: 'an exciting time for us'57# Generation for prompt-1: 'likely to nominate a new'585960if__name__=='__main__':61main()