Distributed LLM Generation#

SourceNVIDIA/TensorRT-LLM.

 1fromtensorrt_llmimportLLM,SamplingParams 2 3 4defmain(): 5# model could accept HF model name or a path to local HF model. 6llm=LLM( 7model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", 8# Enable 2-way tensor parallelism 9tensor_parallel_size=210# Enable 2-way pipeline parallelism if needed11# pipeline_parallel_size=212# Enable 2-way expert parallelism for MoE model's expert weights13# moe_expert_parallel_size=214# Enable 2-way tensor parallelism for MoE model's expert weights15# moe_tensor_parallel_size=216)1718# Sample prompts.19prompts=[20"Hello, my name is",21"The capital of France is",22"The future of AI is",23]2425# Create a sampling params.26sampling_params=SamplingParams(temperature=0.8,top_p=0.95)2728foroutputinllm.generate(prompts,sampling_params):29print(30f"Prompt:{output.prompt!r}, Generated text:{output.outputs[0].text!r}"31)3233# Got output like34# Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'35# Prompt: 'The capital of France is', Generated text: 'Paris.'36# Prompt: 'The future of AI is', Generated text: 'an exciting time for us. We are constantly researching, developing, and improving our platform to create the most advanced and efficient model available. We are'373839# The entry point of the program need to be protected for spawning processes.40if__name__=='__main__':41main()