Runtime Configuration Examples#

SourceNVIDIA/TensorRT-LLM.

 1 2importargparse 3 4fromtensorrt_llmimportLLM,SamplingParams 5fromtensorrt_llm.llmapiimportCudaGraphConfig,KvCacheConfig 6 7 8defexample_cuda_graph_config(): 9"""10    Example demonstrating CUDA graph configuration for performance optimization.1112    CUDA graphs help with:13    - Reduced kernel launch overhead14    - Better GPU utilization15    - Improved throughput for repeated operations16    """17print("\n=== CUDA Graph Configuration Example ===")1819cuda_graph_config=CudaGraphConfig(20batch_sizes=[1,2,4],21enable_padding=True,22)2324llm=LLM(25model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",26cuda_graph_config=cuda_graph_config,# Enable CUDA graphs27max_batch_size=4,28max_seq_len=512,29kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.5))3031prompts=[32"Hello, my name is",33"The capital of France is",34"The future of AI is",35]3637sampling_params=SamplingParams(max_tokens=50,temperature=0.8,top_p=0.95)3839# This should benefit from CUDA graphs40outputs=llm.generate(prompts,sampling_params)41foroutputinoutputs:42print(f"Prompt:{output.prompt}")43print(f"Generated:{output.outputs[0].text}")44print()454647defexample_kv_cache_config():48print("\n=== KV Cache Configuration Example ===")49print("\n1. KV Cache Configuration:")5051llm_advanced=LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",52max_batch_size=8,53max_seq_len=1024,54kv_cache_config=KvCacheConfig(55free_gpu_memory_fraction=0.5,56enable_block_reuse=True))5758prompts=[59"Hello, my name is",60"The capital of France is",61"The future of AI is",62]6364outputs=llm_advanced.generate(prompts)65fori,outputinenumerate(outputs):66print(f"Query{i+1}:{output.prompt}")67print(f"Answer:{output.outputs[0].text[:100]}...")68print()697071defmain():72"""73    Main function to run all runtime configuration examples.74    """75parser=argparse.ArgumentParser(76description="Runtime Configuration Examples")77parser.add_argument("--example",78type=str,79choices=["kv_cache","cuda_graph","all"],80default="all",81help="Which example to run")8283args=parser.parse_args()8485ifargs.example=="kv_cache"orargs.example=="all":86example_kv_cache_config()8788ifargs.example=="cuda_graph"orargs.example=="all":89example_cuda_graph_config()909192if__name__=="__main__":93main()