Run trtllm-bench with pytorch backend on Slurm#

SourceNVIDIA/TensorRT-LLM.

 1#!/bin/bash 2#SBATCH -A <account> 3#SBATCH -p <partition> 4#SBATCH -t 01:00:00 5#SBATCH -N 2 6#SBATCH --ntasks-per-node=8 7#SBATCH -o logs/trtllm-bench.out 8#SBATCH -e logs/trtllm-bench.err 9#SBATCH -J trtllm-bench101112# NOTE, this feature is experimental and may not work on all systems.13# The trtllm-llmapi-launch is a script that launches the LLM-API code on14# Slurm-like systems, and can support multi-node and multi-GPU setups.1516# Note that, the number of MPI processes should be the same as the model world17# size. e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for18# each, or 4 nodes with 4 gpus for each or other combinations.1920# This docker image should have tensorrt_llm installed, or you need to install21# it in the task.2223# The following variables are expected to be set in the environment:24# You can set them via --export in the srun/sbatch command.25#   CONTAINER_IMAGE: the docker image to use, you'd better install tensorrt_llm in it, or install it in the task.26#   MOUNT_DIR: the directory to mount in the container27#   MOUNT_DEST: the destination directory in the container28#   WORKDIR: the working directory in the container29#   SOURCE_ROOT: the path to the TensorRT LLM source30#   PROLOGUE: the prologue to run before the script31#   LOCAL_MODEL: the local model directory to use, NOTE: downloading from HF is32#      not supported in Slurm mode, you need to download the model and put it in33#      the LOCAL_MODEL directory.3435exportprepare_dataset="$SOURCE_ROOT/benchmarks/cpp/prepare_dataset.py"36exportdata_path="$WORKDIR/token-norm-dist.txt"3738echo"Preparing dataset..."39srun-l\40-N1\41-n1\42--container-image=${CONTAINER_IMAGE}\43--container-name="prepare-name"\44--container-mounts=${MOUNT_DIR}:${MOUNT_DEST}\45--container-workdir=${WORKDIR}\46--export=ALL\47--mpi=pmix\48bash-c"49$PROLOGUE50        python3$prepare_dataset \51            --tokenizer=$LOCAL_MODEL \52            --stdout token-norm-dist \53            --num-requests=100 \54            --input-mean=128 \55            --output-mean=128 \56            --input-stdev=0 \57            --output-stdev=0 >$data_path58    "5960echo"Running benchmark..."61# Just launch trtllm-bench job with trtllm-llmapi-launch command.6263srun-l\64--container-image=${CONTAINER_IMAGE}\65--container-mounts=${MOUNT_DIR}:${MOUNT_DEST}\66--container-workdir=${WORKDIR}\67--export=ALL,PYTHONPATH=${SOURCE_ROOT}\68--mpi=pmix\69bash-c"70        set -ex71$PROLOGUE72        export PATH=$PATH:~/.local/bin7374        # This is optional75        cat > /tmp/pytorch_extra_args.txt << EOF76cuda_graph_config: null77print_iter_log: true78enable_attention_dp: false79EOF8081        # launch the benchmark82        trtllm-llmapi-launch \83         trtllm-bench \84            --model$MODEL_NAME \85            --model_path$LOCAL_MODEL \86            throughput \87            --dataset$data_path \88            --backend pytorch \89            --tp 16 \90            --extra_llm_api_options /tmp/pytorch_extra_args.txt \91$EXTRA_ARGS92    "