Mar 29, 2025 · Mar 29, 2025 · Mar 29, 2025 · Mar 31, 2025 · Mar 31, 2025 · Mar 31, 2025
diff --git a/recipes/OpenR1-Zero-32B-Math/grpo/config_v00.00.yaml b/recipes/OpenR1-Zero-32B-Math/grpo/config_v00.00.yaml
 do_eval: false
 gradient_accumulation_steps: 16
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 #gradient_checkpointing_kwargs:
 #  use_reentrant: false
 hub_model_id: open-r1/R1-Zero-Qwen-32B-Math
 hub_model_revision: v00.00
 hub_strategy: every_save
diff --git a/setup.py b/setup.py
    "sentencepiece>=0.1.99",
    "torch==2.6.0",
    "transformers==4.51.2",
    "trl @ git+https://github.com/huggingface/trl.git@294f35bf3c0043d3ee6b9b5d22385e5736f6ce9e",  # Generate once per batch: https://github.com/huggingface/trl/pull/3283
    "vllm==0.8.3",
    "trl[vllm] @ git+https://github.com/huggingface/trl.git@294f35bf3c0043d3ee6b9b5d22385e5736f6ce9e",  # Generate once per batch: https://github.com/huggingface/trl/pull/3283
    "wandb>=0.19.1",
 ]

diff --git a/slurm/train.slurm b/slurm/train.slurm
 #SBATCH --gres=gpu:8
 #SBATCH --partition=hopper-prod  # Adjust this for your cluster
 #SBATCH --output=./logs/%x-%j.out
 #SBATCH --err=./logs/%x-%j.err
 #SBATCH --error=./logs/%x-%j.err
 #SBATCH --requeue

 # Specific configuration optimized for the Hugging Face Compute Cluster

 source ~/.bashrc
 source openr1/bin/activate
 echo "START TIME: $(date)"

 MODEL=$1
 TASK=$2
 CONFIG_SUFFIX=$3
 ACCELERATOR=$4
 OPTIONAL_ARGS=$5
 CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml

 # Special parsing to align GAS on accelerate and training configs
 GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')

 # Split the string into individual arguments
 IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
 # Loop through the arguments and find the one with "--gradient_accumulation_steps"
 for arg in "${ARGS[@]}"; do
    if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
        # Extract the value after the equals sign
        GRAD_ACC_STEPS="${arg#*=}"
        break  # Exit the loop once we find the desired argument
    fi
 done

 echo "Gradient accumulation steps: $GRAD_ACC_STEPS"

 MODEL=$(grep 'model_name_or_path:' $CONFIG_FILE | awk '{print $2}')
 REVISION=$(grep 'model_revision:' $CONFIG_FILE | head -n 1 | awk '{print $2}')

 if [[ -f "$CONFIG_FILE" ]] && grep -qE '^\s*use_vllm:\s*true' "$CONFIG_FILE"; then
    USE_VLLM="true"
 fi
 #If usingvLLM we need to reserve one node for the vLLM server and retain the rest for training
 #if usingvllm
 if [[ "$USE_VLLM" == "true" ]]; then
     TRAIN_NODES=("${NODELIST[@]:0:$((NUM_NODES - 1))}")
     VLLM_NODE=${NODELIST[-1]} # Last node
     echo "Using vLLM server on node: $VLLM_NODE"
     echo "Training nodes: ${TRAIN_NODES[*]}"
     TP=$(python scripts/get_tensor_parallel_size.py --model_name $MODEL --revision $REVISION --default_tp $GPUS_PER_NODE)
     WORLD_SIZE=$((WORLD_SIZE - GPUS_PER_NODE))
     NUM_NODES=$((NUM_NODES - 1))
     echo "Reduced WORLD_SIZE: $WORLD_SIZE and NUM_NODES: $NUM_NODES"
     srun --nodes=1 --ntasks=1 --nodelist=$VLLM_NODE trl vllm-serve --model $MODEL --revision $REVISION --tensor_parallel_size $TP &

     OPTIONAL_ARGS="$OPTIONAL_ARGS --vllm_server_host=$VLLM_NODE"
 fi

 # export NCCL_NSOCKS_PERTHREAD=1
 # export CUDA_LAUNCH_BLOCKING=1

 export CMD=" \
    src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS
    "

 TRAIN_NODES_CSV=$(IFS=,; echo "${TRAIN_NODES[*]}")
 export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
    --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
    --gradient_accumulation_steps $GRAD_ACC_STEPS \
    --num_machines $NUM_NODES \
    --num_processes $WORLD_SIZE \
    --main_process_ip $MASTER_ADDR \
    --main_process_port $MASTER_PORT \
    --machine_rank $SLURM_PROCID \
    --rdzv_backend=c10d \
    --max_restarts 1 \
    --tee 3 \
    "
 # srun error handling:
 # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
 # --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
 NODELIST=$(IFS=,; echo "${TRAIN_NODES[*]}")

 srun --nodes=$NUM_NODES \
     --ntasks=$NUM_NODES \
     --nodelist=$TRAIN_NODES_CSV \
     accelerate launch \
      --config_file recipes/accelerate_configs/$ACCELERATOR.yaml \
      --gradient_accumulation_steps $GRAD_ACC_STEPS \
      --num_machines $NUM_NODES \
      --num_processes $WORLD_SIZE \
      --main_process_ip $MASTER_ADDR \
      --main_process_port $MASTER_PORT \
      --machine_rank $SLURM_PROCID \
      --rdzv_backend=c10d \
      src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS
 SRUN_ARGS=" \
    --wait=60 \
    --kill-on-bad-exit=1 \
    --nodes=$NUM_NODES \
    --ntasks=$NUM_NODES \
    --nodelist=$NODELIST
    "
 clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER $CMD" 2>&1

 # wait for any background jobs (vLLM) before exiting
 wait
 echo "END TIME: $(date)"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,8 +24,8 @@ use_vllm: true
		do_eval: false
		gradient_accumulation_steps: 16
		gradient_checkpointing: true
		gradient_checkpointing_kwargs:
		use_reentrant: false
		#gradient_checkpointing_kwargs:
		# use_reentrant: false
		hub_model_id: open-r1/R1-Zero-Qwen-32B-Math
		hub_model_revision: v00.00
		hub_strategy: every_save
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -67,8 +67,7 @@
		"sentencepiece>=0.1.99",
		"torch==2.6.0",
		"transformers==4.51.2",
		"trl @ git+https://github.com/huggingface/trl.git@294f35bf3c0043d3ee6b9b5d22385e5736f6ce9e", # Generate once per batch: https://github.com/huggingface/trl/pull/3283
		"vllm==0.8.3",
		"trl[vllm] @ git+https://github.com/huggingface/trl.git@294f35bf3c0043d3ee6b9b5d22385e5736f6ce9e", # Generate once per batch: https://github.com/huggingface/trl/pull/3283
		"wandb>=0.19.1",
		]

Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,7 +5,7 @@
		#SBATCH --gres=gpu:8
		#SBATCH --partition=hopper-prod # Adjust this for your cluster
		#SBATCH --output=./logs/%x-%j.out
		#SBATCH --err=./logs/%x-%j.err
		#SBATCH --error=./logs/%x-%j.err
		#SBATCH --requeue

		# Specific configuration optimized for the Hugging Face Compute Cluster
Expand All		@@ -14,30 +14,15 @@ set -x -e

		source ~/.bashrc
		source openr1/bin/activate
		echo "START TIME: $(date)"

		MODEL=$1
		TASK=$2
		CONFIG_SUFFIX=$3
		ACCELERATOR=$4
		OPTIONAL_ARGS=$5
		CONFIG_FILE=recipes/$MODEL/$TASK/config_$CONFIG_SUFFIX.yaml

		# Special parsing to align GAS on accelerate and training configs
		GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE \| awk '{print $2}')

		# Split the string into individual arguments
		IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
		# Loop through the arguments and find the one with "--gradient_accumulation_steps"
		for arg in "${ARGS[@]}"; do
		if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
		# Extract the value after the equals sign
		GRAD_ACC_STEPS="${arg#*=}"
		break # Exit the loop once we find the desired argument
		fi
		done

		echo "Gradient accumulation steps: $GRAD_ACC_STEPS"

		MODEL=$(grep 'model_name_or_path:' $CONFIG_FILE \| awk '{print $2}')
		REVISION=$(grep 'model_revision:' $CONFIG_FILE \| head -n 1 \| awk '{print $2}')

Expand All		@@ -54,17 +39,15 @@ USE_VLLM="false"
		if [[ -f "$CONFIG_FILE" ]] && grep -qE '^\suse_vllm:\strue' "$CONFIG_FILE"; then
		USE_VLLM="true"
		fi
		#If usingvLLM we need to reserve one node for the vLLM server and retain the rest for training
		#if usingvllm
		if [[ "$USE_VLLM" == "true" ]]; then
		TRAIN_NODES=("${NODELIST[@]:0:$((NUM_NODES - 1))}")
		VLLM_NODE=${NODELIST[-1]} # Last node
		echo "Using vLLM server on node: $VLLM_NODE"
		echo "Training nodes: ${TRAIN_NODES[*]}"
		TP=$(python scripts/get_tensor_parallel_size.py --model_name $MODEL --revision $REVISION --default_tp $GPUS_PER_NODE)
		WORLD_SIZE=$((WORLD_SIZE - GPUS_PER_NODE))
		NUM_NODES=$((NUM_NODES - 1))
		echo "Reduced WORLD_SIZE: $WORLD_SIZE and NUM_NODES: $NUM_NODES"
		srun --nodes=1 --ntasks=1 --nodelist=$VLLM_NODE trl vllm-serve --model $MODEL --revision $REVISION --tensor_parallel_size $TP &

		OPTIONAL_ARGS="$OPTIONAL_ARGS --vllm_server_host=$VLLM_NODE"
		fi

Expand All		@@ -76,22 +59,34 @@ export NCCL_ASYNC_ERROR_HANDLING=1
		# export NCCL_NSOCKS_PERTHREAD=1
		# export CUDA_LAUNCH_BLOCKING=1

		export CMD=" \
		src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS
		"

		TRAIN_NODES_CSV=$(IFS=,; echo "${TRAIN_NODES[*]}")
		export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
		--config_file recipes/accelerate_configs/$ACCELERATOR.yaml \
		--gradient_accumulation_steps $GRAD_ACC_STEPS \
		--num_machines $NUM_NODES \
		--num_processes $WORLD_SIZE \
		--main_process_ip $MASTER_ADDR \
		--main_process_port $MASTER_PORT \
		--machine_rank $SLURM_PROCID \
		--rdzv_backend=c10d \
		--max_restarts 1 \
		--tee 3 \
		"
		# srun error handling:
		# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
		# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
		NODELIST=$(IFS=,; echo "${TRAIN_NODES[*]}")

		srun --nodes=$NUM_NODES \
		--ntasks=$NUM_NODES \
		--nodelist=$TRAIN_NODES_CSV \
		accelerate launch \
		--config_file recipes/accelerate_configs/$ACCELERATOR.yaml \
		--gradient_accumulation_steps $GRAD_ACC_STEPS \
		--num_machines $NUM_NODES \
		--num_processes $WORLD_SIZE \
		--main_process_ip $MASTER_ADDR \
		--main_process_port $MASTER_PORT \
		--machine_rank $SLURM_PROCID \
		--rdzv_backend=c10d \
		src/open_r1/$TASK.py --config $CONFIG_FILE $OPTIONAL_ARGS
		SRUN_ARGS=" \
		--wait=60 \
		--kill-on-bad-exit=1 \
		--nodes=$NUM_NODES \
		--ntasks=$NUM_NODES \
		--nodelist=$NODELIST
		"
		clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER $CMD" 2>&1

		# wait for any background jobs (vLLM) before exiting
		wait
		echo "END TIME: $(date)"