Feb 24, 2025 · Feb 25, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/README_AGENTS.md b/README_AGENTS.md
 sbatch --nodes=1 slurm/train.slurm --model SmolLM2-1.7B-Instruct --task sft --config agent --accelerator zero3
 ```
 Refers to the config  recipes/SmolLM2-1.7B-Instruct/sft/config_agent.yaml
 zero3 is one of the accelerate configs in recipes/accelerate_configs
 zero3 is one of the accelerate configs in recipes/accelerate_configs



 Launch VLM training:
 ```bash
 sbatch --nodes=1 slurm/train.slurm --model Qwen2.5-VL-3B-Instruct --task sft --config agent --accelerator zero3
 ```

 Simple mode
 ```bash
 sbatch --nodes=1 slurm/train.slurm --model Qwen2.5-VL-3B-Instruct --task sft --config agent --accelerator ddp
 ```
diff --git a/logs/.gitkeep b/logs/.gitkeep
diff --git a/recipes/Qwen2.5-VL-3B-Instruct/sft/config_agent.yaml b/recipes/Qwen2.5-VL-3B-Instruct/sft/config_agent.yaml
 # Model arguments
 # You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
 model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct
 vision_model: true
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: sdpa
 save_strategy: "steps"
 save_steps: 500
 save_total_limit: 1
 seed: 42
 seed: 42

 dataset_mixture:
  datasets:                     # List of datasets to include in the mixture
    - id: smolagents/aguvis-stage-2  # Hub dataset ID
      config: mind2web     # Name of the dataset config
      split: train            # Split to use from the dataset
      columns:                  # Columns to keep
        - images
        - texts
      weight: 1.             # Fraction of dataset to use
  seed: 42                      # Seed for shuffling the combined dataset
  test_split_size: 0.1
diff --git a/recipes/SmolLM2-1.7B-Instruct/sft/config_agent.yaml b/recipes/SmolLM2-1.7B-Instruct/sft/config_agent.yaml

 # SFT trainer config
 max_steps: -1
 num_train_epochs:6
 num_train_epochs:1
 bf16: true
 do_eval: false
 eval_strategy: 'no'
diff --git a/slurm/train.slurm b/slurm/train.slurm
  exit 0
 fi

 HF_DATASETS_CACHE="/fsx/aymeric/.cache/datasets"
 TRANSFORMERS_CACHE="/fsx/aymeric/.cache/transformers"

 # Specific configuration optimized for the Hugging Face Compute Cluster
 module load cuda/12.4
  esac
 done

 export HF_DATASETS_CACHE="/fsx/aymeric/.cache/datasets"
 export TRANSFORMERS_CACHE="/fsx/aymeric/.cache/transformers"
 HF_DATASETS_CACHE="/fsx/aymeric/.cache/datasets"
 TRANSFORMERS_CACHE="/fsx/aymeric/.cache/transformers"
 export HF_HOME="/fsx/aymeric/.cache/"
 HF_HOME="/fsx/aymeric/.cache/"

 # Validate required arguments
 if [[ -z "$MODEL" || -z "$TASK" || -z "$CONFIG_SUFFIX" || -z "$ACCELERATOR" ]]; then
 fi

 # force crashing on nccl issues like hanging broadcast
 exportNCCL_ASYNC_ERROR_HANDLING=1
 #export NCCL_DEBUG=INFO
 exportTORCH_NCCL_ASYNC_ERROR_HANDLING=1
 export NCCL_DEBUG=INFO
 # export NCCL_DEBUG_SUBSYS=COLL
 # export NCCL_SOCKET_NTHREADS=1
 # export NCCL_NSOCKS_PERTHREAD=1
diff --git a/src/open_r1/configs.py b/src/open_r1/configs.py
        default=None,
        metadata={"help": "The optional system prompt to use for benchmarking."},
    )
    vision_model: bool = field(
        default=False,
        metadata={"help": "Whether this is a vision-language model training."},
    )
    hub_model_revision: Optional[str] = field(
        default="main",
        metadata={"help": "The Hub model branch to push the model to."},
diff --git a/src/open_r1/sft.py b/src/open_r1/sft.py
 # limitations under the License.

 """
 Supervised fine-tuning script for decoder language models.
 Supervised fine-tuning script for decoder language models and vision-language models.

 Usage:


 import datasets
 import transformers
 from transformers import set_seed
 from transformers import set_seed, AutoModelForVision2Seq, AutoProcessor, LlavaForConditionalGeneration
 from transformers.trainer_utils import get_last_checkpoint
 from trl import ModelConfig, SFTTrainer, TrlParser, get_peft_config, setup_chat_format

 from open_r1.configs import ScriptArguments, SFTConfig
 from open_r1.utils import get_dataset, get_model, get_tokenizer
 from open_r1.utils import get_dataset, get_model, get_tokenizer, get_processor
 from open_r1.utils.callbacks import get_callbacks
 from open_r1.utils.wandb_logging import init_wandb_training

 logger = logging.getLogger(__name__)


 def create_vlm_collate_fn(processor):
    """Create a data collator for VLM training that handles images and text."""

    def collate_fn(examples):
        # Get the texts and images, and apply the chat template
        texts = [processor.apply_chat_template(example["messages"], tokenize=False) for example in examples]
        images = [example["images"] for example in examples]

        # Handle LLaVA 1.5 which doesn't support multiple images
        if isinstance(processor.model, LlavaForConditionalGeneration):
            images = [image[0] if image else None for image in images]

        # Tokenize the texts and process the images
        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

        # The labels are the input_ids, and we mask the padding tokens in the loss computation
        labels = batch["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = -100

        # Ignore the image token index in the loss computation (model specific)
        if hasattr(processor, 'image_token'):
            image_token_id = processor.tokenizer.convert_tokens_to_ids(processor.image_token)
            labels[labels == image_token_id] = -100

        batch["labels"] = labels
        return batch

    return collate_fn


 def main(script_args, training_args, model_args):
    set_seed(training_args.seed)

        init_wandb_training(training_args)

    ######################################
    # Load dataset, tokenizer, and model #
    # Load dataset,processor/tokenizer, and model #
    ######################################
    dataset = get_dataset(script_args)
    tokenizer = get_tokenizer(model_args, training_args)
    model = get_model(model_args, training_args)

    if tokenizer.chat_template is None:
        logger.info("No chat template provided, defaulting to ChatML.")
        model, tokenizer = setup_chat_format(model, tokenizer, format="chatml")
    if training_args.vision_model:
        logger.info("Setting up vision-language model training")

        # Set VLM-specific training arguments (following TRL reference)
        training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False)
        training_args.remove_unused_columns = False
        training_args.dataset_kwargs = {"skip_prepare_dataset": True}

        # Load processor and model for VLM
        processor = get_processor(model_args, training_args)
        model = get_model(model_args, training_args)  # This should return AutoModelForVision2Seq
        data_collator = create_vlm_collate_fn(processor)
        processing_class = processor.tokenizer
        model_tags = ["open-r1", "vision-language", "vlm"]

    else:
        logger.info("Setting up text-only model training")

        # Load tokenizer and model for text-only
        tokenizer = get_tokenizer(model_args, training_args)
        model = get_model(model_args, training_args)

        if tokenizer.chat_template is None:
            logger.info("No chat template provided, defaulting to ChatML.")
            model, tokenizer = setup_chat_format(model, tokenizer, format="chatml")

        data_collator = None  # Use default
        processing_class = tokenizer
        model_tags = ["open-r1"]

    ############################
    # Initialize the SFT Trainer
    ############################
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset[script_args.dataset_train_split],
        eval_dataset=(
            dataset[script_args.dataset_test_split]
            if training_args.eval_strategy != "no"
            else None
        ),
        processing_class=tokenizer,
        processing_class=processing_class,
        peft_config=get_peft_config(model_args),
        callbacks=get_callbacks(training_args, model_args),
    )
    # Save model and create model card
    ##################################
    logger.info("*** Save model ***")
    # Align the model's generation config with the tokenizer's eos token
    # to avoid unbounded generation in the transformers `pipeline()` function
    trainer.model.generation_config.eos_token_id = tokenizer.eos_token_id
    trainer.save_model(training_args.output_dir)
    logger.info(f"Model saved to {training_args.output_dir}")

    # Save everything else on main process
    kwargs = {
        "dataset_name": script_args.dataset_name,
        "tags":["open-r1"],
        "tags":model_tags,
    }
    if trainer.accelerator.is_main_process:
        trainer.create_model_card(**kwargs)
    if training_args.push_to_hub:
        logger.info("Pushing to hub...")
        trainer.push_to_hub(**kwargs)
        # Also push processor for VLM models
        if training_args.vision_model and trainer.accelerator.is_main_process:
            processor.push_to_hub(training_args.hub_model_id)


 if __name__ == "__main__":
diff --git a/src/open_r1/utils/__init__.py b/src/open_r1/utils/__init__.py
 from .data import get_dataset
 from .import_utils import is_e2b_available, is_morph_available
 from .model_utils import get_model, get_tokenizer
 from .model_utils import get_model, get_tokenizer, get_processor


 __all__ = ["get_tokenizer", "is_e2b_available", "is_morph_available", "get_model", "get_dataset"]
 __all__ = ["get_tokenizer", "get_processor", "is_e2b_available", "is_morph_available", "get_model", "get_dataset"]
diff --git a/src/open_r1/utils/model_utils.py b/src/open_r1/utils/model_utils.py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
 from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer, AutoProcessor, AutoModelForVision2Seq

 from trl import ModelConfig, get_kbit_device_map, get_quantization_config

    return tokenizer


 def get_model(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> AutoModelForCausalLM:
    """Get the model"""
 def get_processor(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> AutoProcessor:
    """Get the processor for VLM models."""
    processor = AutoProcessor.from_pretrained(
        model_args.model_name_or_path,
        revision=model_args.model_revision,
        trust_remote_code=model_args.trust_remote_code,
    )

    if training_args.chat_template is not None:
        processor.chat_template = training_args.chat_template

    return processor


 def get_model(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> AutoModelForCausalLM | AutoModelForVision2Seq:
    """Get the model - supports both text-only and vision-language models"""
    torch_dtype = (
        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
    )
        device_map=get_kbit_device_map() if quantization_config is not None else None,
        quantization_config=quantization_config,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path,
        **model_kwargs,
    )

    # Check if this is a VLM model using the explicit flag
    if hasattr(training_args, 'vision_model') and training_args.vision_model:
        # Load as vision-language model
        model = AutoModelForVision2Seq.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
        )
    else:
        # Load as text-only model
        model = AutoModelForCausalLM.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
        )

    return model
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,16 @@ Launch:
		sbatch --nodes=1 slurm/train.slurm --model SmolLM2-1.7B-Instruct --task sft --config agent --accelerator zero3
		```
		Refers to the config recipes/SmolLM2-1.7B-Instruct/sft/config_agent.yaml
		zero3 is one of the accelerate configs in recipes/accelerate_configs
		zero3 is one of the accelerate configs in recipes/accelerate_configs



		Launch VLM training:
		```bash
		sbatch --nodes=1 slurm/train.slurm --model Qwen2.5-VL-3B-Instruct --task sft --config agent --accelerator zero3
		```

		Simple mode
		```bash
		sbatch --nodes=1 slurm/train.slurm --model Qwen2.5-VL-3B-Instruct --task sft --config agent --accelerator ddp
		```
Original file line number	Diff line number	Diff line change
		@@ -1,6 +1,7 @@
		# Model arguments
		# You can download the model and manually change the rope to 300k/500k and max_position_embeddings to 32768
		model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct
		vision_model: true
		model_revision: main
		torch_dtype: bfloat16
		attn_implementation: sdpa
Expand DownExpand Up		@@ -42,4 +43,16 @@ report_to:
		save_strategy: "steps"
		save_steps: 500
		save_total_limit: 1
		seed: 42
		seed: 42

		dataset_mixture:
		datasets: # List of datasets to include in the mixture
		- id: smolagents/aguvis-stage-2 # Hub dataset ID
		config: mind2web # Name of the dataset config
		split: train # Split to use from the dataset
		columns: # Columns to keep
		- images
		- texts
		weight: 1. # Fraction of dataset to use
		seed: 42 # Seed for shuffling the combined dataset
		test_split_size: 0.1
Original file line number	Diff line number	Diff line change
Expand Up		@@ -22,7 +22,7 @@ per_device_train_batch_size: 4 # Change this depending on the context length of

		# SFT trainer config
		max_steps: -1
		num_train_epochs:6
		num_train_epochs:1
		bf16: true
		do_eval: false
		eval_strategy: 'no'
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -23,8 +23,6 @@ if [[ "$" == "--help"* ]]; then
		exit 0
		fi

		HF_DATASETS_CACHE="/fsx/aymeric/.cache/datasets"
		TRANSFORMERS_CACHE="/fsx/aymeric/.cache/transformers"

		# Specific configuration optimized for the Hugging Face Compute Cluster
		module load cuda/12.4
Expand DownExpand Up		@@ -88,10 +86,8 @@ while [[ $# -gt 0 ]]; do
		esac
		done

		export HF_DATASETS_CACHE="/fsx/aymeric/.cache/datasets"
		export TRANSFORMERS_CACHE="/fsx/aymeric/.cache/transformers"
		HF_DATASETS_CACHE="/fsx/aymeric/.cache/datasets"
		TRANSFORMERS_CACHE="/fsx/aymeric/.cache/transformers"
		export HF_HOME="/fsx/aymeric/.cache/"
		HF_HOME="/fsx/aymeric/.cache/"

		# Validate required arguments
		if [[ -z "$MODEL" \|\| -z "$TASK" \|\| -z "$CONFIG_SUFFIX" \|\| -z "$ACCELERATOR" ]]; then
Expand DownExpand Up		@@ -143,8 +139,8 @@ if [[ "$USE_VLLM" == "true" ]]; then
		fi

		# force crashing on nccl issues like hanging broadcast
		exportNCCL_ASYNC_ERROR_HANDLING=1
		#export NCCL_DEBUG=INFO
		exportTORCH_NCCL_ASYNC_ERROR_HANDLING=1
		export NCCL_DEBUG=INFO
		# export NCCL_DEBUG_SUBSYS=COLL
		# export NCCL_SOCKET_NTHREADS=1
		# export NCCL_NSOCKS_PERTHREAD=1
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -185,6 +185,10 @@ class SFTConfig(trl.SFTConfig):
		default=None,
		metadata={"help": "The optional system prompt to use for benchmarking."},
		)
		vision_model: bool = field(
		default=False,
		metadata={"help": "Whether this is a vision-language model training."},
		)
		hub_model_revision: Optional[str] = field(
		default="main",
		metadata={"help": "The Hub model branch to push the model to."},
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,7 +13,7 @@
		# limitations under the License.

		"""
		Supervised fine-tuning script for decoder language models.
		Supervised fine-tuning script for decoder language models and vision-language models.

		Usage:

Expand All		@@ -39,18 +39,48 @@

		import datasets
		import transformers
		from transformers import set_seed
		from transformers import set_seed, AutoModelForVision2Seq, AutoProcessor, LlavaForConditionalGeneration
		from transformers.trainer_utils import get_last_checkpoint
		from trl import ModelConfig, SFTTrainer, TrlParser, get_peft_config, setup_chat_format

		from open_r1.configs import ScriptArguments, SFTConfig
		from open_r1.utils import get_dataset, get_model, get_tokenizer
		from open_r1.utils import get_dataset, get_model, get_tokenizer, get_processor
		from open_r1.utils.callbacks import get_callbacks
		from open_r1.utils.wandb_logging import init_wandb_training

		logger = logging.getLogger(__name__)


		def create_vlm_collate_fn(processor):
		"""Create a data collator for VLM training that handles images and text."""

		def collate_fn(examples):
		# Get the texts and images, and apply the chat template
		texts = [processor.apply_chat_template(example["messages"], tokenize=False) for example in examples]
		images = [example["images"] for example in examples]

		# Handle LLaVA 1.5 which doesn't support multiple images
		if isinstance(processor.model, LlavaForConditionalGeneration):
		images = [image[0] if image else None for image in images]

		# Tokenize the texts and process the images
		batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

		# The labels are the input_ids, and we mask the padding tokens in the loss computation
		labels = batch["input_ids"].clone()
		labels[labels == processor.tokenizer.pad_token_id] = -100

		# Ignore the image token index in the loss computation (model specific)
		if hasattr(processor, 'image_token'):
		image_token_id = processor.tokenizer.convert_tokens_to_ids(processor.image_token)
		labels[labels == image_token_id] = -100

		batch["labels"] = labels
		return batch

		return collate_fn


		def main(script_args, training_args, model_args):
		set_seed(training_args.seed)

Expand DownExpand Up		@@ -84,29 +114,54 @@ def main(script_args, training_args, model_args):
		init_wandb_training(training_args)

		######################################
		# Load dataset, tokenizer, and model #
		# Load dataset,processor/tokenizer, and model #
		######################################
		dataset = get_dataset(script_args)
		tokenizer = get_tokenizer(model_args, training_args)
		model = get_model(model_args, training_args)

		if tokenizer.chat_template is None:
		logger.info("No chat template provided, defaulting to ChatML.")
		model, tokenizer = setup_chat_format(model, tokenizer, format="chatml")
		if training_args.vision_model:
		logger.info("Setting up vision-language model training")

		# Set VLM-specific training arguments (following TRL reference)
		training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False)
		training_args.remove_unused_columns = False
		training_args.dataset_kwargs = {"skip_prepare_dataset": True}

		# Load processor and model for VLM
		processor = get_processor(model_args, training_args)
		model = get_model(model_args, training_args) # This should return AutoModelForVision2Seq
		data_collator = create_vlm_collate_fn(processor)
		processing_class = processor.tokenizer
		model_tags = ["open-r1", "vision-language", "vlm"]

		else:
		logger.info("Setting up text-only model training")

		# Load tokenizer and model for text-only
		tokenizer = get_tokenizer(model_args, training_args)
		model = get_model(model_args, training_args)

		if tokenizer.chat_template is None:
		logger.info("No chat template provided, defaulting to ChatML.")
		model, tokenizer = setup_chat_format(model, tokenizer, format="chatml")

		data_collator = None # Use default
		processing_class = tokenizer
		model_tags = ["open-r1"]

		############################
		# Initialize the SFT Trainer
		############################
		trainer = SFTTrainer(
		model=model,
		args=training_args,
		data_collator=data_collator,
		train_dataset=dataset[script_args.dataset_train_split],
		eval_dataset=(
		dataset[script_args.dataset_test_split]
		if training_args.eval_strategy != "no"
		else None
		),
		processing_class=tokenizer,
		processing_class=processing_class,
		peft_config=get_peft_config(model_args),
		callbacks=get_callbacks(training_args, model_args),
		)
Expand All		@@ -131,16 +186,13 @@ def main(script_args, training_args, model_args):
		# Save model and create model card
		##################################
		logger.info("* Save model *")
		# Align the model's generation config with the tokenizer's eos token
		# to avoid unbounded generation in the transformers `pipeline()` function
		trainer.model.generation_config.eos_token_id = tokenizer.eos_token_id
		trainer.save_model(training_args.output_dir)
		logger.info(f"Model saved to {training_args.output_dir}")

		# Save everything else on main process
		kwargs = {
		"dataset_name": script_args.dataset_name,
		"tags":["open-r1"],
		"tags":model_tags,
		}
		if trainer.accelerator.is_main_process:
		trainer.create_model_card(**kwargs)
Expand All		@@ -164,6 +216,9 @@ def main(script_args, training_args, model_args):
		if training_args.push_to_hub:
		logger.info("Pushing to hub...")
		trainer.push_to_hub(**kwargs)
		# Also push processor for VLM models
		if training_args.vision_model and trainer.accelerator.is_main_process:
		processor.push_to_hub(training_args.hub_model_id)


		if __name__ == "__main__":
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -1,6 +1,6 @@
		from .data import get_dataset
		from .import_utils import is_e2b_available, is_morph_available
		from .model_utils import get_model, get_tokenizer
		from .model_utils import get_model, get_tokenizer, get_processor


		__all__ = ["get_tokenizer", "is_e2b_available", "is_morph_available", "get_model", "get_dataset"]
		__all__ = ["get_tokenizer", "get_processor", "is_e2b_available", "is_morph_available", "get_model", "get_dataset"]
Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,5 @@
		import torch
		from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
		from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer, AutoProcessor, AutoModelForVision2Seq

		from trl import ModelConfig, get_kbit_device_map, get_quantization_config

Expand All		@@ -20,8 +20,22 @@ def get_tokenizer(model_args: ModelConfig, training_args: SFTConfig \| GRPOConfig
		return tokenizer


		def get_model(model_args: ModelConfig, training_args: SFTConfig \| GRPOConfig) -> AutoModelForCausalLM:
		"""Get the model"""
		def get_processor(model_args: ModelConfig, training_args: SFTConfig \| GRPOConfig) -> AutoProcessor:
		"""Get the processor for VLM models."""
		processor = AutoProcessor.from_pretrained(
		model_args.model_name_or_path,
		revision=model_args.model_revision,
		trust_remote_code=model_args.trust_remote_code,
		)

		if training_args.chat_template is not None:
		processor.chat_template = training_args.chat_template

		return processor


		def get_model(model_args: ModelConfig, training_args: SFTConfig \| GRPOConfig) -> AutoModelForCausalLM \| AutoModelForVision2Seq:
		"""Get the model - supports both text-only and vision-language models"""
		torch_dtype = (
		model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
		)
Expand All		@@ -35,8 +49,19 @@ def get_model(model_args: ModelConfig, training_args: SFTConfig \| GRPOConfig) ->
		device_map=get_kbit_device_map() if quantization_config is not None else None,
		quantization_config=quantization_config,
		)
		model = AutoModelForCausalLM.from_pretrained(
		model_args.model_name_or_path,
		**model_kwargs,
		)

		# Check if this is a VLM model using the explicit flag
		if hasattr(training_args, 'vision_model') and training_args.vision_model:
		# Load as vision-language model
		model = AutoModelForVision2Seq.from_pretrained(
		model_args.model_name_or_path,
		**model_kwargs,
		)
		else:
		# Load as text-only model
		model = AutoModelForCausalLM.from_pretrained(
		model_args.model_name_or_path,
		**model_kwargs,
		)

		return model