Dec 5, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/keras/src/callbacks/orbax_checkpoint.py b/keras/src/callbacks/orbax_checkpoint.py
 from keras.src.callbacks.monitor_callback import (
    MonitorCallback,  # For metric monitoring logic
 )
 from keras.src.utils.io_utils import print_msg
 from keras.src.utils.module_utils import ocp

 # Context and AsyncOptions are accessed through the lazy-loaded ocp module
    This callback saves the model's weights and optimizer state asynchronously
    using Orbax, allowing training to continue without blocking for I/O.

    **Multi-host Support**: When running in a multi-host distributed training
    environment with JAX backend, this callback automatically coordinates
    checkpointing across all hosts to ensure consistency and proper
    synchronization. Multi-host checkpointing is only supported on JAX.

    Example:

    ```python
        self._current_epoch = 0  # Keep track of epoch
        self._total_batches_seen = 0  # Global batch counter for step tracking

        # Multi-host support
        self._multihost_initialized = self._is_multihost_initialized()

        if self.save_freq != "epoch" and not isinstance(self.save_freq, int):
            raise ValueError(
                f"Unrecognized save_freq: {self.save_freq}. "
                ocp.training.preservation_policies.LatestN(max_to_keep)
            )

        # Use AnyPreservationPolicy to combine them.
        # Use AnyPreservationPolicy to combine them, or use directly
        # if single policy
        preservation_policy = None
        if policies:
            preservation_policy = (
                ocp.training.preservation_policies.AnyPreservationPolicy(
                    policies
            if len(policies) == 1:
                preservation_policy = policies[0]
            else:
                preservation_policy = (
                    ocp.training.preservation_policies.AnyPreservationPolicy(
                        policies
                    )
                )
            )

        # Create the V1 Checkpointer with direct parameter passing
        # Orbax will handle directory creation on all processes as needed
            preservation_policy=preservation_policy,
        )

    def _is_multihost_initialized(self):
        """Check if multi-host environment is initialized."""
        # Multi-host checkpointing is only supported on JAX backend
        if backend.backend() != "jax":
            return False

        multihost = ocp.multihost
        # Check if JAX distributed client is initialized
        # (indicates multihost setup)
        return multihost.is_jax_distributed_client_initialized()

    def _sync_processes(self, key=None):
        """Synchronize all processes across hosts."""
        if not self._multihost_initialized:
            return  # No-op for single host

        multihost = ocp.multihost
        sync_key = key or "orbax_checkpoint_sync"
        multihost.sync_global_processes(sync_key)

    def is_multihost_enabled(self):
        """Return True if multi-host checkpointing is enabled and initialized.

        This method can be used to check if the callback is operating in
        a multi-host distributed training environment. Multi-host checkpointing
        is only supported on JAX backend.

        Returns:
            bool: True if multi-host support is active, False otherwise.
        """
        return self._multihost_initialized

    def is_primary_host(self):
        """Return True if this process is the primary host in multi-host setup.

        In multi-host environments, only the primary host typically handles
        logging and coordination tasks. Multi-host checkpointing is only
        supported on JAX backend.

        Returns:
            bool: True if this is the primary host, False otherwise.
            Always returns True in single-host environments.
        """
        if not self._multihost_initialized:
            return True  # Single host is always primary
        multihost = ocp.multihost
        return multihost.is_primary_host()

    def _should_save_on_batch(self, batch):
        """Check if we should save on this batch."""
        if self.save_freq == "epoch":
        return False

    def _save_checkpoint(self, step, logs=None):
        """Save a checkpoint at the given step."""
        """Save a checkpoint at the given step with multi-host coordination."""

        # --- Prepare Composite State (Backend-Agnostic) ---
        state_tree = _get_state_tree(self.model)
                composite_state["non_trainable_variables"] = state_tree[
                    "non_trainable_variables"
                ]
            # Include assets even for weights-only checkpoints
            assets_tree = {}
            for layer in self.model.layers:
                if hasattr(layer, "asset_data"):
                    # Convert TrackedDict to dict, handle bytes as base64
                    asset_dict = {}
                    for key, value in layer.asset_data.items():
                        if isinstance(value, bytes):
                            import base64

                            asset_dict[key] = base64.b64encode(value).decode(
                                "ascii"
                            )
                        else:
                            asset_dict[key] = value
                    assets_tree[layer.name] = asset_dict
            composite_state["assets"] = assets_tree
        else:
            composite_state = state_tree

        # --- Save Logic (V1 API) ---
        # All processes participate in distributed checkpointing
        # Checkpointer is configured to save unconditionally when
        # save_pytree is called
        if self.verbose > 0:
            print_msg(
                f"OrbaxCheckpoint: Triggering async save for step {step}..."
            )
            composite_state = {
                "model_config": self.model.get_config(),
                **state_tree,
            }
            # Include assets as part of the tree
            if not self.save_weights_only:
                assets_tree = {}
                for layer in self.model.layers:
                    if hasattr(layer, "asset_data"):
                        # Convert TrackedDict to dict, handle bytes as base64
                        asset_dict = {}
                        for key, value in layer.asset_data.items():
                            if isinstance(value, bytes):
                                import base64

                                asset_dict[key] = base64.b64encode(
                                    value
                                ).decode("ascii")
                            else:
                                asset_dict[key] = value
                        assets_tree[layer.name] = asset_dict
                composite_state["assets"] = assets_tree

        # Use a single with statement. If context_options is empty,
        # Context() uses defaults.
        except Exception:
            pass  # Ignore errors during cleanup

        # Multi-host synchronization: ensure all hosts complete cleanup
        self._sync_processes("checkpoint_cleanup")

    def wait_until_finished(self):
        """Wait for any in-progress checkpoint operations to complete.
        This method blocks until all asynchronous checkpoint save operations
        have completed. It should be called before attempting to load
        checkpoints if there might be pending save operations.
        have completed across all hosts in a multi-host setup.
        """
        # Wait for any async operations to complete
        if hasattr(self.checkpointer, "wait"):
            self.checkpointer.wait()
        else:
            # Fallback for older Orbax versions that don't have wait() method
            while self.checkpointer.is_saving_in_progress():
                import time
        # Wait for any async operations to complete on this host
        self.checkpointer.wait()

                time.sleep(0.1)
        # Multi-host synchronization: ensure all hosts complete
        self._sync_processes("checkpoint_wait_complete")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,7 +8,6 @@
		from keras.src.callbacks.monitor_callback import (
		MonitorCallback, # For metric monitoring logic
		)
		from keras.src.utils.io_utils import print_msg
		from keras.src.utils.module_utils import ocp

		# Context and AsyncOptions are accessed through the lazy-loaded ocp module
Expand DownExpand Up		@@ -62,6 +61,11 @@ class OrbaxCheckpoint(MonitorCallback):
		This callback saves the model's weights and optimizer state asynchronously
		using Orbax, allowing training to continue without blocking for I/O.

		Multi-host Support: When running in a multi-host distributed training
		environment with JAX backend, this callback automatically coordinates
		checkpointing across all hosts to ensure consistency and proper
		synchronization. Multi-host checkpointing is only supported on JAX.

		Example:

		```python
Expand DownExpand Up		@@ -138,6 +142,9 @@ def __init__(
		self._current_epoch = 0 # Keep track of epoch
		self._total_batches_seen = 0 # Global batch counter for step tracking

		# Multi-host support
		self._multihost_initialized = self._is_multihost_initialized()

		if self.save_freq != "epoch" and not isinstance(self.save_freq, int):
		raise ValueError(
		f"Unrecognized save_freq: {self.save_freq}. "
Expand All		@@ -151,14 +158,18 @@ def __init__(
		ocp.training.preservation_policies.LatestN(max_to_keep)
		)

		# Use AnyPreservationPolicy to combine them.
		# Use AnyPreservationPolicy to combine them, or use directly
		# if single policy
		preservation_policy = None
		if policies:
		preservation_policy = (
		ocp.training.preservation_policies.AnyPreservationPolicy(
		policies
		if len(policies) == 1:
		preservation_policy = policies[0]
		else:
		preservation_policy = (
		ocp.training.preservation_policies.AnyPreservationPolicy(
		policies
		)
		)
		)

		# Create the V1 Checkpointer with direct parameter passing
		# Orbax will handle directory creation on all processes as needed
Expand All		@@ -167,6 +178,54 @@ def __init__(
		preservation_policy=preservation_policy,
		)

		def _is_multihost_initialized(self):
		"""Check if multi-host environment is initialized."""
		# Multi-host checkpointing is only supported on JAX backend
		if backend.backend() != "jax":
		return False

		multihost = ocp.multihost
		# Check if JAX distributed client is initialized
		# (indicates multihost setup)
		return multihost.is_jax_distributed_client_initialized()

		def _sync_processes(self, key=None):
		"""Synchronize all processes across hosts."""
		if not self._multihost_initialized:
		return # No-op for single host

		multihost = ocp.multihost
		sync_key = key or "orbax_checkpoint_sync"
		multihost.sync_global_processes(sync_key)

		def is_multihost_enabled(self):
		"""Return True if multi-host checkpointing is enabled and initialized.

		This method can be used to check if the callback is operating in
		a multi-host distributed training environment. Multi-host checkpointing
		is only supported on JAX backend.

		Returns:
		bool: True if multi-host support is active, False otherwise.
		"""
		return self._multihost_initialized

		def is_primary_host(self):
		"""Return True if this process is the primary host in multi-host setup.

		In multi-host environments, only the primary host typically handles
		logging and coordination tasks. Multi-host checkpointing is only
		supported on JAX backend.

		Returns:
		bool: True if this is the primary host, False otherwise.
		Always returns True in single-host environments.
		"""
		if not self._multihost_initialized:
		return True # Single host is always primary
		multihost = ocp.multihost
		return multihost.is_primary_host()

		def _should_save_on_batch(self, batch):
		"""Check if we should save on this batch."""
		if self.save_freq == "epoch":
Expand All		@@ -186,7 +245,7 @@ def _should_save_on_batch(self, batch):
		return False

		def _save_checkpoint(self, step, logs=None):
		"""Save a checkpoint at the given step."""
		"""Save a checkpoint at the given step with multi-host coordination."""

		# --- Prepare Composite State (Backend-Agnostic) ---
		state_tree = _get_state_tree(self.model)
Expand All		@@ -201,17 +260,46 @@ def _save_checkpoint(self, step, logs=None):
		composite_state["non_trainable_variables"] = state_tree[
		"non_trainable_variables"
		]
		# Include assets even for weights-only checkpoints
		assets_tree = {}
		for layer in self.model.layers:
		if hasattr(layer, "asset_data"):
		# Convert TrackedDict to dict, handle bytes as base64
		asset_dict = {}
		for key, value in layer.asset_data.items():
		if isinstance(value, bytes):
		import base64

		asset_dict[key] = base64.b64encode(value).decode(
		"ascii"
		)
		else:
		asset_dict[key] = value
		assets_tree[layer.name] = asset_dict
		composite_state["assets"] = assets_tree
		else:
		composite_state = state_tree

		# --- Save Logic (V1 API) ---
		# All processes participate in distributed checkpointing
		# Checkpointer is configured to save unconditionally when
		# save_pytree is called
		if self.verbose > 0:
		print_msg(
		f"OrbaxCheckpoint: Triggering async save for step {step}..."
		)
		composite_state = {
		"model_config": self.model.get_config(),
amitsrivastava78 marked this conversation as resolved. Show resolvedHide resolved
		**state_tree,
		}
		# Include assets as part of the tree
		if not self.save_weights_only:
		assets_tree = {}
		for layer in self.model.layers:
		if hasattr(layer, "asset_data"):
		# Convert TrackedDict to dict, handle bytes as base64
		asset_dict = {}
		for key, value in layer.asset_data.items():
		if isinstance(value, bytes):
		import base64

		asset_dict[key] = base64.b64encode(
		value
		).decode("ascii")
		else:
		asset_dict[key] = value
		assets_tree[layer.name] = asset_dict
		composite_state["assets"] = assets_tree

		# Use a single with statement. If context_options is empty,
		# Context() uses defaults.
Expand DownExpand Up		@@ -282,18 +370,16 @@ def on_train_end(self, logs=None):
		except Exception:
		pass # Ignore errors during cleanup

		# Multi-host synchronization: ensure all hosts complete cleanup
		self._sync_processes("checkpoint_cleanup")

		def wait_until_finished(self):
		"""Wait for any in-progress checkpoint operations to complete.
		This method blocks until all asynchronous checkpoint save operations
		have completed. It should be called before attempting to load
		checkpoints if there might be pending save operations.
		have completed across all hosts in a multi-host setup.
		"""
		# Wait for any async operations to complete
		if hasattr(self.checkpointer, "wait"):
		self.checkpointer.wait()
		else:
		# Fallback for older Orbax versions that don't have wait() method
		while self.checkpointer.is_saving_in_progress():
		import time
		# Wait for any async operations to complete on this host
		self.checkpointer.wait()

		time.sleep(0.1)
		# Multi-host synchronization: ensure all hosts complete
		self._sync_processes("checkpoint_wait_complete")
amitsrivastava78 marked this conversation as resolved. Show resolvedHide resolved