Aug 21, 2025
diff --git a/paddlenlp/transformers/deepseek_v2/configuration.py b/paddlenlp/transformers/deepseek_v2/configuration.py
        attention_dropout=0.0,
        speculate_model_type=False,
        using_flex_token=False,
        decoderlayer_act_offload_settings={},
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.speculate_model_type = speculate_model_type
        self.use_fp8 = False
        self.using_flex_token = using_flex_token

        self.decoderlayer_act_offload_settings = decoderlayer_act_offload_settings
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py
        use_cache: bool,
        attn_mask_startend_row_indices: Optional[Tensor] = None,
    ):
        decoderlayer_act_offload_settings = self.config.get(
            "decoderlayer_act_offload_settings", {"type": "", "value": ""}
        )

        setting_type = decoderlayer_act_offload_settings["type"]
        offload_value = decoderlayer_act_offload_settings["value"]

        def get_offload_kwargs(layer_idx, setting_type, offload_value):
            offload_kwargs = {}
            if "mod" == setting_type:
                assert isinstance(offload_value, (list, tuple))
                v1, v2 = offload_value
                offload_kwargs["offload_indices"] = [0] if layer_idx % v1 == v2 else []
            else:
                raise ValueError(
                    f"decoderlayer_act_offload_settings only support type == 'mod' ,but get type {setting_type}"
                )
            return offload_kwargs

        layer_idx = layer_module.layer_idx
        # NOTE: the first layer inputs will be used in mtp, so do not offload it
        if layer_idx == 0:
            offload_kwargs = {}
        else:
            offload_kwargs = get_offload_kwargs(layer_idx, setting_type, offload_value)
        print("recompute offload ", offload_kwargs)

        def create_custom_forward(module):
            def custom_forward(*inputs):
                return module(*inputs)
            use_cache,
            attn_mask_startend_row_indices,
            use_reentrant=self.config.recompute_use_reentrant,
            **offload_kwargs,
        )

        return hidden_states
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
        return get_attr(layer._layer, name)


 def get_offload_kwargs(layer_idx, decoderlayer_act_offload_settings):
    setting_type = decoderlayer_act_offload_settings["type"]
    offload_value = decoderlayer_act_offload_settings["value"]

    # NOTE: the first layer inputs will be used in mtp, so do not offload it
    if layer_idx == 0:
        offload_kwargs = {}
    else:
        offload_kwargs = {}
        if "mod" == setting_type:
            assert isinstance(offload_value, (list, tuple))
            v1, v2 = offload_value
            offload_kwargs["offload_indices"] = [0] if layer_idx % v1 == v2 else []
        elif setting_type is not None and setting_type != "":
            raise ValueError(
                f"decoderlayer_act_offload_settings only support type == 'mod' ,but get type {setting_type}"
            )
    print("offload_kwargs ", offload_kwargs)
    return offload_kwargs


 class DeepseekV2EmbeddingPipe(nn.Layer):
    def __init__(self, config: DeepseekV2Config):
        super(DeepseekV2EmbeddingPipe, self).__init__()
                attention_mask = attention_mask[
                    :, :, : -self.config.num_nextn_predict_layers, : -self.config.num_nextn_predict_layers
                ]


            # attn_mask_startend_row_indices: [b, num_head, seq_len] or [b, num_head, seq_len, C], C is 2 or 4
            if attn_mask_startend_row_indices is not None:
                if attn_mask_startend_row_indices.ndim == 3:
                    attn_mask_startend_row_indices = attn_mask_startend_row_indices[
                        :, :, : -self.config.num_nextn_predict_layers,
                        :,
                        :,
                        : -self.config.num_nextn_predict_layers,
                    ]
                elif attn_mask_startend_row_indices.ndim == 4:
                    attn_mask_startend_row_indices = attn_mask_startend_row_indices[
            attn_mask_startend_row_indices, position_ids = None, attn_mask_startend_row_indices

        if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
            decoderlayer_act_offload_settings = self.config.get(
                "decoderlayer_act_offload_settings", {"type": "", "value": ""}
            )
            offload_kwargs = get_offload_kwargs(self.layer_idx, decoderlayer_act_offload_settings)
            if attention_mask is not None or attn_mask_startend_row_indices is not None:
                hidden_states = recompute(
                    super().forward,
                    attention_mask=attention_mask,
                    attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                    use_reentrant=False,
                    **offload_kwargs,
                )
            else:
                # for pretrain
                    position_ids=position_ids,
                    attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                    use_reentrant=self.config.recompute_use_reentrant,
                    **offload_kwargs,
                )
        else:
            hidden_states = super().forward(
        for depth in range(self.config.num_nextn_predict_layers):
            inputs_embeds_cur_depth = inputs_embeds_cur_depth_list[depth]
            if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
                decoderlayer_act_offload_settings = self.config.get(
                    "decoderlayer_act_offload_settings", {"type": "", "value": ""}
                )
                offload_kwargs = get_offload_kwargs(self.layer_idx, decoderlayer_act_offload_settings)
                if attention_mask is not None or attn_mask_startend_row_indices is not None:
                    hidden_states = recompute(
                        super().forward,
                        attention_mask=attention_mask,
                        attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                        use_reentrant=False,
                        **offload_kwargs,
                    )
                else:
                    # for pretrain
                        position_ids=position_ids,
                        attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                        use_reentrant=self.config.recompute_use_reentrant,
                        **offload_kwargs,
                    )
            else:
                hidden_states = super().forward(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -179,6 +179,7 @@ def __init__(
		attention_dropout=0.0,
		speculate_model_type=False,
		using_flex_token=False,
		decoderlayer_act_offload_settings={},
		**kwargs,
		):
		self.vocab_size = vocab_size
Expand DownExpand Up		@@ -227,7 +228,7 @@ def __init__(
		self.speculate_model_type = speculate_model_type
		self.use_fp8 = False
		self.using_flex_token = using_flex_token

		self.decoderlayer_act_offload_settings = decoderlayer_act_offload_settings
		super().__init__(
		pad_token_id=pad_token_id,
		bos_token_id=bos_token_id,
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -1660,6 +1660,33 @@ def recompute_training_full(
		use_cache: bool,
		attn_mask_startend_row_indices: Optional[Tensor] = None,
		):
		decoderlayer_act_offload_settings = self.config.get(
		"decoderlayer_act_offload_settings", {"type": "", "value": ""}
		)

		setting_type = decoderlayer_act_offload_settings["type"]
		offload_value = decoderlayer_act_offload_settings["value"]

		def get_offload_kwargs(layer_idx, setting_type, offload_value):
		offload_kwargs = {}
		if "mod" == setting_type:
		assert isinstance(offload_value, (list, tuple))
		v1, v2 = offload_value
		offload_kwargs["offload_indices"] = [0] if layer_idx % v1 == v2 else []
		else:
		raise ValueError(
		f"decoderlayer_act_offload_settings only support type == 'mod' ,but get type {setting_type}"
		)
		return offload_kwargs

		layer_idx = layer_module.layer_idx
		# NOTE: the first layer inputs will be used in mtp, so do not offload it
		if layer_idx == 0:
		offload_kwargs = {}
		else:
		offload_kwargs = get_offload_kwargs(layer_idx, setting_type, offload_value)
		print("recompute offload ", offload_kwargs)

		def create_custom_forward(module):
		def custom_forward(*inputs):
		return module(*inputs)
Expand All		@@ -1676,6 +1703,7 @@ def custom_forward(*inputs):
		use_cache,
		attn_mask_startend_row_indices,
		use_reentrant=self.config.recompute_use_reentrant,
		**offload_kwargs,
		)

		return hidden_states
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -93,6 +93,27 @@ def get_attr(layer, name):
		return get_attr(layer._layer, name)


		def get_offload_kwargs(layer_idx, decoderlayer_act_offload_settings):
		setting_type = decoderlayer_act_offload_settings["type"]
		offload_value = decoderlayer_act_offload_settings["value"]

		# NOTE: the first layer inputs will be used in mtp, so do not offload it
		if layer_idx == 0:
		offload_kwargs = {}
		else:
		offload_kwargs = {}
		if "mod" == setting_type:
		assert isinstance(offload_value, (list, tuple))
		v1, v2 = offload_value
		offload_kwargs["offload_indices"] = [0] if layer_idx % v1 == v2 else []
		elif setting_type is not None and setting_type != "":
		raise ValueError(
		f"decoderlayer_act_offload_settings only support type == 'mod' ,but get type {setting_type}"
		)
		print("offload_kwargs ", offload_kwargs)
		return offload_kwargs


		class DeepseekV2EmbeddingPipe(nn.Layer):
		def __init__(self, config: DeepseekV2Config):
		super(DeepseekV2EmbeddingPipe, self).__init__()
Expand DownExpand Up		@@ -132,12 +153,14 @@ def forward(self, args):
		attention_mask = attention_mask[
		:, :, : -self.config.num_nextn_predict_layers, : -self.config.num_nextn_predict_layers
		]


		# attn_mask_startend_row_indices: [b, num_head, seq_len] or [b, num_head, seq_len, C], C is 2 or 4
		if attn_mask_startend_row_indices is not None:
		if attn_mask_startend_row_indices.ndim == 3:
		attn_mask_startend_row_indices = attn_mask_startend_row_indices[
		:, :, : -self.config.num_nextn_predict_layers,
		:,
		:,
		: -self.config.num_nextn_predict_layers,
		]
		elif attn_mask_startend_row_indices.ndim == 4:
		attn_mask_startend_row_indices = attn_mask_startend_row_indices[
Expand DownExpand Up		@@ -222,6 +245,10 @@ def forward(self, args):
		attn_mask_startend_row_indices, position_ids = None, attn_mask_startend_row_indices

		if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
		decoderlayer_act_offload_settings = self.config.get(
		"decoderlayer_act_offload_settings", {"type": "", "value": ""}
		)
		offload_kwargs = get_offload_kwargs(self.layer_idx, decoderlayer_act_offload_settings)
		if attention_mask is not None or attn_mask_startend_row_indices is not None:
		hidden_states = recompute(
		super().forward,
Expand All		@@ -230,6 +257,7 @@ def forward(self, args):
		attention_mask=attention_mask,
		attn_mask_startend_row_indices=attn_mask_startend_row_indices,
		use_reentrant=False,
		**offload_kwargs,
		)
		else:
		# for pretrain
Expand All		@@ -239,6 +267,7 @@ def forward(self, args):
		position_ids=position_ids,
		attn_mask_startend_row_indices=attn_mask_startend_row_indices,
		use_reentrant=self.config.recompute_use_reentrant,
		**offload_kwargs,
		)
		else:
		hidden_states = super().forward(
Expand DownExpand Up		@@ -279,6 +308,10 @@ def forward(self, args):
		for depth in range(self.config.num_nextn_predict_layers):
		inputs_embeds_cur_depth = inputs_embeds_cur_depth_list[depth]
		if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
		decoderlayer_act_offload_settings = self.config.get(
		"decoderlayer_act_offload_settings", {"type": "", "value": ""}
		)
		offload_kwargs = get_offload_kwargs(self.layer_idx, decoderlayer_act_offload_settings)
		if attention_mask is not None or attn_mask_startend_row_indices is not None:
		hidden_states = recompute(
		super().forward,
Expand All		@@ -288,6 +321,7 @@ def forward(self, args):
		attention_mask=attention_mask,
		attn_mask_startend_row_indices=attn_mask_startend_row_indices,
		use_reentrant=False,
		**offload_kwargs,
		)
		else:
		# for pretrain
Expand All		@@ -298,6 +332,7 @@ def forward(self, args):
		position_ids=position_ids,
		attn_mask_startend_row_indices=attn_mask_startend_row_indices,
		use_reentrant=self.config.recompute_use_reentrant,
		**offload_kwargs,
		)
		else:
		hidden_states = super().forward(
Expand Down