InternLM
diff --git a/‎xtuner/v1/data_proto/utils.py‎
Lines changed: 86 additions & 0 deletions b/‎xtuner/v1/data_proto/utils.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎xtuner/v1/rl/base/loss.py‎
Lines changed: 24 additions & 2 deletions b/‎xtuner/v1/rl/base/loss.py‎
Lines changed: 24 additions & 2 deletions
@@ -110,3 +110,89 @@ def gather_for_sequence_parallel(input, dim: int, sp_group: dist.ProcessGroup):
     output = torch.cat(tensor_list, dim=dim).contiguous()
 
     return output
+
+
+def convert_padded_to_packed(
+    input: torch.Tensor, num_tokens: torch.Tensor | list, padding_side: str = "right"
+) -> torch.Tensor:
+    """Convert a padded tensor (B, L, ...) to a packed tensor (1,
+    sum(num_tokens), ...).
+
+    Args:
+        input: The input tensor to be converted.
+        num_tokens: The number of tokens of each sequence in the padded input.
+    """
+    if isinstance(num_tokens, torch.Tensor):
+        num_tokens = num_tokens.tolist()
+    if padding_side == "right":
+        return torch.cat([input[i, : num_tokens[i]] for i in range(len(num_tokens))], dim=0).unsqueeze(0)
+    elif padding_side == "left":
+        return torch.cat([input[i, -num_tokens[i] :] for i in range(len(num_tokens))], dim=0).unsqueeze(0)
+    else:
+        raise ValueError(f"Invalid padding_side: {padding_side}. Must be 'right' or 'left'.")
+
+
+def convert_packed_to_padded(
+    input: torch.Tensor, num_tokens: torch.Tensor | list, padding_value: float, padding_side: str = "right"
+) -> torch.Tensor:
+    """Convert a packed tensor (1, sum(num_tokens), ...) to a padded tensor
+    (len(num_tokens), max(num_tokens), ...).
+
+    Args:
+        input: The input tensor to be converted.
+        num_tokens: The number of tokens of each sequence in the padded input.
+    """
+    unpacked_input = unpack_sequence(input, num_tokens)  # list of (1, num_tokens[i], ...)
+    max_length = max(num_tokens)
+    padded_input = torch.full(
+        (len(num_tokens), max_length, *input.shape[2:]), padding_value, dtype=input.dtype, device=input.device
+    )
+    for i, seq in enumerate(unpacked_input):
+        if padding_side == "right":
+            padded_input[i, : num_tokens[i]] = seq[0]
+        elif padding_side == "left":
+            padded_input[i, -num_tokens[i] :] = seq[0]
+        else:
+            raise ValueError(f"Invalid padding_side: {padding_side}. Must be 'right' or 'left'.")
+    return padded_input
+
+
+def masked_sum(
+    input: torch.Tensor,
+    mask: torch.Tensor,
+    axis: int | None = None,
+    num_tokens: torch.Tensor | list | None = None,
+    unpack_sequence: bool = False,
+) -> torch.Tensor:
+    """
+    Args:
+        input: The input tensor to be masked.
+        mask: The mask tensor to be applied.
+        axis: The dimension along which the tensor should be masked.
+        num_tokens: The number of tokens of each sequence in the packed input.
+        unpack_sequence: Whether to unpack the sequence.
+    """
+    if unpack_sequence:
+        input = convert_packed_to_padded(input, num_tokens, padding_value=0, padding_side="right")
+        mask = convert_packed_to_padded(mask, num_tokens, padding_value=0, padding_side="right")
+    valid_values = torch.where(mask.bool(), input, 0.0)
+    return (valid_values * mask).sum(axis=axis)
+
+
+def masked_mean(
+    input: torch.Tensor,
+    mask: torch.Tensor,
+    axis: int | None = None,
+    num_tokens: torch.Tensor | list | None = None,
+    unpack_sequence: bool = False,
+) -> torch.Tensor:
+    """
+    Args:
+        input: The input tensor to be masked.
+        mask: The mask tensor to be applied.
+        axis: The dimension along which the tensor should be masked.
+        num_tokens: The number of tokens of each sequence in the packed input.
+        unpack_sequence: Whether to unpack the sequence.
+    """
+    sum = masked_sum(input, mask, axis=axis, num_tokens=num_tokens, unpack_sequence=unpack_sequence)
+    return sum / (mask.sum(axis=axis) + 1e-8)
@@ -7,10 +7,11 @@
 
 from xtuner.v1.loss import BaseLossConfig
 from xtuner.v1.loss.base_loss_ctx import BaseLossContext
-
+from .rollout_is import RolloutImportanceSampling
 from ..utils import sp_split
 
 
+
 T = TypeVar("T")
 
 
@@ -32,7 +33,9 @@ class BaseRLLossConfig(BaseLossConfig):
         kl_loss_type (Literal["kl", "k1", "abs", "mse", "k2", "low_var_kl", "k3"] | None):
             Type of KL penalty computation method. Different types provide various
             regularization behaviors and numerical stability properties. Defaults to None.
-
+        rollout_is (RolloutImportanceSampling): Configuration parameters for the rollout importance sampling.
+            Contains algorithm-specific parameters for rollout importance sampling.
+            Defaults to RolloutImportanceSampling().
     **Abstract Method:**
         loss_ctx_cls: Must be implemented by subclasses to return the appropriate
         loss context class for the specific RL algorithm.
@@ -72,6 +75,7 @@ class BaseRLLossConfig(BaseLossConfig):
     use_kl_loss: bool = False
     kl_loss_coef: float = 0.001
     kl_loss_type: Literal["kl", "k1", "abs", "mse", "k2", "low_var_kl", "k3"] | None = None
+    rollout_is: RolloutImportanceSampling = RolloutImportanceSampling()
 
     @property
     def loss_ctx_cls(self) -> type[BaseLossContext]:
@@ -86,24 +90,38 @@ class RLLossContextInputItem(BaseModel):
         advantages (torch.Tensor): Advantage estimates for the actions taken.
         old_logprobs (torch.Tensor | None): Log probabilities from the old policy.
         ref_logprobs (torch.Tensor | None): Reference log probabilities for KL penalty, if used.
+        rollout_logprobs (torch.Tensor | None): Rollout log probabilities from inference engine, used for importance sampling.
+        is_weights (torch.Tensor | None): Importance sampling weights. If None, importance sampling is not used.
     """
 
     model_config = ConfigDict(title="RLLossContextInputItem", extra="allow", arbitrary_types_allowed=True)
     shifted_labels: torch.Tensor
     advantages: torch.Tensor
     old_logprobs: torch.Tensor | None = None
     ref_logprobs: torch.Tensor | None = None
+    rollout_logprobs: torch.Tensor | None = None
+    is_weights: torch.Tensor | None = None
 
     def sp_split(self, sp_mesh: DeviceMesh) -> Self:
         shifted_labels = sp_split(self.shifted_labels, sp_mesh=sp_mesh, split_dim=1, padding_value=-100)
         advantages = sp_split(self.advantages, sp_mesh=sp_mesh, split_dim=1, padding_value=0.0)
+        if self.rollout_logprobs is not None:
+            rollout_logprobs = sp_split(self.rollout_logprobs, sp_mesh=sp_mesh, split_dim=1, padding_value=0.0)
+        else:
+            rollout_logprobs = None
+        if self.is_weights is not None:
+            is_weights = sp_split(self.is_weights, sp_mesh=sp_mesh, split_dim=1, padding_value=1.0)
+        else:
+            is_weights = None
         # 这里不用对old_logprobs和ref_logprobs进行sp_split，因为他是模型 fwd 生成的
         # 模型 fwd 前一定会对 seq_ctx 进行 sp_split
         return type(self)(
             shifted_labels=shifted_labels,
             advantages=advantages,
             old_logprobs=self.old_logprobs,
             ref_logprobs=self.ref_logprobs,
+            rollout_logprobs=rollout_logprobs,
+            is_weights=is_weights,
         )
 
     def to(self, device: torch.device | str) -> Self:
@@ -113,4 +131,8 @@ def to(self, device: torch.device | str) -> Self:
             self.old_logprobs = self.old_logprobs.to(device)
         if self.ref_logprobs is not None:
             self.ref_logprobs = self.ref_logprobs.to(device)
+        if self.rollout_logprobs is not None:
+            self.rollout_logprobs = self.rollout_logprobs.to(device)
+        if self.is_weights is not None:
+            self.is_weights = self.is_weights.to(device)
         return self