fix: add normalizer bias support. fix trainer callback. add tests

baberabb · baberabb · commit a550a6555a8c · 2025-11-18T04:42:47.000+05:00
diff --git a/bergson/gradients.py b/bergson/gradients.py
@@ -68,14 +68,24 @@ def state_dict(self) -> dict[str, str | Tensor]:
 class AdafactorNormalizer(Normalizer):
     """
     Row and column sums of second moments of gradients for a matrix-valued parameter.
+
+    Args:
+        row: Row statistics [O]
+        col: Column statistics [I]
+        bias_avg_sq: Optional second moments for bias [O]
     """
 
     row: Tensor  # shape [O]
     col: Tensor  # shape [I]
+    bias_avg_sq: Tensor | None = None  # shape [O]
 
     def __post_init__(self):
         assert self.row.ndim == 1, f"Expected 1D tensor for row, got {self.row.ndim}D"
         assert self.col.ndim == 1, f"Expected 1D tensor for col, got {self.col.ndim}D"
+        if self.bias_avg_sq is not None:
+            assert self.bias_avg_sq.ndim == 1, (
+                f"Expected 1D tensor for bias_avg_sq, got {self.bias_avg_sq.ndim}D"
+            )
 
     @torch.compile
     def normalize_(
@@ -120,22 +130,29 @@ def to_adam(self) -> "AdamNormalizer":
         """
         Convert this Adafactor normalizer to an Adam normalizer by materializing the
         rank-one second moment matrix.
+
+        Preserves bias_avg_sq if present.
         """
         # Compute the second moment matrix as a square matrix of shape [O, I]
         # NOTE: We don't add the epsilon here, since the AdamNormalizer is going to
         # add it outside the square root. This could cause infs though if there are
         # any exactly zero rows or columns, so we should be careful.
         avg_sq = torch.outer(self.row, self.col) / self.row.mean()
-        return AdamNormalizer(avg_sq=avg_sq)
+        return AdamNormalizer(avg_sq=avg_sq, bias_avg_sq=self.bias_avg_sq)
 
 
 @dataclass
 class AdamNormalizer(Normalizer):
     """
     Contains the second moments of the gradients.
+
+    Args:
+        avg_sq: Second moments for weights [O, I]
+        bias_avg_sq: Optional second moments for bias [O]
     """
 
     avg_sq: Tensor
+    bias_avg_sq: Tensor | None = None
 
     @torch.compile
     def normalize_(
@@ -153,16 +170,19 @@ def to_adafactor(self) -> AdafactorNormalizer:
         Convert this Adam normalizer to an Adafactor normalizer, minimizing the
         I-divergence (generalized Kullback-Leibler divergence) between the original
         and the factored second moments.
+
+        Preserves bias_avg_sq if present.
         """
         # We assume avg_sq is a square matrix of shape [O, I]
-        assert (
-            self.avg_sq.ndim == 2
-        ), f"Expected 2D tensor for avg_sq, got {self.avg_sq.ndim}D"
+        assert self.avg_sq.ndim == 2, (
+            f"Expected 2D tensor for avg_sq, got {self.avg_sq.ndim}D"
+        )
 
         # Compute row and column means
         return AdafactorNormalizer(
             row=self.avg_sq.mean(dim=1),  # shape [O]
             col=self.avg_sq.mean(dim=0),  # shape [I]
+            bias_avg_sq=self.bias_avg_sq,  # Preserve bias second moments
         )
 
 
@@ -551,8 +571,22 @@ def _process_grad(self, module: nn.Module, _, grad_out):
         i = getattr(module, LayerAdapter.in_attr(module))
         o = getattr(module, LayerAdapter.out_attr(module))
 
-        # Pre-scale G by the Adafactor row statistics
+        # Handle bias gradients if needed (must be computed from raw G)
         norm = self.processor.normalizers.get(name)
+        bias_grad = None
+        if include_bias:
+            # Compute bias from raw G (before any normalization)
+            bias_grad = G.sum(dim=1)  # [N, S, O] -> [N, O]
+
+            # Normalize bias with appropriate second moments
+            if (
+                isinstance(norm, (AdamNormalizer, AdafactorNormalizer))
+                and hasattr(norm, "bias_avg_sq")
+                and norm.bias_avg_sq is not None
+            ):
+                bias_grad = bias_grad / norm.bias_avg_sq.sqrt().add_(1e-8)
+
+        # Pre-scale G by the Adafactor row statistics (for weight gradients)
         if isinstance(norm, AdafactorNormalizer):
             # Compare to the normalize_ method in AdafactorNormalizer
             r = norm.row.add(1e-30)
@@ -563,17 +597,15 @@ def _process_grad(self, module: nn.Module, _, grad_out):
         # If we are using AdamNormalizer, or including bias gradients
         # we need to materialize the full gradient and then project
         if isinstance(norm, AdamNormalizer) or include_bias:
-
             P = G.mT @ I  # [N, O, S] @ [N, S, I] → [N, O, I]
             if isinstance(norm, AdamNormalizer):
                 # Normalize the gradients using the second moment matrix
                 P /= norm.avg_sq.sqrt().add_(1e-8)
 
-            if include_bias:
-                # TODO: should we normalize the bias gradients?
-                # Append the raw bias gradient to the input
+            if include_bias and bias_grad is not None:
+                # Append pre-computed and normalized bias gradient
                 P = torch.cat(
-                    [P, G.sum(dim=1).unsqueeze(2)],  # [N, S, O] -> [N, O]  # [N, O, 1]
+                    [P, bias_grad.unsqueeze(2)],  # [N, O, 1]
                     dim=2,
                 )
                 i += 1
diff --git a/bergson/huggingface.py b/bergson/huggingface.py
@@ -239,13 +239,12 @@ def on_step_end(
         **kwargs,
     ):
         self.on_substep_end(args, state, control)
-        print("Step end")
 
         # Record training order if enabled
         if self.order is not None:
-            assert (
-                self.batch_indices is not None
-            ), "Batch indices are not available for training order tracking"
+            assert self.batch_indices is not None, (
+                "Batch indices are not available for training order tracking"
+            )
 
             epoch = int(state.epoch or 0)
             global_step = state.global_step
@@ -279,32 +278,82 @@ def on_step_end(
 
         # Read normalizers off of the optimizer state. We need to figure out
         # what type of optimizer this is first.
+        # Collect references to both weight and bias second moments per layer
+        layer_second_moments: dict[str, dict[str, Tensor]] = {}
+
         for group in optimizer.param_groups:
-            lr_sqrt = group["lr"] ** 0.5
+            group_lr = group["lr"]
 
             for param in group["params"]:
-                name = param_to_name[param].removesuffix(".weight")
-                if name not in self.collector.target_info:
+                param_name = param_to_name[param]
+
+                # Extract layer name (remove .weight or .bias suffix)
+                if param_name.endswith(".weight"):
+                    param_type = "weight"
+                    layer_name = param_name.removesuffix(".weight")
+                elif param_name.endswith(".bias"):
+                    param_type = "bias"
+                    layer_name = param_name.removesuffix(".bias")
+                else:
+                    continue
+
+                if layer_name not in self.collector.target_info:
                     continue
 
                 p_state = optimizer.state[param]
 
+                # Initialize layer dict if needed, storing this group's learning rate
+                if layer_name not in layer_second_moments:
+                    layer_second_moments[layer_name] = {"lr": group_lr}
+
                 # Adam-like optimizer
                 if (eas := p_state.get("exp_avg_sq")) is not None:
-                    norm = AdamNormalizer(eas).to_adafactor()
-
+                    layer_second_moments[layer_name][param_type] = eas
                 # Adafactor-like optimizer
                 elif (vr := p_state.get("exp_avg_sq_row")) is not None:
                     vc = p_state.get("exp_avg_sq_col")
-                    norm = AdafactorNormalizer(vr, vc)
-                else:
-                    continue
-
-                # Scale the gradient by the current learning rate. It's factorized
-                # so we multiply each factor by the square root of the LR.
-                norm.row *= lr_sqrt
-                norm.col *= lr_sqrt
-                normalizers[name] = norm
+                    if param_type == "weight":
+                        # Factorized second moments for weights
+                        layer_second_moments[layer_name]["row"] = vr
+                        layer_second_moments[layer_name]["col"] = vc
+                    elif param_type == "bias":
+                        # Adafactor stores bias as regular exp_avg_sq
+                        bias_eas = p_state.get("exp_avg_sq")
+                        if bias_eas is not None:
+                            layer_second_moments[layer_name]["bias"] = bias_eas
+
+        # Build normalizers from collected second moments
+        for layer_name, moments in layer_second_moments.items():
+            lr_sqrt = moments["lr"] ** 0.5
+
+            # Adam-like: has weight exp_avg_sq
+            if "weight" in moments:
+                weight_eas = moments["weight"]
+                bias_eas = moments.get("bias")  # May be None
+
+                # Create Adam normalizer with optional bias, then convert to Adafactor
+                # TODO: always convert to adafactor?
+                norm = AdamNormalizer(weight_eas, bias_eas).to_adafactor()
+
+                # Scale by LR (factorized) - use non-in-place ops to avoid modifying optimizer state
+                norm.row = norm.row * lr_sqrt
+                norm.col = norm.col * lr_sqrt
+                if norm.bias_avg_sq is not None:
+                    norm.bias_avg_sq = norm.bias_avg_sq * (lr_sqrt**2)
+
+            # Adafactor-like: has row/col
+            elif "row" in moments and "col" in moments:
+                bias_eas = moments.get("bias")  # May be present
+                norm = AdafactorNormalizer(moments["row"], moments["col"], bias_eas)
+                # Scale by LR (factorized) - use non-in-place ops to avoid modifying optimizer state
+                norm.row = norm.row * lr_sqrt
+                norm.col = norm.col * lr_sqrt
+                if norm.bias_avg_sq is not None:
+                    norm.bias_avg_sq = norm.bias_avg_sq * (lr_sqrt**2)
+            else:
+                continue
+
+            normalizers[layer_name] = norm
 
         proc.normalizers = normalizers
 
diff --git a/tests/test_trainer_callback.py b/tests/test_trainer_callback.py