[Enhance] resolve some lint issues

nil0x9 · nil0x9 · commit e557bee31ca4 · 2025-11-18T23:32:03.000+08:00
[Fix] fix rms_norm no_grad
diff --git a/xtuner/v1/float8/float8_ops.py b/xtuner/v1/float8/float8_ops.py
@@ -165,4 +165,4 @@ def float8_desugar_data_and_scale_op(aten_op, args, kwargs=None):
         args[0]._orig_dtype,
         args[0]._scaling_granularity,
         args[0]._group_size,
-    )
+    )
diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py
@@ -465,14 +465,14 @@ def _micro_batch_forward(
 
             router_logits_dict: dict[str, torch.Tensor] = {}
             layer_names = list(router_logits_list[0].keys())
-            
+
             for layer_name in layer_names:
                 layer_router_logits_list: list[torch.Tensor] = []
                 for micro_batch_idx in range(len(seq_ctx_list)):
                     layer_router_logits_list.append(router_logits_list[micro_batch_idx][layer_name].clone().detach())
                 router_logits = torch.stack(layer_router_logits_list, dim=0).unsqueeze(0)
                 router_logits_dict["router_logits"] = router_logits
-            
+
             output["router_logits"] = router_logits_dict
 
         return MoEModelOutputs(**output, logits=final_logits)  # type: ignore[typeddict-item]
diff --git a/xtuner/v1/model/moe/qwen3vl_text.py b/xtuner/v1/model/moe/qwen3vl_text.py
@@ -111,6 +111,7 @@ def _forward(
         self,
         seq_ctx: SequenceContext,  # todo(@yehaochen): support intra layer micro-batch
         loss_ctx: CELossContext | None,
+        return_router_logits: bool = False,
     ) -> MoEModelOutputs:
         input_ids = seq_ctx.input_ids
         position_ids = seq_ctx.position_ids
@@ -210,7 +211,7 @@ def _forward(
 
         del router_logits
 
-        if self.config.return_router_results:
+        if self.config.return_router_results or return_router_logits:
             raise NotImplementedError
             # TODO: Move router logits to CPU is cost
             # for layer_name, router_logits in output["router_logits"].items():
diff --git a/xtuner/v1/ops/attn_imp.py b/xtuner/v1/ops/attn_imp.py
@@ -33,8 +33,6 @@
     flash_sink_attn_varlen_func = None  # type: ignore[assignment]
     flash_sink_attn_exception = e
 
-from typing import List
-
 
 def get_flex_attention_compiled():
     torch._dynamo.config.cache_size_limit = 128
@@ -129,7 +127,7 @@ def mask_mod(b, h, q_idx, kv_idx):
 
 def eager_attention(
     q, k, v, cu_seqlens_q, softmax_scale, window_size=(-1, -1), dropout_p=0.0, s_aux=None, **kwargs
-) -> torch.Tensor:
+) -> tuple[torch.Tensor, dict]:
     # TODO(HHA): Currently, the mask is recalculated each time, which is quite time-consuming.
     # It should be refactored to be calculated only once.
 
@@ -176,7 +174,7 @@ def eager_attention(
 
 def flex_attention(
     q, k, v, cu_seqlens_q, softmax_scale=None, window_size=(-1, -1), dropout_p=0.0, s_aux=None, causal=True, **kwargs
-) -> torch.Tensor:
+) -> tuple[torch.Tensor, dict]:
     # q, k, v: [b, n_head, seq, head_dim]
     assert dropout_p == 0.0, "Dropout is not supported in flex attention"
 
@@ -208,7 +206,7 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
     return attention_output, extra_info
 
 
-def flash_attention(q, k, v, window_size=(-1, -1), s_aux=None, **kwargs) -> torch.Tensor:
+def flash_attention(q, k, v, window_size=(-1, -1), s_aux=None, **kwargs) -> tuple[torch.Tensor, dict]:
     # q, k, v: [b, n_head, seq , head_dim]
     assert q.size(0) == 1, "Only support batch size 1 for flash attention"
     q = q.transpose(1, 2).squeeze(0)  # [seq, head, dim]
@@ -220,11 +218,11 @@ def flash_attention(q, k, v, window_size=(-1, -1), s_aux=None, **kwargs) -> torc
         if flash_attn_exception is not None:
             traceback.print_exception(flash_attn_exception)
             raise flash_attn_exception
-        attention_outputs = flash_attn_varlen_func(q, k, v, return_attn_probs=True, **kwargs) # type: ignore
+        attention_outputs = flash_attn_varlen_func(q, k, v, return_attn_probs=True, **kwargs)  # type: ignore
         if isinstance(attention_outputs, tuple):
             attention_output = attention_outputs[0]
             extra_info["softmax_lse"] = attention_outputs[1].detach()
-        else: # npu fused attn doesn't support softmax_lse
+        else:  # npu fused attn doesn't support softmax_lse
             attention_output = attention_outputs
     else:
         if flash_sink_attn_exception is not None:
diff --git a/xtuner/v1/train/trainer.py b/xtuner/v1/train/trainer.py
@@ -41,13 +41,13 @@
 from xtuner.v1.profiler.prober_utils import setup_prober_list
 from xtuner.v1.utils import (
     XTUNER_DETERMINISTIC,
+    InternalMetrics,
+    InternalMetricsRecorder,
     ParallelConfigException,
     get_logger,
     is_hf_model_path,
     log_format,
     record_git_info,
-    InternalMetricsRecorder,
-    InternalMetrics,
 )
 from xtuner.v1.utils.device import get_device, get_torch_device_module
 
@@ -344,7 +344,9 @@ def __init__(
         self._hf_interval = hf_interval
         self._internal_metrics_interval = internal_metrics_interval
         if self._internal_metrics_interval is not None:
-            torch._dynamo.config.skip_nnmodule_hook_guards = False # otherwise the hook will be ignored for compiled modules
+            torch._dynamo.config.skip_nnmodule_hook_guards = (
+                False  # otherwise the hook will be ignored for compiled modules
+            )
 
         if tokenizer_path is not None:
             self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
@@ -1438,7 +1440,7 @@ def _setup_env(self):
         logger.info(log_str)
 
 
-def _flatten_nested_metrics(metrics: InternalMetrics, sep: str = '/') -> dict:
+def _flatten_nested_metrics(metrics: InternalMetrics, sep: str = "/") -> dict:
     items = []
     for name, sub_metrics in metrics.items():
         if isinstance(sub_metrics, dict):
@@ -1448,5 +1450,7 @@ def _flatten_nested_metrics(metrics: InternalMetrics, sep: str = '/') -> dict:
                 else:
                     raise ValueError(f"Unsupported metric value type: expected float or int, but got {type(v)}")
         else:
-            raise ValueError(f"Unsupported metric type for internal metrics: expected dict, but got {type(sub_metrics)}")
+            raise ValueError(
+                f"Unsupported metric type for internal metrics: expected dict, but got {type(sub_metrics)}"
+            )
     return dict(items)
diff --git a/xtuner/v1/utils/__init__.py b/xtuner/v1/utils/__init__.py
@@ -16,6 +16,7 @@
 
 from .internal_metrics import InternalMetricsRecorder, InternalMetrics
 
+
 IGNORE_INDEX = -100
 
 __all__ = [
diff --git a/xtuner/v1/utils/internal_metrics.py b/xtuner/v1/utils/internal_metrics.py
@@ -1,26 +1,18 @@
 from collections import defaultdict
-from typing import Any
-import numpy as np
 import torch
-from torch import nn
 import torch.distributed as dist
+from torch import nn
 from torch.utils.hooks import RemovableHandle
+from typing_extensions import TypedDict
 
-from xtuner.v1.module import (
-    RMSNorm,
-    MultiHeadAttention,
-    MultiLatentAttention,
-    LMHead
-)
-from xtuner.v1.module.decoder_layer.moe_decoder_layer import MoEGate, MoEBlock, MoEDecoderLayer
-from xtuner.v1.module.decoder_layer.dense_decoder_layer import DenseDecoderLayer
+from xtuner.v1.engine.train_engine import TrainEngine
 from xtuner.v1.model import MoE
 from xtuner.v1.model.base import ModelItem
-from xtuner.v1.engine.train_engine import TrainEngine
+from xtuner.v1.module import LMHead, MultiHeadAttention, MultiLatentAttention
+from xtuner.v1.module.decoder_layer.dense_decoder_layer import DenseDecoderLayer
+from xtuner.v1.module.decoder_layer.moe_decoder_layer import MoEDecoderLayer
 from xtuner.v1.utils.grad_norm import group_tensors_by_device_mesh_and_placements, cal_total_norm
 
-from typing_extensions import TypedDict
-
 
 class InternalMetrics(TypedDict):
     weight_rms: dict[str, float]
@@ -45,6 +37,7 @@ class InternalMetrics(TypedDict):
 ATTN_MAX_LSE: dict[str, torch.Tensor] = {}
 ATTN_MAX_LOGITS: dict[str, torch.Tensor] = {}
 
+
 class InternalMetricsRecorder:
     def __init__(self, engine: TrainEngine):
         self.model = engine.model
@@ -60,8 +53,10 @@ def __init__(self, engine: TrainEngine):
             "attn_max_logits": {},
         }
 
+    @torch.no_grad()
     def calculate_module_weight_rms(self, module: nn.Module, layer_name: str, dtype: torch.dtype = torch.float32):
-        all_params = [param for param in module.parameters() if param.requires_grad]
+        """Calculate the RMS of the module's parameters"""
+        all_params = [param.data for param in module.parameters() if param.requires_grad]
         if not all_params:
             return
         grouped_params = group_tensors_by_device_mesh_and_placements(all_params)
@@ -73,16 +68,14 @@ def calculate_module_weight_rms(self, module: nn.Module, layer_name: str, dtype:
             total_numel += sum(p.numel() for p in params)
         param_l2_norm = torch.linalg.vector_norm(torch.stack(total_norms), ord=2.0, dtype=dtype)
         param_rms = param_l2_norm / total_numel**0.5
-        self.metrics['weight_rms'][layer_name] = param_rms.item()
+        self.metrics["weight_rms"][layer_name] = param_rms.item()
 
     def register_attn_extra_info_hook(self, module: nn.Module, layer_name: str):
-        """
-        Register attention extra info hook as a forward hook
-        """
+        """Register attention extra info hook as a forward hook"""
         def hook(module, input, output):
             extra_info = output[1]
             if extra_info.get("softmax_lse", None) is not None:
-                if layer_name not in ATTN_MAX_LSE: 
+                if layer_name not in ATTN_MAX_LSE:
                     # original shape: [n_head, seq]
                     ATTN_MAX_LSE[layer_name] = extra_info["softmax_lse"].max()
                 else:
@@ -101,6 +94,7 @@ def hook(module, input, output):
 
     @torch.no_grad()
     def get_metrics(self, data_batches: list[ModelItem]):
+        """Run a dummy forward to get metrics"""
         additional_kwargs = {}
         if isinstance(self.model, MoE):
             # for MoE model, add additional kwargs to return necessary stats
@@ -140,7 +134,6 @@ def get_metrics(self, data_batches: list[ModelItem]):
                 else:
                     tokens_per_expert_global += output["tokens_per_expert_global"].float()
 
-
             if output.get("router_logits", None) is not None:
                 for layer_name, router_logits in output["router_logits"].items():
                     # [bsz, packed_len, num_experts]
@@ -151,7 +144,9 @@ def get_metrics(self, data_batches: list[ModelItem]):
             avg_count_load = tokens_per_expert_global.mean(1)
             max_load_i = torch.amax(tokens_per_expert_global, dim=1)
             maxvio_all_layers = (max_load_i - avg_count_load) / avg_count_load
-            drop_ratio_all_layers = (tokens_per_expert_global - avg_count_load[:,None]).abs().mean(dim=1) / avg_count_load
+            drop_ratio_all_layers = (
+                tokens_per_expert_global - avg_count_load[:,None]
+            ).abs().mean(dim=1) / avg_count_load
             drop_ratio = drop_ratio_all_layers.mean()
             self.metrics["drop_ratio"].update(
                 {f"layer{idx}": drop_ratio_all_layers[idx].item() for idx in range(drop_ratio_all_layers.shape[0])}