add health check (#1275)

jayhenry · web-flow · commit 41a161a6ada5 · 2025-11-19T18:22:06.000+08:00
diff --git a/tests/utils/test_check_health.py b/tests/utils/test_check_health.py
@@ -0,0 +1,40 @@
+import os
+import torch
+import torch.distributed as dist
+from torch.testing._internal.common_distributed import DistributedTestBase
+from unittest.mock import patch, Mock
+
+import xtuner.v1.utils.check_health as check_health
+
+from xtuner.v1.utils.device import get_device
+
+
+DEVICE = get_device()
+
+
+def fake_health_job(dtype, loop=10):
+    if dist.get_rank() == 1:
+        print(f"rank {dist.get_rank()} world size {dist.get_world_size()} return 0.0")
+        return torch.tensor(0.0, dtype=dtype, device=DEVICE)
+    else:
+        print(f"rank {dist.get_rank()} world size {dist.get_world_size()} return 1.0")
+        return torch.tensor(1.0, dtype=dtype, device=DEVICE)
+
+
+class TestCheckHealth(DistributedTestBase):
+    def create_pg(self, device):
+        ret = super().create_pg(device)
+        os.environ["LOCAL_RANK"] = str(dist.get_rank())
+        torch.accelerator.set_device_index(int(os.environ["LOCAL_RANK"]))
+        return ret
+
+    def test_check_health_normal(self):
+        self.create_pg(DEVICE)
+
+        self.assertTrue(check_health.check_health())
+
+    def test_check_health_failed(self):
+        self.create_pg(DEVICE)
+
+        with patch("xtuner.v1.utils.check_health.health_job", fake_health_job):
+            self.assertFalse(check_health.check_health())
diff --git a/xtuner/v1/train/trainer.py b/xtuner/v1/train/trainer.py
@@ -47,6 +47,7 @@
     log_format,
     record_git_info,
 )
+from xtuner.v1.utils.check_health import check_health
 from xtuner.v1.utils.device import get_device, get_torch_device_module
 
 from .toy_tokenizer import UTF8ByteTokenizer
@@ -172,6 +173,7 @@ class TrainerConfig(BaseModel):
     checkpoint_maxkeep: int | None = -1
     skip_checkpoint_validation: bool = False  # Suggest enabled if fsdp_size is larger than 512
     snapshot_interval: int | None = None
+    check_health_interval: int | None = None
     hf_interval: int | None = None
     hf_max_keep: int | None = None
     exp_tracker: Literal["tensorboard", "jsonl"] = "jsonl"
@@ -289,6 +291,7 @@ def __init__(
         checkpoint_maxkeep: int | None = -1,
         skip_checkpoint_validation: bool = False,  # Suggest enabled if fsdp_size is larger than 512
         snapshot_interval: int | None = None,
+        check_health_interval: int | None = None,
         hf_interval: int | None = None,
         hf_max_keep: int | None = None,
         exp_tracker: Literal["tensorboard", "jsonl"] = "jsonl",
@@ -337,6 +340,7 @@ def __init__(
         self._checkpoint_interval = checkpoint_interval
         self._checkpoint_maxkeep = checkpoint_maxkeep
         self._snapshot_interval = snapshot_interval
+        self._check_health_interval = check_health_interval
         self._hf_max_keep = hf_max_keep
         self._hf_interval = hf_interval
 
@@ -481,6 +485,7 @@ def from_config(cls, config: TrainerConfig) -> Self:
             checkpoint_maxkeep=config.checkpoint_maxkeep,
             skip_checkpoint_validation=config.skip_checkpoint_validation,
             snapshot_interval=config.snapshot_interval,
+            check_health_interval=config.check_health_interval,
             hf_interval=config.hf_interval,
             hf_max_keep=config.hf_max_keep,
             exp_tracker=config.exp_tracker,
@@ -586,6 +591,7 @@ def fit(self):
             )
 
             self._lr_scheduler.step()
+            self._maybe_check_health()
             self._maybe_save_hf()
             ckpt_saved = self._maybe_save(is_snapshot=False)
             if not ckpt_saved:
@@ -806,6 +812,16 @@ def warmup_fn(x):
         )
         return lr_scheduler
 
+    def _maybe_check_health(self):
+        if (
+            (self._check_health_interval is not None and self.cur_step % self._check_health_interval == 0)
+            or (self._checkpoint_interval is not None and self.cur_step % self._checkpoint_interval == 0)
+            or (self._snapshot_interval is not None and self.cur_step % self._snapshot_interval == 0)
+        ):
+            if not check_health():
+                raise RuntimeError("Health check failed, exit training")
+            logger.info(f"Health check passed at step {self.cur_step}")
+
     def _maybe_save(self, is_snapshot: bool = False) -> bool:
         ckp_interval = self._checkpoint_interval if not is_snapshot else self._snapshot_interval
         if ckp_interval is None:
diff --git a/xtuner/v1/utils/check_health.py b/xtuner/v1/utils/check_health.py
@@ -0,0 +1,85 @@
+from collections import defaultdict
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+
+from xtuner.v1.utils import get_logger
+from xtuner.v1.utils.device import get_device
+
+
+logger = get_logger()
+
+DEVICE = get_device()
+
+
+def health_job(dtype, loop=10):
+    # use independent generator to avoid affecting the global generator
+    x = torch.rand(128, 128, generator=torch.Generator(device=DEVICE).manual_seed(12345), dtype=dtype, device=DEVICE)
+    dist.broadcast(x, src=0)
+
+    y = x
+    for _ in range(loop):
+        y = F.normalize(y, dim=0)
+        torch.matmul(x, y, out=y)
+    y = y.mean()
+    return y
+
+
+def check_health(loop=10):
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    dtype = torch.bfloat16
+    rtol = 1.6e-2
+    atol = 1e-5
+    # from torch.testing.assert_close:
+    # +---------------------------+------------+----------+
+    # | ``dtype``                 | ``rtol``   | ``atol`` |
+    # +===========================+============+==========+
+    # | :attr:`~torch.float16`    | ``1e-3``   | ``1e-5`` |
+    # +---------------------------+------------+----------+
+    # | :attr:`~torch.bfloat16`   | ``1.6e-2`` | ``1e-5`` |
+    # +---------------------------+------------+----------+
+    # | :attr:`~torch.float32`    | ``1.3e-6`` | ``1e-5`` |
+    # +---------------------------+------------+----------+
+
+    y = health_job(dtype, loop)
+
+    # gather check
+    y_list = [torch.tensor(0.0, dtype=dtype, device=DEVICE) for _ in range(world_size)] if rank == 0 else None
+    dist.gather(y, y_list)
+    gather_check = torch.tensor(1, dtype=torch.int32, device=DEVICE)
+    if rank == 0:
+        for i in range(world_size):
+            if not torch.allclose(y, y_list[i], rtol=rtol, atol=atol):
+                gather_check = torch.tensor(0, dtype=torch.int32, device=DEVICE)
+                break
+    dist.all_reduce(gather_check, op=dist.ReduceOp.MIN)
+
+    # all reduce check
+    z = y.clone()
+    dist.all_reduce(z, op=dist.ReduceOp.AVG)
+    all_reduce_check = (
+        torch.tensor(1, dtype=torch.int32, device=DEVICE)
+        if torch.allclose(y, z, rtol=rtol, atol=atol)
+        else torch.tensor(0, dtype=torch.int32, device=DEVICE)
+    )
+    dist.all_reduce(all_reduce_check, op=dist.ReduceOp.MIN)
+
+    if gather_check.item() == 1 and all_reduce_check.item() == 1:
+        return True
+
+    if rank == 0:  # log
+        logger.error(
+            f"Health check failed: gather_check={gather_check.item()}, all_reduce_check={all_reduce_check.item()}. rtol={rtol}, atol={atol}."
+        )
+        logger.error(f"All reduce check info: y: {y.item()}, z: {z.item()}")
+
+        y2rank = defaultdict(list)
+        for ranki, yi in enumerate(y_list):
+            y2rank[yi.item()].append(ranki)
+        for yi, ranks in y2rank.items():
+            logger.error(f"Gather check info: rank {sorted(ranks)}: {yi}")
+
+    return False