ROCm · LJ-underdog · Apr 23, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 25, 2026
diff --git a/atom/config.py b/atom/config.py
@@ -1040,6 +1040,29 @@ def _set_cudagraph_sizes(self):
             elif len(cuda_graph_sizes) > 1:
                 self.graph_bs = cuda_graph_sizes
 
+    def _uses_sliding_window(self) -> bool:
+        """True iff the model uses sliding-window attention (global or interleaved).
+
+        Prefix caching's classical KV pool cannot restore the per-request SWA
+        buffer on a cache hit, so SWA models must run with it disabled.
+        """
+        hf = self.hf_config
+        # Global sliding_window field (Step-3.5=512, Gemma, Mistral, Qwen-SWA, ...).
+        sw = getattr(hf, "sliding_window", None)
+        if isinstance(sw, int) and not isinstance(sw, bool) and sw > 0:
+            return True
+        # Interleaved SWA via layer_types
+        # (Step-3.5: ['full_attention', 'sliding_attention', ...]).
+        layer_types = getattr(hf, "layer_types", None) or []
+        if any("sliding" in str(t) for t in layer_types):
+            return True
+        # DeepSeek-V4: model_type is remapped to deepseek_v3, so detect SWA via the
+        # preserved architectures name (kept as a fallback).
+        arches = getattr(hf, "architectures", None) or []
+        if any("DeepseekV4" in str(a) for a in arches):
+            return True
+        return False
+
     def __post_init__(self):
         if isinstance(self.compilation_config, dict):
             self.compilation_config = CompilationConfig(**self.compilation_config)
@@ -1163,16 +1186,32 @@ def __post_init__(self):
             v4_block_size = 128
             if self.kv_cache_block_size != v4_block_size:
                 self.kv_cache_block_size = v4_block_size
-            # TODO: V4's per-request SWA buffer cannot be restored from the classical
-            # KV pool on prefix cache hit, so disable prefix caching silently.
-            if self.enable_prefix_caching:
-                import logging
 
-                logging.getLogger(__name__).warning(
-                    "DeepSeek-V4 does not support prefix caching "
-                    "(SWA buffer is not cacheable); disabling automatically."
+        # SWA models cannot restore the per-request sliding-window KV buffer from the
+        # classical KV pool on a prefix-cache hit, so disable prefix caching for any
+        # sliding-window model (DeepSeek-V4, Step-3.5, ...). Generalizes main's
+        # original V4-only guard, which left Step-3.5/SWA exposed to a merge default
+        # flip (enable_prefix_caching default False->True). Non-SWA models keep
+        # main's prefix-caching optimization.
+        if self._uses_sliding_window():
+            import logging
+
+            _log = logging.getLogger(__name__)
+            if self.enable_prefix_caching:
+                _log.warning(
+                    "Model uses sliding-window attention (SWA buffer is not "
+                    "cacheable); disabling prefix caching automatically."
                 )
                 self.enable_prefix_caching = False
+            if self.enable_chunked_prefill:
+                # Conservative: SWA + chunked prefill cross-chunk window correctness
+                # is unverified on GPU; restore the pre-merge-safe default (off).
+                # TODO: re-enable after SWA + chunked-prefill GPU validation.
+                _log.warning(
+                    "Model uses sliding-window attention; disabling chunked "
+                    "prefill (SWA + chunked prefill unverified)."
+                )
+                self.enable_chunked_prefill = False
 
     def compute_hash(self) -> str:
         """

diff --git a/atom/examples/simple_inference.py b/atom/examples/simple_inference.py
@@ -58,7 +58,9 @@ def main():
     engine_args = EngineArgs.from_cli_args(args)
     llm = engine_args.create_engine()
 
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model, trust_remote_code=getattr(args, "trust_remote_code", False)
+    )
 
     sampling_params = SamplingParams(
         temperature=args.temperature, max_tokens=args.max_tokens
@@ -70,9 +72,6 @@ def main():
         for p in prompts
     ]
     print("This is prompts:", prompts)
-    # print("Warming up...")
-    # _ = llm.generate(["warmup"], sampling_params)
-    # print("Warm up done")
 
     print("\n" + "=" * 70)
     print("Starting profiling...")

diff --git a/atom/model_engine/model_runner.py b/atom/model_engine/model_runner.py
@@ -72,6 +72,7 @@
     "Qwen3_5MoeForConditionalGeneration": "atom.models.qwen3_5.Qwen3_5MoeMultimodalModel",
     "KimiK25ForConditionalGeneration": "atom.models.kimi_k25.KimiK25ForCausalLM",
     "MiniMaxM2ForCausalLM": "atom.models.minimax_m2.MiniMaxM2ForCausalLM",
+    "Step3p5ForCausalLM": "atom.models.step3p5.Step3p5ForCausalLM",
     "MiMoV2ForCausalLM": "atom.models.mimo_v2.MiMoV2ForCausalLM",
     "MiMoV2FlashForCausalLM": "atom.models.mimo_v2.MiMoV2ForCausalLM",
 }
@@ -1200,11 +1201,22 @@ def allocate_forward_vars(self):
     def _get_num_kv_heads(self):
         """Return the per-rank number of KV heads."""
         hf_config = self.config.hf_config
-        if hf_config.num_key_value_heads >= self.world_size:
-            assert hf_config.num_key_value_heads % self.world_size == 0
-            return hf_config.num_key_value_heads // self.world_size
+        num_kv_heads_cfg = getattr(
+            hf_config,
+            "num_key_value_heads",
+            getattr(hf_config, "num_attention_groups", None),
+        )
+        if num_kv_heads_cfg is None:
+            raise ValueError(
+                "Model config has neither 'num_key_value_heads' nor "
+                "'num_attention_groups'; cannot determine number of KV heads "
+                f"for {getattr(hf_config, 'architectures', hf_config)}"
+            )
+        if num_kv_heads_cfg >= self.world_size:
+            assert num_kv_heads_cfg % self.world_size == 0
+            return num_kv_heads_cfg // self.world_size
         else:
-            assert self.world_size % hf_config.num_key_value_heads == 0
+            assert self.world_size % num_kv_heads_cfg == 0
             return 1
 
     def _mrope_positions_view(self, num_tokens: int) -> torch.Tensor:
@@ -1453,11 +1465,22 @@ def allocate_kv_cache(self, num_kvcache_blocks):
         self.num_physical_kvcache_blocks = (
             num_kvcache_blocks * self.attn_metadata_builder.block_ratio
         )
-        if hf_config.num_key_value_heads >= self.world_size:
-            assert hf_config.num_key_value_heads % self.world_size == 0
-            num_kv_heads = hf_config.num_key_value_heads // self.world_size
+        num_kv_heads_cfg = getattr(
+            hf_config,
+            "num_key_value_heads",
+            getattr(hf_config, "num_attention_groups", None),
+        )
+        if num_kv_heads_cfg is None:
+            raise ValueError(
+                "Model config has neither 'num_key_value_heads' nor "
+                "'num_attention_groups'; cannot determine number of KV heads "
+                f"for {getattr(hf_config, 'architectures', hf_config)}"
+            )
+        if num_kv_heads_cfg >= self.world_size:
+            assert num_kv_heads_cfg % self.world_size == 0
+            num_kv_heads = num_kv_heads_cfg // self.world_size
         else:
-            assert self.world_size % hf_config.num_key_value_heads == 0
+            assert self.world_size % num_kv_heads_cfg == 0
             num_kv_heads = 1
         # Promote to self so attention builders' build_kv_cache_tensor()
         # hooks can access it without re-deriving from hf_config.

diff --git a/atom/model_loader/loader.py b/atom/model_loader/loader.py
@@ -478,12 +478,27 @@ def _submit(fn, *args):
                     maybe_matching_name,
                     f"{module_prefix}experts.{hf_config.n_routed_experts}.",
                 )
+            # Check fused expert format before packed_modules_mapping to avoid
+            # expert weights (e.g. moe.gate_proj) being incorrectly matched
+            # by packed_modules_mapping entries (e.g. gate_proj -> gate_up_proj).
+            if detect_fused_expert_fn is not None and not is_fused_expert:
+                if detect_fused_expert_fn(name):
+                    is_fused_expert = True
+                    if get_fused_expert_mapping_fn is not None:
+                        fused_expert_params_mapping = get_fused_expert_mapping_fn()
             for k in packed_modules_mapping:
                 # We handle the experts below in expert_params_mapping
                 if (
                     "mlp.experts." in name or "ffn.experts." in name
                 ) and name not in params_dict:
                     continue
+                # Skip fused expert weights — handled below in expert loading path
+                if (
+                    is_fused_expert
+                    and detect_fused_expert_fn is not None
+                    and detect_fused_expert_fn(name)
+                ):
+                    continue
                 if k in name:
                     packed_value = packed_modules_mapping[k]
                     # Handle both tuple (fuse parameter) and list (shard parameter)
@@ -556,7 +571,14 @@ def _submit(fn, *args):
                             )
 
                             if matched:
-                                loaded_weights_record.add(prefix + name)
+                                # Record the MAPPED param name (e.g.
+                                # moe.experts.w13_weight), not the ckpt name
+                                # (e.g. moe.gate_proj.weight): the post-load
+                                # verification below diffs against params_dict
+                                # keys (param names), so recording the ckpt name
+                                # makes fused-expert params (w13_weight/w2_weight)
+                                # falsely show up as "NOT loaded".
+                                loaded_weights_record.add(prefix + name_mapped)
                                 break
 
                         if matched:

diff --git a/atom/model_ops/attentions/aiter_attention.py b/atom/model_ops/attentions/aiter_attention.py
@@ -86,7 +86,18 @@ def __init__(
         else:
             max_qlen = 1
 
-        num_head_k = max(1, hf_config.num_key_value_heads // get_tp_group().world_size)
+        num_kv_heads_cfg = getattr(
+            hf_config,
+            "num_key_value_heads",
+            getattr(hf_config, "num_attention_groups", None),
+        )
+        if num_kv_heads_cfg is None:
+            raise ValueError(
+                "Model config has neither 'num_key_value_heads' nor "
+                "'num_attention_groups'; cannot determine number of KV heads "
+                f"for {getattr(hf_config, 'architectures', hf_config)}"
+            )
+        num_head_k = max(1, num_kv_heads_cfg // get_tp_group().world_size)
         (
             (work_meta_data_size, work_meta_data_type),
             (work_indptr_size, work_indptr_type),
@@ -236,9 +247,18 @@ def set_aiter_persistent_worker_buffers(self, bs: int):
         config = self.model_runner.config
         hf_config = config.hf_config
         num_query_heads = self.num_attention_heads
-        num_kv_heads = max(
-            1, hf_config.num_key_value_heads // get_tp_group().world_size
+        num_kv_heads_cfg = getattr(
+            hf_config,
+            "num_key_value_heads",
+            getattr(hf_config, "num_attention_groups", None),
         )
+        if num_kv_heads_cfg is None:
+            raise ValueError(
+                "Model config has neither 'num_key_value_heads' nor "
+                "'num_attention_groups'; cannot determine number of KV heads "
+                f"for {getattr(hf_config, 'architectures', hf_config)}"
+            )
+        num_kv_heads = max(1, num_kv_heads_cfg // get_tp_group().world_size)
         block_size = self.block_size
 
         var = self.model_runner.forward_vars
@@ -884,9 +904,18 @@ def _set_ubatch_pa_buffers(self, padded_bs, max_q_len, ubatch_idx):
         config = self.model_runner.config
         hf_config = config.hf_config
         num_query_heads = self.num_attention_heads
-        num_kv_heads = max(
-            1, hf_config.num_key_value_heads // get_tp_group().world_size
+        num_kv_heads_cfg = getattr(
+            hf_config,
+            "num_key_value_heads",
+            getattr(hf_config, "num_attention_groups", None),
         )
+        if num_kv_heads_cfg is None:
+            raise ValueError(
+                "Model config has neither 'num_key_value_heads' nor "
+                "'num_attention_groups'; cannot determine number of KV heads "
+                f"for {getattr(hf_config, 'architectures', hf_config)}"
+            )
+        num_kv_heads = max(1, num_kv_heads_cfg // get_tp_group().world_size)
         p = f"ub{ubatch_idx}_"
         var = self.model_runner.forward_vars
 

diff --git a/atom/model_ops/layernorm.py b/atom/model_ops/layernorm.py
@@ -667,6 +667,16 @@ def forward_cuda(
         x: torch.Tensor,
         residual: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        # Contiguity guard (merge-regression fix): the aiter HIP fused-kernel path
+        # below uses x.view(), which requires a contiguous input. QK-norm feeds a
+        # non-contiguous GQA slice here (step3p5 torch.split -> reshape keeps the
+        # qkv row stride, e.g. (1280,128,1) for an 8x128 q), so x.view(-1, head_dim)
+        # raises "Cannot view a tensor ...". Fall back to the pre-merge native math
+        # for non-contiguous inputs; contiguous callers keep main's fast HIP kernel.
+        # Under Dynamo, FakeTensor.is_contiguous() resolves to a concrete bool from
+        # static strides, so this short-circuits at trace time before the .view().
+        if not x.is_contiguous():
+            return self.forward_native(x, residual)
         # Use the aiter HIP fused_qk_rmsnorm_group_quant kernel in no-quant mode
         # (q_out_scale=None) to perform Gemma RMSNorm + optional residual add.
         # Same math as the Triton kernel: out = rmsnorm(x [+ residual]) * (1 + w),