From 5c615f5a6547196cbd5321edc08d8059009b6a39 Mon Sep 17 00:00:00 2001
From: kmc6042 <kmc6042@g.skku.edu>
Date: Wed, 24 Jun 2026 22:38:31 +0900
Subject: [PATCH] Support SSD streaming for Q4_K routed experts on ROCm

The ROCm streaming MoE paths were gated to the IQ2_XXS/Q2_K expert quant
pair, so Q4_K expert GGUFs failed prefill with "missing compact selected
experts" and could not run under --ssd-streaming at all.

Route Q4_K through the quant-agnostic machinery instead of the IQ2-only
selected/split kernels:

- Prefill: allow the full-layer streaming path for Q4_K. It stages a whole
  layer's expert table contiguously and runs the standard matmul, so use it
  for any multi-token prefill since Q4_K has no batched selected-gather kernel.
- Decode: route Q4_K through the shared-overlap selected-load path and force
  the selected-expert loader to build a full contiguous compact buffer, since
  the split decode kernels only exist for the IQ2_XXS/Q2_K pair.

Also speed up Q4_K streaming by warming the routed-expert cache from the
popularity hotlist:

- Implement the previously stubbed ROCm seed_experts() as a real bulk
  sequential preload into the resident cache, which is far cheaper than the
  scattered first-touch random reads it replaces. Read failures release the
  resident cache so partially-filled entries are never served as hits.
- Allow the hotlist/prefill cache seed for Q4_K layers, and warm the cache at
  the start of decode-style prefill so short prompts benefit too.

On an AMD Ryzen AI MAX+ 395 (Strix Halo, gfx1151) with the 153 GiB Q4_K
DeepSeek-V4-Flash GGUF and 123 GiB RAM, this takes the model from failing to
start to producing correct output, and the preload cuts decode cache misses
roughly in half.

Escape hatches: DS4_ROCM_DISABLE_Q4_SELECTED_SHARED_OVERLAP=1 plus the
existing --ssd-streaming-cold / DS4_METAL_DISABLE_STREAMING_EXPERT_HOTLIST.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 ds4.c                                | 131 +++++++++++++++++++++++----
 ds4_gpu.h                            |   1 +
 rocm/ds4_rocm_current_api_compat.cuh | 116 +++++++++++++++++++++++-
 rocm/ds4_rocm_runtime.cuh            |  11 ++-
 4 files changed, 238 insertions(+), 21 deletions(-)

diff --git a/ds4.c b/ds4.c
index 640511eb0..f5eb39c21 100644
--- a/ds4.c
+++ b/ds4.c
@@ -11697,18 +11697,40 @@ static bool rocm_graph_stream_prefill_full_layer_enabled(
         const ds4_gpu_graph      *g,
         const ds4_layer_weights  *layer,
         uint32_t                  n_tokens) {
-    return g &&
-           g->ssd_streaming &&
-           !g->quality &&
-           layer &&
-           n_tokens >= DS4_ROCM_STREAM_PREFILL_FULL_LAYER_MIN_TOKENS &&
-           DS4_N_EXPERT_USED == 6 &&
-           layer->ffn_gate_exps &&
-           layer->ffn_up_exps &&
-           layer->ffn_down_exps &&
-           layer->ffn_gate_exps->type == DS4_TENSOR_IQ2_XXS &&
-           layer->ffn_up_exps->type == DS4_TENSOR_IQ2_XXS &&
-           layer->ffn_down_exps->type == DS4_TENSOR_Q2_K;
+    if (!g ||
+        !g->ssd_streaming ||
+        g->quality ||
+        !layer ||
+        DS4_N_EXPERT_USED != 6 ||
+        !layer->ffn_gate_exps ||
+        !layer->ffn_up_exps ||
+        !layer->ffn_down_exps) {
+        return false;
+    }
+    const bool iq2_experts =
+        layer->ffn_gate_exps->type == DS4_TENSOR_IQ2_XXS &&
+        layer->ffn_up_exps->type == DS4_TENSOR_IQ2_XXS &&
+        layer->ffn_down_exps->type == DS4_TENSOR_Q2_K;
+    /*
+     * Q4_K routed experts have no batched selected-expert streaming kernels
+     * (those exist only for the IQ2_XXS gate / Q2_K down quant pair).  The
+     * full-layer load path is quant-agnostic: it stages the whole layer's
+     * expert table into a contiguous buffer and runs the standard matmul, which
+     * already supports Q4_K.  It is therefore the only multi-token streaming
+     * route for Q4_K, so enable it for any multi-token prefill.  IQ2 keeps the
+     * higher token threshold because its selected-expert path streams shorter
+     * prefills more cheaply.
+     */
+    const bool q4k_experts =
+        layer->ffn_gate_exps->type == DS4_TENSOR_Q4_K &&
+        layer->ffn_up_exps->type == DS4_TENSOR_Q4_K &&
+        layer->ffn_down_exps->type == DS4_TENSOR_Q4_K;
+    if (!iq2_experts && !q4k_experts) {
+        return false;
+    }
+    const uint32_t min_tokens =
+        q4k_experts ? 2u : DS4_ROCM_STREAM_PREFILL_FULL_LAYER_MIN_TOKENS;
+    return n_tokens >= min_tokens;
 }
 
 static uint32_t rocm_graph_stream_prefill_full_layer_seed_tokens(void) {
@@ -13611,6 +13633,33 @@ static bool metal_graph_use_q4_selected_shared_overlap(void) {
     return metal_graph_env_flag("DS4_METAL_Q4_SELECTED_OVERLAP_SHARED", &cache);
 }
 
+#ifdef DS4_ROCM_BUILD
+/*
+ * ROCm has no batched/split decode kernels for Q4_K routed experts (those exist
+ * only for the IQ2_XXS/Q2_K pair).  Route Q4_K streaming decode through the same
+ * shared-overlap selected-load path IQ2 uses; the loader is quant-agnostic and,
+ * combined with the force-contiguous selected cache, feeds the standard Q4_K
+ * decode matmul.  This is what makes Q4_K usable under SSD streaming on ROCm.
+ */
+static bool metal_graph_use_rocm_q4_selected_shared_overlap(
+        const ds4_gpu_graph     *g,
+        const ds4_layer_weights *layer) {
+    return g &&
+           g->ssd_streaming &&
+           !g->quality &&
+           layer &&
+           layer->ffn_gate_exps &&
+           layer->ffn_up_exps &&
+           layer->ffn_down_exps &&
+           layer->ffn_gate_exps->type == DS4_TENSOR_Q4_K &&
+           layer->ffn_up_exps->type == DS4_TENSOR_Q4_K &&
+           layer->ffn_down_exps->type == DS4_TENSOR_Q4_K &&
+           DS4_N_EXPERT_USED == 6 &&
+           DS4_N_EXPERT >= 128 &&
+           getenv("DS4_ROCM_DISABLE_Q4_SELECTED_SHARED_OVERLAP") == NULL;
+}
+#endif
+
 static bool metal_graph_use_cuda_selected_shared_overlap(const ds4_gpu_graph *g) {
 #if !defined(DS4_ROCM_BUILD) && !defined(DS4_NO_GPU) && !defined(__APPLE__)
     return g &&
@@ -15615,6 +15664,18 @@ static bool metal_graph_encode_decode_layer(
     const bool cuda_selected_shared_overlap =
         metal_graph_use_cuda_selected_shared_overlap(g) &&
         metal_graph_decode_cuda_selected_slots_expected(g, layer);
+#ifdef DS4_ROCM_BUILD
+    const bool rocm_q4_selected_shared_overlap =
+        metal_graph_use_rocm_q4_selected_shared_overlap(g, layer);
+    /*
+     * Q4_K has no split decode kernel on ROCm, so force the selected-expert
+     * loader to build a full contiguous compact buffer for these layers.
+     */
+    ds4_gpu_stream_set_selected_force_contiguous(
+        rocm_q4_selected_shared_overlap ? 1 : 0);
+#else
+    const bool rocm_q4_selected_shared_overlap = false;
+#endif
     const bool overlap_selected_shared =
         ok &&
         !decode_stage_profile &&
@@ -15623,12 +15684,14 @@ static bool metal_graph_encode_decode_layer(
         getenv("DS4_MOE_REPLAY_SELECTED_IDS") == NULL &&
         (q4_selected_shared_overlap ||
          iq2_selected_shared_overlap ||
-         cuda_selected_shared_overlap);
+         cuda_selected_shared_overlap ||
+         rocm_q4_selected_shared_overlap);
     const bool async_selected_load =
         overlap_selected_shared &&
         ((iq2_selected_shared_overlap &&
           metal_graph_use_iq2_selected_async_load(g)) ||
-         cuda_selected_shared_overlap);
+         cuda_selected_shared_overlap ||
+         rocm_q4_selected_shared_overlap);
     const bool selected_readahead_shared_delay =
         ok &&
         !overlap_selected_shared &&
@@ -19567,6 +19630,30 @@ static bool metal_graph_use_streaming_decode_prefill_range(
     return metal_graph_use_streaming_decode_prefill(g, weights, n_tokens);
 }
 
+/* True when a layer's routed experts use a quant that the streaming
+ * selected-expert cache can serve (IQ2_XXS/Q2_K everywhere; Q4_K on ROCm). */
+static bool metal_graph_decode_streaming_selected_slots_expected(
+        const ds4_gpu_graph     *g,
+        const ds4_layer_weights *layer) {
+    if (metal_graph_decode_iq2_selected_slots_expected(g, layer)) return true;
+#ifdef DS4_ROCM_BUILD
+    if (layer && layer->ffn_gate_exps && layer->ffn_down_exps &&
+        metal_graph_decode_q4_selected_slots_expected(
+                g,
+                layer,
+                layer->ffn_gate_exps->bytes,
+                layer->ffn_down_exps->bytes)) {
+        return true;
+    }
+#endif
+    return false;
+}
+
+static bool metal_graph_seed_streaming_expert_cache_from_hotlist(
+        ds4_gpu_graph     *g,
+        const ds4_model   *model,
+        const ds4_weights *weights);
+
 static bool metal_graph_prefill_decode_streaming_range(
         ds4_gpu_graph *g,
         const ds4_model       *model,
@@ -19588,6 +19675,16 @@ static bool metal_graph_prefill_decode_streaming_range(
         n_tokens > (uint32_t)prompt->len - start) return false;
     if (start == 0) {
         ds4_gpu_stream_expert_cache_reset_route_hotness();
+        /*
+         * Warm the routed-expert cache with the popularity hotlist before the
+         * decode-style prefill loads experts one token at a time.  The layer-major
+         * prefill path does this at its tail, but the short-prompt decode path
+         * otherwise starts cold; a bulk sequential warm-up is far cheaper than the
+         * scattered first-touch reads it replaces.  Idempotent across turns.
+         */
+        if (!metal_graph_seed_streaming_expert_cache_from_hotlist(g, model, weights)) {
+            return false;
+        }
     }
 
     const bool profile = getenv("DS4_METAL_GRAPH_PREFILL_PROFILE") != NULL;
@@ -19710,7 +19807,7 @@ static bool metal_graph_seed_streaming_expert_cache_from_prefill(
     uint32_t seeded_rows = 0;
     for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
         const ds4_layer_weights *layer = &weights->layer[il];
-        if (!metal_graph_decode_iq2_selected_slots_expected(g, layer)) continue;
+        if (!metal_graph_decode_streaming_selected_slots_expected(g, layer)) continue;
 
         const uint64_t gate_row_bytes = routed_expert_row_bytes(layer->ffn_gate_exps);
         const uint64_t down_row_bytes = routed_expert_row_bytes(layer->ffn_down_exps);
@@ -19762,7 +19859,7 @@ static bool metal_graph_seed_streaming_expert_cache_from_hotlist(
     uint32_t cache_budget = 0;
     for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
         const ds4_layer_weights *layer = &weights->layer[il];
-        if (!metal_graph_decode_iq2_selected_slots_expected(g, layer)) continue;
+        if (!metal_graph_decode_streaming_selected_slots_expected(g, layer)) continue;
 
         const uint64_t gate_row_bytes = routed_expert_row_bytes(layer->ffn_gate_exps);
         const uint64_t down_row_bytes = routed_expert_row_bytes(layer->ffn_down_exps);
@@ -19835,7 +19932,7 @@ static bool metal_graph_seed_streaming_expert_cache_from_hotlist(
         const uint32_t n = counts[il];
         if (n == 0) continue;
         const ds4_layer_weights *layer = &weights->layer[il];
-        if (!metal_graph_decode_iq2_selected_slots_expected(g, layer)) continue;
+        if (!metal_graph_decode_streaming_selected_slots_expected(g, layer)) continue;
 
         const uint64_t gate_row_bytes = routed_expert_row_bytes(layer->ffn_gate_exps);
         const uint64_t down_row_bytes = routed_expert_row_bytes(layer->ffn_down_exps);
diff --git a/ds4_gpu.h b/ds4_gpu.h
index b58aca9bd..021539147 100644
--- a/ds4_gpu.h
+++ b/ds4_gpu.h
@@ -74,6 +74,7 @@ void ds4_gpu_set_quality(bool quality);
 void ds4_gpu_set_ssd_streaming(bool enabled);
 void ds4_gpu_set_streaming_expert_cache_budget(uint32_t experts);
 void ds4_gpu_set_streaming_expert_cache_expert_bytes(uint64_t bytes);
+void ds4_gpu_stream_set_selected_force_contiguous(int enabled);
 uint64_t ds4_gpu_recommended_working_set_size(void);
 uint32_t ds4_gpu_stream_expert_cache_configured_count(void);
 uint32_t ds4_gpu_stream_expert_cache_current_count(void);
diff --git a/rocm/ds4_rocm_current_api_compat.cuh b/rocm/ds4_rocm_current_api_compat.cuh
index 3fa7a0c29..a5322527b 100644
--- a/rocm/ds4_rocm_current_api_compat.cuh
+++ b/rocm/ds4_rocm_current_api_compat.cuh
@@ -174,6 +174,10 @@ extern "C" void ds4_gpu_set_streaming_expert_cache_budget(uint32_t experts) {
     g_stream_expert_cache_budget = experts;
 }
 
+extern "C" void ds4_gpu_stream_set_selected_force_contiguous(int enabled) {
+    g_stream_selected_force_contiguous = enabled ? 1 : 0;
+}
+
 extern "C" void ds4_gpu_set_streaming_expert_cache_expert_bytes(uint64_t bytes) {
     (void)bytes;
 }
@@ -322,15 +326,121 @@ extern "C" int ds4_gpu_stream_expert_cache_release_layer_cache(void) {
     return 1;
 }
 
+/*
+ * Popularity-based cache warm-up.  Bulk-load the most popular experts of one
+ * layer into the resident cache via large sequential file reads, which are far
+ * cheaper than the scattered per-expert random reads that otherwise happen lazily
+ * on first-touch during decode.  Best-effort: any per-expert failure just leaves
+ * that expert to be streamed on demand later, so this always reports success.
+ */
 extern "C" int ds4_gpu_stream_expert_cache_seed_experts(
         const ds4_gpu_stream_expert_table *table,
         const int32_t                     *expert_ids,
         const uint32_t                    *expert_priorities,
         uint32_t                           n_experts) {
-    (void)table;
-    (void)expert_ids;
     (void)expert_priorities;
-    (void)n_experts;
+    if (!g_ssd_streaming_mode || !table || !expert_ids || n_experts == 0) return 1;
+
+    const void *model_map = table->model_map;
+    const uint64_t model_size = table->model_size;
+    const uint32_t layer = table->layer;
+    const uint32_t n_total_expert = table->n_total_expert;
+    const uint64_t gate_offset = table->gate_offset;
+    const uint64_t up_offset = table->up_offset;
+    const uint64_t down_offset = table->down_offset;
+    const uint64_t gate_expert_bytes = table->gate_expert_bytes;
+    const uint64_t down_expert_bytes = table->down_expert_bytes;
+    if (n_total_expert == 0 || gate_expert_bytes == 0 || down_expert_bytes == 0) {
+        return 1;
+    }
+    /* Only the file-backed streaming path benefits from a bulk warm-up. */
+    const int use_fd = g_model_fd >= 0 &&
+        (g_model_fd_host_base == NULL || model_map == g_model_fd_host_base);
+    if (!use_fd || !cuda_stream_selected_ensure_stream()) return 1;
+
+    cuda_stream_read_job *jobs = (cuda_stream_read_job *)calloc(
+            DS4_ROCM_STREAM_READ_MAX_JOBS, sizeof(jobs[0]));
+    if (!jobs) return 1;
+    uint32_t job_count = 0;
+
+    for (uint32_t i = 0; i < n_experts; i++) {
+        /* Stop once the configured cache is full; let warm decode misses
+         * compete for the remaining slots through the normal LRU path. */
+        if (g_stream_expert_cache_budget != 0 &&
+            g_stream_resident_experts.size() >= g_stream_expert_cache_budget) {
+            break;
+        }
+        const int32_t expert_id = expert_ids[i];
+        if (expert_id < 0 || (uint32_t)expert_id >= n_total_expert) continue;
+        if (cuda_stream_resident_find(model_map, layer, expert_id,
+                                      gate_offset, up_offset, down_offset,
+                                      gate_expert_bytes, down_expert_bytes) >= 0) {
+            continue;
+        }
+
+        const uint64_t expert = (uint64_t)(uint32_t)expert_id;
+        uint64_t gate_rel = 0;
+        uint64_t down_rel = 0;
+        if (!cuda_u64_mul_checked(expert, gate_expert_bytes, &gate_rel) ||
+            !cuda_u64_mul_checked(expert, down_expert_bytes, &down_rel) ||
+            gate_offset > model_size ||
+            up_offset > model_size ||
+            down_offset > model_size ||
+            gate_rel > model_size - gate_offset ||
+            gate_rel > model_size - up_offset ||
+            down_rel > model_size - down_offset ||
+            gate_expert_bytes > model_size - gate_offset - gate_rel ||
+            gate_expert_bytes > model_size - up_offset - gate_rel ||
+            down_expert_bytes > model_size - down_offset - down_rel) {
+            continue;
+        }
+
+        const int idx = cuda_stream_resident_alloc(model_map, layer, expert_id,
+                                                   expert_ids, n_experts,
+                                                   gate_offset, up_offset, down_offset,
+                                                   gate_expert_bytes, down_expert_bytes);
+        if (idx < 0) break;  /* out of cache budget or device memory */
+        cuda_stream_resident_expert &entry = g_stream_resident_experts[(size_t)idx];
+
+        if (job_count + 3u > DS4_ROCM_STREAM_READ_MAX_JOBS) {
+            const int flushed =
+                cuda_stream_read_jobs_parallel(jobs, job_count) &&
+                cuda_stream_selected_upload_read_jobs(jobs, job_count);
+            cuda_stream_read_jobs_free(jobs, job_count);
+            job_count = 0;
+            if (!flushed) {
+                /* Unfilled resident entries would later be served as cache hits
+                 * with garbage data; drop the resident cache so it refills
+                 * correctly on demand. */
+                cuda_stream_resident_cache_release();
+                free(jobs);
+                return 1;
+            }
+        }
+        jobs[job_count++] = {entry.gate, gate_offset + gate_rel, gate_expert_bytes,
+                             NULL, NULL, 0, 0, 0};
+        jobs[job_count++] = {entry.up, up_offset + gate_rel, gate_expert_bytes,
+                             NULL, NULL, 0, 0, 0};
+        jobs[job_count++] = {entry.down, down_offset + down_rel, down_expert_bytes,
+                             NULL, NULL, 0, 0, 0};
+    }
+
+    if (job_count != 0) {
+        const int flushed =
+            cuda_stream_read_jobs_parallel(jobs, job_count) &&
+            cuda_stream_selected_upload_read_jobs(jobs, job_count);
+        cuda_stream_read_jobs_free(jobs, job_count);
+        if (!flushed) {
+            cuda_stream_resident_cache_release();
+            free(jobs);
+            return 1;
+        }
+    }
+    if (cuda_stream_cache_stats_on()) {
+        g_stream_cache_stats.seed_calls++;
+        g_stream_cache_stats.seed_unique += n_experts;
+    }
+    free(jobs);
     return 1;
 }
 
diff --git a/rocm/ds4_rocm_runtime.cuh b/rocm/ds4_rocm_runtime.cuh
index 3bd786f8e..9e081ccfd 100644
--- a/rocm/ds4_rocm_runtime.cuh
+++ b/rocm/ds4_rocm_runtime.cuh
@@ -253,6 +253,14 @@ static cuda_stream_cache_stats g_stream_cache_stats;
 static int g_stream_cache_stats_enabled = -1;
 static int32_t g_routed_moe_selected_override[DS4_ROCM_N_EXPERT_USED];
 static uint32_t g_routed_moe_selected_override_n;
+/*
+ * When set, the single-token selected-expert loader always materializes a full
+ * contiguous compact expert buffer instead of deferring a mixed
+ * resident/missing set to the async split path.  The split decode kernels only
+ * exist for the IQ2_XXS/Q2_K quant pair, so Q4_K experts must use the
+ * contiguous path that feeds the standard Q4_K matmul.
+ */
+static int g_stream_selected_force_contiguous;
 static uint64_t g_stream_selected_stage_counter;
 static cudaEvent_t g_stream_selected_reuse_event;
 static int g_stream_selected_reuse_event_pending;
@@ -3057,7 +3065,8 @@ static int cuda_stream_selected_load(
         }
     }
 
-    if (use_fd && read_job_count != 0 && resident_mask != 0) {
+    if (use_fd && read_job_count != 0 && resident_mask != 0 &&
+        !g_stream_selected_force_contiguous) {
         g_stream_selected_pending.active = 1;
         g_stream_selected_pending.model_map = model_map;
         g_stream_selected_pending.layer = layer;