From 5c615f5a6547196cbd5321edc08d8059009b6a39 Mon Sep 17 00:00:00 2001 From: kmc6042 Date: Wed, 24 Jun 2026 22:38:31 +0900 Subject: [PATCH] Support SSD streaming for Q4_K routed experts on ROCm The ROCm streaming MoE paths were gated to the IQ2_XXS/Q2_K expert quant pair, so Q4_K expert GGUFs failed prefill with "missing compact selected experts" and could not run under --ssd-streaming at all. Route Q4_K through the quant-agnostic machinery instead of the IQ2-only selected/split kernels: - Prefill: allow the full-layer streaming path for Q4_K. It stages a whole layer's expert table contiguously and runs the standard matmul, so use it for any multi-token prefill since Q4_K has no batched selected-gather kernel. - Decode: route Q4_K through the shared-overlap selected-load path and force the selected-expert loader to build a full contiguous compact buffer, since the split decode kernels only exist for the IQ2_XXS/Q2_K pair. Also speed up Q4_K streaming by warming the routed-expert cache from the popularity hotlist: - Implement the previously stubbed ROCm seed_experts() as a real bulk sequential preload into the resident cache, which is far cheaper than the scattered first-touch random reads it replaces. Read failures release the resident cache so partially-filled entries are never served as hits. - Allow the hotlist/prefill cache seed for Q4_K layers, and warm the cache at the start of decode-style prefill so short prompts benefit too. On an AMD Ryzen AI MAX+ 395 (Strix Halo, gfx1151) with the 153 GiB Q4_K DeepSeek-V4-Flash GGUF and 123 GiB RAM, this takes the model from failing to start to producing correct output, and the preload cuts decode cache misses roughly in half. Escape hatches: DS4_ROCM_DISABLE_Q4_SELECTED_SHARED_OVERLAP=1 plus the existing --ssd-streaming-cold / DS4_METAL_DISABLE_STREAMING_EXPERT_HOTLIST. Co-Authored-By: Claude Opus 4.8 --- ds4.c | 131 +++++++++++++++++++++++---- ds4_gpu.h | 1 + rocm/ds4_rocm_current_api_compat.cuh | 116 +++++++++++++++++++++++- rocm/ds4_rocm_runtime.cuh | 11 ++- 4 files changed, 238 insertions(+), 21 deletions(-) diff --git a/ds4.c b/ds4.c index 640511eb0..f5eb39c21 100644 --- a/ds4.c +++ b/ds4.c @@ -11697,18 +11697,40 @@ static bool rocm_graph_stream_prefill_full_layer_enabled( const ds4_gpu_graph *g, const ds4_layer_weights *layer, uint32_t n_tokens) { - return g && - g->ssd_streaming && - !g->quality && - layer && - n_tokens >= DS4_ROCM_STREAM_PREFILL_FULL_LAYER_MIN_TOKENS && - DS4_N_EXPERT_USED == 6 && - layer->ffn_gate_exps && - layer->ffn_up_exps && - layer->ffn_down_exps && - layer->ffn_gate_exps->type == DS4_TENSOR_IQ2_XXS && - layer->ffn_up_exps->type == DS4_TENSOR_IQ2_XXS && - layer->ffn_down_exps->type == DS4_TENSOR_Q2_K; + if (!g || + !g->ssd_streaming || + g->quality || + !layer || + DS4_N_EXPERT_USED != 6 || + !layer->ffn_gate_exps || + !layer->ffn_up_exps || + !layer->ffn_down_exps) { + return false; + } + const bool iq2_experts = + layer->ffn_gate_exps->type == DS4_TENSOR_IQ2_XXS && + layer->ffn_up_exps->type == DS4_TENSOR_IQ2_XXS && + layer->ffn_down_exps->type == DS4_TENSOR_Q2_K; + /* + * Q4_K routed experts have no batched selected-expert streaming kernels + * (those exist only for the IQ2_XXS gate / Q2_K down quant pair). The + * full-layer load path is quant-agnostic: it stages the whole layer's + * expert table into a contiguous buffer and runs the standard matmul, which + * already supports Q4_K. It is therefore the only multi-token streaming + * route for Q4_K, so enable it for any multi-token prefill. IQ2 keeps the + * higher token threshold because its selected-expert path streams shorter + * prefills more cheaply. + */ + const bool q4k_experts = + layer->ffn_gate_exps->type == DS4_TENSOR_Q4_K && + layer->ffn_up_exps->type == DS4_TENSOR_Q4_K && + layer->ffn_down_exps->type == DS4_TENSOR_Q4_K; + if (!iq2_experts && !q4k_experts) { + return false; + } + const uint32_t min_tokens = + q4k_experts ? 2u : DS4_ROCM_STREAM_PREFILL_FULL_LAYER_MIN_TOKENS; + return n_tokens >= min_tokens; } static uint32_t rocm_graph_stream_prefill_full_layer_seed_tokens(void) { @@ -13611,6 +13633,33 @@ static bool metal_graph_use_q4_selected_shared_overlap(void) { return metal_graph_env_flag("DS4_METAL_Q4_SELECTED_OVERLAP_SHARED", &cache); } +#ifdef DS4_ROCM_BUILD +/* + * ROCm has no batched/split decode kernels for Q4_K routed experts (those exist + * only for the IQ2_XXS/Q2_K pair). Route Q4_K streaming decode through the same + * shared-overlap selected-load path IQ2 uses; the loader is quant-agnostic and, + * combined with the force-contiguous selected cache, feeds the standard Q4_K + * decode matmul. This is what makes Q4_K usable under SSD streaming on ROCm. + */ +static bool metal_graph_use_rocm_q4_selected_shared_overlap( + const ds4_gpu_graph *g, + const ds4_layer_weights *layer) { + return g && + g->ssd_streaming && + !g->quality && + layer && + layer->ffn_gate_exps && + layer->ffn_up_exps && + layer->ffn_down_exps && + layer->ffn_gate_exps->type == DS4_TENSOR_Q4_K && + layer->ffn_up_exps->type == DS4_TENSOR_Q4_K && + layer->ffn_down_exps->type == DS4_TENSOR_Q4_K && + DS4_N_EXPERT_USED == 6 && + DS4_N_EXPERT >= 128 && + getenv("DS4_ROCM_DISABLE_Q4_SELECTED_SHARED_OVERLAP") == NULL; +} +#endif + static bool metal_graph_use_cuda_selected_shared_overlap(const ds4_gpu_graph *g) { #if !defined(DS4_ROCM_BUILD) && !defined(DS4_NO_GPU) && !defined(__APPLE__) return g && @@ -15615,6 +15664,18 @@ static bool metal_graph_encode_decode_layer( const bool cuda_selected_shared_overlap = metal_graph_use_cuda_selected_shared_overlap(g) && metal_graph_decode_cuda_selected_slots_expected(g, layer); +#ifdef DS4_ROCM_BUILD + const bool rocm_q4_selected_shared_overlap = + metal_graph_use_rocm_q4_selected_shared_overlap(g, layer); + /* + * Q4_K has no split decode kernel on ROCm, so force the selected-expert + * loader to build a full contiguous compact buffer for these layers. + */ + ds4_gpu_stream_set_selected_force_contiguous( + rocm_q4_selected_shared_overlap ? 1 : 0); +#else + const bool rocm_q4_selected_shared_overlap = false; +#endif const bool overlap_selected_shared = ok && !decode_stage_profile && @@ -15623,12 +15684,14 @@ static bool metal_graph_encode_decode_layer( getenv("DS4_MOE_REPLAY_SELECTED_IDS") == NULL && (q4_selected_shared_overlap || iq2_selected_shared_overlap || - cuda_selected_shared_overlap); + cuda_selected_shared_overlap || + rocm_q4_selected_shared_overlap); const bool async_selected_load = overlap_selected_shared && ((iq2_selected_shared_overlap && metal_graph_use_iq2_selected_async_load(g)) || - cuda_selected_shared_overlap); + cuda_selected_shared_overlap || + rocm_q4_selected_shared_overlap); const bool selected_readahead_shared_delay = ok && !overlap_selected_shared && @@ -19567,6 +19630,30 @@ static bool metal_graph_use_streaming_decode_prefill_range( return metal_graph_use_streaming_decode_prefill(g, weights, n_tokens); } +/* True when a layer's routed experts use a quant that the streaming + * selected-expert cache can serve (IQ2_XXS/Q2_K everywhere; Q4_K on ROCm). */ +static bool metal_graph_decode_streaming_selected_slots_expected( + const ds4_gpu_graph *g, + const ds4_layer_weights *layer) { + if (metal_graph_decode_iq2_selected_slots_expected(g, layer)) return true; +#ifdef DS4_ROCM_BUILD + if (layer && layer->ffn_gate_exps && layer->ffn_down_exps && + metal_graph_decode_q4_selected_slots_expected( + g, + layer, + layer->ffn_gate_exps->bytes, + layer->ffn_down_exps->bytes)) { + return true; + } +#endif + return false; +} + +static bool metal_graph_seed_streaming_expert_cache_from_hotlist( + ds4_gpu_graph *g, + const ds4_model *model, + const ds4_weights *weights); + static bool metal_graph_prefill_decode_streaming_range( ds4_gpu_graph *g, const ds4_model *model, @@ -19588,6 +19675,16 @@ static bool metal_graph_prefill_decode_streaming_range( n_tokens > (uint32_t)prompt->len - start) return false; if (start == 0) { ds4_gpu_stream_expert_cache_reset_route_hotness(); + /* + * Warm the routed-expert cache with the popularity hotlist before the + * decode-style prefill loads experts one token at a time. The layer-major + * prefill path does this at its tail, but the short-prompt decode path + * otherwise starts cold; a bulk sequential warm-up is far cheaper than the + * scattered first-touch reads it replaces. Idempotent across turns. + */ + if (!metal_graph_seed_streaming_expert_cache_from_hotlist(g, model, weights)) { + return false; + } } const bool profile = getenv("DS4_METAL_GRAPH_PREFILL_PROFILE") != NULL; @@ -19710,7 +19807,7 @@ static bool metal_graph_seed_streaming_expert_cache_from_prefill( uint32_t seeded_rows = 0; for (uint32_t il = 0; il < DS4_N_LAYER; il++) { const ds4_layer_weights *layer = &weights->layer[il]; - if (!metal_graph_decode_iq2_selected_slots_expected(g, layer)) continue; + if (!metal_graph_decode_streaming_selected_slots_expected(g, layer)) continue; const uint64_t gate_row_bytes = routed_expert_row_bytes(layer->ffn_gate_exps); const uint64_t down_row_bytes = routed_expert_row_bytes(layer->ffn_down_exps); @@ -19762,7 +19859,7 @@ static bool metal_graph_seed_streaming_expert_cache_from_hotlist( uint32_t cache_budget = 0; for (uint32_t il = 0; il < DS4_N_LAYER; il++) { const ds4_layer_weights *layer = &weights->layer[il]; - if (!metal_graph_decode_iq2_selected_slots_expected(g, layer)) continue; + if (!metal_graph_decode_streaming_selected_slots_expected(g, layer)) continue; const uint64_t gate_row_bytes = routed_expert_row_bytes(layer->ffn_gate_exps); const uint64_t down_row_bytes = routed_expert_row_bytes(layer->ffn_down_exps); @@ -19835,7 +19932,7 @@ static bool metal_graph_seed_streaming_expert_cache_from_hotlist( const uint32_t n = counts[il]; if (n == 0) continue; const ds4_layer_weights *layer = &weights->layer[il]; - if (!metal_graph_decode_iq2_selected_slots_expected(g, layer)) continue; + if (!metal_graph_decode_streaming_selected_slots_expected(g, layer)) continue; const uint64_t gate_row_bytes = routed_expert_row_bytes(layer->ffn_gate_exps); const uint64_t down_row_bytes = routed_expert_row_bytes(layer->ffn_down_exps); diff --git a/ds4_gpu.h b/ds4_gpu.h index b58aca9bd..021539147 100644 --- a/ds4_gpu.h +++ b/ds4_gpu.h @@ -74,6 +74,7 @@ void ds4_gpu_set_quality(bool quality); void ds4_gpu_set_ssd_streaming(bool enabled); void ds4_gpu_set_streaming_expert_cache_budget(uint32_t experts); void ds4_gpu_set_streaming_expert_cache_expert_bytes(uint64_t bytes); +void ds4_gpu_stream_set_selected_force_contiguous(int enabled); uint64_t ds4_gpu_recommended_working_set_size(void); uint32_t ds4_gpu_stream_expert_cache_configured_count(void); uint32_t ds4_gpu_stream_expert_cache_current_count(void); diff --git a/rocm/ds4_rocm_current_api_compat.cuh b/rocm/ds4_rocm_current_api_compat.cuh index 3fa7a0c29..a5322527b 100644 --- a/rocm/ds4_rocm_current_api_compat.cuh +++ b/rocm/ds4_rocm_current_api_compat.cuh @@ -174,6 +174,10 @@ extern "C" void ds4_gpu_set_streaming_expert_cache_budget(uint32_t experts) { g_stream_expert_cache_budget = experts; } +extern "C" void ds4_gpu_stream_set_selected_force_contiguous(int enabled) { + g_stream_selected_force_contiguous = enabled ? 1 : 0; +} + extern "C" void ds4_gpu_set_streaming_expert_cache_expert_bytes(uint64_t bytes) { (void)bytes; } @@ -322,15 +326,121 @@ extern "C" int ds4_gpu_stream_expert_cache_release_layer_cache(void) { return 1; } +/* + * Popularity-based cache warm-up. Bulk-load the most popular experts of one + * layer into the resident cache via large sequential file reads, which are far + * cheaper than the scattered per-expert random reads that otherwise happen lazily + * on first-touch during decode. Best-effort: any per-expert failure just leaves + * that expert to be streamed on demand later, so this always reports success. + */ extern "C" int ds4_gpu_stream_expert_cache_seed_experts( const ds4_gpu_stream_expert_table *table, const int32_t *expert_ids, const uint32_t *expert_priorities, uint32_t n_experts) { - (void)table; - (void)expert_ids; (void)expert_priorities; - (void)n_experts; + if (!g_ssd_streaming_mode || !table || !expert_ids || n_experts == 0) return 1; + + const void *model_map = table->model_map; + const uint64_t model_size = table->model_size; + const uint32_t layer = table->layer; + const uint32_t n_total_expert = table->n_total_expert; + const uint64_t gate_offset = table->gate_offset; + const uint64_t up_offset = table->up_offset; + const uint64_t down_offset = table->down_offset; + const uint64_t gate_expert_bytes = table->gate_expert_bytes; + const uint64_t down_expert_bytes = table->down_expert_bytes; + if (n_total_expert == 0 || gate_expert_bytes == 0 || down_expert_bytes == 0) { + return 1; + } + /* Only the file-backed streaming path benefits from a bulk warm-up. */ + const int use_fd = g_model_fd >= 0 && + (g_model_fd_host_base == NULL || model_map == g_model_fd_host_base); + if (!use_fd || !cuda_stream_selected_ensure_stream()) return 1; + + cuda_stream_read_job *jobs = (cuda_stream_read_job *)calloc( + DS4_ROCM_STREAM_READ_MAX_JOBS, sizeof(jobs[0])); + if (!jobs) return 1; + uint32_t job_count = 0; + + for (uint32_t i = 0; i < n_experts; i++) { + /* Stop once the configured cache is full; let warm decode misses + * compete for the remaining slots through the normal LRU path. */ + if (g_stream_expert_cache_budget != 0 && + g_stream_resident_experts.size() >= g_stream_expert_cache_budget) { + break; + } + const int32_t expert_id = expert_ids[i]; + if (expert_id < 0 || (uint32_t)expert_id >= n_total_expert) continue; + if (cuda_stream_resident_find(model_map, layer, expert_id, + gate_offset, up_offset, down_offset, + gate_expert_bytes, down_expert_bytes) >= 0) { + continue; + } + + const uint64_t expert = (uint64_t)(uint32_t)expert_id; + uint64_t gate_rel = 0; + uint64_t down_rel = 0; + if (!cuda_u64_mul_checked(expert, gate_expert_bytes, &gate_rel) || + !cuda_u64_mul_checked(expert, down_expert_bytes, &down_rel) || + gate_offset > model_size || + up_offset > model_size || + down_offset > model_size || + gate_rel > model_size - gate_offset || + gate_rel > model_size - up_offset || + down_rel > model_size - down_offset || + gate_expert_bytes > model_size - gate_offset - gate_rel || + gate_expert_bytes > model_size - up_offset - gate_rel || + down_expert_bytes > model_size - down_offset - down_rel) { + continue; + } + + const int idx = cuda_stream_resident_alloc(model_map, layer, expert_id, + expert_ids, n_experts, + gate_offset, up_offset, down_offset, + gate_expert_bytes, down_expert_bytes); + if (idx < 0) break; /* out of cache budget or device memory */ + cuda_stream_resident_expert &entry = g_stream_resident_experts[(size_t)idx]; + + if (job_count + 3u > DS4_ROCM_STREAM_READ_MAX_JOBS) { + const int flushed = + cuda_stream_read_jobs_parallel(jobs, job_count) && + cuda_stream_selected_upload_read_jobs(jobs, job_count); + cuda_stream_read_jobs_free(jobs, job_count); + job_count = 0; + if (!flushed) { + /* Unfilled resident entries would later be served as cache hits + * with garbage data; drop the resident cache so it refills + * correctly on demand. */ + cuda_stream_resident_cache_release(); + free(jobs); + return 1; + } + } + jobs[job_count++] = {entry.gate, gate_offset + gate_rel, gate_expert_bytes, + NULL, NULL, 0, 0, 0}; + jobs[job_count++] = {entry.up, up_offset + gate_rel, gate_expert_bytes, + NULL, NULL, 0, 0, 0}; + jobs[job_count++] = {entry.down, down_offset + down_rel, down_expert_bytes, + NULL, NULL, 0, 0, 0}; + } + + if (job_count != 0) { + const int flushed = + cuda_stream_read_jobs_parallel(jobs, job_count) && + cuda_stream_selected_upload_read_jobs(jobs, job_count); + cuda_stream_read_jobs_free(jobs, job_count); + if (!flushed) { + cuda_stream_resident_cache_release(); + free(jobs); + return 1; + } + } + if (cuda_stream_cache_stats_on()) { + g_stream_cache_stats.seed_calls++; + g_stream_cache_stats.seed_unique += n_experts; + } + free(jobs); return 1; } diff --git a/rocm/ds4_rocm_runtime.cuh b/rocm/ds4_rocm_runtime.cuh index 3bd786f8e..9e081ccfd 100644 --- a/rocm/ds4_rocm_runtime.cuh +++ b/rocm/ds4_rocm_runtime.cuh @@ -253,6 +253,14 @@ static cuda_stream_cache_stats g_stream_cache_stats; static int g_stream_cache_stats_enabled = -1; static int32_t g_routed_moe_selected_override[DS4_ROCM_N_EXPERT_USED]; static uint32_t g_routed_moe_selected_override_n; +/* + * When set, the single-token selected-expert loader always materializes a full + * contiguous compact expert buffer instead of deferring a mixed + * resident/missing set to the async split path. The split decode kernels only + * exist for the IQ2_XXS/Q2_K quant pair, so Q4_K experts must use the + * contiguous path that feeds the standard Q4_K matmul. + */ +static int g_stream_selected_force_contiguous; static uint64_t g_stream_selected_stage_counter; static cudaEvent_t g_stream_selected_reuse_event; static int g_stream_selected_reuse_event_pending; @@ -3057,7 +3065,8 @@ static int cuda_stream_selected_load( } } - if (use_fd && read_job_count != 0 && resident_mask != 0) { + if (use_fd && read_job_count != 0 && resident_mask != 0 && + !g_stream_selected_force_contiguous) { g_stream_selected_pending.active = 1; g_stream_selected_pending.model_map = model_map; g_stream_selected_pending.layer = layer;