Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 114 additions & 17 deletions ds4.c
Original file line number Diff line number Diff line change
Expand Up @@ -11697,18 +11697,40 @@ static bool rocm_graph_stream_prefill_full_layer_enabled(
const ds4_gpu_graph *g,
const ds4_layer_weights *layer,
uint32_t n_tokens) {
return g &&
g->ssd_streaming &&
!g->quality &&
layer &&
n_tokens >= DS4_ROCM_STREAM_PREFILL_FULL_LAYER_MIN_TOKENS &&
DS4_N_EXPERT_USED == 6 &&
layer->ffn_gate_exps &&
layer->ffn_up_exps &&
layer->ffn_down_exps &&
layer->ffn_gate_exps->type == DS4_TENSOR_IQ2_XXS &&
layer->ffn_up_exps->type == DS4_TENSOR_IQ2_XXS &&
layer->ffn_down_exps->type == DS4_TENSOR_Q2_K;
if (!g ||
!g->ssd_streaming ||
g->quality ||
!layer ||
DS4_N_EXPERT_USED != 6 ||
!layer->ffn_gate_exps ||
!layer->ffn_up_exps ||
!layer->ffn_down_exps) {
return false;
}
const bool iq2_experts =
layer->ffn_gate_exps->type == DS4_TENSOR_IQ2_XXS &&
layer->ffn_up_exps->type == DS4_TENSOR_IQ2_XXS &&
layer->ffn_down_exps->type == DS4_TENSOR_Q2_K;
/*
* Q4_K routed experts have no batched selected-expert streaming kernels
* (those exist only for the IQ2_XXS gate / Q2_K down quant pair). The
* full-layer load path is quant-agnostic: it stages the whole layer's
* expert table into a contiguous buffer and runs the standard matmul, which
* already supports Q4_K. It is therefore the only multi-token streaming
* route for Q4_K, so enable it for any multi-token prefill. IQ2 keeps the
* higher token threshold because its selected-expert path streams shorter
* prefills more cheaply.
*/
const bool q4k_experts =
layer->ffn_gate_exps->type == DS4_TENSOR_Q4_K &&
layer->ffn_up_exps->type == DS4_TENSOR_Q4_K &&
layer->ffn_down_exps->type == DS4_TENSOR_Q4_K;
if (!iq2_experts && !q4k_experts) {
return false;
}
const uint32_t min_tokens =
q4k_experts ? 2u : DS4_ROCM_STREAM_PREFILL_FULL_LAYER_MIN_TOKENS;
return n_tokens >= min_tokens;
}

static uint32_t rocm_graph_stream_prefill_full_layer_seed_tokens(void) {
Expand Down Expand Up @@ -13611,6 +13633,33 @@ static bool metal_graph_use_q4_selected_shared_overlap(void) {
return metal_graph_env_flag("DS4_METAL_Q4_SELECTED_OVERLAP_SHARED", &cache);
}

#ifdef DS4_ROCM_BUILD
/*
* ROCm has no batched/split decode kernels for Q4_K routed experts (those exist
* only for the IQ2_XXS/Q2_K pair). Route Q4_K streaming decode through the same
* shared-overlap selected-load path IQ2 uses; the loader is quant-agnostic and,
* combined with the force-contiguous selected cache, feeds the standard Q4_K
* decode matmul. This is what makes Q4_K usable under SSD streaming on ROCm.
*/
static bool metal_graph_use_rocm_q4_selected_shared_overlap(
const ds4_gpu_graph *g,
const ds4_layer_weights *layer) {
return g &&
g->ssd_streaming &&
!g->quality &&
layer &&
layer->ffn_gate_exps &&
layer->ffn_up_exps &&
layer->ffn_down_exps &&
layer->ffn_gate_exps->type == DS4_TENSOR_Q4_K &&
layer->ffn_up_exps->type == DS4_TENSOR_Q4_K &&
layer->ffn_down_exps->type == DS4_TENSOR_Q4_K &&
DS4_N_EXPERT_USED == 6 &&
DS4_N_EXPERT >= 128 &&
getenv("DS4_ROCM_DISABLE_Q4_SELECTED_SHARED_OVERLAP") == NULL;
}
Comment on lines +13644 to +13660
#endif

static bool metal_graph_use_cuda_selected_shared_overlap(const ds4_gpu_graph *g) {
#if !defined(DS4_ROCM_BUILD) && !defined(DS4_NO_GPU) && !defined(__APPLE__)
return g &&
Expand Down Expand Up @@ -15615,6 +15664,18 @@ static bool metal_graph_encode_decode_layer(
const bool cuda_selected_shared_overlap =
metal_graph_use_cuda_selected_shared_overlap(g) &&
metal_graph_decode_cuda_selected_slots_expected(g, layer);
#ifdef DS4_ROCM_BUILD
const bool rocm_q4_selected_shared_overlap =
metal_graph_use_rocm_q4_selected_shared_overlap(g, layer);
/*
* Q4_K has no split decode kernel on ROCm, so force the selected-expert
* loader to build a full contiguous compact buffer for these layers.
*/
ds4_gpu_stream_set_selected_force_contiguous(
rocm_q4_selected_shared_overlap ? 1 : 0);
#else
const bool rocm_q4_selected_shared_overlap = false;
#endif
const bool overlap_selected_shared =
ok &&
!decode_stage_profile &&
Expand All @@ -15623,12 +15684,14 @@ static bool metal_graph_encode_decode_layer(
getenv("DS4_MOE_REPLAY_SELECTED_IDS") == NULL &&
(q4_selected_shared_overlap ||
iq2_selected_shared_overlap ||
cuda_selected_shared_overlap);
cuda_selected_shared_overlap ||
rocm_q4_selected_shared_overlap);
const bool async_selected_load =
overlap_selected_shared &&
((iq2_selected_shared_overlap &&
metal_graph_use_iq2_selected_async_load(g)) ||
cuda_selected_shared_overlap);
cuda_selected_shared_overlap ||
rocm_q4_selected_shared_overlap);
const bool selected_readahead_shared_delay =
ok &&
!overlap_selected_shared &&
Expand Down Expand Up @@ -19567,6 +19630,30 @@ static bool metal_graph_use_streaming_decode_prefill_range(
return metal_graph_use_streaming_decode_prefill(g, weights, n_tokens);
}

/* True when a layer's routed experts use a quant that the streaming
* selected-expert cache can serve (IQ2_XXS/Q2_K everywhere; Q4_K on ROCm). */
static bool metal_graph_decode_streaming_selected_slots_expected(
const ds4_gpu_graph *g,
const ds4_layer_weights *layer) {
if (metal_graph_decode_iq2_selected_slots_expected(g, layer)) return true;
#ifdef DS4_ROCM_BUILD
if (layer && layer->ffn_gate_exps && layer->ffn_down_exps &&
metal_graph_decode_q4_selected_slots_expected(
g,
layer,
layer->ffn_gate_exps->bytes,
layer->ffn_down_exps->bytes)) {
return true;
}
#endif
return false;
}

static bool metal_graph_seed_streaming_expert_cache_from_hotlist(
ds4_gpu_graph *g,
const ds4_model *model,
const ds4_weights *weights);

static bool metal_graph_prefill_decode_streaming_range(
ds4_gpu_graph *g,
const ds4_model *model,
Expand All @@ -19588,6 +19675,16 @@ static bool metal_graph_prefill_decode_streaming_range(
n_tokens > (uint32_t)prompt->len - start) return false;
if (start == 0) {
ds4_gpu_stream_expert_cache_reset_route_hotness();
/*
* Warm the routed-expert cache with the popularity hotlist before the
* decode-style prefill loads experts one token at a time. The layer-major
* prefill path does this at its tail, but the short-prompt decode path
* otherwise starts cold; a bulk sequential warm-up is far cheaper than the
* scattered first-touch reads it replaces. Idempotent across turns.
*/
if (!metal_graph_seed_streaming_expert_cache_from_hotlist(g, model, weights)) {
return false;
}
}

const bool profile = getenv("DS4_METAL_GRAPH_PREFILL_PROFILE") != NULL;
Expand Down Expand Up @@ -19710,7 +19807,7 @@ static bool metal_graph_seed_streaming_expert_cache_from_prefill(
uint32_t seeded_rows = 0;
for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
const ds4_layer_weights *layer = &weights->layer[il];
if (!metal_graph_decode_iq2_selected_slots_expected(g, layer)) continue;
if (!metal_graph_decode_streaming_selected_slots_expected(g, layer)) continue;

const uint64_t gate_row_bytes = routed_expert_row_bytes(layer->ffn_gate_exps);
const uint64_t down_row_bytes = routed_expert_row_bytes(layer->ffn_down_exps);
Expand Down Expand Up @@ -19762,7 +19859,7 @@ static bool metal_graph_seed_streaming_expert_cache_from_hotlist(
uint32_t cache_budget = 0;
for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
const ds4_layer_weights *layer = &weights->layer[il];
if (!metal_graph_decode_iq2_selected_slots_expected(g, layer)) continue;
if (!metal_graph_decode_streaming_selected_slots_expected(g, layer)) continue;

const uint64_t gate_row_bytes = routed_expert_row_bytes(layer->ffn_gate_exps);
const uint64_t down_row_bytes = routed_expert_row_bytes(layer->ffn_down_exps);
Expand Down Expand Up @@ -19835,7 +19932,7 @@ static bool metal_graph_seed_streaming_expert_cache_from_hotlist(
const uint32_t n = counts[il];
if (n == 0) continue;
const ds4_layer_weights *layer = &weights->layer[il];
if (!metal_graph_decode_iq2_selected_slots_expected(g, layer)) continue;
if (!metal_graph_decode_streaming_selected_slots_expected(g, layer)) continue;

const uint64_t gate_row_bytes = routed_expert_row_bytes(layer->ffn_gate_exps);
const uint64_t down_row_bytes = routed_expert_row_bytes(layer->ffn_down_exps);
Expand Down
1 change: 1 addition & 0 deletions ds4_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ void ds4_gpu_set_quality(bool quality);
void ds4_gpu_set_ssd_streaming(bool enabled);
void ds4_gpu_set_streaming_expert_cache_budget(uint32_t experts);
void ds4_gpu_set_streaming_expert_cache_expert_bytes(uint64_t bytes);
void ds4_gpu_stream_set_selected_force_contiguous(int enabled);
uint64_t ds4_gpu_recommended_working_set_size(void);
uint32_t ds4_gpu_stream_expert_cache_configured_count(void);
uint32_t ds4_gpu_stream_expert_cache_current_count(void);
Expand Down
116 changes: 113 additions & 3 deletions rocm/ds4_rocm_current_api_compat.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ extern "C" void ds4_gpu_set_streaming_expert_cache_budget(uint32_t experts) {
g_stream_expert_cache_budget = experts;
}

extern "C" void ds4_gpu_stream_set_selected_force_contiguous(int enabled) {
g_stream_selected_force_contiguous = enabled ? 1 : 0;
}

extern "C" void ds4_gpu_set_streaming_expert_cache_expert_bytes(uint64_t bytes) {
(void)bytes;
}
Expand Down Expand Up @@ -322,15 +326,121 @@ extern "C" int ds4_gpu_stream_expert_cache_release_layer_cache(void) {
return 1;
}

/*
* Popularity-based cache warm-up. Bulk-load the most popular experts of one
* layer into the resident cache via large sequential file reads, which are far
* cheaper than the scattered per-expert random reads that otherwise happen lazily
* on first-touch during decode. Best-effort: any per-expert failure just leaves
* that expert to be streamed on demand later, so this always reports success.
*/
extern "C" int ds4_gpu_stream_expert_cache_seed_experts(
const ds4_gpu_stream_expert_table *table,
const int32_t *expert_ids,
const uint32_t *expert_priorities,
uint32_t n_experts) {
(void)table;
(void)expert_ids;
(void)expert_priorities;
(void)n_experts;
if (!g_ssd_streaming_mode || !table || !expert_ids || n_experts == 0) return 1;

const void *model_map = table->model_map;
const uint64_t model_size = table->model_size;
const uint32_t layer = table->layer;
const uint32_t n_total_expert = table->n_total_expert;
const uint64_t gate_offset = table->gate_offset;
const uint64_t up_offset = table->up_offset;
const uint64_t down_offset = table->down_offset;
const uint64_t gate_expert_bytes = table->gate_expert_bytes;
const uint64_t down_expert_bytes = table->down_expert_bytes;
if (n_total_expert == 0 || gate_expert_bytes == 0 || down_expert_bytes == 0) {
return 1;
}
/* Only the file-backed streaming path benefits from a bulk warm-up. */
const int use_fd = g_model_fd >= 0 &&
(g_model_fd_host_base == NULL || model_map == g_model_fd_host_base);
if (!use_fd || !cuda_stream_selected_ensure_stream()) return 1;

cuda_stream_read_job *jobs = (cuda_stream_read_job *)calloc(
DS4_ROCM_STREAM_READ_MAX_JOBS, sizeof(jobs[0]));
if (!jobs) return 1;
uint32_t job_count = 0;

for (uint32_t i = 0; i < n_experts; i++) {
/* Stop once the configured cache is full; let warm decode misses
* compete for the remaining slots through the normal LRU path. */
if (g_stream_expert_cache_budget != 0 &&
g_stream_resident_experts.size() >= g_stream_expert_cache_budget) {
break;
}
const int32_t expert_id = expert_ids[i];
if (expert_id < 0 || (uint32_t)expert_id >= n_total_expert) continue;
if (cuda_stream_resident_find(model_map, layer, expert_id,
gate_offset, up_offset, down_offset,
gate_expert_bytes, down_expert_bytes) >= 0) {
continue;
}

const uint64_t expert = (uint64_t)(uint32_t)expert_id;
uint64_t gate_rel = 0;
uint64_t down_rel = 0;
if (!cuda_u64_mul_checked(expert, gate_expert_bytes, &gate_rel) ||
!cuda_u64_mul_checked(expert, down_expert_bytes, &down_rel) ||
gate_offset > model_size ||
up_offset > model_size ||
down_offset > model_size ||
gate_rel > model_size - gate_offset ||
gate_rel > model_size - up_offset ||
down_rel > model_size - down_offset ||
gate_expert_bytes > model_size - gate_offset - gate_rel ||
gate_expert_bytes > model_size - up_offset - gate_rel ||
down_expert_bytes > model_size - down_offset - down_rel) {
continue;
}

const int idx = cuda_stream_resident_alloc(model_map, layer, expert_id,
expert_ids, n_experts,
gate_offset, up_offset, down_offset,
gate_expert_bytes, down_expert_bytes);
if (idx < 0) break; /* out of cache budget or device memory */
cuda_stream_resident_expert &entry = g_stream_resident_experts[(size_t)idx];

if (job_count + 3u > DS4_ROCM_STREAM_READ_MAX_JOBS) {
const int flushed =
cuda_stream_read_jobs_parallel(jobs, job_count) &&
cuda_stream_selected_upload_read_jobs(jobs, job_count);
cuda_stream_read_jobs_free(jobs, job_count);
job_count = 0;
if (!flushed) {
/* Unfilled resident entries would later be served as cache hits
* with garbage data; drop the resident cache so it refills
* correctly on demand. */
cuda_stream_resident_cache_release();
free(jobs);
return 1;
}
}
jobs[job_count++] = {entry.gate, gate_offset + gate_rel, gate_expert_bytes,
NULL, NULL, 0, 0, 0};
jobs[job_count++] = {entry.up, up_offset + gate_rel, gate_expert_bytes,
NULL, NULL, 0, 0, 0};
jobs[job_count++] = {entry.down, down_offset + down_rel, down_expert_bytes,
NULL, NULL, 0, 0, 0};
}

if (job_count != 0) {
const int flushed =
cuda_stream_read_jobs_parallel(jobs, job_count) &&
cuda_stream_selected_upload_read_jobs(jobs, job_count);
cuda_stream_read_jobs_free(jobs, job_count);
if (!flushed) {
cuda_stream_resident_cache_release();
free(jobs);
return 1;
}
}
if (cuda_stream_cache_stats_on()) {
g_stream_cache_stats.seed_calls++;
g_stream_cache_stats.seed_unique += n_experts;
}
Comment on lines +420 to +442
free(jobs);
return 1;
}

Expand Down
11 changes: 10 additions & 1 deletion rocm/ds4_rocm_runtime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,14 @@ static cuda_stream_cache_stats g_stream_cache_stats;
static int g_stream_cache_stats_enabled = -1;
static int32_t g_routed_moe_selected_override[DS4_ROCM_N_EXPERT_USED];
static uint32_t g_routed_moe_selected_override_n;
/*
* When set, the single-token selected-expert loader always materializes a full
* contiguous compact expert buffer instead of deferring a mixed
* resident/missing set to the async split path. The split decode kernels only
* exist for the IQ2_XXS/Q2_K quant pair, so Q4_K experts must use the
* contiguous path that feeds the standard Q4_K matmul.
*/
static int g_stream_selected_force_contiguous;
static uint64_t g_stream_selected_stage_counter;
static cudaEvent_t g_stream_selected_reuse_event;
static int g_stream_selected_reuse_event_pending;
Expand Down Expand Up @@ -3057,7 +3065,8 @@ static int cuda_stream_selected_load(
}
}

if (use_fd && read_job_count != 0 && resident_mask != 0) {
if (use_fd && read_job_count != 0 && resident_mask != 0 &&
!g_stream_selected_force_contiguous) {
g_stream_selected_pending.active = 1;
g_stream_selected_pending.model_map = model_map;
g_stream_selected_pending.layer = layer;
Expand Down