From 96d62ca6cdca01f6bcd54661291bb4154e317d0f Mon Sep 17 00:00:00 2001 From: alantsev Date: Wed, 24 Jun 2026 02:03:45 +1000 Subject: [PATCH] Fix ROCm Q8->F16 cache reserve starving session tensors on large models --- rocm/ds4_rocm_runtime.cuh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/rocm/ds4_rocm_runtime.cuh b/rocm/ds4_rocm_runtime.cuh index 3bd786f8e..ff144db9b 100644 --- a/rocm/ds4_rocm_runtime.cuh +++ b/rocm/ds4_rocm_runtime.cuh @@ -3558,14 +3558,17 @@ static uint64_t cuda_q8_f16_cache_reserve_bytes(uint64_t total_bytes) { if (g_ssd_streaming_mode) { return cuda_stream_resident_free_reserve_bytes(); } - if (total_bytes >= 112ull * 1024ull * 1024ull * 1024ull) { - return 512ull * 1048576ull; - } /* The expanded Q8->F16 cache is only an acceleration path. Keep enough - * device memory free for cuBLAS workspaces, transient graph buffers, and - * driver bookkeeping instead of letting optional cached weights consume the - * last few GiB on 96 GiB cards. */ + * device memory free for the session/context tensors, cuBLAS workspaces, + * and transient graph buffers allocated after model load, instead of + * letting optional cached weights consume the last few GiB. + * + * Do not shrink this to a sub-GiB reserve on large unified-memory machines - + * a tiny reserve lets the eager preload fill device memory down to a few + * hundred MiB and OOM at session creation. + * Loading an MTP model disables this cache and hides the issue. */ + const uint64_t min_reserve = 4096ull * 1048576ull; const uint64_t pct_reserve = total_bytes / 20u; /* 5% */ return pct_reserve > min_reserve ? pct_reserve : min_reserve;