From 96d62ca6cdca01f6bcd54661291bb4154e317d0f Mon Sep 17 00:00:00 2001
From: alantsev <alantsev@users.noreply.github.com>
Date: Wed, 24 Jun 2026 02:03:45 +1000
Subject: [PATCH] Fix ROCm Q8->F16 cache reserve starving session tensors on
 large models

---
 rocm/ds4_rocm_runtime.cuh | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/rocm/ds4_rocm_runtime.cuh b/rocm/ds4_rocm_runtime.cuh
index 3bd786f8e..ff144db9b 100644
--- a/rocm/ds4_rocm_runtime.cuh
+++ b/rocm/ds4_rocm_runtime.cuh
@@ -3558,14 +3558,17 @@ static uint64_t cuda_q8_f16_cache_reserve_bytes(uint64_t total_bytes) {
     if (g_ssd_streaming_mode) {
         return cuda_stream_resident_free_reserve_bytes();
     }
-    if (total_bytes >= 112ull * 1024ull * 1024ull * 1024ull) {
-        return 512ull * 1048576ull;
-    }
 
     /* The expanded Q8->F16 cache is only an acceleration path.  Keep enough
-     * device memory free for cuBLAS workspaces, transient graph buffers, and
-     * driver bookkeeping instead of letting optional cached weights consume the
-     * last few GiB on 96 GiB cards. */
+     * device memory free for the session/context tensors, cuBLAS workspaces,
+     * and transient graph buffers allocated after model load, instead of
+     * letting optional cached weights consume the last few GiB.
+     *
+     * Do not shrink this to a sub-GiB reserve on large unified-memory machines -
+     * a tiny reserve lets the eager preload fill device memory down to a few
+     * hundred MiB and OOM at session creation.
+     * Loading an MTP model disables this cache and hides the issue. */
+
     const uint64_t min_reserve = 4096ull * 1048576ull;
     const uint64_t pct_reserve = total_bytes / 20u; /* 5% */
     return pct_reserve > min_reserve ? pct_reserve : min_reserve;