server: handle limiting maximum reasoning budget

aviallon · aviallon · commit fb4026604c67 · 2025-12-09T16:26:38.000+01:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2574,12 +2574,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
     add_opt(common_arg(
         {"--reasoning-budget"}, "N",
-        "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
+        "controls the maximum number of thinking tokens allowed; -1 for unlimited, 0 to disable thinking, or a positive value to limit thinking tokens (default: -1)",
         [](common_params & params, int value) {
-            if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
+            if (value < -1) { throw std::invalid_argument("invalid value: must be -1 (unlimited), 0 (disabled), or a positive number"); }
             params.reasoning_budget = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
+    add_opt(common_arg(
+        {"--reasoning-force-close-message"}, "STRING",
+        string_format(
+            "if specified, forces the model to close its reasoning/thoughts when generating this message (default: %s)\n",
+            params.reasoning_force_close_message.c_str()
+        ),
+        [](common_params & params, const std::string & value) {
+            params.reasoning_force_close_message = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_FORCE_CLOSE_MESSAGE"));
     add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
         string_format(
diff --git a/common/common.cpp b/common/common.cpp
@@ -1078,6 +1078,14 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     common_init_sampler_from_model(model, params.sampling);
 
+    // Allow models to override the forced reasoning close message via GGUF metadata
+    if (params.reasoning_force_close_message == COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE) {
+        char buf[512] = {0};
+        if (llama_model_meta_val_str(model, "tokenizer.ggml.reasoning_force_close_message", buf, sizeof(buf)) > 0) {
+            params.reasoning_force_close_message = buf;
+        }
+    }
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     auto cparams = common_context_params_to_llama(params);
diff --git a/common/common.h b/common/common.h
@@ -102,6 +102,8 @@ enum llama_example {
     LLAMA_EXAMPLE_COUNT,
 };
 
+inline constexpr const char * COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE = "... I now conclude my reasoning and will provide the final answer.";
+
 enum common_sampler_type {
     COMMON_SAMPLER_TYPE_NONE        = 0,
     COMMON_SAMPLER_TYPE_DRY         = 1,
@@ -466,6 +468,7 @@ struct common_params {
     bool enable_chat_template = true;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     int reasoning_budget = -1;
+    std::string reasoning_force_close_message = COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
 
     std::vector<std::string> api_keys;
diff --git a/tools/server/README.md b/tools/server/README.md
@@ -203,7 +203,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
 | `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
-| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
+| `--reasoning-budget N` | controls the maximum number of thinking tokens allowed; -1 for unlimited, 0 to disable thinking, or a positive value to limit thinking tokens. When the budget is exceeded, the server automatically injects a closing `</think>` and continues with the final answer. Individual OpenAI-compatible requests can override this value with `thinking_budget_tokens`. (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
+| `--reasoning-force-close-message STRING` | when the reasoning budget is exceeded, this message is appended to the current user message to signal the model to close any open thought tags. (default: '... I now conclude my reasoning and will provide the final answer.')<br/>(env: LLAMA_ARG_THINK_FORCE_CLOSE_MESSAGE) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -18,6 +18,8 @@
 #include <memory>
 #include <unordered_set>
 #include <filesystem>
+#include <deque>
+#include <exception>
 
 // fix problem with std::min and std::max
 #if defined(_WIN32)
@@ -47,6 +49,13 @@ enum server_state {
     SERVER_STATE_READY,          // Server is ready and model is loaded
 };
 
+enum reasoning_state {
+    REASONING_STATE_NONE,
+    REASONING_STATE_REASONING,
+    REASONING_STATE_PENDING_FORCE_CLOSE,
+    REASONING_STATE_FINISHED,
+};
+
 static bool server_task_type_need_embd(server_task_type task_type) {
     switch (task_type) {
         case SERVER_TASK_TYPE_EMBEDDING:
@@ -113,6 +122,12 @@ struct server_slot {
     bool has_new_line   = false;
     bool truncated      = false;
 
+    // reasoning budget tracking
+    int32_t n_reasoning_tokens    = 0;  // number of tokens generated while in reasoning/thinking mode
+    reasoning_state reasoning     = REASONING_STATE_NONE; // are we currently in reasoning mode
+    std::string reasoning_end_tag;  // the closing tag to inject when budget is exceeded (e.g., "</think>")
+    std::deque<llama_token> forced_tokens;  // tokens we must feed back to the model (e.g., forced </think>)
+
     stop_type stop;
 
     std::string stopping_word;
@@ -162,9 +177,11 @@ struct server_slot {
     size_t n_sent_text = 0; // number of sent text character
 
     int64_t t_start_process_prompt;
+    int64_t t_start_reasoning;
     int64_t t_start_generation;
 
     double t_prompt_processing; // ms
+    double t_reasoning_token_generation; // ms
     double t_token_generation;  // ms
 
     std::function<void(int)> callback_on_release;
@@ -188,6 +205,13 @@ struct server_slot {
 
         drafted.clear();
         i_batch_dft.clear();
+
+        // reset reasoning budget tracking
+        n_reasoning_tokens    = 0;
+        reasoning          = REASONING_STATE_NONE;
+        reasoning_end_tag     = "";
+        forced_tokens.clear();
+
         generated_tokens.clear();
         generated_token_probs.clear();
         json_schema = json();
@@ -372,15 +396,20 @@ struct server_slot {
         const double t_prompt        =       t_prompt_processing / n_prompt_tokens_processed;
         const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
 
+        const double t_reasoning     =       t_reasoning_token_generation / n_reasoning_tokens;
+        const double n_reasoning_second = 1e3 / t_reasoning_token_generation * n_reasoning_tokens;
+
         const double t_gen        =       t_token_generation / n_decoded;
         const double n_gen_second = 1e3 / t_token_generation * n_decoded;
 
         SLT_INF(*this,
                 "\n"
                 "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "  reasoning time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
                 "       eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
                 "      total time = %10.2f ms / %5d tokens\n",
                 t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
+                t_reasoning_token_generation, n_reasoning_tokens, t_reasoning, n_reasoning_second,
                 t_token_generation, n_decoded, t_gen, n_gen_second,
                 t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
 
@@ -1079,6 +1108,13 @@ struct server_context_impl {
             ? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
             : SLOT_STATE_STARTED;
 
+        // Initialize reasoning tracking
+        slot.forced_tokens.clear();
+        slot.n_reasoning_tokens = 0;
+        slot.reasoning = REASONING_STATE_NONE;
+        slot.reasoning_end_tag.clear();
+
+
         SLT_INF(slot, "%s", "processing task\n");
 
         return true;
@@ -1154,6 +1190,85 @@ struct server_context_impl {
             SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.task->params.n_predict);
         }
 
+        const int32_t reasoning_budget = (slot.task ? slot.task->params.reasoning_budget : params_base.reasoning_budget);
+
+        // check reasoning budget limit
+        // Track reasoning tokens using the chat parser to detect reasoning segments consistently across formats
+        // When the budget is exceeded we enqueue the closing tag tokens so they get sent to the client
+        // and fed back into the model before continuing normal generation
+        if (slot.has_next_token && reasoning_budget > 0 && slot.reasoning != REASONING_STATE_FINISHED)  {
+            const auto parsed_msg = common_chat_parse(
+                slot.generated_text,
+                /* is_partial = */ true,
+                slot.task->params.oaicompat_chat_syntax);
+            const auto & rstatus = parsed_msg.reasoning_status;
+
+            if (rstatus.active && slot.reasoning != REASONING_STATE_PENDING_FORCE_CLOSE) {
+                if (slot.reasoning != REASONING_STATE_REASONING) {
+                    SLT_DBG(slot, "detected reasoning start via parser%s\n", "");
+                    slot.reasoning = REASONING_STATE_REASONING;
+                    slot.reasoning_end_tag = rstatus.end_tag;
+                    slot.n_reasoning_tokens = 0;
+                    slot.t_start_reasoning = ggml_time_us();
+                }
+            } else if (!rstatus.active && slot.reasoning == REASONING_STATE_REASONING) {
+                SLT_DBG(slot, "detected reasoning end '%s' via parser\n", rstatus.end_tag.c_str());
+                slot.reasoning = REASONING_STATE_FINISHED;
+                slot.t_reasoning_token_generation = (ggml_time_us() - slot.t_start_reasoning) / 1e3;
+            }
+
+            if (slot.reasoning == REASONING_STATE_REASONING) {
+                slot.n_reasoning_tokens++;
+
+                // Detect if we are in the middle of emitting a tool call this step.
+                // The parser sets tool_call_in_progress when it catches a partial exception
+                // while parsing tool calls, indicating incomplete tool call parsing.
+                // We also check for tool call diffs in this token as a fallback.
+                if (!parsed_msg.tool_call_in_progress && slot.n_reasoning_tokens >= reasoning_budget) {
+                    SLT_INF(slot, "reasoning budget exceeded, forcing close with '%s', n_reasoning_tokens = %d, reasoning_budget = %d\n",
+                        slot.reasoning_end_tag.c_str(), slot.n_reasoning_tokens, reasoning_budget);
+
+                    auto fail_close = [&](const char * reason) {
+                        SLT_WRN(slot, "failed to inject reasoning close tag (%s) -> stopping generation\n", reason);
+                        slot.stop           = STOP_TYPE_LIMIT;
+                        slot.has_next_token = false;
+                    };
+
+                    if (slot.reasoning_end_tag.empty()) {
+                        fail_close("no closing tag detected");
+                    } else {
+                        const std::string forced_message = slot.task->params.reasoning_force_close_message.empty()
+                            ? std::string(COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE)
+                            : slot.task->params.reasoning_force_close_message;
+                        const std::string forced_injection = forced_message + slot.reasoning_end_tag;
+
+                        llama_tokens closing_tokens;
+                        try {
+                            closing_tokens = common_tokenize(ctx, forced_injection, /*add_special=*/false, /*parse_special=*/true);
+                        } catch (const std::exception & err) {
+                            SLT_WRN(slot, "tokenization error while forcing reasoning close: %s\n", err.what());
+                            fail_close("tokenization error");
+                            closing_tokens.clear();
+                        }
+
+                        if (!closing_tokens.empty()) {
+                            slot.forced_tokens.insert(slot.forced_tokens.end(), closing_tokens.begin(), closing_tokens.end());
+                            slot.reasoning = REASONING_STATE_PENDING_FORCE_CLOSE;
+                        } else if (slot.has_next_token) {
+                            fail_close("closing tag produced no tokens");
+                        }
+                    }
+                }
+            } else if (slot.reasoning == REASONING_STATE_PENDING_FORCE_CLOSE) {
+                // We've already scheduled the forced close, wait until it's done
+                if (slot.forced_tokens.empty()) {
+                    SLT_DBG(slot, "completed forced reasoning close with '%s'\n", slot.reasoning_end_tag.c_str());
+                    slot.reasoning = REASONING_STATE_FINISHED;
+                    slot.t_reasoning_token_generation = (ggml_time_us() - slot.t_start_reasoning) / 1e3;
+                }
+            }
+        }
+
         if (slot.has_new_line) {
             // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
             if (slot.task->params.n_indent > 0) {
@@ -2484,7 +2599,15 @@ struct server_context_impl {
 
                 const int tok_idx = slot.i_batch - i;
 
-                llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
+                const bool has_forced_token = !slot.forced_tokens.empty();
+                llama_token id = 0;
+
+                if (has_forced_token) {
+                    id = slot.forced_tokens.front();
+                    slot.forced_tokens.pop_front();
+                } else {
+                    id = common_sampler_sample(slot.smpl, ctx, tok_idx);
+                }
 
                 slot.i_batch = -1;
 
@@ -2522,7 +2645,7 @@ struct server_context_impl {
 
             // speculative decoding - main model sample and accept
             for (auto & slot : slots) {
-                if (slot.state != SLOT_STATE_GENERATING || slot.i_batch_dft.empty()) {
+                if (slot.state != SLOT_STATE_GENERATING || slot.i_batch_dft.empty() || !slot.forced_tokens.empty()) {
                     continue;
                 }
 
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
@@ -130,13 +130,15 @@ json task_params::to_json(bool only_metrics) const {
         {"reasoning_format",          common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
         {"reasoning_in_content",      oaicompat_chat_syntax.reasoning_in_content},
         {"thinking_forced_open",      oaicompat_chat_syntax.thinking_forced_open},
+        {"reasoning_force_close_message", reasoning_force_close_message},
         {"samplers",                  samplers},
         {"speculative.n_max",         speculative.n_max},
         {"speculative.n_min",         speculative.n_min},
         {"speculative.p_min",         speculative.p_min},
         {"timings_per_token",         timings_per_token},
         {"post_sampling_probs",       post_sampling_probs},
         {"lora",                      lora},
+        {"thinking_budget_tokens",     reasoning_budget},
     };
 }
 
@@ -159,8 +161,8 @@ task_params server_task::params_from_json_cmpl(
     defaults.speculative   = params_base.speculative;
     defaults.n_keep        = params_base.n_keep;
     defaults.n_predict     = params_base.n_predict;
-    defaults.n_cache_reuse = params_base.n_cache_reuse;
     defaults.antiprompt    = params_base.antiprompt;
+    defaults.reasoning_force_close_message = params_base.reasoning_force_close_message;
 
     // enabling this will output extra debug information in the HTTP responses from the server
     params.verbose           = params_base.verbosity > 9;
@@ -182,6 +184,9 @@ task_params server_task::params_from_json_cmpl(
     params.t_max_predict_ms = json_value(data,       "t_max_predict_ms",   defaults.t_max_predict_ms);
     params.response_fields  = json_value(data,       "response_fields",    std::vector<std::string>());
 
+    params.reasoning_budget = json_value(data, "thinking_budget_tokens", params_base.reasoning_budget);
+    params.reasoning_force_close_message = json_value(data, "reasoning_force_close_message", defaults.reasoning_force_close_message);
+
     params.sampling.top_k              = json_value(data, "top_k",               defaults.sampling.top_k);
     params.sampling.top_p              = json_value(data, "top_p",               defaults.sampling.top_p);
     params.sampling.min_p              = json_value(data, "min_p",               defaults.sampling.min_p);
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
@@ -72,11 +72,13 @@ struct task_params {
     struct common_params_speculative speculative;
 
     // response formatting
-    bool               verbose  = false;
-    task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
-    std::string        oaicompat_model;
-    std::string        oaicompat_cmpl_id;
-    common_chat_syntax oaicompat_chat_syntax;
+    bool                         verbose                   = false;
+    task_response_type           res_type                  = TASK_RESPONSE_TYPE_NONE;
+    std::string                  oaicompat_model;
+    std::string                  oaicompat_cmpl_id;
+    common_chat_syntax           oaicompat_chat_syntax;
+    int32_t                      reasoning_budget;
+    std::string                  reasoning_force_close_message;
 
     // Embeddings
     int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
diff --git a/tools/server/tests/unit/test_reasoning_budget_stream.py b/tools/server/tests/unit/test_reasoning_budget_stream.py
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py