@@ -34,11 +34,6 @@ using json = nlohmann::ordered_json;
3434
3535constexpr int HTTP_POLLING_SECONDS = 1 ;
3636
37- static const std::vector<std::pair<std::string, std::string>> kReasoningThinkMarkers = {
38- {" <think>" , " </think>" },
39- {" <|START_THINKING|>" , " <|END_THINKING|>" },
40- };
41-
4237// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
4338enum slot_state {
4439 SLOT_STATE_IDLE,
@@ -1109,15 +1104,8 @@ struct server_context_impl {
11091104 // Initialize reasoning tracking
11101105 slot.forced_tokens .clear ();
11111106 slot.n_reasoning_tokens = 0 ;
1112-
1113- const bool thinking_forced_open = slot.task ->params .oaicompat_chat_syntax .thinking_forced_open ;
1114- slot.reasoning = thinking_forced_open ? REASONING_STATE_REASONING : REASONING_STATE_NONE;
1115-
1116- if (thinking_forced_open) {
1117- slot.reasoning_end_tag = kReasoningThinkMarkers .front ().second ;
1118- } else {
1119- slot.reasoning_end_tag .clear ();
1120- }
1107+ slot.reasoning = REASONING_STATE_NONE;
1108+ slot.reasoning_end_tag .clear ();
11211109
11221110
11231111 SLT_INF (slot, " %s" , " processing task\n " );
@@ -1198,65 +1186,62 @@ struct server_context_impl {
11981186 const int32_t reasoning_budget = (slot.task ? slot.task ->params .reasoning_budget : params_base.reasoning_budget );
11991187
12001188 // check reasoning budget limit
1201- // Track reasoning tokens when we're inside thinking blocks (<think>...</think> or similar)
1189+ // Track reasoning tokens using the chat parser to detect reasoning segments consistently across formats
12021190 // When the budget is exceeded we enqueue the closing tag tokens so they get sent to the client
12031191 // and fed back into the model before continuing normal generation
12041192 if (slot.has_next_token && reasoning_budget > 0 && slot.reasoning != REASONING_STATE_FINISHED) {
1205- // Check if we've entered or exited reasoning mode
1206- if (slot.reasoning == REASONING_STATE_NONE) {
1207- for (const auto & [start_tag, end_tag] : kReasoningThinkMarkers ) {
1208- size_t start_pos = slot.generated_text .rfind (start_tag);
1209- if (start_pos != std::string::npos) {
1210- SLT_DBG (slot, " detected reasoning start with '%s'\n " , start_tag.c_str ());
1211- slot.reasoning = REASONING_STATE_REASONING;
1212- slot.reasoning_end_tag = end_tag;
1213- slot.n_reasoning_tokens = 0 ;
1214- break ;
1215- }
1216- }
1217- } else if (slot.reasoning == REASONING_STATE_REASONING) {
1218- size_t end_pos = slot.generated_text .rfind (slot.reasoning_end_tag );
1219- if (end_pos != std::string::npos) {
1220- SLT_DBG (slot, " detected reasoning end with '%s'\n " , slot.reasoning_end_tag .c_str ());
1221- slot.reasoning = REASONING_STATE_FINISHED;
1193+ std::vector<common_chat_msg_diff> reasoning_diffs;
1194+ const auto & parsed_msg = slot.update_chat_msg (reasoning_diffs, /* compute_diffs = */ false );
1195+ const auto & rstatus = parsed_msg.reasoning_status ;
1196+
1197+ if (rstatus.active && slot.reasoning != REASONING_STATE_PENDING_FORCE_CLOSE) {
1198+ if (slot.reasoning != REASONING_STATE_REASONING) {
1199+ SLT_DBG (slot, " detected reasoning start via parser%s\n " , " " );
1200+ slot.reasoning = REASONING_STATE_REASONING;
1201+ slot.reasoning_end_tag = rstatus.end_tag ;
12221202 slot.n_reasoning_tokens = 0 ;
1223- } else {
1224- // If actively reasoning (and we haven't already scheduled a forced close) count this token
1225- slot.n_reasoning_tokens ++;
1203+ }
1204+ } else if (!rstatus.active && slot.reasoning == REASONING_STATE_REASONING) {
1205+ SLT_DBG (slot, " detected reasoning end '%s' via parser\n " , rstatus.end_tag .c_str ());
1206+ slot.reasoning = REASONING_STATE_FINISHED;
1207+ slot.n_reasoning_tokens = 0 ;
1208+ }
12261209
1227- if (slot.n_reasoning_tokens >= reasoning_budget) {
1228- SLT_INF (slot, " reasoning budget exceeded, forcing close with '%s', n_reasoning_tokens = %d, reasoning_budget = %d\n " ,
1229- slot.reasoning_end_tag .c_str (), slot.n_reasoning_tokens , reasoning_budget);
1210+ if (slot.reasoning == REASONING_STATE_REASONING) {
1211+ slot.n_reasoning_tokens ++;
12301212
1231- auto fail_close = [&](const char * reason) {
1232- SLT_WRN (slot, " failed to inject reasoning close tag (%s) -> stopping generation\n " , reason);
1233- slot.stop = STOP_TYPE_LIMIT;
1234- slot.has_next_token = false ;
1235- };
1213+ if (slot.n_reasoning_tokens >= reasoning_budget) {
1214+ SLT_INF (slot, " reasoning budget exceeded, forcing close with '%s', n_reasoning_tokens = %d, reasoning_budget = %d\n " ,
1215+ slot.reasoning_end_tag .c_str (), slot.n_reasoning_tokens , reasoning_budget);
12361216
1237- if (slot.reasoning_end_tag .empty ()) {
1238- fail_close (" no closing tag detected" );
1239- } else {
1240- const std::string forced_message = slot.task ->params .reasoning_force_close_message .empty ()
1241- ? std::string (COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE)
1242- : slot.task ->params .reasoning_force_close_message ;
1243- const std::string forced_injection = forced_message + slot.reasoning_end_tag ;
1244-
1245- llama_tokens closing_tokens;
1246- try {
1247- closing_tokens = common_tokenize (ctx, forced_injection, /* add_special=*/ false , /* parse_special=*/ true );
1248- } catch (const std::exception & err) {
1249- SLT_WRN (slot, " tokenization error while forcing reasoning close: %s\n " , err.what ());
1250- fail_close (" tokenization error" );
1251- closing_tokens.clear ();
1252- }
1217+ auto fail_close = [&](const char * reason) {
1218+ SLT_WRN (slot, " failed to inject reasoning close tag (%s) -> stopping generation\n " , reason);
1219+ slot.stop = STOP_TYPE_LIMIT;
1220+ slot.has_next_token = false ;
1221+ };
12531222
1254- if (!closing_tokens.empty ()) {
1255- slot.forced_tokens .insert (slot.forced_tokens .end (), closing_tokens.begin (), closing_tokens.end ());
1256- slot.reasoning = REASONING_STATE_PENDING_FORCE_CLOSE;
1257- } else if (slot.has_next_token ) {
1258- fail_close (" closing tag produced no tokens" );
1259- }
1223+ if (slot.reasoning_end_tag .empty ()) {
1224+ fail_close (" no closing tag detected" );
1225+ } else {
1226+ const std::string forced_message = slot.task ->params .reasoning_force_close_message .empty ()
1227+ ? std::string (COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE)
1228+ : slot.task ->params .reasoning_force_close_message ;
1229+ const std::string forced_injection = forced_message + slot.reasoning_end_tag ;
1230+
1231+ llama_tokens closing_tokens;
1232+ try {
1233+ closing_tokens = common_tokenize (ctx, forced_injection, /* add_special=*/ false , /* parse_special=*/ true );
1234+ } catch (const std::exception & err) {
1235+ SLT_WRN (slot, " tokenization error while forcing reasoning close: %s\n " , err.what ());
1236+ fail_close (" tokenization error" );
1237+ closing_tokens.clear ();
1238+ }
1239+
1240+ if (!closing_tokens.empty ()) {
1241+ slot.forced_tokens .insert (slot.forced_tokens .end (), closing_tokens.begin (), closing_tokens.end ());
1242+ slot.reasoning = REASONING_STATE_PENDING_FORCE_CLOSE;
1243+ } else if (slot.has_next_token ) {
1244+ fail_close (" closing tag produced no tokens" );
12601245 }
12611246 }
12621247 }
0 commit comments