Skip to content

Commit 9c82848

Browse files
committed
use reasoning state from chat-parser, thus removing duplicated logic
1 parent cd9165a commit 9c82848

File tree

6 files changed

+103
-65
lines changed

6 files changed

+103
-65
lines changed

common/chat-parser-xml-toolcall.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,9 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
705705

706706
// Parse content
707707
bool reasoning_unclosed = builder.syntax().thinking_forced_open;
708+
if (reasoning_unclosed) {
709+
builder.mark_reasoning_active(end_think);
710+
}
708711
std::string unclosed_reasoning_content("");
709712
for (;;) {
710713
auto tc = try_find_2_literal_splited_by_spaces(builder, form.scope_start, form.tool_start);
@@ -730,6 +733,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
730733
}
731734
} else {
732735
reasoning_unclosed = false;
736+
builder.mark_reasoning_closed();
733737
std::string reasoning_content;
734738
if (pos == std::string::npos) {
735739
reasoning_content = std::move(content);
@@ -766,13 +770,15 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
766770
bool toolcall_in_think = false;
767771
for (auto think_start = content.find(start_think); think_start != std::string::npos; think_start = content.find(start_think, think_start)) {
768772
if (auto think_end = content.find(end_think, think_start + start_think.size()); think_end != std::string::npos) {
773+
builder.mark_reasoning_active(end_think);
769774
if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
770775
auto reasoning_content = content.substr(think_start + start_think.size(), think_end - think_start - start_think.size());
771776
builder.add_reasoning_content(reasoning_content);
772777
think_start = erase_spaces(content, think_start, think_end + end_think.size() - 1);
773778
} else {
774779
think_start = think_end + end_think.size() - 1;
775780
}
781+
builder.mark_reasoning_closed();
776782
} else {
777783
// This <tool_call> start is in thinking block, skip this tool call
778784
// This <tool_call> start is in thinking block
@@ -782,6 +788,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
782788
unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
783789
}
784790
reasoning_unclosed = true;
791+
builder.mark_reasoning_active(end_think);
785792
content.resize(think_start);
786793
toolcall_in_think = true;
787794
}

common/chat-parser.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,20 @@ void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_
156156
result_.reasoning_content += reasoning_content;
157157
}
158158

159+
void common_chat_msg_parser::mark_reasoning_active(const std::string & end_tag) {
160+
result_.reasoning_status.detected = true;
161+
result_.reasoning_status.active = true;
162+
if (!end_tag.empty()) {
163+
result_.reasoning_status.end_tag = end_tag;
164+
}
165+
}
166+
167+
void common_chat_msg_parser::mark_reasoning_closed() {
168+
if (result_.reasoning_status.detected) {
169+
result_.reasoning_status.active = false;
170+
}
171+
}
172+
159173
bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
160174
if (name.empty()) {
161175
return false;
@@ -329,11 +343,13 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
329343
const size_t saved_pos = pos_;
330344
const size_t saved_content_size = result_.content.size();
331345
const size_t saved_reasoning_size = result_.reasoning_content.size();
346+
const auto saved_reasoning_status = result_.reasoning_status;
332347

333348
auto restore_state = [&]() {
334349
move_to(saved_pos);
335350
result_.content.resize(saved_content_size);
336351
result_.reasoning_content.resize(saved_reasoning_size);
352+
result_.reasoning_status = saved_reasoning_status;
337353
};
338354

339355
// Allow leading whitespace to be preserved as content when reasoning is present at the start
@@ -370,9 +386,11 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
370386
if (whitespace_end > pos_) {
371387
add_content(input_.substr(pos_, whitespace_end - pos_));
372388
}
389+
mark_reasoning_active(end_think);
373390
set_reasoning_prefix(cursor);
374391
cursor += start_think.size();
375392
} else if (syntax_.thinking_forced_open) {
393+
mark_reasoning_active(end_think);
376394
cursor = whitespace_end;
377395
} else {
378396
restore_state();
@@ -398,8 +416,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
398416

399417
if (end_pos > cursor) {
400418
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
419+
mark_reasoning_closed();
401420
} else {
402421
handle_reasoning("", /* closed */ true);
422+
mark_reasoning_closed();
403423
}
404424

405425
cursor = end_pos + end_think.size();
@@ -420,6 +440,7 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
420440
move_to(input_.size());
421441
return true;
422442
}
443+
mark_reasoning_active(end_think);
423444
set_reasoning_prefix(cursor);
424445
cursor += start_think.size();
425446
continue;

common/chat-parser.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ class common_chat_msg_parser {
5656
// Appends to the result.reasoning_content field
5757
void add_reasoning_content(const std::string & reasoning_content);
5858

59+
// Track reasoning status to expose start/end markers to callers
60+
void mark_reasoning_active(const std::string & end_tag);
61+
void mark_reasoning_closed();
62+
5963
// Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
6064
bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
6165

common/chat.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,19 @@ struct common_chat_tool_call {
2222
}
2323
};
2424

25+
struct common_chat_reasoning_status {
26+
bool detected = false; // a reasoning block start was observed
27+
bool active = false; // we are currently inside a reasoning block (not closed yet)
28+
std::string end_tag; // closing tag to use when forcing a close
29+
30+
bool operator==(const common_chat_reasoning_status & other) const {
31+
return detected == other.detected && active == other.active && end_tag == other.end_tag;
32+
}
33+
bool operator!=(const common_chat_reasoning_status & other) const {
34+
return !(*this == other);
35+
}
36+
};
37+
2538
struct common_chat_msg_content_part {
2639
std::string type;
2740
std::string text;
@@ -37,6 +50,7 @@ struct common_chat_msg {
3750
std::vector<common_chat_msg_content_part> content_parts;
3851
std::vector<common_chat_tool_call> tool_calls;
3952
std::string reasoning_content;
53+
common_chat_reasoning_status reasoning_status;
4054
std::string tool_name;
4155
std::string tool_call_id;
4256

@@ -63,6 +77,7 @@ struct common_chat_msg {
6377
&& content_parts == other.content_parts
6478
&& tool_calls == other.tool_calls
6579
&& reasoning_content == other.reasoning_content
80+
&& reasoning_status == other.reasoning_status
6681
&& tool_name == other.tool_name
6782
&& tool_call_id == other.tool_call_id;
6883
}

tests/test-chat-parser.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ static void test_reasoning() {
119119
auto msg = common_chat_parse(input, false, syntax);
120120
assert_equals(variant, std::string("Pense"), msg.reasoning_content);
121121
assert_equals(variant, std::string("Bonjour"), msg.content);
122+
assert_equals(variant, true, msg.reasoning_status.detected);
123+
assert_equals(variant, false, msg.reasoning_status.active);
124+
assert_equals(variant, std::string("</think>"), msg.reasoning_status.end_tag);
122125
}
123126
{
124127
const std::string variant("llama_3_inline_think");
@@ -133,6 +136,9 @@ static void test_reasoning() {
133136
auto msg = common_chat_parse(input, false, syntax);
134137
assert_equals(variant, std::string("Plan"), msg.reasoning_content);
135138
assert_equals(variant, std::string("Réponse"), msg.content);
139+
assert_equals(variant, true, msg.reasoning_status.detected);
140+
assert_equals(variant, false, msg.reasoning_status.active);
141+
assert_equals(variant, std::string("</think>"), msg.reasoning_status.end_tag);
136142
}
137143
// Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
138144
{

tools/server/server-context.cpp

Lines changed: 50 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,6 @@ using json = nlohmann::ordered_json;
3434

3535
constexpr int HTTP_POLLING_SECONDS = 1;
3636

37-
static const std::vector<std::pair<std::string, std::string>> kReasoningThinkMarkers = {
38-
{"<think>", "</think>"},
39-
{"<|START_THINKING|>", "<|END_THINKING|>"},
40-
};
41-
4237
// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
4338
enum slot_state {
4439
SLOT_STATE_IDLE,
@@ -1109,15 +1104,8 @@ struct server_context_impl {
11091104
// Initialize reasoning tracking
11101105
slot.forced_tokens.clear();
11111106
slot.n_reasoning_tokens = 0;
1112-
1113-
const bool thinking_forced_open = slot.task->params.oaicompat_chat_syntax.thinking_forced_open;
1114-
slot.reasoning = thinking_forced_open ? REASONING_STATE_REASONING : REASONING_STATE_NONE;
1115-
1116-
if (thinking_forced_open) {
1117-
slot.reasoning_end_tag = kReasoningThinkMarkers.front().second;
1118-
} else {
1119-
slot.reasoning_end_tag.clear();
1120-
}
1107+
slot.reasoning = REASONING_STATE_NONE;
1108+
slot.reasoning_end_tag.clear();
11211109

11221110

11231111
SLT_INF(slot, "%s", "processing task\n");
@@ -1198,65 +1186,62 @@ struct server_context_impl {
11981186
const int32_t reasoning_budget = (slot.task ? slot.task->params.reasoning_budget : params_base.reasoning_budget);
11991187

12001188
// check reasoning budget limit
1201-
// Track reasoning tokens when we're inside thinking blocks (<think>...</think> or similar)
1189+
// Track reasoning tokens using the chat parser to detect reasoning segments consistently across formats
12021190
// When the budget is exceeded we enqueue the closing tag tokens so they get sent to the client
12031191
// and fed back into the model before continuing normal generation
12041192
if (slot.has_next_token && reasoning_budget > 0 && slot.reasoning != REASONING_STATE_FINISHED) {
1205-
// Check if we've entered or exited reasoning mode
1206-
if (slot.reasoning == REASONING_STATE_NONE) {
1207-
for (const auto & [start_tag, end_tag] : kReasoningThinkMarkers) {
1208-
size_t start_pos = slot.generated_text.rfind(start_tag);
1209-
if (start_pos != std::string::npos) {
1210-
SLT_DBG(slot, "detected reasoning start with '%s'\n", start_tag.c_str());
1211-
slot.reasoning = REASONING_STATE_REASONING;
1212-
slot.reasoning_end_tag = end_tag;
1213-
slot.n_reasoning_tokens = 0;
1214-
break;
1215-
}
1216-
}
1217-
} else if (slot.reasoning == REASONING_STATE_REASONING) {
1218-
size_t end_pos = slot.generated_text.rfind(slot.reasoning_end_tag);
1219-
if (end_pos != std::string::npos) {
1220-
SLT_DBG(slot, "detected reasoning end with '%s'\n", slot.reasoning_end_tag.c_str());
1221-
slot.reasoning = REASONING_STATE_FINISHED;
1193+
std::vector<common_chat_msg_diff> reasoning_diffs;
1194+
const auto & parsed_msg = slot.update_chat_msg(reasoning_diffs, /* compute_diffs = */ false);
1195+
const auto & rstatus = parsed_msg.reasoning_status;
1196+
1197+
if (rstatus.active && slot.reasoning != REASONING_STATE_PENDING_FORCE_CLOSE) {
1198+
if (slot.reasoning != REASONING_STATE_REASONING) {
1199+
SLT_DBG(slot, "detected reasoning start via parser%s\n", "");
1200+
slot.reasoning = REASONING_STATE_REASONING;
1201+
slot.reasoning_end_tag = rstatus.end_tag;
12221202
slot.n_reasoning_tokens = 0;
1223-
} else {
1224-
// If actively reasoning (and we haven't already scheduled a forced close) count this token
1225-
slot.n_reasoning_tokens++;
1203+
}
1204+
} else if (!rstatus.active && slot.reasoning == REASONING_STATE_REASONING) {
1205+
SLT_DBG(slot, "detected reasoning end '%s' via parser\n", rstatus.end_tag.c_str());
1206+
slot.reasoning = REASONING_STATE_FINISHED;
1207+
slot.n_reasoning_tokens = 0;
1208+
}
12261209

1227-
if (slot.n_reasoning_tokens >= reasoning_budget) {
1228-
SLT_INF(slot, "reasoning budget exceeded, forcing close with '%s', n_reasoning_tokens = %d, reasoning_budget = %d\n",
1229-
slot.reasoning_end_tag.c_str(), slot.n_reasoning_tokens, reasoning_budget);
1210+
if (slot.reasoning == REASONING_STATE_REASONING) {
1211+
slot.n_reasoning_tokens++;
12301212

1231-
auto fail_close = [&](const char * reason) {
1232-
SLT_WRN(slot, "failed to inject reasoning close tag (%s) -> stopping generation\n", reason);
1233-
slot.stop = STOP_TYPE_LIMIT;
1234-
slot.has_next_token = false;
1235-
};
1213+
if (slot.n_reasoning_tokens >= reasoning_budget) {
1214+
SLT_INF(slot, "reasoning budget exceeded, forcing close with '%s', n_reasoning_tokens = %d, reasoning_budget = %d\n",
1215+
slot.reasoning_end_tag.c_str(), slot.n_reasoning_tokens, reasoning_budget);
12361216

1237-
if (slot.reasoning_end_tag.empty()) {
1238-
fail_close("no closing tag detected");
1239-
} else {
1240-
const std::string forced_message = slot.task->params.reasoning_force_close_message.empty()
1241-
? std::string(COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE)
1242-
: slot.task->params.reasoning_force_close_message;
1243-
const std::string forced_injection = forced_message + slot.reasoning_end_tag;
1244-
1245-
llama_tokens closing_tokens;
1246-
try {
1247-
closing_tokens = common_tokenize(ctx, forced_injection, /*add_special=*/false, /*parse_special=*/true);
1248-
} catch (const std::exception & err) {
1249-
SLT_WRN(slot, "tokenization error while forcing reasoning close: %s\n", err.what());
1250-
fail_close("tokenization error");
1251-
closing_tokens.clear();
1252-
}
1217+
auto fail_close = [&](const char * reason) {
1218+
SLT_WRN(slot, "failed to inject reasoning close tag (%s) -> stopping generation\n", reason);
1219+
slot.stop = STOP_TYPE_LIMIT;
1220+
slot.has_next_token = false;
1221+
};
12531222

1254-
if (!closing_tokens.empty()) {
1255-
slot.forced_tokens.insert(slot.forced_tokens.end(), closing_tokens.begin(), closing_tokens.end());
1256-
slot.reasoning = REASONING_STATE_PENDING_FORCE_CLOSE;
1257-
} else if (slot.has_next_token) {
1258-
fail_close("closing tag produced no tokens");
1259-
}
1223+
if (slot.reasoning_end_tag.empty()) {
1224+
fail_close("no closing tag detected");
1225+
} else {
1226+
const std::string forced_message = slot.task->params.reasoning_force_close_message.empty()
1227+
? std::string(COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE)
1228+
: slot.task->params.reasoning_force_close_message;
1229+
const std::string forced_injection = forced_message + slot.reasoning_end_tag;
1230+
1231+
llama_tokens closing_tokens;
1232+
try {
1233+
closing_tokens = common_tokenize(ctx, forced_injection, /*add_special=*/false, /*parse_special=*/true);
1234+
} catch (const std::exception & err) {
1235+
SLT_WRN(slot, "tokenization error while forcing reasoning close: %s\n", err.what());
1236+
fail_close("tokenization error");
1237+
closing_tokens.clear();
1238+
}
1239+
1240+
if (!closing_tokens.empty()) {
1241+
slot.forced_tokens.insert(slot.forced_tokens.end(), closing_tokens.begin(), closing_tokens.end());
1242+
slot.reasoning = REASONING_STATE_PENDING_FORCE_CLOSE;
1243+
} else if (slot.has_next_token) {
1244+
fail_close("closing tag produced no tokens");
12601245
}
12611246
}
12621247
}

0 commit comments

Comments
 (0)