Skip to content

Commit 28cf17e

Browse files
committed
log how much time we spent in reasoning
1 parent e3619c3 commit 28cf17e

File tree

1 file changed

+10
-2
lines changed

1 file changed

+10
-2
lines changed

tools/server/server-context.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,11 @@ struct server_slot {
177177
size_t n_sent_text = 0; // number of sent text character
178178

179179
int64_t t_start_process_prompt;
180+
int64_t t_start_reasoning;
180181
int64_t t_start_generation;
181182

182183
double t_prompt_processing; // ms
184+
double t_reasoning_token_generation; // ms
183185
double t_token_generation; // ms
184186

185187
std::function<void(int)> callback_on_release;
@@ -394,15 +396,20 @@ struct server_slot {
394396
const double t_prompt = t_prompt_processing / n_prompt_tokens_processed;
395397
const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
396398

399+
const double t_reasoning = t_reasoning_token_generation / n_reasoning_tokens;
400+
const double n_reasoning_second = 1e3 / t_reasoning_token_generation * n_reasoning_tokens;
401+
397402
const double t_gen = t_token_generation / n_decoded;
398403
const double n_gen_second = 1e3 / t_token_generation * n_decoded;
399404

400405
SLT_INF(*this,
401406
"\n"
402407
"prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
408+
" reasoning time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
403409
" eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
404410
" total time = %10.2f ms / %5d tokens\n",
405411
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
412+
t_reasoning_token_generation, n_reasoning_tokens, t_reasoning, n_reasoning_second,
406413
t_token_generation, n_decoded, t_gen, n_gen_second,
407414
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
408415

@@ -1200,11 +1207,12 @@ struct server_context_impl {
12001207
slot.reasoning = REASONING_STATE_REASONING;
12011208
slot.reasoning_end_tag = rstatus.end_tag;
12021209
slot.n_reasoning_tokens = 0;
1210+
slot.t_start_reasoning = ggml_time_us();
12031211
}
12041212
} else if (!rstatus.active && slot.reasoning == REASONING_STATE_REASONING) {
12051213
SLT_DBG(slot, "detected reasoning end '%s' via parser\n", rstatus.end_tag.c_str());
12061214
slot.reasoning = REASONING_STATE_FINISHED;
1207-
slot.n_reasoning_tokens = 0;
1215+
slot.t_reasoning_token_generation = (ggml_time_us() - slot.t_start_reasoning) / 1e3;
12081216
}
12091217

12101218
if (slot.reasoning == REASONING_STATE_REASONING) {
@@ -1254,7 +1262,7 @@ struct server_context_impl {
12541262
if (slot.forced_tokens.empty()) {
12551263
SLT_DBG(slot, "completed forced reasoning close with '%s'\n", slot.reasoning_end_tag.c_str());
12561264
slot.reasoning = REASONING_STATE_FINISHED;
1257-
slot.n_reasoning_tokens = 0;
1265+
slot.t_reasoning_token_generation = (ggml_time_us() - slot.t_start_reasoning) / 1e3;
12581266
}
12591267
}
12601268
}

0 commit comments

Comments
 (0)