Skip to content

Commit 6257e2f

Browse files
authored
chore(deps): bump llama-cpp to 96776405a17034dcfd53d3ddf5d142d34bdbb657 (#3793)
This adapts also to upstream changes Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent 65ca754 commit 6257e2f

File tree

2 files changed

+39
-39
lines changed

2 files changed

+39
-39
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
88
# llama.cpp versions
99
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
1010
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
11-
CPPLLAMA_VERSION?=0e9f760eb12546704ef8fa72577bc1a3ffe1bc04
11+
CPPLLAMA_VERSION?=96776405a17034dcfd53d3ddf5d142d34bdbb657
1212

1313
# go-rwkv version
1414
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp

backend/cpp/llama/grpc-server.cpp

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -113,15 +113,15 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
113113
std::string ret;
114114
for (; begin != end; ++begin)
115115
{
116-
ret += llama_token_to_piece(ctx, *begin);
116+
ret += common_token_to_piece(ctx, *begin);
117117
}
118118
return ret;
119119
}
120120

121121
// format incomplete utf-8 multibyte character for output
122122
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
123123
{
124-
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
124+
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
125125
// if the size is 1 and first bit is 1, meaning it's a partial character
126126
// (size > 1 meaning it's already a known token)
127127
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
@@ -203,8 +203,8 @@ struct llama_client_slot
203203
std::string stopping_word;
204204

205205
// sampling
206-
struct gpt_sampler_params sparams;
207-
gpt_sampler *ctx_sampling = nullptr;
206+
struct common_sampler_params sparams;
207+
common_sampler *ctx_sampling = nullptr;
208208

209209
int32_t ga_i = 0; // group-attention state
210210
int32_t ga_n = 1; // group-attention factor
@@ -257,7 +257,7 @@ struct llama_client_slot
257257
images.clear();
258258
}
259259

260-
bool has_budget(gpt_params &global_params) {
260+
bool has_budget(common_params &global_params) {
261261
if (params.n_predict == -1 && global_params.n_predict == -1)
262262
{
263263
return true; // limitless
@@ -398,7 +398,7 @@ struct llama_server_context
398398

399399
clip_ctx *clp_ctx = nullptr;
400400

401-
gpt_params params;
401+
common_params params;
402402

403403
llama_batch batch;
404404

@@ -441,7 +441,7 @@ struct llama_server_context
441441
}
442442
}
443443

444-
bool load_model(const gpt_params &params_)
444+
bool load_model(const common_params &params_)
445445
{
446446
params = params_;
447447
if (!params.mmproj.empty()) {
@@ -458,9 +458,9 @@ struct llama_server_context
458458
}
459459
}
460460

461-
llama_init_result llama_init = llama_init_from_gpt_params(params);
462-
model = llama_init.model;
463-
ctx = llama_init.context;
461+
common_init_result common_init = common_init_from_params(params);
462+
model = common_init.model;
463+
ctx = common_init.context;
464464
if (model == nullptr)
465465
{
466466
LOG_ERR("unable to load model: %s", params.model.c_str());
@@ -578,12 +578,12 @@ struct llama_server_context
578578
std::vector<llama_token> p;
579579
if (first)
580580
{
581-
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
581+
p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
582582
first = false;
583583
}
584584
else
585585
{
586-
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
586+
p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
587587
}
588588
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
589589
}
@@ -600,7 +600,7 @@ struct llama_server_context
600600
else
601601
{
602602
auto s = json_prompt.template get<std::string>();
603-
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
603+
prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
604604
}
605605

606606
return prompt_tokens;
@@ -629,7 +629,7 @@ struct llama_server_context
629629

630630
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
631631
slot_params default_params;
632-
gpt_sampler_params default_sparams;
632+
common_sampler_params default_sparams;
633633

634634
slot->params.stream = json_value(data, "stream", false);
635635
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
@@ -769,7 +769,7 @@ struct llama_server_context
769769
}
770770
else if (el[0].is_string())
771771
{
772-
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
772+
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
773773
for (auto tok : toks)
774774
{
775775
slot->sparams.logit_bias.push_back({tok, bias});
@@ -801,7 +801,7 @@ struct llama_server_context
801801
sampler_names.emplace_back(name);
802802
}
803803
}
804-
slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
804+
slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false);
805805
}
806806
else
807807
{
@@ -885,9 +885,9 @@ struct llama_server_context
885885

886886
if (slot->ctx_sampling != nullptr)
887887
{
888-
gpt_sampler_free(slot->ctx_sampling);
888+
common_sampler_free(slot->ctx_sampling);
889889
}
890-
slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
890+
slot->ctx_sampling = common_sampler_init(model, slot->sparams);
891891
//llama_set_rng_seed(ctx, slot->params.seed);
892892
slot->command = LOAD_PROMPT;
893893

@@ -914,13 +914,13 @@ struct llama_server_context
914914
system_tokens.clear();
915915

916916
if (!system_prompt.empty()) {
917-
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
917+
system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);
918918

919-
llama_batch_clear(batch);
919+
common_batch_clear(batch);
920920

921921
for (int i = 0; i < (int)system_tokens.size(); ++i)
922922
{
923-
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
923+
common_batch_add(batch, system_tokens[i], i, { 0 }, false);
924924
}
925925

926926
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
@@ -1009,7 +1009,7 @@ struct llama_server_context
10091009

10101010
bool process_token(completion_token_output &result, llama_client_slot &slot) {
10111011
// remember which tokens were sampled - used for repetition penalties during sampling
1012-
const std::string token_str = llama_token_to_piece(ctx, result.tok);
1012+
const std::string token_str = common_token_to_piece(ctx, result.tok);
10131013
slot.sampled = result.tok;
10141014

10151015
// search stop word and delete it
@@ -1160,7 +1160,7 @@ struct llama_server_context
11601160
samplers.reserve(slot.sparams.samplers.size());
11611161
for (const auto & sampler : slot.sparams.samplers)
11621162
{
1163-
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
1163+
samplers.emplace_back(common_sampler_type_to_str(sampler));
11641164
}
11651165

11661166
return json {
@@ -1216,7 +1216,7 @@ struct llama_server_context
12161216
if (slot.sparams.n_probs > 0)
12171217
{
12181218
std::vector<completion_token_output> probs_output = {};
1219-
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
1219+
const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
12201220
size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
12211221
size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
12221222
if (probs_pos < probs_stop_pos)
@@ -1268,7 +1268,7 @@ struct llama_server_context
12681268
std::vector<completion_token_output> probs = {};
12691269
if (!slot.params.stream && slot.stopped_word)
12701270
{
1271-
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
1271+
const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
12721272
probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
12731273
}
12741274
else
@@ -1408,7 +1408,7 @@ struct llama_server_context
14081408
}
14091409
image_idx++;
14101410

1411-
llama_batch_clear(batch);
1411+
common_batch_clear(batch);
14121412

14131413
// append prefix of next image
14141414
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
@@ -1418,7 +1418,7 @@ struct llama_server_context
14181418
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
14191419
for (int i = 0; i < (int) append_tokens.size(); ++i)
14201420
{
1421-
llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
1421+
common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
14221422
slot.n_past += 1;
14231423
}
14241424
}
@@ -1550,7 +1550,7 @@ struct llama_server_context
15501550
update_system_prompt();
15511551
}
15521552

1553-
llama_batch_clear(batch);
1553+
common_batch_clear(batch);
15541554

15551555
if (all_slots_are_idle)
15561556
{
@@ -1628,7 +1628,7 @@ struct llama_server_context
16281628

16291629
// TODO: we always have to take into account the "system_tokens"
16301630
// this is not great and needs to be improved somehow
1631-
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
1631+
common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
16321632
slot.n_past += 1;
16331633
}
16341634

@@ -1722,7 +1722,7 @@ struct llama_server_context
17221722

17231723
if (!slot.params.cache_prompt)
17241724
{
1725-
gpt_sampler_reset(slot.ctx_sampling);
1725+
common_sampler_reset(slot.ctx_sampling);
17261726

17271727
slot.n_past = 0;
17281728
slot.n_past_se = 0;
@@ -1734,7 +1734,7 @@ struct llama_server_context
17341734
// push the prompt into the sampling context (do not apply grammar)
17351735
for (auto &token : prompt_tokens)
17361736
{
1737-
gpt_sampler_accept(slot.ctx_sampling, token, false);
1737+
common_sampler_accept(slot.ctx_sampling, token, false);
17381738
}
17391739

17401740
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1826,7 +1826,7 @@ struct llama_server_context
18261826
ga_i += ga_w/ga_n;
18271827
}
18281828
}
1829-
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
1829+
common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
18301830
slot_npast++;
18311831
}
18321832

@@ -1943,9 +1943,9 @@ struct llama_server_context
19431943
}
19441944

19451945
completion_token_output result;
1946-
const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
1946+
const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
19471947

1948-
gpt_sampler_accept(slot.ctx_sampling, id, true);
1948+
common_sampler_accept(slot.ctx_sampling, id, true);
19491949

19501950
slot.n_decoded += 1;
19511951
if (slot.n_decoded == 1)
@@ -1956,7 +1956,7 @@ struct llama_server_context
19561956
}
19571957

19581958
result.tok = id;
1959-
const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
1959+
const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);
19601960

19611961
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
19621962
result.probs.push_back({
@@ -2009,7 +2009,7 @@ static json format_partial_response(
20092009
struct token_translator
20102010
{
20112011
llama_context * ctx;
2012-
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
2012+
std::string operator()(llama_token tok) const { return common_token_to_piece(ctx, tok); }
20132013
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
20142014
};
20152015

@@ -2203,7 +2203,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
22032203
// }
22042204

22052205
static void params_parse(const backend::ModelOptions* request,
2206-
gpt_params & params) {
2206+
common_params & params) {
22072207

22082208
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
22092209

@@ -2311,7 +2311,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
23112311

23122312
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
23132313
// Implement LoadModel RPC
2314-
gpt_params params;
2314+
common_params params;
23152315
params_parse(request, params);
23162316

23172317
llama_backend_init();

0 commit comments

Comments
 (0)