chore(deps): bump llama-cpp to 96776405a17034dcfd53d3ddf5d142d34bdbb657 (#3793)

mudler · web-flow · commit 6257e2f5101c · 2024-10-12T01:25:03.000+02:00
This adapts also to upstream changes

Signed-off-by: Ettore Di Giacinto &lt;mudler@localai.io&gt;
diff --git a/Makefile b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=0e9f760eb12546704ef8fa72577bc1a3ffe1bc04
+CPPLLAMA_VERSION?=96776405a17034dcfd53d3ddf5d142d34bdbb657
 
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
@@ -113,15 +113,15 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
     std::string ret;
     for (; begin != end; ++begin)
     {
-        ret += llama_token_to_piece(ctx, *begin);
+        ret += common_token_to_piece(ctx, *begin);
     }
     return ret;
 }
 
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
-    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+    std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
     // if the size is 1 and first bit is 1, meaning it's a partial character
     //   (size > 1 meaning it's already a known token)
     if (out.size() == 1 && (out[0] & 0x80) == 0x80)
@@ -203,8 +203,8 @@ struct llama_client_slot
     std::string stopping_word;
 
     // sampling
-    struct gpt_sampler_params sparams;
-    gpt_sampler *ctx_sampling = nullptr;
+    struct common_sampler_params sparams;
+    common_sampler *ctx_sampling = nullptr;
 
     int32_t ga_i = 0;   // group-attention state
     int32_t ga_n = 1;   // group-attention factor
@@ -257,7 +257,7 @@ struct llama_client_slot
         images.clear();
     }
 
-    bool has_budget(gpt_params &global_params) {
+    bool has_budget(common_params &global_params) {
         if (params.n_predict == -1 && global_params.n_predict == -1)
         {
             return true; // limitless
@@ -398,7 +398,7 @@ struct llama_server_context
 
     clip_ctx *clp_ctx = nullptr;
 
-    gpt_params params;
+    common_params params;
 
     llama_batch batch;
 
@@ -441,7 +441,7 @@ struct llama_server_context
         }
     }
 
-    bool load_model(const gpt_params &params_)
+    bool load_model(const common_params &params_)
     {
         params = params_;
         if (!params.mmproj.empty()) {
@@ -458,9 +458,9 @@ struct llama_server_context
             }
         }
 
-        llama_init_result llama_init = llama_init_from_gpt_params(params);
-        model = llama_init.model;
-        ctx = llama_init.context;
+        common_init_result common_init = common_init_from_params(params);
+        model = common_init.model;
+        ctx = common_init.context;
         if (model == nullptr)
         {
             LOG_ERR("unable to load model: %s", params.model.c_str());
@@ -578,12 +578,12 @@ struct llama_server_context
                     std::vector<llama_token> p;
                     if (first)
                     {
-                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+                        p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
                         first = false;
                     }
                     else
                     {
-                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
+                        p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
                     }
                     prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
                 }
@@ -600,7 +600,7 @@ struct llama_server_context
         else
         {
             auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+            prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
         }
 
         return prompt_tokens;
@@ -629,7 +629,7 @@ struct llama_server_context
 
     bool launch_slot_with_data(llama_client_slot* &slot, json data) {
         slot_params default_params;
-        gpt_sampler_params default_sparams;
+        common_sampler_params default_sparams;
  
         slot->params.stream             = json_value(data, "stream",            false);
         slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
@@ -769,7 +769,7 @@ struct llama_server_context
                     }
                     else if (el[0].is_string())
                     {
-                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
+                        auto toks = common_tokenize(model, el[0].get<std::string>(), false);
                         for (auto tok : toks)
                         {
                             slot->sparams.logit_bias.push_back({tok, bias});
@@ -801,7 +801,7 @@ struct llama_server_context
                         sampler_names.emplace_back(name);
                     }
                 }
-                slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
+                slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false);
         }
         else
         {
@@ -885,9 +885,9 @@ struct llama_server_context
 
         if (slot->ctx_sampling != nullptr)
         {
-            gpt_sampler_free(slot->ctx_sampling);
+            common_sampler_free(slot->ctx_sampling);
         }
-        slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
+        slot->ctx_sampling = common_sampler_init(model, slot->sparams);
         //llama_set_rng_seed(ctx, slot->params.seed);
         slot->command = LOAD_PROMPT;
 
@@ -914,13 +914,13 @@ struct llama_server_context
         system_tokens.clear();
 
         if (!system_prompt.empty()) {
-            system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
+            system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);
 
-            llama_batch_clear(batch);
+            common_batch_clear(batch);
 
             for (int i = 0; i < (int)system_tokens.size(); ++i)
             {
-                llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
+                common_batch_add(batch, system_tokens[i], i, { 0 }, false);
             }
 
             for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
@@ -1009,7 +1009,7 @@ struct llama_server_context
 
     bool process_token(completion_token_output &result, llama_client_slot &slot) {
         // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok);
+        const std::string token_str = common_token_to_piece(ctx, result.tok);
         slot.sampled = result.tok;
 
         // search stop word and delete it
@@ -1160,7 +1160,7 @@ struct llama_server_context
         samplers.reserve(slot.sparams.samplers.size());
         for (const auto & sampler : slot.sparams.samplers)
         {
-            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
+            samplers.emplace_back(common_sampler_type_to_str(sampler));
         }
 
         return json {
@@ -1216,7 +1216,7 @@ struct llama_server_context
         if (slot.sparams.n_probs > 0)
         {
             std::vector<completion_token_output> probs_output = {};
-            const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
+            const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
             size_t probs_pos      = std::min(slot.sent_token_probs_index,                       slot.generated_token_probs.size());
             size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
             if (probs_pos < probs_stop_pos)
@@ -1268,7 +1268,7 @@ struct llama_server_context
             std::vector<completion_token_output> probs = {};
             if (!slot.params.stream && slot.stopped_word)
             {
-                const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
+                const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
                 probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
             }
             else
@@ -1408,7 +1408,7 @@ struct llama_server_context
             }
             image_idx++;
 
-            llama_batch_clear(batch);
+            common_batch_clear(batch);
 
             // append prefix of next image
             const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
@@ -1418,7 +1418,7 @@ struct llama_server_context
             std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
             for (int i = 0; i < (int) append_tokens.size(); ++i)
             {
-                llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
+                common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
                 slot.n_past += 1;
             }
         }
@@ -1550,7 +1550,7 @@ struct llama_server_context
             update_system_prompt();
         }
 
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
 
         if (all_slots_are_idle)
         {
@@ -1628,7 +1628,7 @@ struct llama_server_context
 
             // TODO: we always have to take into account the "system_tokens"
             //       this is not great and needs to be improved somehow
-            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
+            common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
             slot.n_past += 1;
         }
 
@@ -1722,7 +1722,7 @@ struct llama_server_context
 
                     if (!slot.params.cache_prompt)
                     {
-                        gpt_sampler_reset(slot.ctx_sampling);
+                        common_sampler_reset(slot.ctx_sampling);
 
                         slot.n_past = 0;
                         slot.n_past_se = 0;
@@ -1734,7 +1734,7 @@ struct llama_server_context
                         // push the prompt into the sampling context (do not apply grammar)
                         for (auto &token : prompt_tokens)
                         {
-                            gpt_sampler_accept(slot.ctx_sampling, token, false);
+                            common_sampler_accept(slot.ctx_sampling, token, false);
                         }
 
                         slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1826,7 +1826,7 @@ struct llama_server_context
                                 ga_i += ga_w/ga_n;
                             }
                         }
-                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
+                        common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
                         slot_npast++;
                     }
 
@@ -1943,9 +1943,9 @@ struct llama_server_context
                 }
 
                 completion_token_output result;
-                const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
+                const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
 
-                gpt_sampler_accept(slot.ctx_sampling, id, true);
+                common_sampler_accept(slot.ctx_sampling, id, true);
 
                 slot.n_decoded += 1;
                 if (slot.n_decoded == 1)
@@ -1956,7 +1956,7 @@ struct llama_server_context
                 }
 
                 result.tok = id;
-                const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
+                const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);
 
                 for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
                     result.probs.push_back({
@@ -2009,7 +2009,7 @@ static json format_partial_response(
 struct token_translator
 {
     llama_context * ctx;
-    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
+    std::string operator()(llama_token tok)                    const { return common_token_to_piece(ctx, tok); }
     std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
 };
 
@@ -2203,7 +2203,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 // }
 
 static void params_parse(const backend::ModelOptions* request,
-                                gpt_params & params) {
+                                common_params & params) {
    
     // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
 
@@ -2311,7 +2311,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
 
   grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
     // Implement LoadModel RPC
-    gpt_params params;
+    common_params params;
     params_parse(request, params);
 
     llama_backend_init();

Original file line number	Diff line number	Diff line change
`@@ -113,15 +113,15 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)`
`113`	`113`	`std::string ret;`
`114`	`114`	`for (; begin != end; ++begin)`
`115`	`115`	`{`
`116`		`- ret += llama_token_to_piece(ctx, *begin);`
	`116`	`+ ret += common_token_to_piece(ctx, *begin);`
`117`	`117`	`}`
`118`	`118`	`return ret;`
`119`	`119`	`}`
`120`	`120`
`121`	`121`	`// format incomplete utf-8 multibyte character for output`
`122`	`122`	`static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)`
`123`	`123`	`{`
`124`		`- std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);`
	`124`	`+ std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);`
`125`	`125`	`// if the size is 1 and first bit is 1, meaning it's a partial character`
`126`	`126`	`// (size > 1 meaning it's already a known token)`
`127`	`127`	`if (out.size() == 1 && (out[0] & 0x80) == 0x80)`
`@@ -203,8 +203,8 @@ struct llama_client_slot`
`203`	`203`	`std::string stopping_word;`
`204`	`204`
`205`	`205`	`// sampling`
`206`		`- struct gpt_sampler_params sparams;`
`207`		`- gpt_sampler *ctx_sampling = nullptr;`
	`206`	`+ struct common_sampler_params sparams;`
	`207`	`+ common_sampler *ctx_sampling = nullptr;`
`208`	`208`
`209`	`209`	`int32_t ga_i = 0; // group-attention state`
`210`	`210`	`int32_t ga_n = 1; // group-attention factor`
`@@ -257,7 +257,7 @@ struct llama_client_slot`
`257`	`257`	`images.clear();`
`258`	`258`	`}`
`259`	`259`
`260`		`- bool has_budget(gpt_params &global_params) {`
	`260`	`+ bool has_budget(common_params &global_params) {`
`261`	`261`	`if (params.n_predict == -1 && global_params.n_predict == -1)`
`262`	`262`	`{`
`263`	`263`	`return true; // limitless`
`@@ -398,7 +398,7 @@ struct llama_server_context`
`398`	`398`
`399`	`399`	`clip_ctx *clp_ctx = nullptr;`
`400`	`400`
`401`		`- gpt_params params;`
	`401`	`+ common_params params;`
`402`	`402`
`403`	`403`	`llama_batch batch;`
`404`	`404`
`@@ -441,7 +441,7 @@ struct llama_server_context`
`441`	`441`	`}`
`442`	`442`	`}`
`443`	`443`
`444`		`- bool load_model(const gpt_params &params_)`
	`444`	`+ bool load_model(const common_params &params_)`
`445`	`445`	`{`
`446`	`446`	`params = params_;`
`447`	`447`	`if (!params.mmproj.empty()) {`
`@@ -458,9 +458,9 @@ struct llama_server_context`
`458`	`458`	`}`
`459`	`459`	`}`
`460`	`460`
`461`		`- llama_init_result llama_init = llama_init_from_gpt_params(params);`
`462`		`- model = llama_init.model;`
`463`		`- ctx = llama_init.context;`
	`461`	`+ common_init_result common_init = common_init_from_params(params);`
	`462`	`+ model = common_init.model;`
	`463`	`+ ctx = common_init.context;`
`464`	`464`	`if (model == nullptr)`
`465`	`465`	`{`
`466`	`466`	`LOG_ERR("unable to load model: %s", params.model.c_str());`
`@@ -578,12 +578,12 @@ struct llama_server_context`
`578`	`578`	`std::vector<llama_token> p;`
`579`	`579`	`if (first)`
`580`	`580`	`{`
`581`		`- p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);`
	`581`	`+ p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);`
`582`	`582`	`first = false;`
`583`	`583`	`}`
`584`	`584`	`else`
`585`	`585`	`{`
`586`		`- p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);`
	`586`	`+ p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);`
`587`	`587`	`}`
`588`	`588`	`prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());`
`589`	`589`	`}`
`@@ -600,7 +600,7 @@ struct llama_server_context`
`600`	`600`	`else`
`601`	`601`	`{`
`602`	`602`	`auto s = json_prompt.template get<std::string>();`
`603`		`- prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);`
	`603`	`+ prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);`
`604`	`604`	`}`
`605`	`605`
`606`	`606`	`return prompt_tokens;`
`@@ -629,7 +629,7 @@ struct llama_server_context`
`629`	`629`
`630`	`630`	`bool launch_slot_with_data(llama_client_slot* &slot, json data) {`
`631`	`631`	`slot_params default_params;`
`632`		`- gpt_sampler_params default_sparams;`
	`632`	`+ common_sampler_params default_sparams;`
`633`	`633`
`634`	`634`	`slot->params.stream = json_value(data, "stream", false);`
`635`	`635`	`slot->params.cache_prompt = json_value(data, "cache_prompt", false);`
`@@ -769,7 +769,7 @@ struct llama_server_context`
`769`	`769`	`}`
`770`	`770`	`else if (el[0].is_string())`
`771`	`771`	`{`
`772`		`- auto toks = llama_tokenize(model, el[0].get<std::string>(), false);`
	`772`	`+ auto toks = common_tokenize(model, el[0].get<std::string>(), false);`
`773`	`773`	`for (auto tok : toks)`
`774`	`774`	`{`
`775`	`775`	`slot->sparams.logit_bias.push_back({tok, bias});`
`@@ -801,7 +801,7 @@ struct llama_server_context`
`801`	`801`	`sampler_names.emplace_back(name);`
`802`	`802`	`}`
`803`	`803`	`}`
`804`		`- slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);`
	`804`	`+ slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false);`
`805`	`805`	`}`
`806`	`806`	`else`
`807`	`807`	`{`
`@@ -885,9 +885,9 @@ struct llama_server_context`
`885`	`885`
`886`	`886`	`if (slot->ctx_sampling != nullptr)`
`887`	`887`	`{`
`888`		`- gpt_sampler_free(slot->ctx_sampling);`
	`888`	`+ common_sampler_free(slot->ctx_sampling);`
`889`	`889`	`}`
`890`		`- slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);`
	`890`	`+ slot->ctx_sampling = common_sampler_init(model, slot->sparams);`
`891`	`891`	`//llama_set_rng_seed(ctx, slot->params.seed);`
`892`	`892`	`slot->command = LOAD_PROMPT;`
`893`	`893`
`@@ -914,13 +914,13 @@ struct llama_server_context`
`914`	`914`	`system_tokens.clear();`
`915`	`915`
`916`	`916`	`if (!system_prompt.empty()) {`
`917`		`- system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);`
	`917`	`+ system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);`
`918`	`918`
`919`		`- llama_batch_clear(batch);`
	`919`	`+ common_batch_clear(batch);`
`920`	`920`
`921`	`921`	`for (int i = 0; i < (int)system_tokens.size(); ++i)`
`922`	`922`	`{`
`923`		`- llama_batch_add(batch, system_tokens[i], i, { 0 }, false);`
	`923`	`+ common_batch_add(batch, system_tokens[i], i, { 0 }, false);`
`924`	`924`	`}`
`925`	`925`
`926`	`926`	`for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)`
`@@ -1009,7 +1009,7 @@ struct llama_server_context`
`1009`	`1009`
`1010`	`1010`	`bool process_token(completion_token_output &result, llama_client_slot &slot) {`
`1011`	`1011`	`// remember which tokens were sampled - used for repetition penalties during sampling`
`1012`		`- const std::string token_str = llama_token_to_piece(ctx, result.tok);`
	`1012`	`+ const std::string token_str = common_token_to_piece(ctx, result.tok);`
`1013`	`1013`	`slot.sampled = result.tok;`
`1014`	`1014`
`1015`	`1015`	`// search stop word and delete it`
`@@ -1160,7 +1160,7 @@ struct llama_server_context`
`1160`	`1160`	`samplers.reserve(slot.sparams.samplers.size());`
`1161`	`1161`	`for (const auto & sampler : slot.sparams.samplers)`
`1162`	`1162`	`{`
`1163`		`- samplers.emplace_back(gpt_sampler_type_to_str(sampler));`
	`1163`	`+ samplers.emplace_back(common_sampler_type_to_str(sampler));`
`1164`	`1164`	`}`
`1165`	`1165`
`1166`	`1166`	`return json {`
`@@ -1216,7 +1216,7 @@ struct llama_server_context`
`1216`	`1216`	`if (slot.sparams.n_probs > 0)`
`1217`	`1217`	`{`
`1218`	`1218`	`std::vector<completion_token_output> probs_output = {};`
`1219`		`- const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);`
	`1219`	`+ const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);`
`1220`	`1220`	`size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());`
`1221`	`1221`	`size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());`
`1222`	`1222`	`if (probs_pos < probs_stop_pos)`
`@@ -1268,7 +1268,7 @@ struct llama_server_context`
`1268`	`1268`	`std::vector<completion_token_output> probs = {};`
`1269`	`1269`	`if (!slot.params.stream && slot.stopped_word)`
`1270`	`1270`	`{`
`1271`		`- const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);`
	`1271`	`+ const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);`
`1272`	`1272`	`probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());`
`1273`	`1273`	`}`
`1274`	`1274`	`else`
`@@ -1408,7 +1408,7 @@ struct llama_server_context`
`1408`	`1408`	`}`
`1409`	`1409`	`image_idx++;`
`1410`	`1410`
`1411`		`- llama_batch_clear(batch);`
	`1411`	`+ common_batch_clear(batch);`
`1412`	`1412`
`1413`	`1413`	`// append prefix of next image`
`1414`	`1414`	`const auto json_prompt = (image_idx >= (int) slot.images.size()) ?`
`@@ -1418,7 +1418,7 @@ struct llama_server_context`
`1418`	`1418`	`std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image`
`1419`	`1419`	`for (int i = 0; i < (int) append_tokens.size(); ++i)`
`1420`	`1420`	`{`
`1421`		`- llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);`
	`1421`	`+ common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);`
`1422`	`1422`	`slot.n_past += 1;`
`1423`	`1423`	`}`
`1424`	`1424`	`}`
`@@ -1550,7 +1550,7 @@ struct llama_server_context`
`1550`	`1550`	`update_system_prompt();`
`1551`	`1551`	`}`
`1552`	`1552`
`1553`		`- llama_batch_clear(batch);`
	`1553`	`+ common_batch_clear(batch);`
`1554`	`1554`
`1555`	`1555`	`if (all_slots_are_idle)`
`1556`	`1556`	`{`
`@@ -1628,7 +1628,7 @@ struct llama_server_context`
`1628`	`1628`
`1629`	`1629`	`// TODO: we always have to take into account the "system_tokens"`
`1630`	`1630`	`// this is not great and needs to be improved somehow`
`1631`		`- llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);`
	`1631`	`+ common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);`
`1632`	`1632`	`slot.n_past += 1;`
`1633`	`1633`	`}`
`1634`	`1634`
`@@ -1722,7 +1722,7 @@ struct llama_server_context`
`1722`	`1722`
`1723`	`1723`	`if (!slot.params.cache_prompt)`
`1724`	`1724`	`{`
`1725`		`- gpt_sampler_reset(slot.ctx_sampling);`
	`1725`	`+ common_sampler_reset(slot.ctx_sampling);`
`1726`	`1726`
`1727`	`1727`	`slot.n_past = 0;`
`1728`	`1728`	`slot.n_past_se = 0;`
`@@ -1734,7 +1734,7 @@ struct llama_server_context`
`1734`	`1734`	`// push the prompt into the sampling context (do not apply grammar)`
`1735`	`1735`	`for (auto &token : prompt_tokens)`
`1736`	`1736`	`{`
`1737`		`- gpt_sampler_accept(slot.ctx_sampling, token, false);`
	`1737`	`+ common_sampler_accept(slot.ctx_sampling, token, false);`
`1738`	`1738`	`}`
`1739`	`1739`
`1740`	`1740`	`slot.n_past = common_part(slot.cache_tokens, prompt_tokens);`
`@@ -1826,7 +1826,7 @@ struct llama_server_context`
`1826`	`1826`	`ga_i += ga_w/ga_n;`
`1827`	`1827`	`}`
`1828`	`1828`	`}`
`1829`		`- llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);`
	`1829`	`+ common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);`
`1830`	`1830`	`slot_npast++;`
`1831`	`1831`	`}`
`1832`	`1832`
`@@ -1943,9 +1943,9 @@ struct llama_server_context`
`1943`	`1943`	`}`
`1944`	`1944`
`1945`	`1945`	`completion_token_output result;`
`1946`		`- const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);`
	`1946`	`+ const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);`
`1947`	`1947`
`1948`		`- gpt_sampler_accept(slot.ctx_sampling, id, true);`
	`1948`	`+ common_sampler_accept(slot.ctx_sampling, id, true);`
`1949`	`1949`
`1950`	`1950`	`slot.n_decoded += 1;`
`1951`	`1951`	`if (slot.n_decoded == 1)`
`@@ -1956,7 +1956,7 @@ struct llama_server_context`
`1956`	`1956`	`}`
`1957`	`1957`
`1958`	`1958`	`result.tok = id;`
`1959`		`- const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);`
	`1959`	`+ const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);`
`1960`	`1960`
`1961`	`1961`	`for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {`
`1962`	`1962`	`result.probs.push_back({`
`@@ -2009,7 +2009,7 @@ static json format_partial_response(`
`2009`	`2009`	`struct token_translator`
`2010`	`2010`	`{`
`2011`	`2011`	`llama_context * ctx;`
`2012`		`- std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }`
	`2012`	`+ std::string operator()(llama_token tok) const { return common_token_to_piece(ctx, tok); }`
`2013`	`2013`	`std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }`
`2014`	`2014`	`};`
`2015`	`2015`
`@@ -2203,7 +2203,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama`
`2203`	`2203`	`// }`
`2204`	`2204`
`2205`	`2205`	`static void params_parse(const backend::ModelOptions* request,`
`2206`		`- gpt_params & params) {`
	`2206`	`+ common_params & params) {`
`2207`	`2207`
`2208`	`2208`	`// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809`
`2209`	`2209`
`@@ -2311,7 +2311,7 @@ class BackendServiceImpl final : public backend::Backend::Service {`
`2311`	`2311`
`2312`	`2312`	`grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {`
`2313`	`2313`	`// Implement LoadModel RPC`
`2314`		`- gpt_params params;`
	`2314`	`+ common_params params;`
`2315`	`2315`	`params_parse(request, params);`
`2316`	`2316`
`2317`	`2317`	`llama_backend_init();`