@@ -113,15 +113,15 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
113113 std::string ret;
114114 for (; begin != end; ++begin)
115115 {
116- ret += llama_token_to_piece (ctx, *begin);
116+ ret += common_token_to_piece (ctx, *begin);
117117 }
118118 return ret;
119119}
120120
121121// format incomplete utf-8 multibyte character for output
122122static std::string tokens_to_output_formatted_string (const llama_context *ctx, const llama_token token)
123123{
124- std::string out = token == -1 ? " " : llama_token_to_piece (ctx, token);
124+ std::string out = token == -1 ? " " : common_token_to_piece (ctx, token);
125125 // if the size is 1 and first bit is 1, meaning it's a partial character
126126 // (size > 1 meaning it's already a known token)
127127 if (out.size () == 1 && (out[0 ] & 0x80 ) == 0x80 )
@@ -203,8 +203,8 @@ struct llama_client_slot
203203 std::string stopping_word;
204204
205205 // sampling
206- struct gpt_sampler_params sparams;
207- gpt_sampler *ctx_sampling = nullptr ;
206+ struct common_sampler_params sparams;
207+ common_sampler *ctx_sampling = nullptr ;
208208
209209 int32_t ga_i = 0 ; // group-attention state
210210 int32_t ga_n = 1 ; // group-attention factor
@@ -257,7 +257,7 @@ struct llama_client_slot
257257 images.clear ();
258258 }
259259
260- bool has_budget (gpt_params &global_params) {
260+ bool has_budget (common_params &global_params) {
261261 if (params.n_predict == -1 && global_params.n_predict == -1 )
262262 {
263263 return true ; // limitless
@@ -398,7 +398,7 @@ struct llama_server_context
398398
399399 clip_ctx *clp_ctx = nullptr ;
400400
401- gpt_params params;
401+ common_params params;
402402
403403 llama_batch batch;
404404
@@ -441,7 +441,7 @@ struct llama_server_context
441441 }
442442 }
443443
444- bool load_model (const gpt_params ¶ms_)
444+ bool load_model (const common_params ¶ms_)
445445 {
446446 params = params_;
447447 if (!params.mmproj .empty ()) {
@@ -458,9 +458,9 @@ struct llama_server_context
458458 }
459459 }
460460
461- llama_init_result llama_init = llama_init_from_gpt_params (params);
462- model = llama_init .model ;
463- ctx = llama_init .context ;
461+ common_init_result common_init = common_init_from_params (params);
462+ model = common_init .model ;
463+ ctx = common_init .context ;
464464 if (model == nullptr )
465465 {
466466 LOG_ERR (" unable to load model: %s" , params.model .c_str ());
@@ -578,12 +578,12 @@ struct llama_server_context
578578 std::vector<llama_token> p;
579579 if (first)
580580 {
581- p = :: llama_tokenize (ctx, s, add_bos, TMP_FORCE_SPECIAL);
581+ p = common_tokenize (ctx, s, add_bos, TMP_FORCE_SPECIAL);
582582 first = false ;
583583 }
584584 else
585585 {
586- p = :: llama_tokenize (ctx, s, false , TMP_FORCE_SPECIAL);
586+ p = common_tokenize (ctx, s, false , TMP_FORCE_SPECIAL);
587587 }
588588 prompt_tokens.insert (prompt_tokens.end (), p.begin (), p.end ());
589589 }
@@ -600,7 +600,7 @@ struct llama_server_context
600600 else
601601 {
602602 auto s = json_prompt.template get <std::string>();
603- prompt_tokens = :: llama_tokenize (ctx, s, add_bos, TMP_FORCE_SPECIAL);
603+ prompt_tokens = common_tokenize (ctx, s, add_bos, TMP_FORCE_SPECIAL);
604604 }
605605
606606 return prompt_tokens;
@@ -629,7 +629,7 @@ struct llama_server_context
629629
630630 bool launch_slot_with_data (llama_client_slot* &slot, json data) {
631631 slot_params default_params;
632- gpt_sampler_params default_sparams;
632+ common_sampler_params default_sparams;
633633
634634 slot->params .stream = json_value (data, " stream" , false );
635635 slot->params .cache_prompt = json_value (data, " cache_prompt" , false );
@@ -769,7 +769,7 @@ struct llama_server_context
769769 }
770770 else if (el[0 ].is_string ())
771771 {
772- auto toks = llama_tokenize (model, el[0 ].get <std::string>(), false );
772+ auto toks = common_tokenize (model, el[0 ].get <std::string>(), false );
773773 for (auto tok : toks)
774774 {
775775 slot->sparams .logit_bias .push_back ({tok, bias});
@@ -801,7 +801,7 @@ struct llama_server_context
801801 sampler_names.emplace_back (name);
802802 }
803803 }
804- slot->sparams .samplers = gpt_sampler_types_from_names (sampler_names, false );
804+ slot->sparams .samplers = common_sampler_types_from_names (sampler_names, false );
805805 }
806806 else
807807 {
@@ -885,9 +885,9 @@ struct llama_server_context
885885
886886 if (slot->ctx_sampling != nullptr )
887887 {
888- gpt_sampler_free (slot->ctx_sampling );
888+ common_sampler_free (slot->ctx_sampling );
889889 }
890- slot->ctx_sampling = gpt_sampler_init (model, slot->sparams );
890+ slot->ctx_sampling = common_sampler_init (model, slot->sparams );
891891 // llama_set_rng_seed(ctx, slot->params.seed);
892892 slot->command = LOAD_PROMPT;
893893
@@ -914,13 +914,13 @@ struct llama_server_context
914914 system_tokens.clear ();
915915
916916 if (!system_prompt.empty ()) {
917- system_tokens = :: llama_tokenize (ctx, system_prompt, add_bos_token);
917+ system_tokens = common_tokenize (ctx, system_prompt, add_bos_token);
918918
919- llama_batch_clear (batch);
919+ common_batch_clear (batch);
920920
921921 for (int i = 0 ; i < (int )system_tokens.size (); ++i)
922922 {
923- llama_batch_add (batch, system_tokens[i], i, { 0 }, false );
923+ common_batch_add (batch, system_tokens[i], i, { 0 }, false );
924924 }
925925
926926 for (int32_t i = 0 ; i < (int32_t ) batch.n_tokens ; i += params.n_batch )
@@ -1009,7 +1009,7 @@ struct llama_server_context
10091009
10101010 bool process_token (completion_token_output &result, llama_client_slot &slot) {
10111011 // remember which tokens were sampled - used for repetition penalties during sampling
1012- const std::string token_str = llama_token_to_piece (ctx, result.tok );
1012+ const std::string token_str = common_token_to_piece (ctx, result.tok );
10131013 slot.sampled = result.tok ;
10141014
10151015 // search stop word and delete it
@@ -1160,7 +1160,7 @@ struct llama_server_context
11601160 samplers.reserve (slot.sparams .samplers .size ());
11611161 for (const auto & sampler : slot.sparams .samplers )
11621162 {
1163- samplers.emplace_back (gpt_sampler_type_to_str (sampler));
1163+ samplers.emplace_back (common_sampler_type_to_str (sampler));
11641164 }
11651165
11661166 return json {
@@ -1216,7 +1216,7 @@ struct llama_server_context
12161216 if (slot.sparams .n_probs > 0 )
12171217 {
12181218 std::vector<completion_token_output> probs_output = {};
1219- const std::vector<llama_token> to_send_toks = llama_tokenize (ctx, tkn.text_to_send , false );
1219+ const std::vector<llama_token> to_send_toks = common_tokenize (ctx, tkn.text_to_send , false );
12201220 size_t probs_pos = std::min (slot.sent_token_probs_index , slot.generated_token_probs .size ());
12211221 size_t probs_stop_pos = std::min (slot.sent_token_probs_index + to_send_toks.size (), slot.generated_token_probs .size ());
12221222 if (probs_pos < probs_stop_pos)
@@ -1268,7 +1268,7 @@ struct llama_server_context
12681268 std::vector<completion_token_output> probs = {};
12691269 if (!slot.params .stream && slot.stopped_word )
12701270 {
1271- const std::vector<llama_token> stop_word_toks = llama_tokenize (ctx, slot.stopping_word , false );
1271+ const std::vector<llama_token> stop_word_toks = common_tokenize (ctx, slot.stopping_word , false );
12721272 probs = std::vector<completion_token_output>(slot.generated_token_probs .begin (), slot.generated_token_probs .end () - stop_word_toks.size ());
12731273 }
12741274 else
@@ -1408,7 +1408,7 @@ struct llama_server_context
14081408 }
14091409 image_idx++;
14101410
1411- llama_batch_clear (batch);
1411+ common_batch_clear (batch);
14121412
14131413 // append prefix of next image
14141414 const auto json_prompt = (image_idx >= (int ) slot.images .size ()) ?
@@ -1418,7 +1418,7 @@ struct llama_server_context
14181418 std::vector<llama_token> append_tokens = tokenize (json_prompt, false ); // has next image
14191419 for (int i = 0 ; i < (int ) append_tokens.size (); ++i)
14201420 {
1421- llama_batch_add (batch, append_tokens[i], system_tokens.size () + slot.n_past , { slot.id }, true );
1421+ common_batch_add (batch, append_tokens[i], system_tokens.size () + slot.n_past , { slot.id }, true );
14221422 slot.n_past += 1 ;
14231423 }
14241424 }
@@ -1550,7 +1550,7 @@ struct llama_server_context
15501550 update_system_prompt ();
15511551 }
15521552
1553- llama_batch_clear (batch);
1553+ common_batch_clear (batch);
15541554
15551555 if (all_slots_are_idle)
15561556 {
@@ -1628,7 +1628,7 @@ struct llama_server_context
16281628
16291629 // TODO: we always have to take into account the "system_tokens"
16301630 // this is not great and needs to be improved somehow
1631- llama_batch_add (batch, slot.sampled , system_tokens.size () + slot_npast, { slot.id }, true );
1631+ common_batch_add (batch, slot.sampled , system_tokens.size () + slot_npast, { slot.id }, true );
16321632 slot.n_past += 1 ;
16331633 }
16341634
@@ -1722,7 +1722,7 @@ struct llama_server_context
17221722
17231723 if (!slot.params .cache_prompt )
17241724 {
1725- gpt_sampler_reset (slot.ctx_sampling );
1725+ common_sampler_reset (slot.ctx_sampling );
17261726
17271727 slot.n_past = 0 ;
17281728 slot.n_past_se = 0 ;
@@ -1734,7 +1734,7 @@ struct llama_server_context
17341734 // push the prompt into the sampling context (do not apply grammar)
17351735 for (auto &token : prompt_tokens)
17361736 {
1737- gpt_sampler_accept (slot.ctx_sampling , token, false );
1737+ common_sampler_accept (slot.ctx_sampling , token, false );
17381738 }
17391739
17401740 slot.n_past = common_part (slot.cache_tokens , prompt_tokens);
@@ -1826,7 +1826,7 @@ struct llama_server_context
18261826 ga_i += ga_w/ga_n;
18271827 }
18281828 }
1829- llama_batch_add (batch, prefix_tokens[slot.n_past ], system_tokens.size () + slot_npast, {slot.id }, false );
1829+ common_batch_add (batch, prefix_tokens[slot.n_past ], system_tokens.size () + slot_npast, {slot.id }, false );
18301830 slot_npast++;
18311831 }
18321832
@@ -1943,9 +1943,9 @@ struct llama_server_context
19431943 }
19441944
19451945 completion_token_output result;
1946- const llama_token id = gpt_sampler_sample (slot.ctx_sampling , ctx, slot.i_batch - i);
1946+ const llama_token id = common_sampler_sample (slot.ctx_sampling , ctx, slot.i_batch - i);
19471947
1948- gpt_sampler_accept (slot.ctx_sampling , id, true );
1948+ common_sampler_accept (slot.ctx_sampling , id, true );
19491949
19501950 slot.n_decoded += 1 ;
19511951 if (slot.n_decoded == 1 )
@@ -1956,7 +1956,7 @@ struct llama_server_context
19561956 }
19571957
19581958 result.tok = id;
1959- const auto * cur_p = gpt_sampler_get_candidates (slot.ctx_sampling );
1959+ const auto * cur_p = common_sampler_get_candidates (slot.ctx_sampling );
19601960
19611961 for (size_t i = 0 ; i < (size_t ) slot.sparams .n_probs ; ++i) {
19621962 result.probs .push_back ({
@@ -2009,7 +2009,7 @@ static json format_partial_response(
20092009struct token_translator
20102010{
20112011 llama_context * ctx;
2012- std::string operator ()(llama_token tok) const { return llama_token_to_piece (ctx, tok); }
2012+ std::string operator ()(llama_token tok) const { return common_token_to_piece (ctx, tok); }
20132013 std::string operator ()(const completion_token_output &cto) const { return (*this )(cto.tok ); }
20142014};
20152015
@@ -2203,7 +2203,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
22032203// }
22042204
22052205static void params_parse (const backend::ModelOptions* request,
2206- gpt_params & params) {
2206+ common_params & params) {
22072207
22082208 // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
22092209
@@ -2311,7 +2311,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
23112311
23122312 grpc::Status LoadModel (ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
23132313 // Implement LoadModel RPC
2314- gpt_params params;
2314+ common_params params;
23152315 params_parse (request, params);
23162316
23172317 llama_backend_init ();
0 commit comments