Skip to content

Commit 0ca55b6

Browse files
committed
cont : make deepseek2 consistent
1 parent 59b9e36 commit 0ca55b6

File tree

2 files changed

+7
-1
lines changed

2 files changed

+7
-1
lines changed

src/llama-model.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1635,7 +1635,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
16351635
// that have no expert_gating_func model parameter set
16361636
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
16371637
}
1638+
16381639
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
1640+
if (hparams.rope_yarn_log_mul == 0.1f) {
1641+
LLAMA_LOG_WARN("%s: detected old-style YaRN RoPE scaling - overriding to 1.0f\n", __func__);
1642+
hparams.rope_yarn_log_mul = 1.0f;
1643+
}
16391644

16401645
// (optional) temperature tuning - used by mistral-large
16411646
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);

src/models/deepseek2.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
2020

2121
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
2222
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
23-
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
23+
// And also: https://github.com/ggml-org/llama.cpp/pull/17945
24+
const float mscale = attn_factor * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
2425
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
2526

2627
ggml_tensor * cur;

0 commit comments

Comments
 (0)