Passes long and short factors for phi3+ models using longrope (#3375)

babusid · web-flow · commit 4084e7fe7532 · 2025-11-17T10:26:43.000-05:00
In the canonical HF implementation of Phi3+ models, the longrope embedding leverages both the long and short factors depending on sequence length. This can be seen here: https://github.com/huggingface/transformers/blob/7b325cd573e40bbb12951b8446176c96e8b1afaa/src/transformers/modeling_rope_utils.py#L521 To achieve this in MLC, we need to pass both the long and short factors to the KV Cache creation. The TVM side of this patch is here: apache/tvm#18422
diff --git a/python/mlc_llm/model/phi3/phi3_model.py b/python/mlc_llm/model/phi3/phi3_model.py
@@ -238,7 +238,9 @@ def __init__(self, config: Phi3Config) -> None:
         self.rope_scaling = config.rope_scaling
         self.rope_theta = config.position_embedding_base
         self.rope_ext_factors = (
-            config.rope_scaling["long_factor"] if config.rope_scaling is not None else None
+            (config.rope_scaling["long_factor"] + config.rope_scaling["short_factor"])
+            if config.rope_scaling is not None
+            else None
         )
         self.tensor_parallel_shards = config.tensor_parallel_shards
         self.partial_rotary_factor = config.partial_rotary_factor
diff --git a/python/mlc_llm/model/phi3v/phi3v_model.py b/python/mlc_llm/model/phi3v/phi3v_model.py
@@ -143,7 +143,9 @@ def __init__(self, config: Phi3VConfig) -> None:
         self.rope_scaling = config.rope_scaling
         self.rope_theta = config.position_embedding_base
         self.rope_ext_factors = (
-            config.rope_scaling["long_factor"] if config.rope_scaling is not None else None
+            (config.rope_scaling["long_factor"] + config.rope_scaling["short_factor"])
+            if config.rope_scaling is not None
+            else None
         )
         self.tensor_parallel_shards = config.tensor_parallel_shards
         self.dtype = "float32"