From 1208c11e146cd8efeb9dba428ccb8e6cc38e62ec Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Wed, 19 Nov 2025 15:01:38 -0800 Subject: [PATCH 1/9] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- .../transformers/models/modeling_auto.py | 26 ++++++++++++------- .../compute_context_length/llama4_cb.py | 8 ++++++ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 16a809c96..8551cbbc5 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -869,7 +869,7 @@ def __init__( self, model: nn.Module, continuous_batching: bool = False, - qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -902,7 +902,7 @@ def __init__( self.input_shapes, self.output_names = None, None @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Optional[dict] = None, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): """ Load a QEfficient multimodal model for dual QPC from a pretrained HuggingFace model or local path. @@ -932,7 +932,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Option return cls( model, pretrained_model_name_or_path=pretrained_model_name_or_path, - qaic_config=qaic_config, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -1565,7 +1565,7 @@ class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, Multimodal def __init__( self, model: nn.Module, - qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -1615,7 +1615,6 @@ def __init__( def from_pretrained( cls, pretrained_model_name_or_path, - qaic_config: Optional[dict] = None, *args, **kwargs, ): @@ -1646,6 +1645,7 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + ccl_enabled = kwargs.pop("ccl_enabled", None) from transformers import AutoConfig @@ -1657,7 +1657,7 @@ def from_pretrained( return cls( model, pretrained_model_name_or_path=pretrained_model_name_or_path, - qaic_config=qaic_config, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -1773,6 +1773,7 @@ def compile( output_names = self.model.get_output_names() # if ccl_enabled is True read Compute-Context-Length lists + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None if self.ccl_enabled: if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: logger.warning( @@ -2154,7 +2155,7 @@ def __new__( model: nn.Module, kv_offload: Optional[bool] = True, continuous_batching: bool = False, - qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -2178,10 +2179,10 @@ def __new__( """ if kv_offload: return _QEffAutoModelForImageTextToTextDualQPC( - model, continuous_batching, qaic_config=qaic_config, **kwargs + model, continuous_batching, ccl_enabled=ccl_enabled, **kwargs ) else: - return _QEFFAutoModelForImageTextToTextSingleQPC(model, qaic_config=qaic_config, **kwargs) + return _QEFFAutoModelForImageTextToTextSingleQPC(model, ccl_enabled=ccl_enabled, **kwargs) @classmethod @with_replaced_quantizers @@ -2231,6 +2232,7 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + ccl_enabled = kwargs.pop("ccl_enabled", None) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) return cls( @@ -2238,7 +2240,7 @@ def from_pretrained( kv_offload=kv_offload, continuous_batching=continuous_batching, pretrained_model_name_or_path=pretrained_model_name_or_path, - qaic_config=qaic_config, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -2289,6 +2291,7 @@ def __init__( model: nn.Module, continuous_batching: bool = False, qaic_config: Optional[dict] = None, + ccl_enabled: bool = False, **kwargs, ): """ @@ -2428,6 +2431,7 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kv_offload = kwargs.pop("kv_offload", None) + ccl_enabled = kwargs.pop("ccl_enabled", None) kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) @@ -2450,6 +2454,7 @@ def from_pretrained( continuous_batching=continuous_batching, qaic_config=qaic_config, pretrained_model_name_or_path=pretrained_model_name_or_path, + ccl_enabled=ccl_enabled, **kwargs, ) @@ -2872,6 +2877,7 @@ def compile( """ # if ccl_enabled is True read Compute-Context-Length lists + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None if self.ccl_enabled: if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: logger.warning( diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py index ffbbff67f..1adfd89b6 100644 --- a/examples/performance/compute_context_length/llama4_cb.py +++ b/examples/performance/compute_context_length/llama4_cb.py @@ -41,9 +41,13 @@ kv_offload=True, config=config, continuous_batching=True, +<<<<<<< HEAD qaic_config={ "ccl_enabled": True, }, +======= + ccl_enabled=True, +>>>>>>> d58736d (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process) ) qeff_model.compile( @@ -68,9 +72,13 @@ attn_implementation="eager", kv_offload=True, config=config, +<<<<<<< HEAD qaic_config={ "ccl_enabled": True, }, +======= + ccl_enabled=True, +>>>>>>> d58736d (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process) ) qeff_model.compile( From ab0a10fda67ec66008068bea0eb368797968f94b Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Wed, 19 Nov 2025 16:17:21 -0800 Subject: [PATCH 2/9] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- examples/performance/compute_context_length/qwen2_5_vl_cb.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py index fc330e14e..39fbf6319 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py +++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py @@ -84,7 +84,11 @@ processor=processor, images=image_urls, generation_len=100, +<<<<<<< HEAD device_ids=[0, 1, 2, 3], +======= + device_ids=[28, 29, 30, 31], +>>>>>>> da18659 (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process) ) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) From 53a843f1bf5beb9303aa62e460337b2e57581365 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Wed, 19 Nov 2025 22:06:06 -0800 Subject: [PATCH 3/9] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- QEfficient/transformers/models/modeling_auto.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 8551cbbc5..d87cc65f4 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1773,7 +1773,6 @@ def compile( output_names = self.model.get_output_names() # if ccl_enabled is True read Compute-Context-Length lists - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None if self.ccl_enabled: if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: logger.warning( @@ -2877,7 +2876,6 @@ def compile( """ # if ccl_enabled is True read Compute-Context-Length lists - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None if self.ccl_enabled: if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: logger.warning( From 7d5fa64db645da6d2ddf5f71555f0d327fb33249 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Sun, 23 Nov 2025 17:49:03 -0800 Subject: [PATCH 4/9] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- examples/performance/compute_context_length/qwen3moe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py index b53a28362..8d53e68b5 100644 --- a/examples/performance/compute_context_length/qwen3moe.py +++ b/examples/performance/compute_context_length/qwen3moe.py @@ -49,6 +49,5 @@ comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) - tokenizer = AutoTokenizer.from_pretrained(model_name) exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer) From 179cd87f328c5017c41fe3d1c365ff9209b5508e Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Mon, 1 Dec 2025 16:42:42 -0800 Subject: [PATCH 5/9] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- examples/performance/compute_context_length/qwen2_5_vl_cb.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py index 39fbf6319..fc330e14e 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py +++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py @@ -84,11 +84,7 @@ processor=processor, images=image_urls, generation_len=100, -<<<<<<< HEAD device_ids=[0, 1, 2, 3], -======= - device_ids=[28, 29, 30, 31], ->>>>>>> da18659 (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process) ) print(output.generated_ids) print(tokenizer.batch_decode(output.generated_ids)) From 11741446955ba6565c1766626a80e4eb81ec416b Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Tue, 2 Dec 2025 14:46:12 -0800 Subject: [PATCH 6/9] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- .../transformers/models/modeling_auto.py | 24 ++++++++----------- .../compute_context_length/llama4_cb.py | 8 ------- .../compute_context_length/molmo.py | 2 +- 3 files changed, 11 insertions(+), 23 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index d87cc65f4..16a809c96 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -869,7 +869,7 @@ def __init__( self, model: nn.Module, continuous_batching: bool = False, - ccl_enabled: bool = False, + qaic_config: Optional[dict] = None, **kwargs, ): """ @@ -902,7 +902,7 @@ def __init__( self.input_shapes, self.output_names = None, None @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Optional[dict] = None, **kwargs): """ Load a QEfficient multimodal model for dual QPC from a pretrained HuggingFace model or local path. @@ -932,7 +932,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): return cls( model, pretrained_model_name_or_path=pretrained_model_name_or_path, - ccl_enabled=ccl_enabled, + qaic_config=qaic_config, **kwargs, ) @@ -1565,7 +1565,7 @@ class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, Multimodal def __init__( self, model: nn.Module, - ccl_enabled: bool = False, + qaic_config: Optional[dict] = None, **kwargs, ): """ @@ -1615,6 +1615,7 @@ def __init__( def from_pretrained( cls, pretrained_model_name_or_path, + qaic_config: Optional[dict] = None, *args, **kwargs, ): @@ -1645,7 +1646,6 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) - ccl_enabled = kwargs.pop("ccl_enabled", None) from transformers import AutoConfig @@ -1657,7 +1657,7 @@ def from_pretrained( return cls( model, pretrained_model_name_or_path=pretrained_model_name_or_path, - ccl_enabled=ccl_enabled, + qaic_config=qaic_config, **kwargs, ) @@ -2154,7 +2154,7 @@ def __new__( model: nn.Module, kv_offload: Optional[bool] = True, continuous_batching: bool = False, - ccl_enabled: bool = False, + qaic_config: Optional[dict] = None, **kwargs, ): """ @@ -2178,10 +2178,10 @@ def __new__( """ if kv_offload: return _QEffAutoModelForImageTextToTextDualQPC( - model, continuous_batching, ccl_enabled=ccl_enabled, **kwargs + model, continuous_batching, qaic_config=qaic_config, **kwargs ) else: - return _QEFFAutoModelForImageTextToTextSingleQPC(model, ccl_enabled=ccl_enabled, **kwargs) + return _QEFFAutoModelForImageTextToTextSingleQPC(model, qaic_config=qaic_config, **kwargs) @classmethod @with_replaced_quantizers @@ -2231,7 +2231,6 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) - ccl_enabled = kwargs.pop("ccl_enabled", None) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) return cls( @@ -2239,7 +2238,7 @@ def from_pretrained( kv_offload=kv_offload, continuous_batching=continuous_batching, pretrained_model_name_or_path=pretrained_model_name_or_path, - ccl_enabled=ccl_enabled, + qaic_config=qaic_config, **kwargs, ) @@ -2290,7 +2289,6 @@ def __init__( model: nn.Module, continuous_batching: bool = False, qaic_config: Optional[dict] = None, - ccl_enabled: bool = False, **kwargs, ): """ @@ -2430,7 +2428,6 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kv_offload = kwargs.pop("kv_offload", None) - ccl_enabled = kwargs.pop("ccl_enabled", None) kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) @@ -2453,7 +2450,6 @@ def from_pretrained( continuous_batching=continuous_batching, qaic_config=qaic_config, pretrained_model_name_or_path=pretrained_model_name_or_path, - ccl_enabled=ccl_enabled, **kwargs, ) diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py index 1adfd89b6..ffbbff67f 100644 --- a/examples/performance/compute_context_length/llama4_cb.py +++ b/examples/performance/compute_context_length/llama4_cb.py @@ -41,13 +41,9 @@ kv_offload=True, config=config, continuous_batching=True, -<<<<<<< HEAD qaic_config={ "ccl_enabled": True, }, -======= - ccl_enabled=True, ->>>>>>> d58736d (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process) ) qeff_model.compile( @@ -72,13 +68,9 @@ attn_implementation="eager", kv_offload=True, config=config, -<<<<<<< HEAD qaic_config={ "ccl_enabled": True, }, -======= - ccl_enabled=True, ->>>>>>> d58736d (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process) ) qeff_model.compile( diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py index b5f1f50e6..6ee272710 100644 --- a/examples/performance/compute_context_length/molmo.py +++ b/examples/performance/compute_context_length/molmo.py @@ -33,7 +33,7 @@ qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_id, - kv_offload=True, + kv_offload=False, trust_remote_code=True, config=config, qaic_config={ From 8673d2ce5f63cd53b64aec5b15b6f694c4e89864 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Tue, 2 Dec 2025 14:50:40 -0800 Subject: [PATCH 7/9] Adding ccl_enabled flag during model loading and passing CCL lists during compilation process Signed-off-by: Vahid Janfaza --- examples/performance/compute_context_length/molmo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py index 6ee272710..b5f1f50e6 100644 --- a/examples/performance/compute_context_length/molmo.py +++ b/examples/performance/compute_context_length/molmo.py @@ -33,7 +33,7 @@ qeff_model = QEFFAutoModelForCausalLM.from_pretrained( model_id, - kv_offload=False, + kv_offload=True, trust_remote_code=True, config=config, qaic_config={ From 2788e6ec12155344ec1aea9dff902b5c74be6b32 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Tue, 9 Dec 2025 18:27:26 -0800 Subject: [PATCH 8/9] Add automatic CCL list generation for prefill and decode when user does not provide lists Signed-off-by: Vahid Janfaza --- .../transformers/models/modeling_auto.py | 39 ++- QEfficient/utils/check_ccl_specializations.py | 240 +++++++++++++++--- .../compute_context_length/README.md | 22 ++ .../compute_context_length/basic_inference.py | 10 +- .../compute_context_length/gemma3.py | 8 +- .../compute_context_length/gpt_oss.py | 7 +- .../compute_context_length/granite_vision.py | 1 + .../compute_context_length/internvl.py | 1 + .../compute_context_length/llama4.py | 8 +- .../compute_context_length/llama4_cb.py | 10 +- .../llama4_multi_image.py | 8 +- .../compute_context_length/mistral3.py | 1 + .../compute_context_length/molmo.py | 8 +- .../compute_context_length/qwen2_5_vl.py | 8 +- .../compute_context_length/qwen2_5_vl_cb.py | 8 +- .../compute_context_length/qwen3moe.py | 8 +- .../compute_context_length/vlm_inference.py | 8 +- 17 files changed, 295 insertions(+), 100 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 16a809c96..c6312e595 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1126,17 +1126,14 @@ def compile( # if ccl_enabled is True read Compute-Context-Length lists if self.ccl_enabled: - if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: - logger.warning( - "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." - ) - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None: + print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) - # For supporting VLLM and Disaggregated with CCL - if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + elif comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) @@ -1774,17 +1771,14 @@ def compile( # if ccl_enabled is True read Compute-Context-Length lists if self.ccl_enabled: - if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: - logger.warning( - "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." - ) - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None: + print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) - # For supporting VLLM and Disaggregated with CCL - if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + elif comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) @@ -2873,16 +2867,13 @@ def compile( # if ccl_enabled is True read Compute-Context-Length lists if self.ccl_enabled: - if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: - logger.warning( - "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." - ) - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None: + print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) - # For supporting VLLM and Disaggregated with CCL - if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: + elif comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: if isinstance(comp_ctx_lengths_prefill, str): import ast @@ -2897,7 +2888,7 @@ def compile( self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill self.comp_ctx_lengths_decode = comp_ctx_lengths_decode - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) # --- Validation --- diff --git a/QEfficient/utils/check_ccl_specializations.py b/QEfficient/utils/check_ccl_specializations.py index 0d6a078f6..b06f83e33 100644 --- a/QEfficient/utils/check_ccl_specializations.py +++ b/QEfficient/utils/check_ccl_specializations.py @@ -5,40 +5,210 @@ # # ----------------------------------------------------------------------------- +from typing import List, Optional, Tuple + + +def next_multiple_of_1024(n: int) -> int: + """Ceil 'n' to the next multiple of 1024.""" + if n <= 0: + return 0 + return ((n + 1023) // 1024) * 1024 + + +def floor_to_1000(n: int) -> int: + """Floor 'n' to the nearest lower multiple of 1000.""" + if n <= 0: + return 0 + return (n // 1000) * 1000 + + +def is_power_of_two(n: int) -> bool: + """Return True if n is a power of two (n>0 and n&(n-1)==0).""" + return n > 0 and (n & (n - 1)) == 0 + + +def build_doubling_sequence(start: int, limit: int, max_elements: int, force_last: Optional[int] = None) -> List[int]: + """ + Build an increasing sequence starting at 'start', doubling each step, + not exceeding 'limit', with total length <= max_elements. + If 'force_last' is provided, ensure the last element equals 'force_last' + (replacing/appending as needed), even if it exceeds 'limit'. + """ + if max_elements <= 0: + return [] + + # If start is already beyond limit, return [force_last or limit] as a single element. + if start > limit: + seq = [force_last if force_last is not None else limit] + return seq[:max_elements] + + seq: List[int] = [] + val = start + + while val <= limit and len(seq) < max_elements: + seq.append(val) + next_val = val * 2 + if next_val > limit or len(seq) >= max_elements: + break + val = next_val + + # Add/replace last element if a 'force_last' is requested + if force_last is not None: + if len(seq) == 0: + seq = [force_last] + elif seq[-1] != force_last: + if len(seq) < max_elements: + seq.append(force_last) + else: + seq[-1] = force_last + + # Deduplicate while preserving order + dedup = [] + seen = set() + for x in seq: + if x not in seen: + dedup.append(x) + seen.add(x) + return dedup[:max_elements] + + +def Automatic_CCL_Generation( + CL: int, + prefill_seq_len: int, + comp_ctx_lengths_prefill: Optional[List[int]] = None, + comp_ctx_lengths_decode: Optional[List[int]] = None, +) -> Tuple[List[int], List[int], int]: + """ + Automatic Compute-Context-Length Lists Generation + + Purpose: + Compute decode and prefill ccl lists based on an input context + length (CL), prefill sequence length, and optional pre-specified lists. + """ + + if CL <= 0: + mapped_CL = next_multiple_of_1024(max(CL, 1)) + # For non-positive CL, minimal identical sequences + seq = [mapped_CL] + return seq, seq, mapped_CL + + mapped_CL = next_multiple_of_1024(CL) + + # Tiered starts + if mapped_CL <= 4096: + seq = [mapped_CL] + return seq, seq, mapped_CL + elif mapped_CL <= 32768: + decode_start, prefill_start = 4096, 4000 + elif mapped_CL <= 65536: + decode_start, prefill_start = 8192, 8000 + elif mapped_CL <= 131072: + decode_start, prefill_start = 16384, 16000 + else: + decode_start, prefill_start = 16384, 16000 + + # If prefill_seq_len > 1: + if prefill_seq_len > 1: + # Passthrough if either provided + if comp_ctx_lengths_decode is not None or comp_ctx_lengths_prefill is not None: + return ( + comp_ctx_lengths_decode if comp_ctx_lengths_decode is not None else [], + comp_ctx_lengths_prefill if comp_ctx_lengths_prefill is not None else [], + mapped_CL, + ) + + max_elems = 5 + + # Decode: ensure last = mapped_CL + decode = build_doubling_sequence( + start=decode_start, + limit=mapped_CL, + max_elements=max_elems, + force_last=mapped_CL, + ) + + # Prefill: + if is_power_of_two(CL): + # Strict doubling, limit = CL, no forced non-doubling last + prefill = build_doubling_sequence( + start=prefill_start, + limit=CL, + max_elements=max_elems, + force_last=None, + ) + else: + prefill_last = floor_to_1000(mapped_CL) + prefill = build_doubling_sequence( + start=prefill_start, + limit=CL, + max_elements=max_elems, + force_last=prefill_last, + ) + + return prefill, decode, mapped_CL + + # UPDATED: prefill_seq_len == 1 → identical lists + else: + max_elems = 10 + grid_cap = 2097152 # upper cap for doubling grid + + if mapped_CL < 4096: + seq = [mapped_CL] + else: + seq = build_doubling_sequence( + start=4096, + limit=min(mapped_CL, grid_cap), + max_elements=max_elems, + force_last=mapped_CL, # identical lists end at mapped_CL + ) + return seq, seq, mapped_CL + def process_ccl_specializations(ccl_prefill, ccl_decode, ctx_len, prefill_seq_len): - if ccl_prefill is None or ccl_decode is None: - return None, None - - if ctx_len is None: - raise TypeError("`ctx_len` is required when loading the model with CCL.") - - if prefill_seq_len == 1: - # both prefill and decode ccl can share the same specializations since prefill_seq_len=1. So, a sorted union of both lists can be used for both of them. - ccl_union_all = sorted(set(ccl_prefill + ccl_decode)) - ccl_union_all = [min(x, ctx_len) for x in ccl_union_all] - return ccl_union_all, ccl_union_all - - # Step 1: Cap values to ctx_len - ccl_prefill = [min(x, ctx_len) for x in ccl_prefill] - ccl_decode = [min(x, ctx_len) for x in ccl_decode] - - # Step 2: Remove duplicates within each list - ccl_prefill = list(set(ccl_prefill)) - ccl_decode = list(set(ccl_decode)) - - # Step 3: Ensure no overlap between ccl_prefill and ccl_decode - updated_prefill = [] - for val in ccl_prefill: - while val in ccl_decode or val in updated_prefill: - val -= 1 - if val < 0: - break # Prevent negative values - if val >= 0: - updated_prefill.append(val) - - # Step 4: Sort both lists - updated_prefill.sort() - ccl_decode.sort() - - return updated_prefill, ccl_decode + # Automatic CCL generation: If both ccl_prefill and ccl_decode are None, + # generate optimized context length lists for prefill and decode based on ctx_len + if ccl_prefill is None and ccl_decode is None: + ccl_prefill, ccl_decode, ctx_len = Automatic_CCL_Generation(ctx_len, prefill_seq_len, ccl_prefill, ccl_decode) + else: + if prefill_seq_len == 1: + if ccl_prefill is not None and ccl_decode is not None: + # both prefill and decode ccl can share the same specializations since prefill_seq_len=1. So, a sorted union of both lists can be used for both of them. + ccl_union_all = sorted(set(ccl_prefill + ccl_decode)) + ccl_union_all = [min(x, ctx_len) for x in ccl_union_all] + ccl_prefill = ccl_union_all + ccl_decode = ccl_union_all + else: + # Step 1: Cap values to ctx_len + ccl_prefill = [min(x, ctx_len) for x in ccl_prefill] if ccl_prefill is not None else None + ccl_decode = [min(x, ctx_len) for x in ccl_decode] if ccl_decode is not None else None + + # Step 2: Remove duplicates within each list + ccl_prefill = list(set(ccl_prefill)) if ccl_prefill is not None else None + ccl_decode = list(set(ccl_decode)) if ccl_decode is not None else None + + if ccl_prefill is None or ccl_decode is None: + if ccl_prefill: + ccl_prefill.sort() + if ccl_decode: + ccl_decode.sort() + else: + # Step 3: Ensure no overlap between ccl_prefill and ccl_decode + tmp_prefill = ccl_prefill + ccl_prefill = [] + for val in tmp_prefill: + while val in ccl_decode or val in ccl_prefill: + val -= 1 + if val < 0: + break # Prevent negative values + if val >= 0: + ccl_prefill.append(val) + + # Step 4: Sort both lists + ccl_prefill.sort() + ccl_decode.sort() + + print("CCL Configuration:") + print(f" - Prefill context lengths: {ccl_prefill}") + print(f" - Decode context lengths: {ccl_decode}") + print(f" - Max context length: {ctx_len}") + return ccl_prefill, ccl_decode, ctx_len diff --git a/examples/performance/compute_context_length/README.md b/examples/performance/compute_context_length/README.md index 9f1d29b9a..2115251e2 100644 --- a/examples/performance/compute_context_length/README.md +++ b/examples/performance/compute_context_length/README.md @@ -37,11 +37,22 @@ python basic_inference.py \ --model-name meta-llama/Llama-3.2-1B \ --prompt "Hello, how are you?" \ --ctx-len 1024 \ + --ccl-enabled \ --comp-ctx-lengths-prefill "256,500" \ --comp-ctx-lengths-decode "512,1024" \ --generation-len 100 ``` +# For automatic CCL lists generation, simply not pass CCL lists and only pass ccl-enabled flag +```bash +python basic_inference.py \ + --model-name meta-llama/Llama-3.2-1B \ + --prompt "Hello, how are you?" \ + --ctx-len 1024 \ + --ccl-enabled \ + --generation-len 100 +``` + ### Vision-Language Models Run VLM inference with CCL: @@ -55,11 +66,22 @@ python vlm_inference.py \ --model-name meta-llama/Llama-3.2-11B-Vision-Instruct \ --query "Describe this image" \ --image-url "https://..." \ + --ccl-enabled \ --comp-ctx-lengths-prefill "4096" \ --comp-ctx-lengths-decode "6144,8192" \ --ctx-len 8192 ``` +# For automatic CCL lists generation, simply not pass CCL lists and only pass ccl-enabled flag +```bash +python vlm_inference.py \ + --model-name meta-llama/Llama-3.2-11B-Vision-Instruct \ + --query "Describe this image" \ + --image-url "https://..." \ + --ccl-enabled \ + --ctx-len 8192 +``` + ## Available Examples ### Text-Only Models diff --git a/examples/performance/compute_context_length/basic_inference.py b/examples/performance/compute_context_length/basic_inference.py index 4533c47e8..6e8c045fb 100644 --- a/examples/performance/compute_context_length/basic_inference.py +++ b/examples/performance/compute_context_length/basic_inference.py @@ -54,13 +54,13 @@ def main(): parser.add_argument( "--comp-ctx-lengths-prefill", type=lambda x: [int(i) for i in x.split(",")], - default="256,500", + default=None, help="Comma-separated list of context lengths for prefill phase (e.g., '256,500')", ) parser.add_argument( "--comp-ctx-lengths-decode", type=lambda x: [int(i) for i in x.split(",")], - default="512,1024", + default=None, help="Comma-separated list of context lengths for decode phase (e.g., '512,1024')", ) parser.add_argument( @@ -107,11 +107,7 @@ def main(): args = parser.parse_args() print(f"Loading model: {args.model_name}") - print("CCL Configuration:") - print(f" - Prefill context lengths: {args.comp_ctx_lengths_prefill}") - print(f" - Decode context lengths: {args.comp_ctx_lengths_decode}") - print(f" - Max context length: {args.ctx_len}") - print(f" - Continuous batching: {args.continuous_batching}") + print(f"Continuous batching: {args.continuous_batching}") # Load model with CCL configuration model = QEFFAutoModelForCausalLM.from_pretrained( diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py index d9672b9e3..1dcec5c81 100644 --- a/examples/performance/compute_context_length/gemma3.py +++ b/examples/performance/compute_context_length/gemma3.py @@ -21,14 +21,16 @@ processor = AutoProcessor.from_pretrained(model_id) ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. comp_ctx_lengths_prefill = [3072] comp_ctx_lengths_decode = [4096, ctx_len] @@ -40,7 +42,7 @@ attn_implementation="eager", kv_offload=True, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) diff --git a/examples/performance/compute_context_length/gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py index 39a5d48ed..92bef9148 100644 --- a/examples/performance/compute_context_length/gpt_oss.py +++ b/examples/performance/compute_context_length/gpt_oss.py @@ -12,16 +12,17 @@ model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 4096 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists. -# Set the list of ccl during prefilling and decoding processes comp_ctx_lengths_prefill = comp_ctx_lengths_decode = [1024, ctx_len] qeff_model = QEFFAutoModelForCausalLM.from_pretrained( diff --git a/examples/performance/compute_context_length/granite_vision.py b/examples/performance/compute_context_length/granite_vision.py index 6dd38395c..ef5dc3a51 100644 --- a/examples/performance/compute_context_length/granite_vision.py +++ b/examples/performance/compute_context_length/granite_vision.py @@ -98,6 +98,7 @@ def run_model( num_devices = 4 ctx_len = 8192 ccl_enabled = True + # Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. If both are None, the lists will be generated automatically based on the context length. comp_ctx_lengths_prefill = [5500] comp_ctx_lengths_decode = [6144, ctx_len] diff --git a/examples/performance/compute_context_length/internvl.py b/examples/performance/compute_context_length/internvl.py index 19bcf4bc1..02e965e0d 100644 --- a/examples/performance/compute_context_length/internvl.py +++ b/examples/performance/compute_context_length/internvl.py @@ -263,6 +263,7 @@ def run_intern_on_aic( ctx_len = 8192 ccl_enabled = True + # Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. If both are None, the lists will be generated automatically based on the context length. comp_ctx_lengths_prefill = [4096] comp_ctx_lengths_decode = [6144, ctx_len] diff --git a/examples/performance/compute_context_length/llama4.py b/examples/performance/compute_context_length/llama4.py index 8cdbd70a1..a867e1bd3 100644 --- a/examples/performance/compute_context_length/llama4.py +++ b/examples/performance/compute_context_length/llama4.py @@ -18,14 +18,16 @@ config.vision_config.num_hidden_layers = 2 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [3072] # Set the list of ccl during decoding process @@ -37,7 +39,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py index ffbbff67f..f97160693 100644 --- a/examples/performance/compute_context_length/llama4_cb.py +++ b/examples/performance/compute_context_length/llama4_cb.py @@ -20,14 +20,16 @@ processor = AutoProcessor.from_pretrained(model_id) ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 4096 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [3072] # Set the list of ccl during decoding process @@ -42,7 +44,7 @@ config=config, continuous_batching=True, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) @@ -69,7 +71,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) diff --git a/examples/performance/compute_context_length/llama4_multi_image.py b/examples/performance/compute_context_length/llama4_multi_image.py index fd513fe45..314aa49b3 100644 --- a/examples/performance/compute_context_length/llama4_multi_image.py +++ b/examples/performance/compute_context_length/llama4_multi_image.py @@ -18,14 +18,16 @@ config.vision_config.num_hidden_layers = 2 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [5376] # Set the list of ccl during decoding process @@ -37,7 +39,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/mistral3.py b/examples/performance/compute_context_length/mistral3.py index 3763fbcde..a773ddfd9 100644 --- a/examples/performance/compute_context_length/mistral3.py +++ b/examples/performance/compute_context_length/mistral3.py @@ -101,6 +101,7 @@ def run_model( num_cores = 16 num_devices = 4 ccl_enabled = True + # Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. If both are None, the lists will be generated automatically based on the context length. comp_ctx_lengths_prefill = [4096] comp_ctx_lengths_decode = [6144, ctx_len] diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py index b5f1f50e6..8d773f5fe 100644 --- a/examples/performance/compute_context_length/molmo.py +++ b/examples/performance/compute_context_length/molmo.py @@ -19,15 +19,17 @@ # config.num_hidden_layers = 2 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. # load the model ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. comp_ctx_lengths_prefill = [3072] # None # comp_ctx_lengths_decode = [4096, 8192] # None # @@ -37,7 +39,7 @@ trust_remote_code=True, config=config, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) diff --git a/examples/performance/compute_context_length/qwen2_5_vl.py b/examples/performance/compute_context_length/qwen2_5_vl.py index 20960b6a9..5a6818930 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl.py +++ b/examples/performance/compute_context_length/qwen2_5_vl.py @@ -23,14 +23,16 @@ config.text_config.num_hidden_layers = 2 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. comp_ctx_lengths_prefill = [4096] # None # comp_ctx_lengths_decode = [6144, ctx_len] # None # @@ -40,7 +42,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py index fc330e14e..c247a1e58 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py +++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py @@ -20,14 +20,16 @@ config.text_config.num_hidden_layers = 4 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. comp_ctx_lengths_prefill = [4096] comp_ctx_lengths_decode = [6144, ctx_len] @@ -38,7 +40,7 @@ config=config, continuous_batching=True, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py index 8d53e68b5..93849fa5a 100644 --- a/examples/performance/compute_context_length/qwen3moe.py +++ b/examples/performance/compute_context_length/qwen3moe.py @@ -17,15 +17,17 @@ """ ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 1024 prefill_seq_len = 1 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. # In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same ccl specializations. comp_ctx_lengths_prefill = comp_ctx_lengths_decode = [256, 512, ctx_len] @@ -33,7 +35,7 @@ model_name, continuous_batching=False, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) diff --git a/examples/performance/compute_context_length/vlm_inference.py b/examples/performance/compute_context_length/vlm_inference.py index 876daa3e6..294632fe3 100644 --- a/examples/performance/compute_context_length/vlm_inference.py +++ b/examples/performance/compute_context_length/vlm_inference.py @@ -58,10 +58,6 @@ def run_model( """ print(f"Loading model: {model_name}") print(f"KV offload (Dual QPC mode): {kv_offload}") - print("CCL Configuration:") - print(f" - Prefill context lengths: {comp_ctx_lengths_prefill}") - print(f" - Decode context lengths: {comp_ctx_lengths_decode}") - print(f" - Max context length: {ctx_len}") ## STEP 1: Load the Processor and Model @@ -186,13 +182,13 @@ def main(): parser.add_argument( "--comp-ctx-lengths-prefill", type=lambda x: [int(i) for i in x.split(",")], - default="4096", + default=None, help="Comma-separated list of context lengths for prefill phase (e.g., '4096')", ) parser.add_argument( "--comp-ctx-lengths-decode", type=lambda x: [int(i) for i in x.split(",")], - default="6144,8192", + default=None, help="Comma-separated list of context lengths for decode phase (e.g., '6144,8192')", ) parser.add_argument( From b769fc0ceb9d14fb257fcc85d23eab7dc1a855b3 Mon Sep 17 00:00:00 2001 From: Vahid Janfaza Date: Fri, 12 Dec 2025 16:56:09 -0800 Subject: [PATCH 9/9] Add automatic CCL list generation for prefill and decode when user does not provide lists Signed-off-by: Vahid Janfaza --- .../transformers/models/modeling_auto.py | 6 +- QEfficient/utils/check_ccl_specializations.py | 227 +++++++++--------- 2 files changed, 122 insertions(+), 111 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index c6312e595..d6c4a5e65 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1127,7 +1127,7 @@ def compile( # if ccl_enabled is True read Compute-Context-Length lists if self.ccl_enabled: if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None: - print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") + logger.info("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) @@ -1772,7 +1772,7 @@ def compile( # if ccl_enabled is True read Compute-Context-Length lists if self.ccl_enabled: if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None: - print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") + logger.info("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) @@ -2868,7 +2868,7 @@ def compile( # if ccl_enabled is True read Compute-Context-Length lists if self.ccl_enabled: if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None: - print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") + logger.info("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) diff --git a/QEfficient/utils/check_ccl_specializations.py b/QEfficient/utils/check_ccl_specializations.py index b06f83e33..779e94122 100644 --- a/QEfficient/utils/check_ccl_specializations.py +++ b/QEfficient/utils/check_ccl_specializations.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from typing import List, Optional, Tuple +from typing import List, Optional, Set, Tuple def next_multiple_of_1024(n: int) -> int: @@ -23,57 +23,59 @@ def floor_to_1000(n: int) -> int: def is_power_of_two(n: int) -> bool: - """Return True if n is a power of two (n>0 and n&(n-1)==0).""" + """Return True if n is a power of two (n > 0 and n & (n - 1) == 0).""" return n > 0 and (n & (n - 1)) == 0 -def build_doubling_sequence(start: int, limit: int, max_elements: int, force_last: Optional[int] = None) -> List[int]: +def band_index_from_mapped_cl(mapped_cl: int) -> int: """ - Build an increasing sequence starting at 'start', doubling each step, - not exceeding 'limit', with total length <= max_elements. - If 'force_last' is provided, ensure the last element equals 'force_last' - (replacing/appending as needed), even if it exceeds 'limit'. + Compute band index ∈ {0,1,2} from mapped_cl using bit arithmetic. + + Bands (upper bounds): 2^15=32768 → idx=0, 2^16=65536 → idx=1, 2^17=131072 → idx=2. + For mapped_cl > 131072, clamp to idx=2. """ - if max_elements <= 0: - return [] + # ceil(log2(mapped_cl)) == bit_length(mapped_cl - 1) + ceil_log2 = (mapped_cl - 1).bit_length() + # map to {0,1,2} by subtracting 15 (the exponent for 32768) and clamping + idx = max(0, min(2, ceil_log2 - 15)) + return idx - # If start is already beyond limit, return [force_last or limit] as a single element. - if start > limit: - seq = [force_last if force_last is not None else limit] - return seq[:max_elements] - - seq: List[int] = [] - val = start - - while val <= limit and len(seq) < max_elements: - seq.append(val) - next_val = val * 2 - if next_val > limit or len(seq) >= max_elements: - break - val = next_val - - # Add/replace last element if a 'force_last' is requested - if force_last is not None: - if len(seq) == 0: - seq = [force_last] - elif seq[-1] != force_last: - if len(seq) < max_elements: - seq.append(force_last) - else: - seq[-1] = force_last - # Deduplicate while preserving order - dedup = [] - seen = set() - for x in seq: - if x not in seen: - dedup.append(x) - seen.add(x) - return dedup[:max_elements] +def build_doubling_set(start: int, limit: int, max_elements: int) -> Set[int]: + """ + Build a STRICT doubling set: {start, start*2, start*4, ...} up to 'limit', + collecting at most 'max_elements' values. Returns a set; caller will sort. + """ + values: Set[int] = set() + if max_elements <= 0 or start <= 0 or limit <= 0: + return values + + v = start + while v <= limit and len(values) < max_elements: + values.add(v) + v *= 2 + return values + + +def ensure_last(sorted_seq: List[int], last_value: int, max_elements: int) -> List[int]: + """ + Ensure the last element equals 'last_value' by appending or replacing the final element, + keeping length <= max_elements. If the sequence is empty, return [last_value]. + """ + if max_elements <= 0: + return [] + if not sorted_seq: + return [last_value][:max_elements] + if sorted_seq[-1] != last_value: + if len(sorted_seq) < max_elements: + sorted_seq.append(last_value) + else: + sorted_seq[-1] = last_value + return sorted_seq[:max_elements] -def Automatic_CCL_Generation( - CL: int, +def automatic_ccl_generation( + ctx_len: int, prefill_seq_len: int, comp_ctx_lengths_prefill: Optional[List[int]] = None, comp_ctx_lengths_decode: Optional[List[int]] = None, @@ -82,93 +84,102 @@ def Automatic_CCL_Generation( Automatic Compute-Context-Length Lists Generation Purpose: - Compute decode and prefill ccl lists based on an input context - length (CL), prefill sequence length, and optional pre-specified lists. + Compute decode and prefill CCL lists based on an input context length (CL), + prefill sequence length, and optional pre-specified lists. + + High-level rules (unchanged from your finalized logic): + - prefill_seq_len > 1: + * If either list is provided, pass them through unchanged. + * decode: doubles from tiered start; MUST end at mapped_CL (last forced to mapped_CL). + * prefill: + • If CL is power of two: STRICT doubling from tiered start, bounded by CL (no forced non-doubling last). + • Else: doubles from tiered start, bounded by CL, and last element = floor_to_1000(mapped_CL). + * Max 5 elements per list. + - prefill_seq_len == 1: + * decode and prefill are IDENTICAL. + * start at 4096, double up to 10 elements. + * upper grid cap computed dynamically (start * 2^(max_elements-1)); last = mapped_CL. + * If mapped_CL < 4096, both lists are [mapped_CL]. """ - - if CL <= 0: - mapped_CL = next_multiple_of_1024(max(CL, 1)) - # For non-positive CL, minimal identical sequences - seq = [mapped_CL] - return seq, seq, mapped_CL - - mapped_CL = next_multiple_of_1024(CL) - - # Tiered starts - if mapped_CL <= 4096: - seq = [mapped_CL] - return seq, seq, mapped_CL - elif mapped_CL <= 32768: - decode_start, prefill_start = 4096, 4000 - elif mapped_CL <= 65536: - decode_start, prefill_start = 8192, 8000 - elif mapped_CL <= 131072: - decode_start, prefill_start = 16384, 16000 - else: - decode_start, prefill_start = 16384, 16000 - - # If prefill_seq_len > 1: + # Handle non-positive CL + if ctx_len <= 0: + mapped_cl = next_multiple_of_1024(1) + seq = [mapped_cl] + return seq, seq, mapped_cl + + mapped_cl = next_multiple_of_1024(ctx_len) + + # Early small-ctx_len case for identical lists + if mapped_cl <= 4096: + seq = [mapped_cl] + return seq, seq, mapped_cl + + # Compute tier starts via band index (no hard-coded chain) + idx = band_index_from_mapped_cl(mapped_cl) + decode_start = 4096 << idx # 4096, 8192, 16384 + PREFILL_STARTS = {0: 4000, 1: 8000, 2: 16000} + prefill_start = PREFILL_STARTS[idx] + + # Branch: prefill_seq_len > 1 if prefill_seq_len > 1: # Passthrough if either provided if comp_ctx_lengths_decode is not None or comp_ctx_lengths_prefill is not None: return ( - comp_ctx_lengths_decode if comp_ctx_lengths_decode is not None else [], comp_ctx_lengths_prefill if comp_ctx_lengths_prefill is not None else [], - mapped_CL, + comp_ctx_lengths_decode if comp_ctx_lengths_decode is not None else [], + mapped_cl, ) + # Due to limitations in the number of specializations during compilation, we set the maximum number of elements in comp_ctx_lengths_decode and comp_ctx_lengths_prefill lists to 5. max_elems = 5 - # Decode: ensure last = mapped_CL - decode = build_doubling_sequence( - start=decode_start, - limit=mapped_CL, - max_elements=max_elems, - force_last=mapped_CL, - ) - - # Prefill: - if is_power_of_two(CL): - # Strict doubling, limit = CL, no forced non-doubling last - prefill = build_doubling_sequence( - start=prefill_start, - limit=CL, - max_elements=max_elems, - force_last=None, - ) + # ---- Decode: strict doubling up to mapped_cl, then enforce last = mapped_cl + decode_set = build_doubling_set(start=decode_start, limit=mapped_cl, max_elements=max_elems) + decode_list = sorted(decode_set) + decode_list = ensure_last(decode_list, last_value=mapped_cl, max_elements=max_elems) + + # ---- Prefill: + if is_power_of_two(ctx_len): + # STRICT doubling only, bounded by ctx_len; do NOT force a non-doubling last + prefill_set = build_doubling_set(start=prefill_start, limit=ctx_len, max_elements=max_elems) + prefill_list = sorted(prefill_set)[:max_elems] else: - prefill_last = floor_to_1000(mapped_CL) - prefill = build_doubling_sequence( - start=prefill_start, - limit=CL, - max_elements=max_elems, - force_last=prefill_last, - ) + # Doubles bounded by ctx_len, but last must equal floor_to_1000(mapped_cl) + prefill_last = floor_to_1000(mapped_cl) + prefill_set = build_doubling_set(start=prefill_start, limit=ctx_len, max_elements=max_elems) + prefill_list = sorted(prefill_set) + prefill_list = ensure_last(prefill_list, last_value=prefill_last, max_elements=max_elems) - return prefill, decode, mapped_CL + # NOTE: return order preserved from your last snippet (prefill first, then decode) + return prefill_list, decode_list, mapped_cl - # UPDATED: prefill_seq_len == 1 → identical lists + # Branch: prefill_seq_len == 1 → identical lists else: + # When prefill_seq_len=1 such as in MoE models, prefilling and decoding processes can use the same specializations and we can double the length of Ccl lists. + # Due to limitations in the number of specializations during compilation, we set the maximum number of elements in comp_ctx_lengths_decode and comp_ctx_lengths_prefill lists to 10. max_elems = 10 - grid_cap = 2097152 # upper cap for doubling grid + start_identical = 4096 - if mapped_CL < 4096: - seq = [mapped_CL] - else: - seq = build_doubling_sequence( - start=4096, - limit=min(mapped_CL, grid_cap), - max_elements=max_elems, - force_last=mapped_CL, # identical lists end at mapped_CL - ) - return seq, seq, mapped_CL + if mapped_cl < start_identical: + seq = [mapped_cl] + return seq, seq, mapped_cl + + # Dynamic grid cap: start * 2^(max_elems - 1) + grid_cap = start_identical * (1 << (max_elems - 1)) + limit = min(mapped_cl, grid_cap) + + seq_set = build_doubling_set(start=start_identical, limit=limit, max_elements=max_elems) + seq_list = sorted(seq_set) + seq_list = ensure_last(seq_list, last_value=mapped_cl, max_elements=max_elems) + + return seq_list, seq_list, mapped_cl def process_ccl_specializations(ccl_prefill, ccl_decode, ctx_len, prefill_seq_len): # Automatic CCL generation: If both ccl_prefill and ccl_decode are None, # generate optimized context length lists for prefill and decode based on ctx_len if ccl_prefill is None and ccl_decode is None: - ccl_prefill, ccl_decode, ctx_len = Automatic_CCL_Generation(ctx_len, prefill_seq_len, ccl_prefill, ccl_decode) + ccl_prefill, ccl_decode, ctx_len = automatic_ccl_generation(ctx_len, prefill_seq_len, ccl_prefill, ccl_decode) else: if prefill_seq_len == 1: if ccl_prefill is not None and ccl_decode is not None: