From 1208c11e146cd8efeb9dba428ccb8e6cc38e62ec Mon Sep 17 00:00:00 2001
From: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
Date: Wed, 19 Nov 2025 15:01:38 -0800
Subject: [PATCH 1/9] Adding ccl_enabled flag during model loading and passing
 CCL lists during compilation process

Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
---
 .../transformers/models/modeling_auto.py      | 26 ++++++++++++-------
 .../compute_context_length/llama4_cb.py       |  8 ++++++
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 16a809c96..8551cbbc5 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -869,7 +869,7 @@ def __init__(
         self,
         model: nn.Module,
         continuous_batching: bool = False,
-        qaic_config: Optional[dict] = None,
+        ccl_enabled: bool = False,
         **kwargs,
     ):
         """
@@ -902,7 +902,7 @@ def __init__(
         self.input_shapes, self.output_names = None, None
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Optional[dict] = None, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
         """
         Load a QEfficient multimodal model for dual QPC from a pretrained HuggingFace model or local path.
 
@@ -932,7 +932,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Option
         return cls(
             model,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            qaic_config=qaic_config,
+            ccl_enabled=ccl_enabled,
             **kwargs,
         )
 
@@ -1565,7 +1565,7 @@ class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, Multimodal
     def __init__(
         self,
         model: nn.Module,
-        qaic_config: Optional[dict] = None,
+        ccl_enabled: bool = False,
         **kwargs,
     ):
         """
@@ -1615,7 +1615,6 @@ def __init__(
     def from_pretrained(
         cls,
         pretrained_model_name_or_path,
-        qaic_config: Optional[dict] = None,
         *args,
         **kwargs,
     ):
@@ -1646,6 +1645,7 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
+        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         from transformers import AutoConfig
 
@@ -1657,7 +1657,7 @@ def from_pretrained(
         return cls(
             model,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            qaic_config=qaic_config,
+            ccl_enabled=ccl_enabled,
             **kwargs,
         )
 
@@ -1773,6 +1773,7 @@ def compile(
         output_names = self.model.get_output_names()
 
         # if ccl_enabled is True read Compute-Context-Length lists
+        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None
         if self.ccl_enabled:
             if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None:
                 logger.warning(
@@ -2154,7 +2155,7 @@ def __new__(
         model: nn.Module,
         kv_offload: Optional[bool] = True,
         continuous_batching: bool = False,
-        qaic_config: Optional[dict] = None,
+        ccl_enabled: bool = False,
         **kwargs,
     ):
         """
@@ -2178,10 +2179,10 @@ def __new__(
         """
         if kv_offload:
             return _QEffAutoModelForImageTextToTextDualQPC(
-                model, continuous_batching, qaic_config=qaic_config, **kwargs
+                model, continuous_batching, ccl_enabled=ccl_enabled, **kwargs
             )
         else:
-            return _QEFFAutoModelForImageTextToTextSingleQPC(model, qaic_config=qaic_config, **kwargs)
+            return _QEFFAutoModelForImageTextToTextSingleQPC(model, ccl_enabled=ccl_enabled, **kwargs)
 
     @classmethod
     @with_replaced_quantizers
@@ -2231,6 +2232,7 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
+        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(
@@ -2238,7 +2240,7 @@ def from_pretrained(
             kv_offload=kv_offload,
             continuous_batching=continuous_batching,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            qaic_config=qaic_config,
+            ccl_enabled=ccl_enabled,
             **kwargs,
         )
 
@@ -2289,6 +2291,7 @@ def __init__(
         model: nn.Module,
         continuous_batching: bool = False,
         qaic_config: Optional[dict] = None,
+        ccl_enabled: bool = False,
         **kwargs,
     ):
         """
@@ -2428,6 +2431,7 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kv_offload = kwargs.pop("kv_offload", None)
+        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
@@ -2450,6 +2454,7 @@ def from_pretrained(
             continuous_batching=continuous_batching,
             qaic_config=qaic_config,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
+            ccl_enabled=ccl_enabled,
             **kwargs,
         )
 
@@ -2872,6 +2877,7 @@ def compile(
         """
 
         # if ccl_enabled is True read Compute-Context-Length lists
+        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None
         if self.ccl_enabled:
             if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None:
                 logger.warning(
diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py
index ffbbff67f..1adfd89b6 100644
--- a/examples/performance/compute_context_length/llama4_cb.py
+++ b/examples/performance/compute_context_length/llama4_cb.py
@@ -41,9 +41,13 @@
         kv_offload=True,
         config=config,
         continuous_batching=True,
+<<<<<<< HEAD
         qaic_config={
             "ccl_enabled": True,
         },
+=======
+        ccl_enabled=True,
+>>>>>>> d58736d (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process)
     )
 
     qeff_model.compile(
@@ -68,9 +72,13 @@
         attn_implementation="eager",
         kv_offload=True,
         config=config,
+<<<<<<< HEAD
         qaic_config={
             "ccl_enabled": True,
         },
+=======
+        ccl_enabled=True,
+>>>>>>> d58736d (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process)
     )
 
     qeff_model.compile(

From ab0a10fda67ec66008068bea0eb368797968f94b Mon Sep 17 00:00:00 2001
From: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
Date: Wed, 19 Nov 2025 16:17:21 -0800
Subject: [PATCH 2/9] Adding ccl_enabled flag during model loading and passing
 CCL lists during compilation process

Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
---
 examples/performance/compute_context_length/qwen2_5_vl_cb.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py
index fc330e14e..39fbf6319 100644
--- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py
+++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py
@@ -84,7 +84,11 @@
     processor=processor,
     images=image_urls,
     generation_len=100,
+<<<<<<< HEAD
     device_ids=[0, 1, 2, 3],
+=======
+    device_ids=[28, 29, 30, 31],
+>>>>>>> da18659 (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process)
 )
 print(output.generated_ids)
 print(tokenizer.batch_decode(output.generated_ids))

From 53a843f1bf5beb9303aa62e460337b2e57581365 Mon Sep 17 00:00:00 2001
From: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
Date: Wed, 19 Nov 2025 22:06:06 -0800
Subject: [PATCH 3/9] Adding ccl_enabled flag during model loading and passing
 CCL lists during compilation process

Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
---
 QEfficient/transformers/models/modeling_auto.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 8551cbbc5..d87cc65f4 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1773,7 +1773,6 @@ def compile(
         output_names = self.model.get_output_names()
 
         # if ccl_enabled is True read Compute-Context-Length lists
-        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None
         if self.ccl_enabled:
             if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None:
                 logger.warning(
@@ -2877,7 +2876,6 @@ def compile(
         """
 
         # if ccl_enabled is True read Compute-Context-Length lists
-        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None
         if self.ccl_enabled:
             if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None:
                 logger.warning(

From 7d5fa64db645da6d2ddf5f71555f0d327fb33249 Mon Sep 17 00:00:00 2001
From: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
Date: Sun, 23 Nov 2025 17:49:03 -0800
Subject: [PATCH 4/9] Adding ccl_enabled flag during model loading and passing
 CCL lists during compilation process

Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
---
 examples/performance/compute_context_length/qwen3moe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py
index b53a28362..8d53e68b5 100644
--- a/examples/performance/compute_context_length/qwen3moe.py
+++ b/examples/performance/compute_context_length/qwen3moe.py
@@ -49,6 +49,5 @@
     comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
     comp_ctx_lengths_decode=comp_ctx_lengths_decode,
 )
-
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer)

From 179cd87f328c5017c41fe3d1c365ff9209b5508e Mon Sep 17 00:00:00 2001
From: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
Date: Mon, 1 Dec 2025 16:42:42 -0800
Subject: [PATCH 5/9] Adding ccl_enabled flag during model loading and passing
 CCL lists during compilation process

Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
---
 examples/performance/compute_context_length/qwen2_5_vl_cb.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py
index 39fbf6319..fc330e14e 100644
--- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py
+++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py
@@ -84,11 +84,7 @@
     processor=processor,
     images=image_urls,
     generation_len=100,
-<<<<<<< HEAD
     device_ids=[0, 1, 2, 3],
-=======
-    device_ids=[28, 29, 30, 31],
->>>>>>> da18659 (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process)
 )
 print(output.generated_ids)
 print(tokenizer.batch_decode(output.generated_ids))

From 11741446955ba6565c1766626a80e4eb81ec416b Mon Sep 17 00:00:00 2001
From: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
Date: Tue, 2 Dec 2025 14:46:12 -0800
Subject: [PATCH 6/9] Adding ccl_enabled flag during model loading and passing
 CCL lists during compilation process

Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
---
 .../transformers/models/modeling_auto.py      | 24 ++++++++-----------
 .../compute_context_length/llama4_cb.py       |  8 -------
 .../compute_context_length/molmo.py           |  2 +-
 3 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index d87cc65f4..16a809c96 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -869,7 +869,7 @@ def __init__(
         self,
         model: nn.Module,
         continuous_batching: bool = False,
-        ccl_enabled: bool = False,
+        qaic_config: Optional[dict] = None,
         **kwargs,
     ):
         """
@@ -902,7 +902,7 @@ def __init__(
         self.input_shapes, self.output_names = None, None
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: str, qaic_config: Optional[dict] = None, **kwargs):
         """
         Load a QEfficient multimodal model for dual QPC from a pretrained HuggingFace model or local path.
 
@@ -932,7 +932,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
         return cls(
             model,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            ccl_enabled=ccl_enabled,
+            qaic_config=qaic_config,
             **kwargs,
         )
 
@@ -1565,7 +1565,7 @@ class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, Multimodal
     def __init__(
         self,
         model: nn.Module,
-        ccl_enabled: bool = False,
+        qaic_config: Optional[dict] = None,
         **kwargs,
     ):
         """
@@ -1615,6 +1615,7 @@ def __init__(
     def from_pretrained(
         cls,
         pretrained_model_name_or_path,
+        qaic_config: Optional[dict] = None,
         *args,
         **kwargs,
     ):
@@ -1645,7 +1646,6 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
-        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         from transformers import AutoConfig
 
@@ -1657,7 +1657,7 @@ def from_pretrained(
         return cls(
             model,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            ccl_enabled=ccl_enabled,
+            qaic_config=qaic_config,
             **kwargs,
         )
 
@@ -2154,7 +2154,7 @@ def __new__(
         model: nn.Module,
         kv_offload: Optional[bool] = True,
         continuous_batching: bool = False,
-        ccl_enabled: bool = False,
+        qaic_config: Optional[dict] = None,
         **kwargs,
     ):
         """
@@ -2178,10 +2178,10 @@ def __new__(
         """
         if kv_offload:
             return _QEffAutoModelForImageTextToTextDualQPC(
-                model, continuous_batching, ccl_enabled=ccl_enabled, **kwargs
+                model, continuous_batching, qaic_config=qaic_config, **kwargs
             )
         else:
-            return _QEFFAutoModelForImageTextToTextSingleQPC(model, ccl_enabled=ccl_enabled, **kwargs)
+            return _QEFFAutoModelForImageTextToTextSingleQPC(model, qaic_config=qaic_config, **kwargs)
 
     @classmethod
     @with_replaced_quantizers
@@ -2231,7 +2231,6 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
-        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(
@@ -2239,7 +2238,7 @@ def from_pretrained(
             kv_offload=kv_offload,
             continuous_batching=continuous_batching,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            ccl_enabled=ccl_enabled,
+            qaic_config=qaic_config,
             **kwargs,
         )
 
@@ -2290,7 +2289,6 @@ def __init__(
         model: nn.Module,
         continuous_batching: bool = False,
         qaic_config: Optional[dict] = None,
-        ccl_enabled: bool = False,
         **kwargs,
     ):
         """
@@ -2430,7 +2428,6 @@ def from_pretrained(
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kv_offload = kwargs.pop("kv_offload", None)
-        ccl_enabled = kwargs.pop("ccl_enabled", None)
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
@@ -2453,7 +2450,6 @@ def from_pretrained(
             continuous_batching=continuous_batching,
             qaic_config=qaic_config,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            ccl_enabled=ccl_enabled,
             **kwargs,
         )
 
diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py
index 1adfd89b6..ffbbff67f 100644
--- a/examples/performance/compute_context_length/llama4_cb.py
+++ b/examples/performance/compute_context_length/llama4_cb.py
@@ -41,13 +41,9 @@
         kv_offload=True,
         config=config,
         continuous_batching=True,
-<<<<<<< HEAD
         qaic_config={
             "ccl_enabled": True,
         },
-=======
-        ccl_enabled=True,
->>>>>>> d58736d (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process)
     )
 
     qeff_model.compile(
@@ -72,13 +68,9 @@
         attn_implementation="eager",
         kv_offload=True,
         config=config,
-<<<<<<< HEAD
         qaic_config={
             "ccl_enabled": True,
         },
-=======
-        ccl_enabled=True,
->>>>>>> d58736d (Adding ccl_enabled flag during model loading and passing CCL lists during compilation process)
     )
 
     qeff_model.compile(
diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py
index b5f1f50e6..6ee272710 100644
--- a/examples/performance/compute_context_length/molmo.py
+++ b/examples/performance/compute_context_length/molmo.py
@@ -33,7 +33,7 @@
 
 qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
     model_id,
-    kv_offload=True,
+    kv_offload=False,
     trust_remote_code=True,
     config=config,
     qaic_config={

From 8673d2ce5f63cd53b64aec5b15b6f694c4e89864 Mon Sep 17 00:00:00 2001
From: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
Date: Tue, 2 Dec 2025 14:50:40 -0800
Subject: [PATCH 7/9] Adding ccl_enabled flag during model loading and passing
 CCL lists during compilation process

Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
---
 examples/performance/compute_context_length/molmo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py
index 6ee272710..b5f1f50e6 100644
--- a/examples/performance/compute_context_length/molmo.py
+++ b/examples/performance/compute_context_length/molmo.py
@@ -33,7 +33,7 @@
 
 qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
     model_id,
-    kv_offload=False,
+    kv_offload=True,
     trust_remote_code=True,
     config=config,
     qaic_config={

From 2788e6ec12155344ec1aea9dff902b5c74be6b32 Mon Sep 17 00:00:00 2001
From: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
Date: Tue, 9 Dec 2025 18:27:26 -0800
Subject: [PATCH 8/9] Add automatic CCL list generation for prefill and decode
 when user does not provide lists

Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
---
 .../transformers/models/modeling_auto.py      |  39 ++-
 QEfficient/utils/check_ccl_specializations.py | 240 +++++++++++++++---
 .../compute_context_length/README.md          |  22 ++
 .../compute_context_length/basic_inference.py |  10 +-
 .../compute_context_length/gemma3.py          |   8 +-
 .../compute_context_length/gpt_oss.py         |   7 +-
 .../compute_context_length/granite_vision.py  |   1 +
 .../compute_context_length/internvl.py        |   1 +
 .../compute_context_length/llama4.py          |   8 +-
 .../compute_context_length/llama4_cb.py       |  10 +-
 .../llama4_multi_image.py                     |   8 +-
 .../compute_context_length/mistral3.py        |   1 +
 .../compute_context_length/molmo.py           |   8 +-
 .../compute_context_length/qwen2_5_vl.py      |   8 +-
 .../compute_context_length/qwen2_5_vl_cb.py   |   8 +-
 .../compute_context_length/qwen3moe.py        |   8 +-
 .../compute_context_length/vlm_inference.py   |   8 +-
 17 files changed, 295 insertions(+), 100 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 16a809c96..c6312e595 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1126,17 +1126,14 @@ def compile(
 
         # if ccl_enabled is True read Compute-Context-Length lists
         if self.ccl_enabled:
-            if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None:
-                logger.warning(
-                    "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model."
-                )
-            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+            if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None:
+                print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).")
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations(
                 comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
             )
-
         # For supporting VLLM and Disaggregated with CCL
-        if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None:
-            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+        elif comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None:
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations(
                 comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
             )
 
@@ -1774,17 +1771,14 @@ def compile(
 
         # if ccl_enabled is True read Compute-Context-Length lists
         if self.ccl_enabled:
-            if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None:
-                logger.warning(
-                    "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model."
-                )
-            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+            if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None:
+                print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).")
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations(
                 comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
             )
-
         # For supporting VLLM and Disaggregated with CCL
-        if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None:
-            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+        elif comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None:
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations(
                 comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
             )
 
@@ -2873,16 +2867,13 @@ def compile(
 
         # if ccl_enabled is True read Compute-Context-Length lists
         if self.ccl_enabled:
-            if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None:
-                logger.warning(
-                    "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model."
-                )
-            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+            if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None:
+                print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).")
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations(
                 comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
             )
-
         # For supporting VLLM and Disaggregated with CCL
-        if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None:
+        elif comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None:
             if isinstance(comp_ctx_lengths_prefill, str):
                 import ast
 
@@ -2897,7 +2888,7 @@ def compile(
                 self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill
                 self.comp_ctx_lengths_decode = comp_ctx_lengths_decode
 
-            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
+            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations(
                 self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len, prefill_seq_len
             )
         # --- Validation ---
diff --git a/QEfficient/utils/check_ccl_specializations.py b/QEfficient/utils/check_ccl_specializations.py
index 0d6a078f6..b06f83e33 100644
--- a/QEfficient/utils/check_ccl_specializations.py
+++ b/QEfficient/utils/check_ccl_specializations.py
@@ -5,40 +5,210 @@
 #
 # -----------------------------------------------------------------------------
 
+from typing import List, Optional, Tuple
+
+
+def next_multiple_of_1024(n: int) -> int:
+    """Ceil 'n' to the next multiple of 1024."""
+    if n <= 0:
+        return 0
+    return ((n + 1023) // 1024) * 1024
+
+
+def floor_to_1000(n: int) -> int:
+    """Floor 'n' to the nearest lower multiple of 1000."""
+    if n <= 0:
+        return 0
+    return (n // 1000) * 1000
+
+
+def is_power_of_two(n: int) -> bool:
+    """Return True if n is a power of two (n>0 and n&(n-1)==0)."""
+    return n > 0 and (n & (n - 1)) == 0
+
+
+def build_doubling_sequence(start: int, limit: int, max_elements: int, force_last: Optional[int] = None) -> List[int]:
+    """
+    Build an increasing sequence starting at 'start', doubling each step,
+    not exceeding 'limit', with total length <= max_elements.
+    If 'force_last' is provided, ensure the last element equals 'force_last'
+    (replacing/appending as needed), even if it exceeds 'limit'.
+    """
+    if max_elements <= 0:
+        return []
+
+    # If start is already beyond limit, return [force_last or limit] as a single element.
+    if start > limit:
+        seq = [force_last if force_last is not None else limit]
+        return seq[:max_elements]
+
+    seq: List[int] = []
+    val = start
+
+    while val <= limit and len(seq) < max_elements:
+        seq.append(val)
+        next_val = val * 2
+        if next_val > limit or len(seq) >= max_elements:
+            break
+        val = next_val
+
+    # Add/replace last element if a 'force_last' is requested
+    if force_last is not None:
+        if len(seq) == 0:
+            seq = [force_last]
+        elif seq[-1] != force_last:
+            if len(seq) < max_elements:
+                seq.append(force_last)
+            else:
+                seq[-1] = force_last
+
+    # Deduplicate while preserving order
+    dedup = []
+    seen = set()
+    for x in seq:
+        if x not in seen:
+            dedup.append(x)
+            seen.add(x)
+    return dedup[:max_elements]
+
+
+def Automatic_CCL_Generation(
+    CL: int,
+    prefill_seq_len: int,
+    comp_ctx_lengths_prefill: Optional[List[int]] = None,
+    comp_ctx_lengths_decode: Optional[List[int]] = None,
+) -> Tuple[List[int], List[int], int]:
+    """
+    Automatic Compute-Context-Length Lists Generation
+
+    Purpose:
+        Compute decode and prefill ccl lists based on an input context
+        length (CL), prefill sequence length, and optional pre-specified lists.
+    """
+
+    if CL <= 0:
+        mapped_CL = next_multiple_of_1024(max(CL, 1))
+        # For non-positive CL, minimal identical sequences
+        seq = [mapped_CL]
+        return seq, seq, mapped_CL
+
+    mapped_CL = next_multiple_of_1024(CL)
+
+    # Tiered starts
+    if mapped_CL <= 4096:
+        seq = [mapped_CL]
+        return seq, seq, mapped_CL
+    elif mapped_CL <= 32768:
+        decode_start, prefill_start = 4096, 4000
+    elif mapped_CL <= 65536:
+        decode_start, prefill_start = 8192, 8000
+    elif mapped_CL <= 131072:
+        decode_start, prefill_start = 16384, 16000
+    else:
+        decode_start, prefill_start = 16384, 16000
+
+    # If prefill_seq_len > 1:
+    if prefill_seq_len > 1:
+        # Passthrough if either provided
+        if comp_ctx_lengths_decode is not None or comp_ctx_lengths_prefill is not None:
+            return (
+                comp_ctx_lengths_decode if comp_ctx_lengths_decode is not None else [],
+                comp_ctx_lengths_prefill if comp_ctx_lengths_prefill is not None else [],
+                mapped_CL,
+            )
+
+        max_elems = 5
+
+        # Decode: ensure last = mapped_CL
+        decode = build_doubling_sequence(
+            start=decode_start,
+            limit=mapped_CL,
+            max_elements=max_elems,
+            force_last=mapped_CL,
+        )
+
+        # Prefill:
+        if is_power_of_two(CL):
+            # Strict doubling, limit = CL, no forced non-doubling last
+            prefill = build_doubling_sequence(
+                start=prefill_start,
+                limit=CL,
+                max_elements=max_elems,
+                force_last=None,
+            )
+        else:
+            prefill_last = floor_to_1000(mapped_CL)
+            prefill = build_doubling_sequence(
+                start=prefill_start,
+                limit=CL,
+                max_elements=max_elems,
+                force_last=prefill_last,
+            )
+
+        return prefill, decode, mapped_CL
+
+    # UPDATED: prefill_seq_len == 1 → identical lists
+    else:
+        max_elems = 10
+        grid_cap = 2097152  # upper cap for doubling grid
+
+        if mapped_CL < 4096:
+            seq = [mapped_CL]
+        else:
+            seq = build_doubling_sequence(
+                start=4096,
+                limit=min(mapped_CL, grid_cap),
+                max_elements=max_elems,
+                force_last=mapped_CL,  # identical lists end at mapped_CL
+            )
+        return seq, seq, mapped_CL
+
 
 def process_ccl_specializations(ccl_prefill, ccl_decode, ctx_len, prefill_seq_len):
-    if ccl_prefill is None or ccl_decode is None:
-        return None, None
-
-    if ctx_len is None:
-        raise TypeError("`ctx_len` is required when loading the model with CCL.")
-
-    if prefill_seq_len == 1:
-        # both prefill and decode ccl can share the same specializations since prefill_seq_len=1. So, a sorted union of both lists can be used for both of them.
-        ccl_union_all = sorted(set(ccl_prefill + ccl_decode))
-        ccl_union_all = [min(x, ctx_len) for x in ccl_union_all]
-        return ccl_union_all, ccl_union_all
-
-    # Step 1: Cap values to ctx_len
-    ccl_prefill = [min(x, ctx_len) for x in ccl_prefill]
-    ccl_decode = [min(x, ctx_len) for x in ccl_decode]
-
-    # Step 2: Remove duplicates within each list
-    ccl_prefill = list(set(ccl_prefill))
-    ccl_decode = list(set(ccl_decode))
-
-    # Step 3: Ensure no overlap between ccl_prefill and ccl_decode
-    updated_prefill = []
-    for val in ccl_prefill:
-        while val in ccl_decode or val in updated_prefill:
-            val -= 1
-            if val < 0:
-                break  # Prevent negative values
-        if val >= 0:
-            updated_prefill.append(val)
-
-    # Step 4: Sort both lists
-    updated_prefill.sort()
-    ccl_decode.sort()
-
-    return updated_prefill, ccl_decode
+    # Automatic CCL generation: If both ccl_prefill and ccl_decode are None,
+    # generate optimized context length lists for prefill and decode based on ctx_len
+    if ccl_prefill is None and ccl_decode is None:
+        ccl_prefill, ccl_decode, ctx_len = Automatic_CCL_Generation(ctx_len, prefill_seq_len, ccl_prefill, ccl_decode)
+    else:
+        if prefill_seq_len == 1:
+            if ccl_prefill is not None and ccl_decode is not None:
+                # both prefill and decode ccl can share the same specializations since prefill_seq_len=1. So, a sorted union of both lists can be used for both of them.
+                ccl_union_all = sorted(set(ccl_prefill + ccl_decode))
+                ccl_union_all = [min(x, ctx_len) for x in ccl_union_all]
+                ccl_prefill = ccl_union_all
+                ccl_decode = ccl_union_all
+        else:
+            # Step 1: Cap values to ctx_len
+            ccl_prefill = [min(x, ctx_len) for x in ccl_prefill] if ccl_prefill is not None else None
+            ccl_decode = [min(x, ctx_len) for x in ccl_decode] if ccl_decode is not None else None
+
+            # Step 2: Remove duplicates within each list
+            ccl_prefill = list(set(ccl_prefill)) if ccl_prefill is not None else None
+            ccl_decode = list(set(ccl_decode)) if ccl_decode is not None else None
+
+            if ccl_prefill is None or ccl_decode is None:
+                if ccl_prefill:
+                    ccl_prefill.sort()
+                if ccl_decode:
+                    ccl_decode.sort()
+            else:
+                # Step 3: Ensure no overlap between ccl_prefill and ccl_decode
+                tmp_prefill = ccl_prefill
+                ccl_prefill = []
+                for val in tmp_prefill:
+                    while val in ccl_decode or val in ccl_prefill:
+                        val -= 1
+                        if val < 0:
+                            break  # Prevent negative values
+                    if val >= 0:
+                        ccl_prefill.append(val)
+
+                # Step 4: Sort both lists
+                ccl_prefill.sort()
+                ccl_decode.sort()
+
+    print("CCL Configuration:")
+    print(f"  - Prefill context lengths: {ccl_prefill}")
+    print(f"  - Decode context lengths: {ccl_decode}")
+    print(f"  - Max context length: {ctx_len}")
+    return ccl_prefill, ccl_decode, ctx_len
diff --git a/examples/performance/compute_context_length/README.md b/examples/performance/compute_context_length/README.md
index 9f1d29b9a..2115251e2 100644
--- a/examples/performance/compute_context_length/README.md
+++ b/examples/performance/compute_context_length/README.md
@@ -37,11 +37,22 @@ python basic_inference.py \
     --model-name meta-llama/Llama-3.2-1B \
     --prompt "Hello, how are you?" \
     --ctx-len 1024 \
+    --ccl-enabled \
     --comp-ctx-lengths-prefill "256,500" \
     --comp-ctx-lengths-decode "512,1024" \
     --generation-len 100
 ```
 
+# For automatic CCL lists generation, simply not pass CCL lists and only pass ccl-enabled flag
+```bash
+python basic_inference.py \
+    --model-name meta-llama/Llama-3.2-1B \
+    --prompt "Hello, how are you?" \
+    --ctx-len 1024 \
+    --ccl-enabled \
+    --generation-len 100
+```
+
 ### Vision-Language Models
 
 Run VLM inference with CCL:
@@ -55,11 +66,22 @@ python vlm_inference.py \
     --model-name meta-llama/Llama-3.2-11B-Vision-Instruct \
     --query "Describe this image" \
     --image-url "https://..." \
+    --ccl-enabled \
     --comp-ctx-lengths-prefill "4096" \
     --comp-ctx-lengths-decode "6144,8192" \
     --ctx-len 8192
 ```
 
+# For automatic CCL lists generation, simply not pass CCL lists and only pass ccl-enabled flag
+```bash
+python vlm_inference.py \
+    --model-name meta-llama/Llama-3.2-11B-Vision-Instruct \
+    --query "Describe this image" \
+    --image-url "https://..." \
+    --ccl-enabled \
+    --ctx-len 8192
+```
+
 ## Available Examples
 
 ### Text-Only Models
diff --git a/examples/performance/compute_context_length/basic_inference.py b/examples/performance/compute_context_length/basic_inference.py
index 4533c47e8..6e8c045fb 100644
--- a/examples/performance/compute_context_length/basic_inference.py
+++ b/examples/performance/compute_context_length/basic_inference.py
@@ -54,13 +54,13 @@ def main():
     parser.add_argument(
         "--comp-ctx-lengths-prefill",
         type=lambda x: [int(i) for i in x.split(",")],
-        default="256,500",
+        default=None,
         help="Comma-separated list of context lengths for prefill phase (e.g., '256,500')",
     )
     parser.add_argument(
         "--comp-ctx-lengths-decode",
         type=lambda x: [int(i) for i in x.split(",")],
-        default="512,1024",
+        default=None,
         help="Comma-separated list of context lengths for decode phase (e.g., '512,1024')",
     )
     parser.add_argument(
@@ -107,11 +107,7 @@ def main():
     args = parser.parse_args()
 
     print(f"Loading model: {args.model_name}")
-    print("CCL Configuration:")
-    print(f"  - Prefill context lengths: {args.comp_ctx_lengths_prefill}")
-    print(f"  - Decode context lengths: {args.comp_ctx_lengths_decode}")
-    print(f"  - Max context length: {args.ctx_len}")
-    print(f"  - Continuous batching: {args.continuous_batching}")
+    print(f"Continuous batching: {args.continuous_batching}")
 
     # Load model with CCL configuration
     model = QEFFAutoModelForCausalLM.from_pretrained(
diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py
index d9672b9e3..1dcec5c81 100644
--- a/examples/performance/compute_context_length/gemma3.py
+++ b/examples/performance/compute_context_length/gemma3.py
@@ -21,14 +21,16 @@
 processor = AutoProcessor.from_pretrained(model_id)
 
 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
-## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
+## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
 ##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
-##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
+##           -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value.
 
 ctx_len = 8192
+ccl_enabled = True
+# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding.
 comp_ctx_lengths_prefill = [3072]
 comp_ctx_lengths_decode = [4096, ctx_len]
 
@@ -40,7 +42,7 @@
     attn_implementation="eager",
     kv_offload=True,
     qaic_config={
-        "ccl_enabled": True,
+        "ccl_enabled": ccl_enabled,
     },
 )
 
diff --git a/examples/performance/compute_context_length/gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py
index 39a5d48ed..92bef9148 100644
--- a/examples/performance/compute_context_length/gpt_oss.py
+++ b/examples/performance/compute_context_length/gpt_oss.py
@@ -12,16 +12,17 @@
 model_id = "openai/gpt-oss-20b"  # weights are not required to convert to fp32
 
 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
-## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
+## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
 ##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
-##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
+##           -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value.
 
 ctx_len = 4096
+ccl_enabled = True
+# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding.
 # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists.
-# Set the list of ccl during prefilling and decoding processes
 comp_ctx_lengths_prefill = comp_ctx_lengths_decode = [1024, ctx_len]
 
 qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
diff --git a/examples/performance/compute_context_length/granite_vision.py b/examples/performance/compute_context_length/granite_vision.py
index 6dd38395c..ef5dc3a51 100644
--- a/examples/performance/compute_context_length/granite_vision.py
+++ b/examples/performance/compute_context_length/granite_vision.py
@@ -98,6 +98,7 @@ def run_model(
     num_devices = 4
     ctx_len = 8192
     ccl_enabled = True
+    # Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. If both are None, the lists will be generated automatically based on the context length.
     comp_ctx_lengths_prefill = [5500]
     comp_ctx_lengths_decode = [6144, ctx_len]
 
diff --git a/examples/performance/compute_context_length/internvl.py b/examples/performance/compute_context_length/internvl.py
index 19bcf4bc1..02e965e0d 100644
--- a/examples/performance/compute_context_length/internvl.py
+++ b/examples/performance/compute_context_length/internvl.py
@@ -263,6 +263,7 @@ def run_intern_on_aic(
 
     ctx_len = 8192
     ccl_enabled = True
+    # Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. If both are None, the lists will be generated automatically based on the context length.
     comp_ctx_lengths_prefill = [4096]
     comp_ctx_lengths_decode = [6144, ctx_len]
 
diff --git a/examples/performance/compute_context_length/llama4.py b/examples/performance/compute_context_length/llama4.py
index 8cdbd70a1..a867e1bd3 100644
--- a/examples/performance/compute_context_length/llama4.py
+++ b/examples/performance/compute_context_length/llama4.py
@@ -18,14 +18,16 @@
 config.vision_config.num_hidden_layers = 2
 
 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
-## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
+## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
 ##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
-##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
+##           -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value.
 
 ctx_len = 8192
+ccl_enabled = True
+# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding.
 # Set the list of ccl during prefilling process
 comp_ctx_lengths_prefill = [3072]
 # Set the list of ccl during decoding process
@@ -37,7 +39,7 @@
     kv_offload=True,
     config=config,
     qaic_config={
-        "ccl_enabled": True,
+        "ccl_enabled": ccl_enabled,
     },
 )
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py
index ffbbff67f..f97160693 100644
--- a/examples/performance/compute_context_length/llama4_cb.py
+++ b/examples/performance/compute_context_length/llama4_cb.py
@@ -20,14 +20,16 @@
 processor = AutoProcessor.from_pretrained(model_id)
 
 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
-## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
+## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
 ##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
-##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
+##           -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value.
 
 ctx_len = 4096
+ccl_enabled = True
+# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding.
 # Set the list of ccl during prefilling process
 comp_ctx_lengths_prefill = [3072]
 # Set the list of ccl during decoding process
@@ -42,7 +44,7 @@
         config=config,
         continuous_batching=True,
         qaic_config={
-            "ccl_enabled": True,
+            "ccl_enabled": ccl_enabled,
         },
     )
 
@@ -69,7 +71,7 @@
         kv_offload=True,
         config=config,
         qaic_config={
-            "ccl_enabled": True,
+            "ccl_enabled": ccl_enabled,
         },
     )
 
diff --git a/examples/performance/compute_context_length/llama4_multi_image.py b/examples/performance/compute_context_length/llama4_multi_image.py
index fd513fe45..314aa49b3 100644
--- a/examples/performance/compute_context_length/llama4_multi_image.py
+++ b/examples/performance/compute_context_length/llama4_multi_image.py
@@ -18,14 +18,16 @@
 config.vision_config.num_hidden_layers = 2
 
 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
-## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
+## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
 ##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
-##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
+##           -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value.
 
 ctx_len = 8192
+ccl_enabled = True
+# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding.
 # Set the list of ccl during prefilling process
 comp_ctx_lengths_prefill = [5376]
 # Set the list of ccl during decoding process
@@ -37,7 +39,7 @@
     kv_offload=True,
     config=config,
     qaic_config={
-        "ccl_enabled": True,
+        "ccl_enabled": ccl_enabled,
     },
 )
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
diff --git a/examples/performance/compute_context_length/mistral3.py b/examples/performance/compute_context_length/mistral3.py
index 3763fbcde..a773ddfd9 100644
--- a/examples/performance/compute_context_length/mistral3.py
+++ b/examples/performance/compute_context_length/mistral3.py
@@ -101,6 +101,7 @@ def run_model(
     num_cores = 16
     num_devices = 4
     ccl_enabled = True
+    # Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. If both are None, the lists will be generated automatically based on the context length.
     comp_ctx_lengths_prefill = [4096]
     comp_ctx_lengths_decode = [6144, ctx_len]
 
diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py
index b5f1f50e6..8d773f5fe 100644
--- a/examples/performance/compute_context_length/molmo.py
+++ b/examples/performance/compute_context_length/molmo.py
@@ -19,15 +19,17 @@
 # config.num_hidden_layers = 2
 
 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
-## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
+## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
 ##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
-##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
+##           -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value.
 
 # load the model
 ctx_len = 8192
+ccl_enabled = True
+# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding.
 comp_ctx_lengths_prefill = [3072]  # None #
 comp_ctx_lengths_decode = [4096, 8192]  # None #
 
@@ -37,7 +39,7 @@
     trust_remote_code=True,
     config=config,
     qaic_config={
-        "ccl_enabled": True,
+        "ccl_enabled": ccl_enabled,
     },
 )
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
diff --git a/examples/performance/compute_context_length/qwen2_5_vl.py b/examples/performance/compute_context_length/qwen2_5_vl.py
index 20960b6a9..5a6818930 100644
--- a/examples/performance/compute_context_length/qwen2_5_vl.py
+++ b/examples/performance/compute_context_length/qwen2_5_vl.py
@@ -23,14 +23,16 @@
 config.text_config.num_hidden_layers = 2
 
 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
-## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
+## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
 ##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
-##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
+##           -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value.
 
 ctx_len = 8192
+ccl_enabled = True
+# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding.
 comp_ctx_lengths_prefill = [4096]  # None #
 comp_ctx_lengths_decode = [6144, ctx_len]  # None #
 
@@ -40,7 +42,7 @@
     kv_offload=True,
     config=config,
     qaic_config={
-        "ccl_enabled": True,
+        "ccl_enabled": ccl_enabled,
     },
 )
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py
index fc330e14e..c247a1e58 100644
--- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py
+++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py
@@ -20,14 +20,16 @@
 config.text_config.num_hidden_layers = 4
 
 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
-## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
+## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
 ##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
-##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
+##           -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value.
 
 ctx_len = 8192
+ccl_enabled = True
+# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding.
 comp_ctx_lengths_prefill = [4096]
 comp_ctx_lengths_decode = [6144, ctx_len]
 
@@ -38,7 +40,7 @@
     config=config,
     continuous_batching=True,
     qaic_config={
-        "ccl_enabled": True,
+        "ccl_enabled": ccl_enabled,
     },
 )
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py
index 8d53e68b5..93849fa5a 100644
--- a/examples/performance/compute_context_length/qwen3moe.py
+++ b/examples/performance/compute_context_length/qwen3moe.py
@@ -17,15 +17,17 @@
 """
 
 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained().
-## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length.
+## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length.
 ##   - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process.
 ##           -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk.
 ##   - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process.
 ##           -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index.
-##           -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold.
+##           -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value.
 
 ctx_len = 1024
 prefill_seq_len = 1
+ccl_enabled = True
+# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding.
 # In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same ccl specializations.
 comp_ctx_lengths_prefill = comp_ctx_lengths_decode = [256, 512, ctx_len]
 
@@ -33,7 +35,7 @@
     model_name,
     continuous_batching=False,
     qaic_config={
-        "ccl_enabled": True,
+        "ccl_enabled": ccl_enabled,
     },
 )
 
diff --git a/examples/performance/compute_context_length/vlm_inference.py b/examples/performance/compute_context_length/vlm_inference.py
index 876daa3e6..294632fe3 100644
--- a/examples/performance/compute_context_length/vlm_inference.py
+++ b/examples/performance/compute_context_length/vlm_inference.py
@@ -58,10 +58,6 @@ def run_model(
     """
     print(f"Loading model: {model_name}")
     print(f"KV offload (Dual QPC mode): {kv_offload}")
-    print("CCL Configuration:")
-    print(f"  - Prefill context lengths: {comp_ctx_lengths_prefill}")
-    print(f"  - Decode context lengths: {comp_ctx_lengths_decode}")
-    print(f"  - Max context length: {ctx_len}")
 
     ## STEP 1: Load the Processor and Model
 
@@ -186,13 +182,13 @@ def main():
     parser.add_argument(
         "--comp-ctx-lengths-prefill",
         type=lambda x: [int(i) for i in x.split(",")],
-        default="4096",
+        default=None,
         help="Comma-separated list of context lengths for prefill phase (e.g., '4096')",
     )
     parser.add_argument(
         "--comp-ctx-lengths-decode",
         type=lambda x: [int(i) for i in x.split(",")],
-        default="6144,8192",
+        default=None,
         help="Comma-separated list of context lengths for decode phase (e.g., '6144,8192')",
     )
     parser.add_argument(

From b769fc0ceb9d14fb257fcc85d23eab7dc1a855b3 Mon Sep 17 00:00:00 2001
From: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
Date: Fri, 12 Dec 2025 16:56:09 -0800
Subject: [PATCH 9/9] Add automatic CCL list generation for prefill and decode
 when user does not provide lists

Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
---
 .../transformers/models/modeling_auto.py      |   6 +-
 QEfficient/utils/check_ccl_specializations.py | 227 +++++++++---------
 2 files changed, 122 insertions(+), 111 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index c6312e595..d6c4a5e65 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1127,7 +1127,7 @@ def compile(
         # if ccl_enabled is True read Compute-Context-Length lists
         if self.ccl_enabled:
             if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None:
-                print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).")
+                logger.info("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).")
             self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations(
                 comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
             )
@@ -1772,7 +1772,7 @@ def compile(
         # if ccl_enabled is True read Compute-Context-Length lists
         if self.ccl_enabled:
             if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None:
-                print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).")
+                logger.info("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).")
             self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations(
                 comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
             )
@@ -2868,7 +2868,7 @@ def compile(
         # if ccl_enabled is True read Compute-Context-Length lists
         if self.ccl_enabled:
             if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None:
-                print("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).")
+                logger.info("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).")
             self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations(
                 comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len
             )
diff --git a/QEfficient/utils/check_ccl_specializations.py b/QEfficient/utils/check_ccl_specializations.py
index b06f83e33..779e94122 100644
--- a/QEfficient/utils/check_ccl_specializations.py
+++ b/QEfficient/utils/check_ccl_specializations.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 
 def next_multiple_of_1024(n: int) -> int:
@@ -23,57 +23,59 @@ def floor_to_1000(n: int) -> int:
 
 
 def is_power_of_two(n: int) -> bool:
-    """Return True if n is a power of two (n>0 and n&(n-1)==0)."""
+    """Return True if n is a power of two (n > 0 and n & (n - 1) == 0)."""
     return n > 0 and (n & (n - 1)) == 0
 
 
-def build_doubling_sequence(start: int, limit: int, max_elements: int, force_last: Optional[int] = None) -> List[int]:
+def band_index_from_mapped_cl(mapped_cl: int) -> int:
     """
-    Build an increasing sequence starting at 'start', doubling each step,
-    not exceeding 'limit', with total length <= max_elements.
-    If 'force_last' is provided, ensure the last element equals 'force_last'
-    (replacing/appending as needed), even if it exceeds 'limit'.
+    Compute band index ∈ {0,1,2} from mapped_cl using bit arithmetic.
+
+    Bands (upper bounds): 2^15=32768 → idx=0,  2^16=65536 → idx=1,  2^17=131072 → idx=2.
+    For mapped_cl > 131072, clamp to idx=2.
     """
-    if max_elements <= 0:
-        return []
+    # ceil(log2(mapped_cl)) == bit_length(mapped_cl - 1)
+    ceil_log2 = (mapped_cl - 1).bit_length()
+    # map to {0,1,2} by subtracting 15 (the exponent for 32768) and clamping
+    idx = max(0, min(2, ceil_log2 - 15))
+    return idx
 
-    # If start is already beyond limit, return [force_last or limit] as a single element.
-    if start > limit:
-        seq = [force_last if force_last is not None else limit]
-        return seq[:max_elements]
-
-    seq: List[int] = []
-    val = start
-
-    while val <= limit and len(seq) < max_elements:
-        seq.append(val)
-        next_val = val * 2
-        if next_val > limit or len(seq) >= max_elements:
-            break
-        val = next_val
-
-    # Add/replace last element if a 'force_last' is requested
-    if force_last is not None:
-        if len(seq) == 0:
-            seq = [force_last]
-        elif seq[-1] != force_last:
-            if len(seq) < max_elements:
-                seq.append(force_last)
-            else:
-                seq[-1] = force_last
 
-    # Deduplicate while preserving order
-    dedup = []
-    seen = set()
-    for x in seq:
-        if x not in seen:
-            dedup.append(x)
-            seen.add(x)
-    return dedup[:max_elements]
+def build_doubling_set(start: int, limit: int, max_elements: int) -> Set[int]:
+    """
+    Build a STRICT doubling set: {start, start*2, start*4, ...} up to 'limit',
+    collecting at most 'max_elements' values. Returns a set; caller will sort.
+    """
+    values: Set[int] = set()
+    if max_elements <= 0 or start <= 0 or limit <= 0:
+        return values
+
+    v = start
+    while v <= limit and len(values) < max_elements:
+        values.add(v)
+        v *= 2
+    return values
+
+
+def ensure_last(sorted_seq: List[int], last_value: int, max_elements: int) -> List[int]:
+    """
+    Ensure the last element equals 'last_value' by appending or replacing the final element,
+    keeping length <= max_elements. If the sequence is empty, return [last_value].
+    """
+    if max_elements <= 0:
+        return []
+    if not sorted_seq:
+        return [last_value][:max_elements]
+    if sorted_seq[-1] != last_value:
+        if len(sorted_seq) < max_elements:
+            sorted_seq.append(last_value)
+        else:
+            sorted_seq[-1] = last_value
+    return sorted_seq[:max_elements]
 
 
-def Automatic_CCL_Generation(
-    CL: int,
+def automatic_ccl_generation(
+    ctx_len: int,
     prefill_seq_len: int,
     comp_ctx_lengths_prefill: Optional[List[int]] = None,
     comp_ctx_lengths_decode: Optional[List[int]] = None,
@@ -82,93 +84,102 @@ def Automatic_CCL_Generation(
     Automatic Compute-Context-Length Lists Generation
 
     Purpose:
-        Compute decode and prefill ccl lists based on an input context
-        length (CL), prefill sequence length, and optional pre-specified lists.
+        Compute decode and prefill CCL lists based on an input context length (CL),
+        prefill sequence length, and optional pre-specified lists.
+
+    High-level rules (unchanged from your finalized logic):
+        - prefill_seq_len > 1:
+            * If either list is provided, pass them through unchanged.
+            * decode: doubles from tiered start; MUST end at mapped_CL (last forced to mapped_CL).
+            * prefill:
+                • If CL is power of two: STRICT doubling from tiered start, bounded by CL (no forced non-doubling last).
+                • Else: doubles from tiered start, bounded by CL, and last element = floor_to_1000(mapped_CL).
+            * Max 5 elements per list.
+        - prefill_seq_len == 1:
+            * decode and prefill are IDENTICAL.
+            * start at 4096, double up to 10 elements.
+            * upper grid cap computed dynamically (start * 2^(max_elements-1)); last = mapped_CL.
+            * If mapped_CL < 4096, both lists are [mapped_CL].
     """
-
-    if CL <= 0:
-        mapped_CL = next_multiple_of_1024(max(CL, 1))
-        # For non-positive CL, minimal identical sequences
-        seq = [mapped_CL]
-        return seq, seq, mapped_CL
-
-    mapped_CL = next_multiple_of_1024(CL)
-
-    # Tiered starts
-    if mapped_CL <= 4096:
-        seq = [mapped_CL]
-        return seq, seq, mapped_CL
-    elif mapped_CL <= 32768:
-        decode_start, prefill_start = 4096, 4000
-    elif mapped_CL <= 65536:
-        decode_start, prefill_start = 8192, 8000
-    elif mapped_CL <= 131072:
-        decode_start, prefill_start = 16384, 16000
-    else:
-        decode_start, prefill_start = 16384, 16000
-
-    # If prefill_seq_len > 1:
+    # Handle non-positive CL
+    if ctx_len <= 0:
+        mapped_cl = next_multiple_of_1024(1)
+        seq = [mapped_cl]
+        return seq, seq, mapped_cl
+
+    mapped_cl = next_multiple_of_1024(ctx_len)
+
+    # Early small-ctx_len case for identical lists
+    if mapped_cl <= 4096:
+        seq = [mapped_cl]
+        return seq, seq, mapped_cl
+
+    # Compute tier starts via band index (no hard-coded chain)
+    idx = band_index_from_mapped_cl(mapped_cl)
+    decode_start = 4096 << idx  # 4096, 8192, 16384
+    PREFILL_STARTS = {0: 4000, 1: 8000, 2: 16000}
+    prefill_start = PREFILL_STARTS[idx]
+
+    # Branch: prefill_seq_len > 1
     if prefill_seq_len > 1:
         # Passthrough if either provided
         if comp_ctx_lengths_decode is not None or comp_ctx_lengths_prefill is not None:
             return (
-                comp_ctx_lengths_decode if comp_ctx_lengths_decode is not None else [],
                 comp_ctx_lengths_prefill if comp_ctx_lengths_prefill is not None else [],
-                mapped_CL,
+                comp_ctx_lengths_decode if comp_ctx_lengths_decode is not None else [],
+                mapped_cl,
             )
 
+        # Due to limitations in the number of specializations during compilation, we set the maximum number of elements in comp_ctx_lengths_decode and comp_ctx_lengths_prefill lists to 5.
         max_elems = 5
 
-        # Decode: ensure last = mapped_CL
-        decode = build_doubling_sequence(
-            start=decode_start,
-            limit=mapped_CL,
-            max_elements=max_elems,
-            force_last=mapped_CL,
-        )
-
-        # Prefill:
-        if is_power_of_two(CL):
-            # Strict doubling, limit = CL, no forced non-doubling last
-            prefill = build_doubling_sequence(
-                start=prefill_start,
-                limit=CL,
-                max_elements=max_elems,
-                force_last=None,
-            )
+        # ---- Decode: strict doubling up to mapped_cl, then enforce last = mapped_cl
+        decode_set = build_doubling_set(start=decode_start, limit=mapped_cl, max_elements=max_elems)
+        decode_list = sorted(decode_set)
+        decode_list = ensure_last(decode_list, last_value=mapped_cl, max_elements=max_elems)
+
+        # ---- Prefill:
+        if is_power_of_two(ctx_len):
+            # STRICT doubling only, bounded by ctx_len; do NOT force a non-doubling last
+            prefill_set = build_doubling_set(start=prefill_start, limit=ctx_len, max_elements=max_elems)
+            prefill_list = sorted(prefill_set)[:max_elems]
         else:
-            prefill_last = floor_to_1000(mapped_CL)
-            prefill = build_doubling_sequence(
-                start=prefill_start,
-                limit=CL,
-                max_elements=max_elems,
-                force_last=prefill_last,
-            )
+            # Doubles bounded by ctx_len, but last must equal floor_to_1000(mapped_cl)
+            prefill_last = floor_to_1000(mapped_cl)
+            prefill_set = build_doubling_set(start=prefill_start, limit=ctx_len, max_elements=max_elems)
+            prefill_list = sorted(prefill_set)
+            prefill_list = ensure_last(prefill_list, last_value=prefill_last, max_elements=max_elems)
 
-        return prefill, decode, mapped_CL
+        # NOTE: return order preserved from your last snippet (prefill first, then decode)
+        return prefill_list, decode_list, mapped_cl
 
-    # UPDATED: prefill_seq_len == 1 → identical lists
+    # Branch: prefill_seq_len == 1 → identical lists
     else:
+        # When prefill_seq_len=1 such as in MoE models, prefilling and decoding processes can use the same specializations and we can double the length of Ccl lists.
+        # Due to limitations in the number of specializations during compilation, we set the maximum number of elements in comp_ctx_lengths_decode and comp_ctx_lengths_prefill lists to 10.
         max_elems = 10
-        grid_cap = 2097152  # upper cap for doubling grid
+        start_identical = 4096
 
-        if mapped_CL < 4096:
-            seq = [mapped_CL]
-        else:
-            seq = build_doubling_sequence(
-                start=4096,
-                limit=min(mapped_CL, grid_cap),
-                max_elements=max_elems,
-                force_last=mapped_CL,  # identical lists end at mapped_CL
-            )
-        return seq, seq, mapped_CL
+        if mapped_cl < start_identical:
+            seq = [mapped_cl]
+            return seq, seq, mapped_cl
+
+        # Dynamic grid cap: start * 2^(max_elems - 1)
+        grid_cap = start_identical * (1 << (max_elems - 1))
+        limit = min(mapped_cl, grid_cap)
+
+        seq_set = build_doubling_set(start=start_identical, limit=limit, max_elements=max_elems)
+        seq_list = sorted(seq_set)
+        seq_list = ensure_last(seq_list, last_value=mapped_cl, max_elements=max_elems)
+
+        return seq_list, seq_list, mapped_cl
 
 
 def process_ccl_specializations(ccl_prefill, ccl_decode, ctx_len, prefill_seq_len):
     # Automatic CCL generation: If both ccl_prefill and ccl_decode are None,
     # generate optimized context length lists for prefill and decode based on ctx_len
     if ccl_prefill is None and ccl_decode is None:
-        ccl_prefill, ccl_decode, ctx_len = Automatic_CCL_Generation(ctx_len, prefill_seq_len, ccl_prefill, ccl_decode)
+        ccl_prefill, ccl_decode, ctx_len = automatic_ccl_generation(ctx_len, prefill_seq_len, ccl_prefill, ccl_decode)
     else:
         if prefill_seq_len == 1:
             if ccl_prefill is not None and ccl_decode is not None: