diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 16a809c96..d6c4a5e65 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1126,17 +1126,14 @@ def compile( # if ccl_enabled is True read Compute-Context-Length lists if self.ccl_enabled: - if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: - logger.warning( - "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." - ) - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None: + logger.info("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) - # For supporting VLLM and Disaggregated with CCL - if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + elif comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) @@ -1774,17 +1771,14 @@ def compile( # if ccl_enabled is True read Compute-Context-Length lists if self.ccl_enabled: - if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: - logger.warning( - "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." - ) - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None: + logger.info("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) - # For supporting VLLM and Disaggregated with CCL - if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + elif comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) @@ -2873,16 +2867,13 @@ def compile( # if ccl_enabled is True read Compute-Context-Length lists if self.ccl_enabled: - if comp_ctx_lengths_prefill is None or comp_ctx_lengths_decode is None: - logger.warning( - "Please set comp_ctx_lengths_prefill and comp_ctx_lengths_decode with a proper list of context lengths. Using non-CCL default model." - ) - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + if comp_ctx_lengths_prefill is None and comp_ctx_lengths_decode is None: + logger.info("Auto-generating CCL-prefill and CCL-decode lists based on Context Length (CL).") + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) - # For supporting VLLM and Disaggregated with CCL - if comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: + elif comp_ctx_lengths_prefill is not None or comp_ctx_lengths_decode is not None: if isinstance(comp_ctx_lengths_prefill, str): import ast @@ -2897,7 +2888,7 @@ def compile( self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill self.comp_ctx_lengths_decode = comp_ctx_lengths_decode - self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations( + self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len = process_ccl_specializations( self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len, prefill_seq_len ) # --- Validation --- diff --git a/QEfficient/utils/check_ccl_specializations.py b/QEfficient/utils/check_ccl_specializations.py index 0d6a078f6..779e94122 100644 --- a/QEfficient/utils/check_ccl_specializations.py +++ b/QEfficient/utils/check_ccl_specializations.py @@ -5,40 +5,221 @@ # # ----------------------------------------------------------------------------- +from typing import List, Optional, Set, Tuple + + +def next_multiple_of_1024(n: int) -> int: + """Ceil 'n' to the next multiple of 1024.""" + if n <= 0: + return 0 + return ((n + 1023) // 1024) * 1024 + + +def floor_to_1000(n: int) -> int: + """Floor 'n' to the nearest lower multiple of 1000.""" + if n <= 0: + return 0 + return (n // 1000) * 1000 + + +def is_power_of_two(n: int) -> bool: + """Return True if n is a power of two (n > 0 and n & (n - 1) == 0).""" + return n > 0 and (n & (n - 1)) == 0 + + +def band_index_from_mapped_cl(mapped_cl: int) -> int: + """ + Compute band index ∈ {0,1,2} from mapped_cl using bit arithmetic. + + Bands (upper bounds): 2^15=32768 → idx=0, 2^16=65536 → idx=1, 2^17=131072 → idx=2. + For mapped_cl > 131072, clamp to idx=2. + """ + # ceil(log2(mapped_cl)) == bit_length(mapped_cl - 1) + ceil_log2 = (mapped_cl - 1).bit_length() + # map to {0,1,2} by subtracting 15 (the exponent for 32768) and clamping + idx = max(0, min(2, ceil_log2 - 15)) + return idx + + +def build_doubling_set(start: int, limit: int, max_elements: int) -> Set[int]: + """ + Build a STRICT doubling set: {start, start*2, start*4, ...} up to 'limit', + collecting at most 'max_elements' values. Returns a set; caller will sort. + """ + values: Set[int] = set() + if max_elements <= 0 or start <= 0 or limit <= 0: + return values + + v = start + while v <= limit and len(values) < max_elements: + values.add(v) + v *= 2 + return values + + +def ensure_last(sorted_seq: List[int], last_value: int, max_elements: int) -> List[int]: + """ + Ensure the last element equals 'last_value' by appending or replacing the final element, + keeping length <= max_elements. If the sequence is empty, return [last_value]. + """ + if max_elements <= 0: + return [] + if not sorted_seq: + return [last_value][:max_elements] + if sorted_seq[-1] != last_value: + if len(sorted_seq) < max_elements: + sorted_seq.append(last_value) + else: + sorted_seq[-1] = last_value + return sorted_seq[:max_elements] + + +def automatic_ccl_generation( + ctx_len: int, + prefill_seq_len: int, + comp_ctx_lengths_prefill: Optional[List[int]] = None, + comp_ctx_lengths_decode: Optional[List[int]] = None, +) -> Tuple[List[int], List[int], int]: + """ + Automatic Compute-Context-Length Lists Generation + + Purpose: + Compute decode and prefill CCL lists based on an input context length (CL), + prefill sequence length, and optional pre-specified lists. + + High-level rules (unchanged from your finalized logic): + - prefill_seq_len > 1: + * If either list is provided, pass them through unchanged. + * decode: doubles from tiered start; MUST end at mapped_CL (last forced to mapped_CL). + * prefill: + • If CL is power of two: STRICT doubling from tiered start, bounded by CL (no forced non-doubling last). + • Else: doubles from tiered start, bounded by CL, and last element = floor_to_1000(mapped_CL). + * Max 5 elements per list. + - prefill_seq_len == 1: + * decode and prefill are IDENTICAL. + * start at 4096, double up to 10 elements. + * upper grid cap computed dynamically (start * 2^(max_elements-1)); last = mapped_CL. + * If mapped_CL < 4096, both lists are [mapped_CL]. + """ + # Handle non-positive CL + if ctx_len <= 0: + mapped_cl = next_multiple_of_1024(1) + seq = [mapped_cl] + return seq, seq, mapped_cl + + mapped_cl = next_multiple_of_1024(ctx_len) + + # Early small-ctx_len case for identical lists + if mapped_cl <= 4096: + seq = [mapped_cl] + return seq, seq, mapped_cl + + # Compute tier starts via band index (no hard-coded chain) + idx = band_index_from_mapped_cl(mapped_cl) + decode_start = 4096 << idx # 4096, 8192, 16384 + PREFILL_STARTS = {0: 4000, 1: 8000, 2: 16000} + prefill_start = PREFILL_STARTS[idx] + + # Branch: prefill_seq_len > 1 + if prefill_seq_len > 1: + # Passthrough if either provided + if comp_ctx_lengths_decode is not None or comp_ctx_lengths_prefill is not None: + return ( + comp_ctx_lengths_prefill if comp_ctx_lengths_prefill is not None else [], + comp_ctx_lengths_decode if comp_ctx_lengths_decode is not None else [], + mapped_cl, + ) + + # Due to limitations in the number of specializations during compilation, we set the maximum number of elements in comp_ctx_lengths_decode and comp_ctx_lengths_prefill lists to 5. + max_elems = 5 + + # ---- Decode: strict doubling up to mapped_cl, then enforce last = mapped_cl + decode_set = build_doubling_set(start=decode_start, limit=mapped_cl, max_elements=max_elems) + decode_list = sorted(decode_set) + decode_list = ensure_last(decode_list, last_value=mapped_cl, max_elements=max_elems) + + # ---- Prefill: + if is_power_of_two(ctx_len): + # STRICT doubling only, bounded by ctx_len; do NOT force a non-doubling last + prefill_set = build_doubling_set(start=prefill_start, limit=ctx_len, max_elements=max_elems) + prefill_list = sorted(prefill_set)[:max_elems] + else: + # Doubles bounded by ctx_len, but last must equal floor_to_1000(mapped_cl) + prefill_last = floor_to_1000(mapped_cl) + prefill_set = build_doubling_set(start=prefill_start, limit=ctx_len, max_elements=max_elems) + prefill_list = sorted(prefill_set) + prefill_list = ensure_last(prefill_list, last_value=prefill_last, max_elements=max_elems) + + # NOTE: return order preserved from your last snippet (prefill first, then decode) + return prefill_list, decode_list, mapped_cl + + # Branch: prefill_seq_len == 1 → identical lists + else: + # When prefill_seq_len=1 such as in MoE models, prefilling and decoding processes can use the same specializations and we can double the length of Ccl lists. + # Due to limitations in the number of specializations during compilation, we set the maximum number of elements in comp_ctx_lengths_decode and comp_ctx_lengths_prefill lists to 10. + max_elems = 10 + start_identical = 4096 + + if mapped_cl < start_identical: + seq = [mapped_cl] + return seq, seq, mapped_cl + + # Dynamic grid cap: start * 2^(max_elems - 1) + grid_cap = start_identical * (1 << (max_elems - 1)) + limit = min(mapped_cl, grid_cap) + + seq_set = build_doubling_set(start=start_identical, limit=limit, max_elements=max_elems) + seq_list = sorted(seq_set) + seq_list = ensure_last(seq_list, last_value=mapped_cl, max_elements=max_elems) + + return seq_list, seq_list, mapped_cl + def process_ccl_specializations(ccl_prefill, ccl_decode, ctx_len, prefill_seq_len): - if ccl_prefill is None or ccl_decode is None: - return None, None - - if ctx_len is None: - raise TypeError("`ctx_len` is required when loading the model with CCL.") - - if prefill_seq_len == 1: - # both prefill and decode ccl can share the same specializations since prefill_seq_len=1. So, a sorted union of both lists can be used for both of them. - ccl_union_all = sorted(set(ccl_prefill + ccl_decode)) - ccl_union_all = [min(x, ctx_len) for x in ccl_union_all] - return ccl_union_all, ccl_union_all - - # Step 1: Cap values to ctx_len - ccl_prefill = [min(x, ctx_len) for x in ccl_prefill] - ccl_decode = [min(x, ctx_len) for x in ccl_decode] - - # Step 2: Remove duplicates within each list - ccl_prefill = list(set(ccl_prefill)) - ccl_decode = list(set(ccl_decode)) - - # Step 3: Ensure no overlap between ccl_prefill and ccl_decode - updated_prefill = [] - for val in ccl_prefill: - while val in ccl_decode or val in updated_prefill: - val -= 1 - if val < 0: - break # Prevent negative values - if val >= 0: - updated_prefill.append(val) - - # Step 4: Sort both lists - updated_prefill.sort() - ccl_decode.sort() - - return updated_prefill, ccl_decode + # Automatic CCL generation: If both ccl_prefill and ccl_decode are None, + # generate optimized context length lists for prefill and decode based on ctx_len + if ccl_prefill is None and ccl_decode is None: + ccl_prefill, ccl_decode, ctx_len = automatic_ccl_generation(ctx_len, prefill_seq_len, ccl_prefill, ccl_decode) + else: + if prefill_seq_len == 1: + if ccl_prefill is not None and ccl_decode is not None: + # both prefill and decode ccl can share the same specializations since prefill_seq_len=1. So, a sorted union of both lists can be used for both of them. + ccl_union_all = sorted(set(ccl_prefill + ccl_decode)) + ccl_union_all = [min(x, ctx_len) for x in ccl_union_all] + ccl_prefill = ccl_union_all + ccl_decode = ccl_union_all + else: + # Step 1: Cap values to ctx_len + ccl_prefill = [min(x, ctx_len) for x in ccl_prefill] if ccl_prefill is not None else None + ccl_decode = [min(x, ctx_len) for x in ccl_decode] if ccl_decode is not None else None + + # Step 2: Remove duplicates within each list + ccl_prefill = list(set(ccl_prefill)) if ccl_prefill is not None else None + ccl_decode = list(set(ccl_decode)) if ccl_decode is not None else None + + if ccl_prefill is None or ccl_decode is None: + if ccl_prefill: + ccl_prefill.sort() + if ccl_decode: + ccl_decode.sort() + else: + # Step 3: Ensure no overlap between ccl_prefill and ccl_decode + tmp_prefill = ccl_prefill + ccl_prefill = [] + for val in tmp_prefill: + while val in ccl_decode or val in ccl_prefill: + val -= 1 + if val < 0: + break # Prevent negative values + if val >= 0: + ccl_prefill.append(val) + + # Step 4: Sort both lists + ccl_prefill.sort() + ccl_decode.sort() + + print("CCL Configuration:") + print(f" - Prefill context lengths: {ccl_prefill}") + print(f" - Decode context lengths: {ccl_decode}") + print(f" - Max context length: {ctx_len}") + return ccl_prefill, ccl_decode, ctx_len diff --git a/examples/performance/compute_context_length/README.md b/examples/performance/compute_context_length/README.md index 9f1d29b9a..2115251e2 100644 --- a/examples/performance/compute_context_length/README.md +++ b/examples/performance/compute_context_length/README.md @@ -37,11 +37,22 @@ python basic_inference.py \ --model-name meta-llama/Llama-3.2-1B \ --prompt "Hello, how are you?" \ --ctx-len 1024 \ + --ccl-enabled \ --comp-ctx-lengths-prefill "256,500" \ --comp-ctx-lengths-decode "512,1024" \ --generation-len 100 ``` +# For automatic CCL lists generation, simply not pass CCL lists and only pass ccl-enabled flag +```bash +python basic_inference.py \ + --model-name meta-llama/Llama-3.2-1B \ + --prompt "Hello, how are you?" \ + --ctx-len 1024 \ + --ccl-enabled \ + --generation-len 100 +``` + ### Vision-Language Models Run VLM inference with CCL: @@ -55,11 +66,22 @@ python vlm_inference.py \ --model-name meta-llama/Llama-3.2-11B-Vision-Instruct \ --query "Describe this image" \ --image-url "https://..." \ + --ccl-enabled \ --comp-ctx-lengths-prefill "4096" \ --comp-ctx-lengths-decode "6144,8192" \ --ctx-len 8192 ``` +# For automatic CCL lists generation, simply not pass CCL lists and only pass ccl-enabled flag +```bash +python vlm_inference.py \ + --model-name meta-llama/Llama-3.2-11B-Vision-Instruct \ + --query "Describe this image" \ + --image-url "https://..." \ + --ccl-enabled \ + --ctx-len 8192 +``` + ## Available Examples ### Text-Only Models diff --git a/examples/performance/compute_context_length/basic_inference.py b/examples/performance/compute_context_length/basic_inference.py index 4533c47e8..6e8c045fb 100644 --- a/examples/performance/compute_context_length/basic_inference.py +++ b/examples/performance/compute_context_length/basic_inference.py @@ -54,13 +54,13 @@ def main(): parser.add_argument( "--comp-ctx-lengths-prefill", type=lambda x: [int(i) for i in x.split(",")], - default="256,500", + default=None, help="Comma-separated list of context lengths for prefill phase (e.g., '256,500')", ) parser.add_argument( "--comp-ctx-lengths-decode", type=lambda x: [int(i) for i in x.split(",")], - default="512,1024", + default=None, help="Comma-separated list of context lengths for decode phase (e.g., '512,1024')", ) parser.add_argument( @@ -107,11 +107,7 @@ def main(): args = parser.parse_args() print(f"Loading model: {args.model_name}") - print("CCL Configuration:") - print(f" - Prefill context lengths: {args.comp_ctx_lengths_prefill}") - print(f" - Decode context lengths: {args.comp_ctx_lengths_decode}") - print(f" - Max context length: {args.ctx_len}") - print(f" - Continuous batching: {args.continuous_batching}") + print(f"Continuous batching: {args.continuous_batching}") # Load model with CCL configuration model = QEFFAutoModelForCausalLM.from_pretrained( diff --git a/examples/performance/compute_context_length/gemma3.py b/examples/performance/compute_context_length/gemma3.py index d9672b9e3..1dcec5c81 100644 --- a/examples/performance/compute_context_length/gemma3.py +++ b/examples/performance/compute_context_length/gemma3.py @@ -21,14 +21,16 @@ processor = AutoProcessor.from_pretrained(model_id) ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. comp_ctx_lengths_prefill = [3072] comp_ctx_lengths_decode = [4096, ctx_len] @@ -40,7 +42,7 @@ attn_implementation="eager", kv_offload=True, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) diff --git a/examples/performance/compute_context_length/gpt_oss.py b/examples/performance/compute_context_length/gpt_oss.py index 39a5d48ed..92bef9148 100644 --- a/examples/performance/compute_context_length/gpt_oss.py +++ b/examples/performance/compute_context_length/gpt_oss.py @@ -12,16 +12,17 @@ model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 4096 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. # In moe models like gpt-oss, since prefill_seq_len=1 both comp_ctx_lengths_prefill and comp_ctx_lengths_decode can share similar lists. -# Set the list of ccl during prefilling and decoding processes comp_ctx_lengths_prefill = comp_ctx_lengths_decode = [1024, ctx_len] qeff_model = QEFFAutoModelForCausalLM.from_pretrained( diff --git a/examples/performance/compute_context_length/granite_vision.py b/examples/performance/compute_context_length/granite_vision.py index 6dd38395c..ef5dc3a51 100644 --- a/examples/performance/compute_context_length/granite_vision.py +++ b/examples/performance/compute_context_length/granite_vision.py @@ -98,6 +98,7 @@ def run_model( num_devices = 4 ctx_len = 8192 ccl_enabled = True + # Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. If both are None, the lists will be generated automatically based on the context length. comp_ctx_lengths_prefill = [5500] comp_ctx_lengths_decode = [6144, ctx_len] diff --git a/examples/performance/compute_context_length/internvl.py b/examples/performance/compute_context_length/internvl.py index 19bcf4bc1..02e965e0d 100644 --- a/examples/performance/compute_context_length/internvl.py +++ b/examples/performance/compute_context_length/internvl.py @@ -263,6 +263,7 @@ def run_intern_on_aic( ctx_len = 8192 ccl_enabled = True + # Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. If both are None, the lists will be generated automatically based on the context length. comp_ctx_lengths_prefill = [4096] comp_ctx_lengths_decode = [6144, ctx_len] diff --git a/examples/performance/compute_context_length/llama4.py b/examples/performance/compute_context_length/llama4.py index 8cdbd70a1..a867e1bd3 100644 --- a/examples/performance/compute_context_length/llama4.py +++ b/examples/performance/compute_context_length/llama4.py @@ -18,14 +18,16 @@ config.vision_config.num_hidden_layers = 2 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [3072] # Set the list of ccl during decoding process @@ -37,7 +39,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/llama4_cb.py b/examples/performance/compute_context_length/llama4_cb.py index ffbbff67f..f97160693 100644 --- a/examples/performance/compute_context_length/llama4_cb.py +++ b/examples/performance/compute_context_length/llama4_cb.py @@ -20,14 +20,16 @@ processor = AutoProcessor.from_pretrained(model_id) ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 4096 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [3072] # Set the list of ccl during decoding process @@ -42,7 +44,7 @@ config=config, continuous_batching=True, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) @@ -69,7 +71,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) diff --git a/examples/performance/compute_context_length/llama4_multi_image.py b/examples/performance/compute_context_length/llama4_multi_image.py index fd513fe45..314aa49b3 100644 --- a/examples/performance/compute_context_length/llama4_multi_image.py +++ b/examples/performance/compute_context_length/llama4_multi_image.py @@ -18,14 +18,16 @@ config.vision_config.num_hidden_layers = 2 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. # Set the list of ccl during prefilling process comp_ctx_lengths_prefill = [5376] # Set the list of ccl during decoding process @@ -37,7 +39,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/mistral3.py b/examples/performance/compute_context_length/mistral3.py index 3763fbcde..a773ddfd9 100644 --- a/examples/performance/compute_context_length/mistral3.py +++ b/examples/performance/compute_context_length/mistral3.py @@ -101,6 +101,7 @@ def run_model( num_cores = 16 num_devices = 4 ccl_enabled = True + # Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. If both are None, the lists will be generated automatically based on the context length. comp_ctx_lengths_prefill = [4096] comp_ctx_lengths_decode = [6144, ctx_len] diff --git a/examples/performance/compute_context_length/molmo.py b/examples/performance/compute_context_length/molmo.py index b5f1f50e6..8d773f5fe 100644 --- a/examples/performance/compute_context_length/molmo.py +++ b/examples/performance/compute_context_length/molmo.py @@ -19,15 +19,17 @@ # config.num_hidden_layers = 2 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. # load the model ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. comp_ctx_lengths_prefill = [3072] # None # comp_ctx_lengths_decode = [4096, 8192] # None # @@ -37,7 +39,7 @@ trust_remote_code=True, config=config, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) diff --git a/examples/performance/compute_context_length/qwen2_5_vl.py b/examples/performance/compute_context_length/qwen2_5_vl.py index 20960b6a9..5a6818930 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl.py +++ b/examples/performance/compute_context_length/qwen2_5_vl.py @@ -23,14 +23,16 @@ config.text_config.num_hidden_layers = 2 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. comp_ctx_lengths_prefill = [4096] # None # comp_ctx_lengths_decode = [6144, ctx_len] # None # @@ -40,7 +42,7 @@ kv_offload=True, config=config, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/qwen2_5_vl_cb.py b/examples/performance/compute_context_length/qwen2_5_vl_cb.py index fc330e14e..c247a1e58 100644 --- a/examples/performance/compute_context_length/qwen2_5_vl_cb.py +++ b/examples/performance/compute_context_length/qwen2_5_vl_cb.py @@ -20,14 +20,16 @@ config.text_config.num_hidden_layers = 4 ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 8192 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. comp_ctx_lengths_prefill = [4096] comp_ctx_lengths_decode = [6144, ctx_len] @@ -38,7 +40,7 @@ config=config, continuous_batching=True, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) diff --git a/examples/performance/compute_context_length/qwen3moe.py b/examples/performance/compute_context_length/qwen3moe.py index b53a28362..93849fa5a 100644 --- a/examples/performance/compute_context_length/qwen3moe.py +++ b/examples/performance/compute_context_length/qwen3moe.py @@ -17,15 +17,17 @@ """ ## Activate Compute-Context-Length (CCL) feature by setting ccl_enabled=True when loading the model with from_pretrained(). -## Use the optional comp_ctx_lengths argument to provide two lists of context lengths for the prefilling and decoding processes. If comp_ctx_lengths=None, the model will run with its default context length. +## Use the optional comp_ctx_lengths_prefill and comp_ctx_lengths_decode to provide two lists of context lengths for the prefilling and decoding processes. If both are None, the lists will be generated automatically based on the context length. ## - The first list, comp_ctx_lengths_prefill, defines the compute-context-length values for the prefilling process. ## -- The process starts with the first value in the list and gradually increases the context length based on the position_id of the current prompt chunk. ## - The second list, comp_ctx_lengths_decode, defines the compute-context-length values for the decoding process. ## -- During decoding, the model selects an appropriate context length from the list based on the input prompt length and cache index. -## -- It starts from the correct value in the list and increases the context length dynamically when the cache index exceeds the current threshold. +## -- It starts from the correct value in the list and increases the context length dynamically when the generated token's cache index exceeds the current CCL value. ctx_len = 1024 prefill_seq_len = 1 +ccl_enabled = True +# Two optional lists, comp_ctx_lengths_prefill and comp_ctx_lengths_decode, define CCL values for prefilling and decoding. # In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same ccl specializations. comp_ctx_lengths_prefill = comp_ctx_lengths_decode = [256, 512, ctx_len] @@ -33,7 +35,7 @@ model_name, continuous_batching=False, qaic_config={ - "ccl_enabled": True, + "ccl_enabled": ccl_enabled, }, ) @@ -49,6 +51,5 @@ comp_ctx_lengths_prefill=comp_ctx_lengths_prefill, comp_ctx_lengths_decode=comp_ctx_lengths_decode, ) - tokenizer = AutoTokenizer.from_pretrained(model_name) exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer) diff --git a/examples/performance/compute_context_length/vlm_inference.py b/examples/performance/compute_context_length/vlm_inference.py index 876daa3e6..294632fe3 100644 --- a/examples/performance/compute_context_length/vlm_inference.py +++ b/examples/performance/compute_context_length/vlm_inference.py @@ -58,10 +58,6 @@ def run_model( """ print(f"Loading model: {model_name}") print(f"KV offload (Dual QPC mode): {kv_offload}") - print("CCL Configuration:") - print(f" - Prefill context lengths: {comp_ctx_lengths_prefill}") - print(f" - Decode context lengths: {comp_ctx_lengths_decode}") - print(f" - Max context length: {ctx_len}") ## STEP 1: Load the Processor and Model @@ -186,13 +182,13 @@ def main(): parser.add_argument( "--comp-ctx-lengths-prefill", type=lambda x: [int(i) for i in x.split(",")], - default="4096", + default=None, help="Comma-separated list of context lengths for prefill phase (e.g., '4096')", ) parser.add_argument( "--comp-ctx-lengths-decode", type=lambda x: [int(i) for i in x.split(",")], - default="6144,8192", + default=None, help="Comma-separated list of context lengths for decode phase (e.g., '6144,8192')", ) parser.add_argument(