quic · quic-mamta · Dec 14, 2025 · Aug 6, 2025 · Aug 7, 2025 · Aug 7, 2025
@@ -6,7 +6,17 @@
 # -----------------------------------------------------------------------------
 
 import os
-import warnings
+
+# ----------------------------------------------------------------------------- #
+# For faster downloads via hf_transfer
+# This code is put above import statements as this needs to be executed before
+# hf_transfer is imported (will happen on line 15 via leading imports)
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+# DO NOT ADD ANY CODE ABOVE THIS LINE
+# Please contact maintainers if you must edit this file above this line.
+# ----------------------------------------------------------------------------- #
+# Placeholder for all non-transformer models registered in QEfficient
+import warnings  # noqa: I001
 
 import QEfficient.utils.model_registery  # noqa: F401
 from QEfficient.base import (
@@ -26,6 +36,10 @@
 from QEfficient.utils import custom_format_warning
 from QEfficient.utils.logging_utils import logger
 
+# custom warning for the better logging experience
+warnings.formatwarning = custom_format_warning
+
+
 # Users can use QEfficient.export for exporting models to ONNX
 export = qualcomm_efficient_converter
 __all__ = [
@@ -42,14 +56,7 @@
     "QEFFCommonLoader",
     "QEffFluxPipeline",
 ]
-# For faster downloads via hf_transfer
-# This code is put above import statements as this needs to be executed before
-# hf_transfer is imported (will happen on line 15 via leading imports)
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-# Placeholder for all non-transformer models registered in QEfficient
 
-# custom warning for the better logging experience
-warnings.formatwarning = custom_format_warning
 
 # Conditionally import QAIC-related modules if the SDK is installed
 __version__ = "0.0.1.dev0"

@@ -60,6 +60,7 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None:
         super().__init__()
         self.model = model
         self.hash_params = create_model_params(self, **kwargs)
+        self.prefill_onnx_path: Optional[str] = None
         self.onnx_path: Optional[str] = None
         self.qpc_path: Optional[str] = None
         self.qpc_session: Optional[QAICInferenceSession] = None
@@ -204,10 +205,11 @@ def _export(
         example_inputs: Dict[str, torch.Tensor],
         output_names: List[str],
         dynamic_axes: Dict[str, Dict[int, str]],
-        export_kwargs: Optional[Dict[str, any]] = None,
         onnx_transform_kwargs: Optional[Dict[str, any]] = None,
         export_dir: Optional[str] = None,
         offload_pt_weights: bool = True,
+        prefill_only: Optional[bool] = False,
+        **export_kwargs,
     ) -> str:
         """
         Export the PyTorch model to ONNX and apply ONNX transforms
@@ -232,11 +234,16 @@ def _export(
             instance using from_pretrained() for re-export.
 
         """
+        # TODO: Hack for retain_full_kv, handle this outside
+        export_kwargs.pop("retain_full_kv", None)
         onnx_path = export_dir / f"{self.model_name}.onnx"
 
         # Return early if ONNX already exists
         if onnx_path.is_file():
-            self.onnx_path = onnx_path
+            if prefill_only:
+                self.prefill_onnx_path = onnx_path
+            else:
+                self.onnx_path = onnx_path
             return onnx_path
 
         # check if the model is in meta state or weights are offloaded
@@ -272,9 +279,6 @@ def _export(
                     input_names.append(param)
 
         try:
-            # Export to ONNX
-            export_kwargs = {} if export_kwargs is None else export_kwargs
-
             torch.onnx.export(
                 self.model,
                 (example_inputs,),
@@ -318,9 +322,42 @@ def _export(
         finally:
             shutil.rmtree(tmp_onnx_dir, ignore_errors=True)
 
-        self.onnx_path = onnx_path
+        if prefill_only:
+            self.prefill_onnx_path = onnx_path
+        else:
+            self.onnx_path = onnx_path
         return onnx_path
 
+    def get_onnx_path(
+        self,
+        prefill_only: Optional[bool] = False,
+        enable_chunking: Optional[bool] = False,
+        specializations: Optional[List[Dict[str, int]]] = None,
+        offload_pt_weights: Optional[bool] = True,
+        use_onnx_subfunctions: Optional[bool] = False,
+        retain_full_kv: Optional[bool] = False,
+    ):
+        kwargs = {
+            "offload_pt_weights": offload_pt_weights,
+            "use_onnx_subfunctions": use_onnx_subfunctions,
+            "retain_full_kv": retain_full_kv,
+        }
+        if prefill_only:
+            if self.prefill_onnx_path is None:
+                kwargs.update(
+                    {
+                        "prefill_only": prefill_only,
+                        "prefill_seq_len": specializations[0].get("seq_len"),
+                        "enable_chunking": enable_chunking,
+                    }
+                )
+                self.export(**kwargs)
+            return self.prefill_onnx_path
+        else:
+            if self.onnx_path is None:
+                self.export(**kwargs)
+            return self.onnx_path
+
     @dump_qconfig
     def _compile(
         self,
@@ -335,6 +372,10 @@ def _compile(
         enable_qnn: Optional[bool] = False,
         qnn_config: Optional[str] = None,
         use_onnx_subfunctions: bool = False,
+        prefill_only: Optional[str] = None,
+        offload_pt_weights: Optional[bool] = True,
+        enable_chunking: Optional[bool] = False,
+        retain_full_kv: Optional[bool] = None,
         **compiler_options,
     ) -> str:
         """
@@ -360,11 +401,18 @@ def _compile(
 
                 For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored.
         """
-
-        if onnx_path is None and self.onnx_path is None:
-            self.export(use_onnx_subfunctions=use_onnx_subfunctions)
-
-        onnx_path = Path(onnx_path or self.onnx_path)
+        onnx_path = Path(
+            onnx_path
+            if onnx_path
+            else self.get_onnx_path(
+                prefill_only,
+                enable_chunking,
+                specializations,
+                offload_pt_weights,
+                use_onnx_subfunctions,
+                retain_full_kv,
+            )
+        )
         compile_dir = Path(compile_dir or onnx_path.parent)
         qpc_path = compile_dir / "qpc"
         if not onnx_path.is_file():
@@ -426,6 +474,7 @@ def _compile(
             "mdp_ts_num_devices": mdp_ts_num_devices,
             "mdp_ts_json": mdp_ts_json,
             "num_speculative_tokens": num_speculative_tokens,
+            "prefill_only": prefill_only,
         }
         compile_hash = hash_dict_params(compile_hash_params)
 
@@ -465,6 +514,16 @@ def _compile(
 
         command.append(f"-aic-binary-dir={qpc_path}")
         logger.info(f"Running compiler: {' '.join(command)}")
+        if use_onnx_subfunctions:
+
+            class FeatureNotAvailableError(Exception):
+                pass
+
+            exec_command = f'QAIC_COMPILER_OPTS_UNSUPPORTED="-loader-inline-all=0" {" ".join(command)}'
+            raise FeatureNotAvailableError(
+                "ONNX graph is exported with subfunctions, assert version of apps SDK should be used for compiling this model."
+                + f"\nRun following command manually with assert compiler:\n{exec_command}"
+            )
         try:
             subprocess.run(command, capture_output=True, check=True)
         except subprocess.CalledProcessError as e:
@@ -485,5 +544,4 @@ def _compile(
         logger.info("Hashed parameters exported successfully.")
 
         self.qpc_path = qpc_path
-
         return qpc_path
@@ -95,12 +95,12 @@ class CustomOpTransform(BaseOnnxTransform):
         "CtxScatterFunc3D": (CtxScatterFunc3D, CtxScatter3D),
         "CtxGatherFunc": (CtxGatherFunc, CtxGather),
         "CtxGatherFunc3D": (CtxGatherFunc3D, CtxGather3D),
-        "CtxScatterFuncCB": (CtxScatterFuncCB, CtxScatterCB),
         "CtxScatterFuncCB3D": (CtxScatterFuncCB3D, CtxScatterCB3D),
-        "CtxGatherFuncCB": (CtxGatherFuncCB, CtxGatherCB),
         "CtxGatherFuncCB3D": (CtxGatherFuncCB3D, CtxGatherCB3D),
         "CtxGatherFuncBlockedKV": (CtxGatherFuncBlockedKV, CtxGatherBlockedKV),
         "CtxGatherFuncBlockedKVCB": (CtxGatherFuncBlockedKVCB, CtxGatherBlockedKVCB),
+        "CtxScatterFuncCB": (CtxScatterFuncCB, CtxScatterCB),
+        "CtxGatherFuncCB": (CtxGatherFuncCB, CtxGatherCB),
     }
 
     @classmethod

@@ -136,6 +136,7 @@ class CtxGatherFunc(torch.autograd.Function):
     def forward(data: torch.Tensor, ctx_indices: torch.Tensor, comp_ctx_len: int):
         batch_indices = torch.arange(data.shape[0]).view(-1, 1, 1)
         head_indices = torch.arange(data.shape[1]).view(1, -1, 1)
+        ctx_indices = torch.where(ctx_indices == torch.iinfo(torch.int32).max, 0, ctx_indices)
         return data[batch_indices, head_indices, ctx_indices]
 
     @staticmethod

@@ -126,6 +126,7 @@ class CtxGatherFuncCB(torch.autograd.Function):
     def forward(data: torch.Tensor, batch_index: torch.Tensor, ctx_indices: torch.Tensor, comp_ctx_len: int):
         batch_indices = batch_index.view(-1, 1, 1)
         head_indices = torch.arange(data.shape[1]).view(1, -1, 1)
+        ctx_indices = torch.where(ctx_indices >= data.shape[2], 0, ctx_indices)
         return data[batch_indices, head_indices, ctx_indices]
 
     @staticmethod

@@ -102,7 +102,7 @@ def export(
         output_names: List[str],
         dynamic_axes: Dict,
         export_dir: str = None,
-        export_kwargs: Dict = None,
+        export_kwargs: Dict = {},
     ) -> str:
         """
         Export the text encoder model to ONNX format.
@@ -122,7 +122,7 @@ def export(
             output_names=output_names,
             dynamic_axes=dynamic_axes,
             export_dir=export_dir,
-            export_kwargs=export_kwargs,
+            **export_kwargs,
         )
 
     def compile(self, specializations: List[Dict], **compiler_options) -> None:
@@ -179,7 +179,7 @@ def export(
         output_names: List[str],
         dynamic_axes: Dict,
         export_dir: str = None,
-        export_kwargs: Dict = None,
+        export_kwargs: Dict = {},
     ) -> str:
         """
         Export the UNet model to ONNX format.
@@ -199,7 +199,7 @@ def export(
             output_names=output_names,
             dynamic_axes=dynamic_axes,
             export_dir=export_dir,
-            export_kwargs=export_kwargs,
+            **export_kwargs,
         )
 
     def compile(self, specializations: List[Dict], **compiler_options) -> None:
@@ -292,7 +292,7 @@ def export(
         output_names: List[str],
         dynamic_axes: Dict,
         export_dir: str = None,
-        export_kwargs: Dict = None,
+        export_kwargs: Dict = {},
     ) -> str:
         """
         Export the VAE model to ONNX format.
@@ -312,7 +312,7 @@ def export(
             output_names=output_names,
             dynamic_axes=dynamic_axes,
             export_dir=export_dir,
-            export_kwargs=export_kwargs,
+            **export_kwargs,
         )
 
     def compile(self, specializations: List[Dict], **compiler_options) -> None:
@@ -438,7 +438,7 @@ def export(
         output_names: List[str],
         dynamic_axes: Dict,
         export_dir: str = None,
-        export_kwargs: Dict = None,
+        export_kwargs: Dict = {},
         use_onnx_subfunctions: bool = False,
     ) -> str:
         """
@@ -466,8 +466,8 @@ def export(
             output_names=output_names,
             dynamic_axes=dynamic_axes,
             export_dir=export_dir,
-            export_kwargs=export_kwargs,
             offload_pt_weights=False,  # As weights are needed with AdaLN changes
+            **export_kwargs,
         )
 
     def compile(self, specializations: List[Dict], **compiler_options) -> None:

@@ -253,7 +253,7 @@ def from_pretrained(cls, pretrained_name_or_path: str, *args, **kwargs):
             obj = cls._from_pretrained(pretrained_name_or_path, *args, **kwargs)
         return obj
 
-    def export(self, export_dir: Optional[str] = None, use_onnx_subfunctions: bool = False) -> str:
+    def export(self, export_dir: Optional[str] = None, **kwargs) -> str:
         """
         Export the model with the active adapter to ONNX format.
 
@@ -291,10 +291,10 @@ def export(self, export_dir: Optional[str] = None, use_onnx_subfunctions: bool =
             example_inputs,
             output_names,
             dynamic_axes,
-            export_kwargs={"do_constant_folding": False},  # To avoid merging adapter weights with base weights
+            do_constant_folding=False,  # To avoid merging adapter weights with base weights
             onnx_transform_kwargs={"adapter_name": self.model.active_adapter},
             export_dir=export_dir,
-            use_onnx_subfunctions=use_onnx_subfunctions,
+            **kwargs,
         )
 
     def compile(

@@ -327,7 +327,7 @@ def _init_adapter_model(self):
         # load_weight to model
         self._load_adapter_weights_to_model()
 
-    def export(self, export_dir: Optional[str] = None, use_onnx_subfunctions: bool = False) -> str:
+    def export(self, export_dir: Optional[str] = None, **kwargs) -> str:
         """
         Export the model with all loaded adapters to ONNX format using ``torch.onnx.export``.
 
@@ -387,7 +387,7 @@ def export(self, export_dir: Optional[str] = None, use_onnx_subfunctions: bool =
             output_names,
             dynamic_axes,
             export_dir=export_dir,
-            use_onnx_subfunctions=use_onnx_subfunctions,
+            **kwargs,
         )
 
     def generate(