Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
af0e6a7
[QEff]: Add gpt_oss
vbaddi Aug 6, 2025
2d442eb
nit: update modeling and make transform uniform
vbaddi Aug 7, 2025
ab8cc9c
apirunner change
ochougul Aug 7, 2025
e7ecc19
added test along with simplified Hybridcache
ochougul Aug 7, 2025
a583265
added test assert
ochougul Aug 7, 2025
dc2cc2a
nit: update test gpt file
vbaddi Aug 8, 2025
f8dac17
nit: update modeling with new decode moe forward
vbaddi Aug 11, 2025
99815cf
nit: seperate gate, up projections for MoE
vbaddi Aug 20, 2025
4948397
nit: remove test file and add sample test in config
Oct 15, 2025
bde09c7
Enable CB for GptOssModel
mamtsing Nov 3, 2025
3fe07a8
Fix tests
mamtsing Nov 4, 2025
3fa01df
Address review comments
mamtsing Nov 4, 2025
4f910e0
prefill only changes for gpt-oss
ochougul Nov 4, 2025
88f9f75
fixed mapping
ochougul Nov 5, 2025
aac4be0
added test
ochougul Nov 6, 2025
1d7220a
added test
ochougul Nov 6, 2025
51316d5
made example not ugly
ochougul Nov 6, 2025
e6e2969
fixed tests
ochougul Nov 6, 2025
2334056
fixed tests
ochougul Nov 6, 2025
0c98397
added new test and fixed failing tests
ochougul Nov 7, 2025
ef4d751
fixed tests
ochougul Nov 10, 2025
d2d55de
fixed kv cache shape
ochougul Nov 10, 2025
23bb9ab
fixed self.onnx_path issue in modeling_qeff
ochougul Nov 11, 2025
a51ef91
added ffn blocking and num blocks env variables
ochougul Nov 13, 2025
a829a05
include num_ffn_blocks in hash
ochougul Nov 17, 2025
eb8c7c3
fixed dynamic range in case of subfunc issue and nonmatching ctx, pre…
ochougul Nov 18, 2025
f6c320e
added swa optimization for reducing MACCs using less KV
ochougul Nov 18, 2025
69a696d
added opt swa to hash
ochougul Nov 24, 2025
50c9b7f
lint and format
ochougul Nov 24, 2025
a53f7bb
enabled chunking
ochougul Nov 26, 2025
ff1d05b
added ChunkedPrefillMLP block; fixed passing prefill_only flag and en…
ochougul Dec 1, 2025
80571aa
added disagg mode example for chunking mode
ochougul Dec 2, 2025
c403ba7
fixed the kwargs passing to build_decode_specialization
ochougul Dec 2, 2025
3defe4c
pushed latest changes with chunking enabled for prefill along with re…
ochougul Dec 8, 2025
dc546ae
added support for prefix caching for gpt-oss
ochougul Dec 8, 2025
3b777e8
removed error
ochougul Dec 9, 2025
ba77602
added errors for prefill-only mode
ochougul Dec 9, 2025
0680508
fix decode-only model
ochougul Dec 10, 2025
be5ef75
fixed CB for decode-only model
ochougul Dec 10, 2025
cc3bb0b
created readme
ochougul Dec 10, 2025
efd671a
rebased and made setup_onnx_sub explicit
ochougul Dec 10, 2025
86733cc
linting error
ochougul Dec 10, 2025
d46c9d0
fixed use_onnx_subfunc
ochougul Dec 11, 2025
82caac6
fixed tests
ochougul Dec 11, 2025
65f93b1
linter
ochougul Dec 11, 2025
edbc7e8
added missing marker
ochougul Dec 11, 2025
4270d2c
pushed tests fix
ochougul Dec 11, 2025
85b23cd
fixed flux pipeline
ochougul Dec 11, 2025
c78ec66
tests fixed
ochougul Dec 11, 2025
502d289
Fix CI error for PL=1
mamtsing Dec 14, 2025
49bb40b
Merge branch 'main' into prefill+decode_gpt_oss
quic-mamta Dec 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,17 @@
# -----------------------------------------------------------------------------

import os
import warnings

# ----------------------------------------------------------------------------- #
# For faster downloads via hf_transfer
# This code is put above import statements as this needs to be executed before
# hf_transfer is imported (will happen on line 15 via leading imports)
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
# DO NOT ADD ANY CODE ABOVE THIS LINE
# Please contact maintainers if you must edit this file above this line.
# ----------------------------------------------------------------------------- #
# Placeholder for all non-transformer models registered in QEfficient
import warnings # noqa: I001

import QEfficient.utils.model_registery # noqa: F401
from QEfficient.base import (
Expand All @@ -26,6 +36,10 @@
from QEfficient.utils import custom_format_warning
from QEfficient.utils.logging_utils import logger

# custom warning for the better logging experience
warnings.formatwarning = custom_format_warning


# Users can use QEfficient.export for exporting models to ONNX
export = qualcomm_efficient_converter
__all__ = [
Expand All @@ -42,14 +56,7 @@
"QEFFCommonLoader",
"QEffFluxPipeline",
]
# For faster downloads via hf_transfer
# This code is put above import statements as this needs to be executed before
# hf_transfer is imported (will happen on line 15 via leading imports)
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
# Placeholder for all non-transformer models registered in QEfficient

# custom warning for the better logging experience
warnings.formatwarning = custom_format_warning

# Conditionally import QAIC-related modules if the SDK is installed
__version__ = "0.0.1.dev0"
Expand Down
82 changes: 70 additions & 12 deletions QEfficient/base/modeling_qeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None:
super().__init__()
self.model = model
self.hash_params = create_model_params(self, **kwargs)
self.prefill_onnx_path: Optional[str] = None
self.onnx_path: Optional[str] = None
self.qpc_path: Optional[str] = None
self.qpc_session: Optional[QAICInferenceSession] = None
Expand Down Expand Up @@ -204,10 +205,11 @@ def _export(
example_inputs: Dict[str, torch.Tensor],
output_names: List[str],
dynamic_axes: Dict[str, Dict[int, str]],
export_kwargs: Optional[Dict[str, any]] = None,
onnx_transform_kwargs: Optional[Dict[str, any]] = None,
export_dir: Optional[str] = None,
offload_pt_weights: bool = True,
prefill_only: Optional[bool] = False,
**export_kwargs,
) -> str:
"""
Export the PyTorch model to ONNX and apply ONNX transforms
Expand All @@ -232,11 +234,16 @@ def _export(
instance using from_pretrained() for re-export.

"""
# TODO: Hack for retain_full_kv, handle this outside
export_kwargs.pop("retain_full_kv", None)
onnx_path = export_dir / f"{self.model_name}.onnx"

# Return early if ONNX already exists
if onnx_path.is_file():
self.onnx_path = onnx_path
if prefill_only:
self.prefill_onnx_path = onnx_path
else:
self.onnx_path = onnx_path
return onnx_path

# check if the model is in meta state or weights are offloaded
Expand Down Expand Up @@ -272,9 +279,6 @@ def _export(
input_names.append(param)

try:
# Export to ONNX
export_kwargs = {} if export_kwargs is None else export_kwargs

torch.onnx.export(
self.model,
(example_inputs,),
Expand Down Expand Up @@ -318,9 +322,42 @@ def _export(
finally:
shutil.rmtree(tmp_onnx_dir, ignore_errors=True)

self.onnx_path = onnx_path
if prefill_only:
self.prefill_onnx_path = onnx_path
else:
self.onnx_path = onnx_path
return onnx_path

def get_onnx_path(
self,
prefill_only: Optional[bool] = False,
enable_chunking: Optional[bool] = False,
specializations: Optional[List[Dict[str, int]]] = None,
offload_pt_weights: Optional[bool] = True,
use_onnx_subfunctions: Optional[bool] = False,
retain_full_kv: Optional[bool] = False,
):
kwargs = {
"offload_pt_weights": offload_pt_weights,
"use_onnx_subfunctions": use_onnx_subfunctions,
"retain_full_kv": retain_full_kv,
}
if prefill_only:
if self.prefill_onnx_path is None:
kwargs.update(
{
"prefill_only": prefill_only,
"prefill_seq_len": specializations[0].get("seq_len"),
"enable_chunking": enable_chunking,
}
)
self.export(**kwargs)
return self.prefill_onnx_path
else:
if self.onnx_path is None:
self.export(**kwargs)
return self.onnx_path

@dump_qconfig
def _compile(
self,
Expand All @@ -335,6 +372,10 @@ def _compile(
enable_qnn: Optional[bool] = False,
qnn_config: Optional[str] = None,
use_onnx_subfunctions: bool = False,
prefill_only: Optional[str] = None,
offload_pt_weights: Optional[bool] = True,
enable_chunking: Optional[bool] = False,
retain_full_kv: Optional[bool] = None,
**compiler_options,
) -> str:
"""
Expand All @@ -360,11 +401,18 @@ def _compile(

For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored.
"""

if onnx_path is None and self.onnx_path is None:
self.export(use_onnx_subfunctions=use_onnx_subfunctions)

onnx_path = Path(onnx_path or self.onnx_path)
onnx_path = Path(
onnx_path
if onnx_path
else self.get_onnx_path(
prefill_only,
enable_chunking,
specializations,
offload_pt_weights,
use_onnx_subfunctions,
retain_full_kv,
)
)
compile_dir = Path(compile_dir or onnx_path.parent)
qpc_path = compile_dir / "qpc"
if not onnx_path.is_file():
Expand Down Expand Up @@ -426,6 +474,7 @@ def _compile(
"mdp_ts_num_devices": mdp_ts_num_devices,
"mdp_ts_json": mdp_ts_json,
"num_speculative_tokens": num_speculative_tokens,
"prefill_only": prefill_only,
}
compile_hash = hash_dict_params(compile_hash_params)

Expand Down Expand Up @@ -465,6 +514,16 @@ def _compile(

command.append(f"-aic-binary-dir={qpc_path}")
logger.info(f"Running compiler: {' '.join(command)}")
if use_onnx_subfunctions:

class FeatureNotAvailableError(Exception):
pass

exec_command = f'QAIC_COMPILER_OPTS_UNSUPPORTED="-loader-inline-all=0" {" ".join(command)}'
raise FeatureNotAvailableError(
"ONNX graph is exported with subfunctions, assert version of apps SDK should be used for compiling this model."
+ f"\nRun following command manually with assert compiler:\n{exec_command}"
)
try:
subprocess.run(command, capture_output=True, check=True)
except subprocess.CalledProcessError as e:
Expand All @@ -485,5 +544,4 @@ def _compile(
logger.info("Hashed parameters exported successfully.")

self.qpc_path = qpc_path

return qpc_path
4 changes: 2 additions & 2 deletions QEfficient/base/onnx_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,12 @@ class CustomOpTransform(BaseOnnxTransform):
"CtxScatterFunc3D": (CtxScatterFunc3D, CtxScatter3D),
"CtxGatherFunc": (CtxGatherFunc, CtxGather),
"CtxGatherFunc3D": (CtxGatherFunc3D, CtxGather3D),
"CtxScatterFuncCB": (CtxScatterFuncCB, CtxScatterCB),
"CtxScatterFuncCB3D": (CtxScatterFuncCB3D, CtxScatterCB3D),
"CtxGatherFuncCB": (CtxGatherFuncCB, CtxGatherCB),
"CtxGatherFuncCB3D": (CtxGatherFuncCB3D, CtxGatherCB3D),
"CtxGatherFuncBlockedKV": (CtxGatherFuncBlockedKV, CtxGatherBlockedKV),
"CtxGatherFuncBlockedKVCB": (CtxGatherFuncBlockedKVCB, CtxGatherBlockedKVCB),
"CtxScatterFuncCB": (CtxScatterFuncCB, CtxScatterCB),
"CtxGatherFuncCB": (CtxGatherFuncCB, CtxGatherCB),
}

@classmethod
Expand Down
1 change: 1 addition & 0 deletions QEfficient/customop/ctx_scatter_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ class CtxGatherFunc(torch.autograd.Function):
def forward(data: torch.Tensor, ctx_indices: torch.Tensor, comp_ctx_len: int):
batch_indices = torch.arange(data.shape[0]).view(-1, 1, 1)
head_indices = torch.arange(data.shape[1]).view(1, -1, 1)
ctx_indices = torch.where(ctx_indices == torch.iinfo(torch.int32).max, 0, ctx_indices)
return data[batch_indices, head_indices, ctx_indices]

@staticmethod
Expand Down
1 change: 1 addition & 0 deletions QEfficient/customop/ctx_scatter_gather_cb.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ class CtxGatherFuncCB(torch.autograd.Function):
def forward(data: torch.Tensor, batch_index: torch.Tensor, ctx_indices: torch.Tensor, comp_ctx_len: int):
batch_indices = batch_index.view(-1, 1, 1)
head_indices = torch.arange(data.shape[1]).view(1, -1, 1)
ctx_indices = torch.where(ctx_indices >= data.shape[2], 0, ctx_indices)
return data[batch_indices, head_indices, ctx_indices]

@staticmethod
Expand Down
16 changes: 8 additions & 8 deletions QEfficient/diffusers/pipelines/pipeline_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def export(
output_names: List[str],
dynamic_axes: Dict,
export_dir: str = None,
export_kwargs: Dict = None,
export_kwargs: Dict = {},
) -> str:
"""
Export the text encoder model to ONNX format.
Expand All @@ -122,7 +122,7 @@ def export(
output_names=output_names,
dynamic_axes=dynamic_axes,
export_dir=export_dir,
export_kwargs=export_kwargs,
**export_kwargs,
)

def compile(self, specializations: List[Dict], **compiler_options) -> None:
Expand Down Expand Up @@ -179,7 +179,7 @@ def export(
output_names: List[str],
dynamic_axes: Dict,
export_dir: str = None,
export_kwargs: Dict = None,
export_kwargs: Dict = {},
) -> str:
"""
Export the UNet model to ONNX format.
Expand All @@ -199,7 +199,7 @@ def export(
output_names=output_names,
dynamic_axes=dynamic_axes,
export_dir=export_dir,
export_kwargs=export_kwargs,
**export_kwargs,
)

def compile(self, specializations: List[Dict], **compiler_options) -> None:
Expand Down Expand Up @@ -292,7 +292,7 @@ def export(
output_names: List[str],
dynamic_axes: Dict,
export_dir: str = None,
export_kwargs: Dict = None,
export_kwargs: Dict = {},
) -> str:
"""
Export the VAE model to ONNX format.
Expand All @@ -312,7 +312,7 @@ def export(
output_names=output_names,
dynamic_axes=dynamic_axes,
export_dir=export_dir,
export_kwargs=export_kwargs,
**export_kwargs,
)

def compile(self, specializations: List[Dict], **compiler_options) -> None:
Expand Down Expand Up @@ -438,7 +438,7 @@ def export(
output_names: List[str],
dynamic_axes: Dict,
export_dir: str = None,
export_kwargs: Dict = None,
export_kwargs: Dict = {},
use_onnx_subfunctions: bool = False,
) -> str:
"""
Expand Down Expand Up @@ -466,8 +466,8 @@ def export(
output_names=output_names,
dynamic_axes=dynamic_axes,
export_dir=export_dir,
export_kwargs=export_kwargs,
offload_pt_weights=False, # As weights are needed with AdaLN changes
**export_kwargs,
)

def compile(self, specializations: List[Dict], **compiler_options) -> None:
Expand Down
6 changes: 3 additions & 3 deletions QEfficient/peft/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def from_pretrained(cls, pretrained_name_or_path: str, *args, **kwargs):
obj = cls._from_pretrained(pretrained_name_or_path, *args, **kwargs)
return obj

def export(self, export_dir: Optional[str] = None, use_onnx_subfunctions: bool = False) -> str:
def export(self, export_dir: Optional[str] = None, **kwargs) -> str:
"""
Export the model with the active adapter to ONNX format.
Expand Down Expand Up @@ -291,10 +291,10 @@ def export(self, export_dir: Optional[str] = None, use_onnx_subfunctions: bool =
example_inputs,
output_names,
dynamic_axes,
export_kwargs={"do_constant_folding": False}, # To avoid merging adapter weights with base weights
do_constant_folding=False, # To avoid merging adapter weights with base weights
onnx_transform_kwargs={"adapter_name": self.model.active_adapter},
export_dir=export_dir,
use_onnx_subfunctions=use_onnx_subfunctions,
**kwargs,
)

def compile(
Expand Down
4 changes: 2 additions & 2 deletions QEfficient/peft/lora/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def _init_adapter_model(self):
# load_weight to model
self._load_adapter_weights_to_model()

def export(self, export_dir: Optional[str] = None, use_onnx_subfunctions: bool = False) -> str:
def export(self, export_dir: Optional[str] = None, **kwargs) -> str:
"""
Export the model with all loaded adapters to ONNX format using ``torch.onnx.export``.
Expand Down Expand Up @@ -387,7 +387,7 @@ def export(self, export_dir: Optional[str] = None, use_onnx_subfunctions: bool =
output_names,
dynamic_axes,
export_dir=export_dir,
use_onnx_subfunctions=use_onnx_subfunctions,
**kwargs,
)

def generate(
Expand Down
Loading
Loading