quic-amitraj · qcdipankar · Dec 9, 2025 · Jul 10, 2025 · Jul 16, 2025 · Jul 17, 2025
@@ -19,6 +19,7 @@
 )
 from QEfficient.compile.compile_helper import compile
 from QEfficient.diffusers.pipelines.flux.pipeline_flux import QEffFluxPipeline
+from QEfficient.diffusers.pipelines.qwen_image.pipeline_qwenimage import QEFFQwenImagePipeline
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
 from QEfficient.peft import QEffAutoPeftModelForCausalLM
@@ -41,6 +42,8 @@
     "QEFFAutoModelForSpeechSeq2Seq",
     "QEFFCommonLoader",
     "QEffFluxPipeline",
+    "QEFFQwenImagePipeline",
+
 ]
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
@@ -70,4 +73,4 @@ def check_qaic_sdk():
 
 
 if not check_qaic_sdk():
-    logger.warning("QAIC SDK is not installed, eager mode features won't be available!")
+    logger.warning("QAIC SDK is not installed, eager mode features won't be available!")
@@ -141,7 +141,11 @@
         """
         Get the model configuration as a dictionary.
 
+        Returns:
+            Dict: The configuration dictionary of the underlying HuggingFace model
+        """
+        return self.model.config.__dict__
         This is an abstract property that must be implemented by all subclasses.
         Typically returns: self.model.config.__dict__
 
        Returns:

@@ -19,16 +19,20 @@
 from QEfficient.customop.ctx_scatter_gather import (
     CtxGather,
     CtxGather3D,
+    CtxGatherBlockedKV,
     CtxGatherFunc,
     CtxGatherFunc3D,
+    CtxGatherFuncBlockedKV,
     CtxScatter,
     CtxScatter3D,
     CtxScatterFunc,
     CtxScatterFunc3D,
 )
 from QEfficient.customop.ctx_scatter_gather_cb import (
+    CtxGatherBlockedKVCB,
     CtxGatherCB,
     CtxGatherCB3D,
+    CtxGatherFuncBlockedKVCB,
     CtxGatherFuncCB,
     CtxGatherFuncCB3D,
     CtxScatterCB,
@@ -95,6 +99,8 @@ class CustomOpTransform(BaseOnnxTransform):
         "CtxScatterFuncCB3D": (CtxScatterFuncCB3D, CtxScatterCB3D),
         "CtxGatherFuncCB": (CtxGatherFuncCB, CtxGatherCB),
         "CtxGatherFuncCB3D": (CtxGatherFuncCB3D, CtxGatherCB3D),
+        "CtxGatherFuncBlockedKV": (CtxGatherFuncBlockedKV, CtxGatherBlockedKV),
+        "CtxGatherFuncBlockedKVCB": (CtxGatherFuncBlockedKVCB, CtxGatherBlockedKVCB),
     }
 
     @classmethod

@@ -0,0 +1,75 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+import torch
+from diffusers.models.attention import JointTransformerBlock, _chunked_feed_forward
+
+
+class QEffJointTransformerBlock(JointTransformerBlock):
+    def forward(
+        self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor
+    ):
+        if self.use_dual_attention:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1(
+                hidden_states, emb=temb
+            )
+        else:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+
+        if self.context_pre_only:
+            norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states, temb)
+        else:
+            norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+                encoder_hidden_states, emb=temb
+            )
+
+        # Attention.
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states
+        )
+
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+
+        if self.use_dual_attention:
+            attn_output2 = self.attn2(hidden_states=norm_hidden_states2)
+            attn_output2 = gate_msa2.unsqueeze(1) * attn_output2
+            hidden_states = hidden_states + attn_output2
+
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            # ff_output = self.ff(norm_hidden_states)
+            ff_output = self.ff(norm_hidden_states, block_size=4096)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+        hidden_states = hidden_states + ff_output
+
+        # Process attention outputs for the `encoder_hidden_states`.
+        if self.context_pre_only:
+            encoder_hidden_states = None
+        else:
+            context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+            encoder_hidden_states = encoder_hidden_states + context_attn_output
+
+            norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+            norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            if self._chunk_size is not None:
+                # "feed_forward_chunk_size" can be used to save memory
+                context_ff_output = _chunked_feed_forward(
+                    self.ff_context, norm_encoder_hidden_states, self._chunk_dim, self._chunk_size
+                )
+            else:
+                # context_ff_output = self.ff_context(norm_encoder_hidden_states)
+                context_ff_output = self.ff_context(norm_encoder_hidden_states, block_size=333)
+            encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+
+        return encoder_hidden_states, hidden_states
@@ -0,0 +1,158 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+from typing import Optional
+
+import torch
+from diffusers.models.attention_processor import Attention, JointAttnProcessor2_0
+
+from QEfficient.diffusers.models.transformers.transformer_qwenimage import QEffQwenDoubleStreamAttnProcessor2_0
+
+
+class QEffAttention(Attention):
+    def __qeff_init__(self):
+        # breakpoint()
+        processor = QEffQwenDoubleStreamAttnProcessor2_0()
+        self.processor = processor
+        processor.query_block_size = 64
+
+    def get_attention_scores(
+        self, query: torch.Tensor, key: torch.Tensor, attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0], query.shape[1], key.shape[2], dtype=query.dtype, device=query.device
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key,
+            beta=beta,
+            alpha=self.scale,
+        )
+        del baddbmm_input
+
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+
+        attention_probs = attention_probs.to(dtype)
+
+        return attention_probs
+
+
+class QEffJointAttnProcessor2_0(JointAttnProcessor2_0):
+    def __call__(
+        self,
+        attn: QEffAttention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        batch_size = hidden_states.shape[0]
+
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # `context` projections.
+        if encoder_hidden_states is not None:
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+
+            query = torch.cat([query, encoder_hidden_states_query_proj], dim=2)
+            key = torch.cat([key, encoder_hidden_states_key_proj], dim=2)
+            value = torch.cat([value, encoder_hidden_states_value_proj], dim=2)
+
+        query = query.reshape(-1, query.shape[-2], query.shape[-1])
+        key = key.reshape(-1, key.shape[-2], key.shape[-1])
+        value = value.reshape(-1, value.shape[-2], value.shape[-1])
+
+        # pre-transpose the key
+        key = key.transpose(-1, -2)
+        if query.size(-2) != value.size(-2):  # cross-attention, use regular attention
+            # QKV done in single block
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            hidden_states = torch.bmm(attention_probs, value)
+        else:  # self-attention, use blocked attention
+            # QKV done with block-attention (a la FlashAttentionV2)
+            query_block_size = self.query_block_size
+            query_seq_len = query.size(-2)
+            num_blocks = (query_seq_len + query_block_size - 1) // query_block_size
+            for qidx in range(num_blocks):
+                query_block = query[:, qidx * query_block_size : (qidx + 1) * query_block_size, :]
+                attention_probs = attn.get_attention_scores(query_block, key, attention_mask)
+                hidden_states_block = torch.bmm(attention_probs, value)
+                if qidx == 0:
+                    hidden_states = hidden_states_block
+                else:
+                    hidden_states = torch.cat((hidden_states, hidden_states_block), -2)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        if encoder_hidden_states is not None:
+            # Split the attention outputs.
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, : residual.shape[1]],
+                hidden_states[:, residual.shape[1] :],
+            )
+            if not attn.context_pre_only:
+                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if encoder_hidden_states is not None:
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
@@ -0,0 +1,31 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+import torch
+from diffusers import AutoencoderKL
+
+
+class QEffAutoencoderKL(AutoencoderKL):
+    def encode(self, x: torch.Tensor, return_dict: bool = True):
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+        return h