Handled 1. Mutiple time export issue 2. Meta device error after first export 3. Hash getting changed after each export

Amit Raj · Amit Raj · commit 09f6f803bb46 · 2025-11-26T12:11:00.000Z
Signed-off-by: Amit Raj &lt;amitraj@qti.qualcommm.com&gt;
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -201,7 +201,6 @@ def _export(
         onnx_transform_kwargs: Optional[Dict[str, any]] = None,
         export_dir: Optional[str] = None,
         offload_pt_weights: bool = True,
-        use_onnx_subfunctions: bool = False,
     ) -> str:
         """
         Export the PyTorch model to ONNX and apply ONNX transforms
diff --git a/QEfficient/diffusers/models/attention.py b/QEfficient/diffusers/models/attention.py
@@ -47,7 +47,6 @@ def forward(
             # "feed_forward_chunk_size" can be used to save memory
             ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
         else:
-            # ff_output = self.ff(norm_hidden_states)
             ff_output = self.ff(norm_hidden_states, block_size=4096)
         ff_output = gate_mlp.unsqueeze(1) * ff_output
 
@@ -68,7 +67,6 @@ def forward(
                     self.ff_context, norm_encoder_hidden_states, self._chunk_dim, self._chunk_size
                 )
             else:
-                # context_ff_output = self.ff_context(norm_encoder_hidden_states)
                 context_ff_output = self.ff_context(norm_encoder_hidden_states, block_size=333)
             encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
 
diff --git a/QEfficient/diffusers/models/normalization.py b/QEfficient/diffusers/models/normalization.py
@@ -14,17 +14,9 @@ class QEffAdaLayerNormZero(AdaLayerNormZero):
     def forward(
         self,
         x: torch.Tensor,
-        timestep: Optional[torch.Tensor] = None,
-        class_labels: Optional[torch.LongTensor] = None,
-        hidden_dtype: Optional[torch.dtype] = None,
         shift_msa: Optional[torch.Tensor] = None,
         scale_msa: Optional[torch.Tensor] = None,
-        # emb: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        # if self.emb is not None:
-        #     emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)
-        # emb = self.linear(self.silu(emb))
-        # shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
         x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
         return x
 
@@ -36,15 +28,12 @@ def forward(
         scale_msa: Optional[torch.Tensor] = None,
         shift_msa: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        # shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=1)
         x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
         return x
 
 
 class QEffAdaLayerNormContinuous(AdaLayerNormContinuous):
     def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
-        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
-        # emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
         emb = conditioning_embedding
         scale, shift = torch.chunk(emb, 2, dim=1)
         x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
diff --git a/QEfficient/diffusers/models/transformers/transformer_flux.py b/QEfficient/diffusers/models/transformers/transformer_flux.py
@@ -9,7 +9,6 @@
 
 import numpy as np
 import torch
-import torch.nn as nn
 from diffusers.models.attention_dispatch import dispatch_attention_fn
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 from diffusers.models.transformers.transformer_flux import (
@@ -21,11 +20,6 @@
     _get_qkv_projections,
 )
 
-from QEfficient.diffusers.models.normalization import (
-    QEffAdaLayerNormZero,
-    QEffAdaLayerNormZeroSingle,
-)
-
 
 def qeff_apply_rotary_emb(
     x: torch.Tensor, freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]]
@@ -120,24 +114,6 @@ def __qeff_init__(self):
 
 
 class QEffFluxSingleTransformerBlock(FluxSingleTransformerBlock):
-    def __init__(self, dim: int, num_attention_heads: int, attention_head_dim: int, mlp_ratio: float = 4.0):
-        super().__init__(dim, num_attention_heads, attention_head_dim, mlp_ratio)
-        self.mlp_hidden_dim = int(dim * mlp_ratio)
-        self.norm = QEffAdaLayerNormZeroSingle(dim)
-        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
-        self.act_mlp = nn.GELU(approximate="tanh")
-        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
-        self.attn = QEffFluxAttention(
-            query_dim=dim,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=dim,
-            bias=True,
-            processor=QEffFluxAttnProcessor(),
-            eps=1e-6,
-            pre_only=True,
-        )
-
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -163,33 +139,12 @@ def forward(
         gate = gate.unsqueeze(1)
         hidden_states = gate * self.proj_out(hidden_states)
         hidden_states = residual + hidden_states
-        # if hidden_states.dtype == torch.float16:
-        hidden_states = hidden_states.clip(-65504, 65504)
 
         encoder_hidden_states, hidden_states = hidden_states[:, :text_seq_len], hidden_states[:, text_seq_len:]
         return encoder_hidden_states, hidden_states
 
 
 class QEffFluxTransformerBlock(FluxTransformerBlock):
-    def __init__(
-        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
-    ):
-        super().__init__(dim, num_attention_heads, attention_head_dim)
-
-        self.norm1 = QEffAdaLayerNormZero(dim)
-        self.norm1_context = QEffAdaLayerNormZero(dim)
-        self.attn = QEffFluxAttention(
-            query_dim=dim,
-            added_kv_proj_dim=dim,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=dim,
-            context_pre_only=False,
-            bias=True,
-            processor=QEffFluxAttnProcessor(),
-            eps=eps,
-        )
-
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -395,31 +350,3 @@ def forward(
             return (output,)
 
         return Transformer2DModelOutput(sample=output)
-
-
-class QEffFluxTransformer2DModelOF(QEffFluxTransformer2DModel):
-    def __qeff_init__(self):
-        self.transformer_blocks = nn.ModuleList()
-        self._block_classes = set()
-
-        for _ in range(self.config.num_layers):
-            BlockClass = QEffFluxTransformerBlock
-            block = BlockClass(
-                dim=self.inner_dim,
-                num_attention_heads=self.config.num_attention_heads,
-                attention_head_dim=self.config.attention_head_dim,
-            )
-            self.transformer_blocks.append(block)
-            self._block_classes.add(BlockClass)
-
-        self.single_transformer_blocks = nn.ModuleList()
-
-        for _ in range(self.config.num_single_layers):
-            SingleBlockClass = QEffFluxSingleTransformerBlock
-            single_block = SingleBlockClass(
-                dim=self.inner_dim,
-                num_attention_heads=self.config.num_attention_heads,
-                attention_head_dim=self.config.attention_head_dim,
-            )
-            self.single_transformer_blocks.append(single_block)
-            self._block_classes.add(SingleBlockClass)
diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py
@@ -485,12 +485,16 @@ def export(
         if use_onnx_subfunctions:
             export_kwargs = {"export_modules_as_functions": {QEffFluxTransformerBlock, QEffFluxSingleTransformerBlock}}
 
+        # Sort _use_default_values in config to ensure consistent hash generation during export
+        self.model.config["_use_default_values"].sort()
+
         return self._export(
             example_inputs=inputs,
             output_names=output_names,
             dynamic_axes=dynamic_axes,
             export_dir=export_dir,
             export_kwargs=export_kwargs,
+            offload_pt_weights=False,  # As weights are needed with AdaLN changes
         )
 
     def compile(self, specializations: List[Dict], **compiler_options) -> None:
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
@@ -198,10 +198,7 @@
 )
 from transformers.models.t5.modeling_t5 import (
     T5Attention,
-    T5LayerCrossAttention,
-    T5LayerFF,
     T5LayerNorm,
-    T5LayerSelfAttention,
 )
 from transformers.models.whisper.modeling_whisper import (
     WhisperAttention,
@@ -425,10 +422,7 @@
 )
 from QEfficient.transformers.models.t5.modeling_t5 import (
     QEffT5Attention,
-    QEffT5LayerCrossAttention,
-    QEffT5LayerFF,
     QEffT5LayerNorm,
-    QEffT5LayerSelfAttention,
 )
 from QEfficient.transformers.models.whisper.modeling_whisper import (
     QEffWhisperAttention,
@@ -824,9 +818,6 @@ class KVCacheExternalModuleMapperTransform(ExternalModuleMapperTransform):
 class T5ModelTransform(ModuleMappingTransform):
     # supported architectures
     _module_mapping = {
-        T5LayerFF: QEffT5LayerFF,
-        T5LayerSelfAttention: QEffT5LayerSelfAttention,
-        T5LayerCrossAttention: QEffT5LayerCrossAttention,
         T5Attention: QEffT5Attention,
         T5LayerNorm: QEffT5LayerNorm,
     }
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
@@ -530,15 +530,6 @@ def create_model_params(qeff_model, **kwargs) -> Dict:
     """
     model_params = copy.deepcopy(kwargs)
     model_params = {k: v for k, v in model_params.items() if k in KWARGS_INCLUSION_LIST}
-
-    # TODO: Refactor this configuration handling to occur during export phase
-    # This is necessary because diffusion models have a different way to change number of layers
-    # that isn't properly considered in the current implementation
-    model_params["config"] = (
-        qeff_model.model.config.to_diff_dict()
-        if hasattr(qeff_model.model.config, "to_diff_dict")
-        else qeff_model.model.config
-    )
     model_params["peft_config"] = getattr(qeff_model.model, "active_peft_config", None)
     model_params["applied_transform_names"] = qeff_model._transform_names()
     return model_params
diff --git a/QEfficient/utils/export_utils.py b/QEfficient/utils/export_utils.py
@@ -122,6 +122,15 @@ def _generate_export_hash(qeff_model, args, kwargs, func):
     bound_args.apply_defaults()
     all_args = bound_args.arguments
 
+    # Use the model's current configuration for hashing to ensure any post-load modifications are captured
+    qeff_model.hash_params = {
+        "model_config": (
+            qeff_model.model.config.to_diff_dict()
+            if hasattr(qeff_model.model.config, "to_diff_dict")
+            else qeff_model.model.config
+        ),
+    }
+
     # Generate hash from relevant parameters
     export_hash, filtered_hash_params = create_export_hash(
         model_params=qeff_model.hash_params,
diff --git a/QEfficient/utils/hash_utils.py b/QEfficient/utils/hash_utils.py
@@ -14,10 +14,7 @@
 
 def json_serializable(obj):
     if isinstance(obj, set):
-        return [cls.__name__ if isinstance(cls, type) else str(cls) for cls in obj]
-    # Handle objects with to_dict() method (e.g., transformers config objects)
-    if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")):
-        return obj.to_dict()
+        return sorted([cls.__name__ if isinstance(cls, type) else str(cls) for cls in obj])
     raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
 
 
diff --git a/examples/diffusers/flux/flux_1_schnell.py b/examples/diffusers/flux/flux_1_schnell.py
@@ -42,7 +42,7 @@
     max_sequence_length=256,
     generator=torch.manual_seed(42),
     parallel_compile=True,
-    use_onnx_subfunctions=True,
+    use_onnx_subfunctions=False,
 )
 
 # Extract the generated image from the output
diff --git a/examples/diffusers/flux/flux_1_shnell_custom.py b/examples/diffusers/flux/flux_1_shnell_custom.py
@@ -105,7 +105,7 @@
     max_sequence_length=256,
     generator=torch.manual_seed(42),
     parallel_compile=True,
-    use_onnx_subfunctions=True,
+    use_onnx_subfunctions=False,
 )
 
 images = output.images[0]

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@`
`42`	`42`	`max_sequence_length=256,`
`43`	`43`	`generator=torch.manual_seed(42),`
`44`	`44`	`parallel_compile=True,`
`45`		`- use_onnx_subfunctions=True,`
	`45`	`+ use_onnx_subfunctions=False,`
`46`	`46`	`)`
`47`	`47`
`48`	`48`	`# Extract the generated image from the output`