fix test failure of speculative_generation on xpu (#42052)

sywangyi · web-flow · commit 58a3f8caacdb · 2025-11-12T12:33:41.000Z
* fix test failure of speculative_generation on xpu

Signed-off-by: Wang, Yi A &lt;yi.a.wang@intel.com&gt;

* code refine

Signed-off-by: Wang, Yi A &lt;yi.a.wang@intel.com&gt;

* address review comment

Signed-off-by: Wang, Yi A &lt;yi.a.wang@intel.com&gt;

---------

Signed-off-by: Wang, Yi A &lt;yi.a.wang@intel.com&gt;
diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py
@@ -177,6 +177,42 @@ def prepare_padding_mask(
     return local_padding_mask
 
 
+def _can_skip_causal_mask_xpu(
+    padding_mask: Optional[torch.Tensor],
+    query_length: int,
+    kv_length: int,
+    local_attention_size: Optional[int],
+) -> bool:
+    """
+    XPU-specific logic for determining if we can skip causal mask creation.
+
+    For XPU devices, we have special handling:
+    - Single query tokens (query_length == 1) use the same logic as CUDA
+    - Multi-query tokens can skip if padding_mask is provided and correctly structured
+      The mask must have all True values in the query window and all False after
+    """
+
+    if is_tracing(padding_mask):
+        return False
+
+    # Check local attention constraint (same as CUDA)
+    if local_attention_size is not None and kv_length >= local_attention_size:
+        return False
+
+    if padding_mask is None:
+        # Without padding mask, can skip if single query token or full causal attention
+        return query_length == 1 or kv_length == query_length
+
+    # XPU allows skipping under additional conditions when padding_mask is provided
+    if query_length == 1:
+        # Single query token: skip only if no padding tokens present
+        return padding_mask.all()
+
+    # XPU-specific: check if query window is all True and rest is all False
+    # This allows XPU to optimize the 1st token in static cache
+    return padding_mask[:, :query_length].all() and not padding_mask[:, query_length:].any()
+
+
 def _ignore_causal_mask_sdpa(
     padding_mask: Optional[torch.Tensor],
     query_length: int,
@@ -197,25 +233,24 @@ def _ignore_causal_mask_sdpa(
         mask_indices += kv_offset
         padding_mask = padding_mask[:, mask_indices]
 
+    if _is_torch_xpu_available:
+        # XPU devices have special handling for mask skipping:
+        # - Single query tokens use the same logic as CUDA
+        # - Multi-query tokens can skip if padding_mask is provided and correctly structured
+        #   (all True in query window, all False after)
+        return _can_skip_causal_mask_xpu(padding_mask, query_length, kv_length, local_attention_size)
     # When using `torch.export` or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is
     # hard-coded to the forward. If a user exports a model with query_length > 1, the exported model will hard-code `is_causal=True`
     # which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108). Thus, we only set
     # `ignore_causal_mask = True` if we are not tracing
     if (
         not is_tracing(padding_mask)
         # only cases when lower and upper diags are the same, see https://github.com/pytorch/pytorch/issues/108108
-        and (query_length == 1 or (kv_length == query_length or _is_torch_xpu_available))
+        and (query_length == 1 or kv_length == query_length)
         # in this case we need to add special patterns to the mask so cannot be skipped otherwise
         and (local_attention_size is None or kv_length < local_attention_size)
         # In this case, we need to add padding to the mask, so cannot be skipped otherwise
-        and (
-            padding_mask is None
-            or (
-                padding_mask.all()
-                if not _is_torch_xpu_available or query_length == 1
-                else padding_mask[:, :query_length].all()
-            )
-        )
+        and (padding_mask is None or padding_mask.all())
     ):
         return True
 
diff --git a/tests/models/qwen3/test_modeling_qwen3.py b/tests/models/qwen3/test_modeling_qwen3.py
@@ -165,7 +165,7 @@ def test_model_600m_long_prompt_sdpa(self):
     def test_speculative_generation(self):
         EXPECTED_TEXT_COMPLETIONS = Expectations(
             {
-                ("xpu", 3): "My favourite condiment is 100% peanut butter. I love it so much that I can't help but use it",
+                ("xpu", 3): "My favourite condiment is 100% beef and comes in a 12 oz. jar. It is sold in",
                 ("cuda", 7): "My favourite condiment is 100% natural. It's a little spicy and a little sweet, but it's the",
                 ("cuda", 8): "My favourite condiment is 100% beef, 100% beef, 100% beef.",
             }

Original file line number	Diff line number	Diff line change
`@@ -165,7 +165,7 @@ def test_model_600m_long_prompt_sdpa(self):`
`165`	`165`	`def test_speculative_generation(self):`
`166`	`166`	`EXPECTED_TEXT_COMPLETIONS = Expectations(`
`167`	`167`	`{`
`168`		`- ("xpu", 3): "My favourite condiment is 100% peanut butter. I love it so much that I can't help but use it",`
	`168`	`+ ("xpu", 3): "My favourite condiment is 100% beef and comes in a 12 oz. jar. It is sold in",`
`169`	`169`	`("cuda", 7): "My favourite condiment is 100% natural. It's a little spicy and a little sweet, but it's the",`
`170`	`170`	`("cuda", 8): "My favourite condiment is 100% beef, 100% beef, 100% beef.",`
`171`	`171`	`}`