Support attention activation sharding

hanzhi713 · changlan · commit 4a4da82263b8 · 2025-08-04T17:27:44.000-07:00
GitOrigin-RevId: 3c305c4
diff --git a/axlearn/common/attention.py b/axlearn/common/attention.py
@@ -161,10 +161,10 @@
     check_numerics,
     flatten_items,
     get_or_none,
+    maybe_shard,
     save_and_offload_only_these_names_regex,
     shapes,
     split_prng_key,
-    with_sharding_constraint,
 )
 
 
@@ -1560,18 +1560,18 @@ class Config(BaseLayer.Config):
         logit_sink: Optional[bool] = None
 
         # Partition spec for query ([batch, seq, q_heads, head_dim]) after input projections.
-        q_partition_spec: Optional[PartitionSpec] = None
+        q_partition_spec: Optional[Sequence[Union[str, Sequence[str], None]]] = None
 
         # Partition spec for key ([batch, seq, kv_heads, head_dim]) after input projections.
         # Follows `q_partition_spec` if None.
-        k_partition_spec: Optional[PartitionSpec] = None
+        k_partition_spec: Optional[Sequence[Union[str, Sequence[str], None]]] = None
 
         # Partition spec for value ([batch, seq, kv_heads, head_dim]) after input projections.
         # Follows `q_partition_spec` if None.
-        v_partition_spec: Optional[PartitionSpec] = None
+        v_partition_spec: Optional[Sequence[Union[str, Sequence[str], None]]] = None
 
         # Partition spec for output ([batch, seq, hidden_dim]) after output projections.
-        o_partition_spec: Optional[PartitionSpec] = None
+        o_partition_spec: Optional[Sequence[Union[str, Sequence[str], None]]] = None
 
     def __init__(self, cfg: Config, *, parent: Module):
         super().__init__(cfg, parent=parent)
@@ -1736,12 +1736,9 @@ def _forward_for_mode(
             time_step = cached_states["time_step"]
             query_positions = query_positions + time_step[:, None]  # [batch, steps]
         q_proj, k_proj, v_proj = self.i_proj(query, query_positions=query_positions, **kv_kwargs)
-        if cfg.q_partition_spec:
-            q_proj = with_sharding_constraint(q_proj, cfg.q_partition_spec)
-        if cfg.q_partition_spec or cfg.k_partition_spec:
-            k_proj = with_sharding_constraint(k_proj, cfg.k_partition_spec or cfg.q_partition_spec)
-        if cfg.q_partition_spec or cfg.v_partition_spec:
-            v_proj = with_sharding_constraint(v_proj, cfg.v_partition_spec or cfg.q_partition_spec)
+        q_proj = maybe_shard(q_proj, cfg.q_partition_spec)
+        k_proj = maybe_shard(k_proj, cfg.k_partition_spec or cfg.q_partition_spec)
+        v_proj = maybe_shard(v_proj, cfg.v_partition_spec or cfg.q_partition_spec)
 
         if cfg.scale_kv_before_cache_update:
             if has_external_kv_state:
@@ -1844,8 +1841,7 @@ def _forward_for_mode(
 
         # [batch, target_length, output_dim].
         o_proj = self.o_proj(context)
-        if cfg.o_partition_spec:
-            o_proj = with_sharding_constraint(o_proj, cfg.o_partition_spec)
+        o_proj = maybe_shard(o_proj, cfg.o_partition_spec)
         outputs = self._remat_name(o_proj, "o_proj")
         self._add_tensor_stats("o_proj_outputs", outputs)
         return_aux = return_aux or set()
@@ -3608,15 +3604,17 @@ def extend_step(
 def set_attention_partition_specs(
     cfg: MultiheadAttention.Config,
     *,
+    batch_axis_names: Union[str, Sequence[str]] = ("data", "fsdp"),
     fsdp_axis_names: Union[str, Sequence[str]] = "fsdp",
     tp_axis_names: Union[str, Sequence[str]] = "model",
+    seq_axis_names: Union[str, Sequence[str]] = "seq",
+    set_attn_activation_specs: bool = False,
 ):
     """Sets `cfg` to shard attention weights over both fsdp and tp axes.
 
     Args:
         cfg: A MultiheadAttention layer config to apply sharding spec to.
-        fsdp_axis_names: Axis name(s) over which we shard fully-sharded-data-parallel tensors.
-        tp_axis_names: Axis name(s) over which we shard tensor-parallel tensors.
+        **kwargs: See `set_double_shard_weights_config`.
     """
     # Shard weights.
     input_linear_cfg = cfg.input_linear
@@ -3625,6 +3623,10 @@ def set_attention_partition_specs(
     input_linear_cfg.layer.param_partition_spec = (fsdp_axis_names, tp_axis_names, None)
     cfg.output_linear.param_partition_spec = (fsdp_axis_names, tp_axis_names, None)
 
+    if set_attn_activation_specs:
+        cfg.q_partition_spec = (batch_axis_names, seq_axis_names, tp_axis_names, None)
+        cfg.o_partition_spec = (batch_axis_names, seq_axis_names, tp_axis_names)
+
 
 def set_feed_forward_partition_specs(
     cfg: TransformerFeedForwardLayer.Config,
@@ -3638,10 +3640,7 @@ def set_feed_forward_partition_specs(
 
     Args:
         cfg: A TransformerFeedForwardLayer layer config to apply sharding spec to.
-        batch_axis_names: Axis name(s) over which we shard the batch dimension of output tensors.
-        fsdp_axis_names: Axis name(s) over which we shard fully-sharded-data-parallel tensors.
-        tp_axis_names: Axis name(s) over which we shard tensor-parallel tensors.
-        seq_axis_names: Axis name(s) over which we shard sequence-parallel tensors.
+        **kwargs: See `set_double_shard_weights_config`.
     """
     # Shard weights.
     cfg.linear1.param_partition_spec = (fsdp_axis_names, tp_axis_names)
@@ -3658,6 +3657,7 @@ def set_double_shard_weights_config(
     fsdp_axis_names: Union[str, Sequence[str]] = "fsdp",
     tp_axis_names: Union[str, Sequence[str]] = "model",
     seq_axis_names: Union[str, Sequence[str]] = "seq",
+    set_attn_activation_specs: bool = False,
 ):
     """Sets `cfg` to shard FFN and attention weights over both fsdp and tp axes.
 
@@ -3667,32 +3667,35 @@ def set_double_shard_weights_config(
         fsdp_axis_names: Axis name(s) over which we shard fully-sharded-data-parallel tensors.
         tp_axis_names: Axis name(s) over which we shard tensor-parallel tensors.
         seq_axis_names: Axis name(s) over which we shard sequence-parallel tensors.
+        set_attn_activation_specs: Whether to set activation spec of qkvo projections. This may be
+            required in for some complex sharding cases.
     """
 
     # pytype: disable=attribute-error
     if not isinstance(cfg, Sequence):
         cfg = [cfg]
 
+    axis_names = dict(
+        batch_axis_names=batch_axis_names,
+        fsdp_axis_names=fsdp_axis_names,
+        tp_axis_names=tp_axis_names,
+        seq_axis_names=seq_axis_names,
+    )
+
     for layer_cfg in cfg:
         set_attention_partition_specs(
             layer_cfg.self_attention.attention,
-            fsdp_axis_names=fsdp_axis_names,
-            tp_axis_names=tp_axis_names,
+            set_attn_activation_specs=set_attn_activation_specs,
+            **axis_names,
         )
         if layer_cfg.cross_attention is not None:
             set_attention_partition_specs(
                 layer_cfg.cross_attention.attention,
-                fsdp_axis_names=fsdp_axis_names,
-                tp_axis_names=tp_axis_names,
+                set_attn_activation_specs=set_attn_activation_specs,
+                **axis_names,
             )
         if isinstance(layer_cfg.feed_forward, TransformerFeedForwardLayer.Config):
-            set_feed_forward_partition_specs(
-                layer_cfg.feed_forward,
-                batch_axis_names=batch_axis_names,
-                fsdp_axis_names=fsdp_axis_names,
-                tp_axis_names=tp_axis_names,
-                seq_axis_names=seq_axis_names,
-            )
+            set_feed_forward_partition_specs(layer_cfg.feed_forward, **axis_names)
     # pytype: enable=attribute-error
 
 
diff --git a/axlearn/common/attention_test.py b/axlearn/common/attention_test.py
@@ -2467,7 +2467,7 @@ def test_gqa_forward(
         )
         self.assertNestedAllClose(base_outputs, test_outputs)
 
-    @parameterized.product(kv_part=[None, PartitionSpec("fsdp", None, "model", None)])
+    @parameterized.product(kv_part=[None, ("fsdp", None, "model", None)])
     @pytest.mark.d8
     def test_qkvo_partition_spec(self, kv_part):
         """Tests that QKVO partition spec are applied correctly when specified."""
@@ -2477,8 +2477,8 @@ def test_qkvo_partition_spec(self, kv_part):
         model_dim = 16
         num_heads = 4
         mesh = jax.make_mesh(mesh_shape, axis_names=("fsdp", "seq", "model"))
-        q_part = PartitionSpec("fsdp", "seq", "model", None)
-        o_part = PartitionSpec("fsdp", "seq", None)
+        q_part = ("fsdp", "seq", "model", None)
+        o_part = ("fsdp", "seq", None)
 
         layer_kwargs = dict(
             query_dim=model_dim,
@@ -2514,14 +2514,14 @@ def callback(sharding):
                 # pylint: disable-next=protected-access
                 normalize_spec = sharding.spec._normalized_spec_for_aval(len(tensor.shape))
                 if name == "q_proj":
-                    self.assertEqual(normalize_spec, q_part)
+                    self.assertEqual(normalize_spec, PartitionSpec(*q_part))
                 elif name == "o_proj":
-                    self.assertEqual(normalize_spec, o_part)
+                    self.assertEqual(normalize_spec, PartitionSpec(*o_part))
                 elif name in ["k_proj", "v_proj"]:
                     if kv_part is None:
-                        self.assertEqual(normalize_spec, q_part)
+                        self.assertEqual(normalize_spec, PartitionSpec(*q_part))
                     else:
-                        self.assertEqual(normalize_spec, kv_part)
+                        self.assertEqual(normalize_spec, PartitionSpec(*kv_part))
 
             jax.debug.inspect_array_sharding(tensor, callback=callback)
             return tensor
diff --git a/axlearn/common/flash_attention/layer.py b/axlearn/common/flash_attention/layer.py
@@ -8,6 +8,7 @@
 import jax
 import jax.numpy as jnp
 import numpy as np
+from absl import logging
 from jax.experimental.shard_map import shard_map
 from jax.interpreters.pxla import thread_resources
 from jax.sharding import PartitionSpec
@@ -17,7 +18,7 @@
 from axlearn.common.config import ConfigBase, ConfigModifier, config_class
 from axlearn.common.flash_attention.utils import flash_attention_implementation
 from axlearn.common.module import Module
-from axlearn.common.utils import Tensor, with_sharding_constraint
+from axlearn.common.utils import Tensor, maybe_shard, with_sharding_constraint
 
 
 class FlashAttention(GroupedQueryAttention):
@@ -108,7 +109,7 @@ def _logit_biases_spec(self, attention_logit_biases: BaseAttentionBias) -> BaseA
 
     def _maybe_repeat_kv_heads(self, key_or_value: Tensor) -> Tensor:
         """Repeats key or value heads dim to be shardable."""
-        cfg = self.config
+        cfg: FlashAttention.Config = self.config
         partition_spec = cfg.mha_dim_to_partition_spec["bsnh"]
         global_mesh = thread_resources.env.physical_mesh
         if (
@@ -132,7 +133,24 @@ def _maybe_repeat_kv_heads(self, key_or_value: Tensor) -> Tensor:
         num_head_repeats = axis_size // key_or_value.shape[-2]
         # Repeat along the num_heads dim: [batch, source_length, repeated_num_heads, per_head_dim].
         if num_head_repeats > 1:
+            logging.info(
+                "Repeating %d KV heads %d times to meet the size of %s, which is %d.",
+                key_or_value.shape[-2],
+                num_head_repeats,
+                axis,
+                axis_size,
+            )
             key_or_value = jnp.repeat(key_or_value, num_head_repeats, axis=-2)
+            if cfg.k_partition_spec != cfg.v_partition_spec:
+                raise ValueError(
+                    "FlashAttention doesn't support "
+                    f"{cfg.k_partition_spec=} != {cfg.v_partition_spec}"
+                )
+            # This maybe_shard is required when using "seq" > num_kv_heads and DeepSpeed Ulysses
+            # style sequence parallelism. It tells the compiler to not reshard from partitioning
+            # along the sequence axis to head axis before the `jnp.repeat` above, which otherwise
+            # would cause an involuntary full materialization.
+            key_or_value = maybe_shard(key_or_value, cfg.k_partition_spec or cfg.q_partition_spec)
 
         if key_or_value.shape[-2] % axis_size != 0:
             raise ValueError(