Rename live_step_len parameter to unpadded_len for clarity

changlan · changlan · commit a0144986dc6e · 2025-08-04T17:30:05.000-07:00
- Rename live_step_len -> unpadded_len across attention and KV cache modules - Update documentation to clarify that unpadded_len specifies the number of non-padding tokens per sequence, with actual behavior depending on KV cache implementation - Fix pre-existing pylint error in rattention.py where rla_output was used before assignment - Update all test files to use the new parameter name The new name better reflects the parameter's purpose: indicating the number of non-padding tokens in each sequence, rather than the ambiguous "live step length". Implementation behavior varies by KV cache type: - Standard KVCache: ignores the parameter - SlidingWindowKVCache: uses it for sequence masking - PagedKVCache: ignores the parameter GitOrigin-RevId: 5b0d848
diff --git a/axlearn/common/attention.py b/axlearn/common/attention.py
@@ -46,14 +46,16 @@
 
 TODO(apghml) Convert everything to take an instance of BaseAttentionBias rather than a Tensor.
 
-On `live_step_len`:
-* An int tensor of shape [batch], indicating the valid step length in the given inputs.
-* We assume that live steps must be contiguous at the beginning. So once
-    `live_step_len < max_step_len` for a sequence, the remaining `max_step_len - live_step_len`
-    part is considered padding.
-* During prefill, `time_step == live_step_len`.
+On `unpadded_len`:
+* An int tensor of shape [batch], indicating the number of non-padding tokens in each sequence.
+* Non-padding tokens are assumed to be contiguous at the beginning of each sequence.
+  For a sequence with `unpadded_len[i] < sequence_length`, tokens at positions
+  `unpadded_len[i]:` are considered padding and should be ignored.
+* During prefill, `time_step == unpadded_len` since we process exactly the non-padding tokens.
+* This parameter enables optimizations in some KV cache implementations by avoiding
+  computation on padding tokens.
 
-TODO (dhwang2): Replace `time_step` argument with `live_step_len` to reduce cognitive complexity.
+TODO (dhwang2): Replace `time_step` argument with `unpadded_len` to reduce cognitive complexity.
 
 On `segment_ids`:
 * A tensor of shape [batch, target_length] with values in [0, num_segments].
@@ -1669,7 +1671,7 @@ def _forward_for_mode(
         key: Optional[Tensor] = None,
         value: Optional[Tensor] = None,
         kv_state: Optional[KVState] = None,
-        live_step_len: Optional[Tensor] = None,
+        unpadded_len: Optional[Tensor] = None,
         attention_logit_biases: Union[None, Tensor, BaseAttentionBias] = None,
         segment_ids: Optional[Tensor] = None,
         query_positions: Optional[Tensor] = None,
@@ -1688,7 +1690,7 @@ def _forward_for_mode(
             key:   An optional Tensor of shape [batch, source_length, source_dim].
             value: An optional Tensor of shape [batch, source_length, source_dim].
             kv_state: An optional KVState. If specified, both `key` and `value` should be None.
-            live_step_len: An optional Tensor of shape [batch]. Please refer to ``On live_step_len``
+            unpadded_len: An optional Tensor of shape [batch]. Please refer to ``On unpadded_len``
                 in the file docstring for details.
             attention_logit_biases: See ``On attention logit biases`` in the file comments.
             segment_ids: See ``On segment_ids`` in the file comments.
@@ -1768,7 +1770,7 @@ def _forward_for_mode(
             )
         elif mode in (ForwardMode.EXTEND_STEP, ForwardMode.INIT_STATES):
             assert cached_states is not None
-            step_len = live_step_len if live_step_len is not None else q_proj.shape[1]
+            step_len = unpadded_len if unpadded_len is not None else q_proj.shape[1]
             new_cached_states = dict(time_step=time_step + step_len)
             if not has_external_kv_state:
                 # In prefill, init_states already called self.kv_cache.init_states.
@@ -1778,7 +1780,7 @@ def _forward_for_mode(
                         k_proj=k_proj,
                         v_proj=v_proj,
                         key_positions=query_positions,
-                        live_step_len=live_step_len,
+                        unpadded_len=unpadded_len,
                         page_pool=page_pool,
                     )
                 if mode == ForwardMode.EXTEND_STEP:
@@ -2057,7 +2059,7 @@ def init_states(
             query=query,
             key=key,
             value=value,
-            live_step_len=time_step,
+            unpadded_len=time_step,
             cached_states=init_states,
             kv_state=kv_state,
             attention_logit_biases=attention_logit_biases,
diff --git a/axlearn/common/kv_cache/base_kv_cache.py b/axlearn/common/kv_cache/base_kv_cache.py
@@ -83,7 +83,7 @@ def extend_step(
         k_proj: Tensor,
         v_proj: Tensor,
         key_positions: Tensor,
-        live_step_len: Optional[Tensor] = None,
+        unpadded_len: Optional[Tensor] = None,
         page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Nested[Tensor], Output]:
         """Updates the KV cache per extend step.
@@ -97,8 +97,10 @@ def extend_step(
             k_proj: A Tensor of shape [batch, step_length, num_kv_heads, per_head_dim].
             v_proj: A Tensor of shape [batch, step_length, num_kv_heads, per_head_dim].
             key_positions: An optional Tensor of shape [1|batch, step_length].
-            live_step_len: An optional Tensor of shape [batch]. See file-level docstring of
-                `attention.py`
+            unpadded_len: An optional Tensor of shape [batch]. Specifies the number of
+                non-padding tokens per sequence. When provided, only the first `unpadded_len[i]`
+                tokens of sequence `i` are considered valid for caching. The actual behavior
+                depends on the specific KV cache implementation.
             page_pool: See file-level docstring of `attention.py`.
 
         Returns:
diff --git a/axlearn/common/kv_cache/kv_cache.py b/axlearn/common/kv_cache/kv_cache.py
@@ -37,15 +37,15 @@ def extend_step(
         k_proj: Tensor,
         v_proj: Tensor,
         key_positions: Tensor,
-        live_step_len: Optional[Tensor] = None,
+        unpadded_len: Optional[Tensor] = None,
         page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Nested[Tensor], BaseKVCache.Output]:
-        # TODO(dhwang2): By returning only the valid portions of the KV (by live_step_len),
-        # the attention complexity can be reduced from O(max_len²) to O(live_step_len²), especially
+        # TODO(dhwang2): By returning only the valid portions of the KV (by unpadded_len),
+        # the attention complexity can be reduced from O(max_len²) to O(unpadded_len²), especially
         # in prefill.
-        # The remaining part after `live_step_len` is considered padding.
+        # The remaining part after `unpadded_len` is considered padding.
         assert page_pool is None
-        del live_step_len
+        del unpadded_len
         if k_proj.shape != v_proj.shape:
             raise ValueError(f"{k_proj.shape=} != {v_proj.shape=}")
         if k_proj.shape[1] != key_positions.shape[1]:
@@ -101,7 +101,7 @@ def update_single(cached_kv_slice, kv_proj_slice, time_idx):
         # [B, S, N, H]
         k_proj = jnp.einsum("bnhs->bsnh", cached_key)
         v_proj = jnp.einsum("bnhs->bsnh", cached_value)
-        # Currently, the part larger than live_step_len is also being overwritten in the KV cache,
+        # Currently, the part larger than unpadded_len is also being overwritten in the KV cache,
         # and this part is filtered out by the causal mask through key_positions.
         key_positions = jnp.arange(k_proj.shape[1])[None]  # [1, source_length]
         return updated_state, self.Output(k_proj=k_proj, v_proj=v_proj, key_positions=key_positions)
diff --git a/axlearn/common/kv_cache/kv_cache_test.py b/axlearn/common/kv_cache/kv_cache_test.py
@@ -17,9 +17,9 @@ class KVCacheTest(TestCase):
         cached_kv_length=[8],
         time_step_value=[2, 4],
         cache_dtype=[None, jnp.bfloat16],
-        live_step_len=[-1, 2, 4],
+        unpadded_len=[-1, 2, 4],
     )
-    def test_kv_cache(self, cached_kv_length, time_step_value, cache_dtype, live_step_len):
+    def test_kv_cache(self, cached_kv_length, time_step_value, cache_dtype, unpadded_len):
         test_layer = (
             KVCache.default_config()
             .set(name="ref", cache_dtype=cache_dtype)
@@ -33,12 +33,12 @@ def test_kv_cache(self, cached_kv_length, time_step_value, cache_dtype, live_ste
         k_proj = jax.random.normal(prng_key, shape=step_shape)
         v_proj = jax.random.normal(prng_key, shape=step_shape)
         key_positions = jnp.arange(step_len)[None] + time_step_value
-        if live_step_len < 0:
+        if unpadded_len < 0:
             valid_step_len = step_len
-            live_step_len = None
+            unpadded_len = None
         else:
-            valid_step_len = live_step_len
-            live_step_len = jnp.full([batch], fill_value=live_step_len, dtype=jnp.int32)
+            valid_step_len = unpadded_len
+            unpadded_len = jnp.full([batch], fill_value=unpadded_len, dtype=jnp.int32)
 
         kv_shape = KVCache.Shape(batch, cached_kv_length, heads, dim)
         test_states = test_layer.init_states(kv_shape, dtype=k_proj.dtype)
@@ -49,7 +49,7 @@ def test_kv_cache(self, cached_kv_length, time_step_value, cache_dtype, live_ste
             k_proj=k_proj,
             v_proj=v_proj,
             key_positions=key_positions,
-            live_step_len=live_step_len,
+            unpadded_len=unpadded_len,
         )
 
         def check(input_kv, output_kv):
@@ -65,7 +65,7 @@ def check(input_kv, output_kv):
         check(v_proj, test_output.v_proj)
         key_positions = jnp.arange(cached_kv_length)[None]
         assert_allclose(test_output.key_positions, key_positions)
-        # Currently, the part larger than live_step_len is also being overwritten in the KV cache.
+        # Currently, the part larger than unpadded_len is also being overwritten in the KV cache.
         # TODO(dhwang2): remove this check when KVCache updates only valid part.
         assert_allclose(
             test_output.k_proj[:, time_step_value : time_step_value + step_len],
diff --git a/axlearn/common/kv_cache/paged_kv_cache.py b/axlearn/common/kv_cache/paged_kv_cache.py
@@ -132,7 +132,7 @@ def extend_step(
         k_proj: Tensor,
         v_proj: Tensor,
         key_positions: Tensor,
-        live_step_len: Optional[Tensor] = None,
+        unpadded_len: Optional[Tensor] = None,
         page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Nested[Tensor], KVCache.Output]:
         """Extend the cache with the new key and value.
@@ -159,7 +159,7 @@ def extend_step(
                     k_pages = k_pages.at[k, actual_page_idx, page_offset].set(k_proj[i, j, k, :])
                     v_pages = v_pages.at[k, actual_page_idx, page_offset].set(v_proj[i, j, k, :])
         """
-        del live_step_len
+        del unpadded_len
 
         if k_proj.shape != v_proj.shape:
             raise ValueError(f"{k_proj.shape=} != {v_proj.shape=}")
diff --git a/axlearn/common/kv_cache/paged_kv_cache_test.py b/axlearn/common/kv_cache/paged_kv_cache_test.py
@@ -103,8 +103,8 @@ def test_paged_kv_cache(
             v_proj = jax.random.normal(prng_key, shape=step_shape, dtype=cache_dtype)
             key_positions = jnp.full((batch, 1), time_step_value, dtype=jnp.int32)
 
-            # TODO(xiyou): consider live_step_len when it's supported
-            live_step_len = None
+            # TODO(xiyou): consider unpadded_len when it's supported
+            unpadded_len = None
 
             kv_shape = KVCache.Shape(batch, max_pages_each_request * page_size, heads, dim)
             ref_states = ref_layer.init_states(kv_shape, dtype=k_proj.dtype)
@@ -131,22 +131,22 @@ def test_paged_kv_cache(
 
             @partial(jax.jit, static_argnums=(0,))
             def jit_extend_step(
-                layer: KVCache, test_states, k_proj, v_proj, key_positions, live_step_len
+                layer: KVCache, test_states, k_proj, v_proj, key_positions, unpadded_len
             ):
                 _, test_output = layer.extend_step(
                     test_states,
                     k_proj=k_proj,
                     v_proj=v_proj,
                     key_positions=key_positions,
-                    live_step_len=live_step_len,
+                    unpadded_len=unpadded_len,
                 )
                 return test_output
 
             ref_out: KVState = jit_extend_step(
-                ref_layer, ref_states, k_proj, v_proj, key_positions, live_step_len
+                ref_layer, ref_states, k_proj, v_proj, key_positions, unpadded_len
             )
             test_out: KVState = jit_extend_step(
-                test_layer, test_states, k_proj, v_proj, key_positions, live_step_len
+                test_layer, test_states, k_proj, v_proj, key_positions, unpadded_len
             )
 
             test_k_proj = reconstruct_kv(page_indices, test_out.k_proj)
diff --git a/axlearn/common/kv_cache/sliding_window_kv_cache.py b/axlearn/common/kv_cache/sliding_window_kv_cache.py
@@ -52,7 +52,7 @@ def extend_step(
         k_proj: Tensor,
         v_proj: Tensor,
         key_positions: Tensor,
-        live_step_len: Optional[Tensor] = None,
+        unpadded_len: Optional[Tensor] = None,
         page_pool: Optional[Nested[Tensor]] = None,
     ) -> tuple[Nested[Tensor], BaseKVCache.Output]:
         """Updates the sliding window KV cache per extend step.
@@ -62,8 +62,10 @@ def extend_step(
             k_proj: A Tensor of shape [batch, step_length, num_kv_heads, per_head_dim].
             v_proj: A Tensor of shape [batch, step_length, num_kv_heads, per_head_dim].
             key_positions: An optional Tensor of shape [1|batch, step_length].
-            live_step_len: An optional Tensor of shape [batch]. Please refer to ``On live_step_len``
-                in the file docstring for details.
+            unpadded_len: An optional Tensor of shape [batch]. Specifies the number of
+                non-padding tokens per sequence. When provided, only the first `unpadded_len[i]`
+                tokens of sequence `i` are considered valid and will be cached. Padding tokens
+                are masked out and marked as invalid positions.
 
         Returns:
             A tuple (updated_state, output):
@@ -81,10 +83,10 @@ def extend_step(
 
         # [1|batch, step_length] -> [batch, step_length]
         key_positions = jnp.broadcast_to(key_positions, (batch, step_len))
-        if live_step_len is not None:
-            if live_step_len.shape[0] != batch:
-                raise ValueError(f"{live_step_len.shape=} must be [{batch}].")
-            steps = live_step_len
+        if unpadded_len is not None:
+            if unpadded_len.shape[0] != batch:
+                raise ValueError(f"{unpadded_len.shape=} must be [{batch}].")
+            steps = unpadded_len
             seq_mask = sequence_mask(lengths=steps, max_len=step_len, dtype=key_positions.dtype)
             # update_single rolls key_positions, so mark invalid positions.
             key_positions = jnp.where(seq_mask, key_positions, self._invaild_position())
diff --git a/axlearn/common/kv_cache/sliding_window_kv_cache_test.py b/axlearn/common/kv_cache/sliding_window_kv_cache_test.py
@@ -13,8 +13,8 @@
 class SlidingWindowKVCacheTest(TestCase):
     """Tests SlidingWindowKVCache."""
 
-    @parameterized.product(cached_kv_length=[8], time_step_value=[2, 4, 6], live_step_len=[None, 2])
-    def test_sliding_window_kv_cache(self, cached_kv_length, time_step_value, live_step_len):
+    @parameterized.product(cached_kv_length=[8], time_step_value=[2, 4, 6], unpadded_len=[None, 2])
+    def test_sliding_window_kv_cache(self, cached_kv_length, time_step_value, unpadded_len):
         test_layer = (
             SlidingWindowKVCache.default_config()
             .set(name="ref", cached_kv_length=cached_kv_length)
@@ -29,16 +29,16 @@ def test_sliding_window_kv_cache(self, cached_kv_length, time_step_value, live_s
         k_proj = jax.random.normal(prng_key, shape=step_shape)
         v_proj = jax.random.normal(prng_key, shape=step_shape)
         key_positions = jnp.arange(step_len)[None] + time_step_value
-        valid_out_len = live_step_len or step_len
-        live_step_len = (
-            jnp.full([batch], fill_value=live_step_len) if live_step_len is not None else None
+        valid_out_len = unpadded_len or step_len
+        unpadded_len = (
+            jnp.full([batch], fill_value=unpadded_len) if unpadded_len is not None else None
         )
         _, test_output = test_layer.extend_step(
             test_states,
             k_proj=k_proj,
             v_proj=v_proj,
             key_positions=key_positions,
-            live_step_len=live_step_len,
+            unpadded_len=unpadded_len,
         )
         kv_shape = (2, cached_kv_length + step_len, 2, 2)
         self.assertEqual(test_output.key_positions.shape, kv_shape[:2])
diff --git a/axlearn/common/rattention/rattention.py b/axlearn/common/rattention/rattention.py
@@ -600,7 +600,7 @@ def _forward_for_mode(
         key: Optional[Tensor] = None,
         value: Optional[Tensor] = None,
         kv_state: Optional[KVState] = None,
-        live_step_len: Optional[int] = None,
+        unpadded_len: Optional[int] = None,
         attention_logit_biases: Union[None, Tensor, BaseAttentionBias] = None,
         segment_ids: Optional[Tensor] = None,
         query_positions: Optional[Tensor] = None,
@@ -616,7 +616,7 @@ def _forward_for_mode(
         k_proj/v_proj as kv_state to the output.
 
         Notes on intermediate variables:
-            * live_step_len vs time_step: time_step denotes the starting point where live_step_len
+            * unpadded_len vs time_step: time_step denotes the starting point where unpadded_len
               denotes the length of progression.
             * k_proj/v_proj vs full_k_proj/full_v_proj: the former could be single token during
               extend_step whereas the latter always means the kv for the whole sequence. Residual_la
@@ -651,6 +651,8 @@ def _forward_for_mode(
 
             if cfg.residual_la is not None:
                 rla_output = self.residual_la(query, i_proj_output)
+            else:
+                rla_output = None
             new_cached_states = {}
         else:
             if kv_state is None:
@@ -661,7 +663,7 @@ def _forward_for_mode(
                         k_proj=k_proj,
                         v_proj=v_proj,
                         key_positions=query_positions,
-                        live_step_len=live_step_len,
+                        unpadded_len=unpadded_len,
                         page_pool=page_pool,
                     )
                     if mode == ForwardMode.EXTEND_STEP:
@@ -690,14 +692,14 @@ def _forward_for_mode(
             else:
                 if mode == ForwardMode.INIT_STATES:
                     rla_state, rla_output = self.residual_la.init_states(
-                        query, (q_proj, full_k_proj, full_v_proj), live_step_len
+                        query, (q_proj, full_k_proj, full_v_proj), unpadded_len
                     )
                 else:
                     rla_state, rla_output = self.residual_la.extend_step(
                         cached_states["rla_state"], query, (q_proj, full_k_proj, full_v_proj)
                     )
 
-            step_len = live_step_len if live_step_len is not None else query.shape[1]
+            step_len = unpadded_len if unpadded_len is not None else query.shape[1]
             new_time_step = time_step + step_len
             new_cached_states = dict(
                 swa_state=swa_state, rla_state=rla_state, time_step=new_time_step
@@ -800,7 +802,7 @@ def init_states(
             query=query,
             key=key,
             value=value,
-            live_step_len=time_step,
+            unpadded_len=time_step,
             cached_states=init_states,
             kv_state=kv_state,
             attention_logit_biases=attention_logit_biases,