Apply minor improvements using tree utility functions. Deprecate optax.global_norm in favor of optax.tree.norm.

carlosgmartin · carlosgmartin · commit de7cdd1bde24 · 2025-06-26T22:30:39.000-04:00
diff --git a/examples/perturbations.ipynb b/examples/perturbations.ipynb
@@ -41,7 +41,6 @@
    "source": [
     "import jax\n",
     "import jax.numpy as jnp\n",
-    "import operator\n",
     "from jax import tree_util as jtu\n",
     "\n",
     "import optax.tree\n",
@@ -773,7 +772,7 @@
     "  pert_softmax = pert_argmax_fun(rng, inputs)\n",
     "  argmax = argmax_tree(inputs)\n",
     "  diffs = jax.tree.map(lambda x, y: jnp.sum((x - y) ** 2 / 4), argmax, pert_softmax)\n",
-    "  return jax.tree.reduce(operator.add, diffs)"
+    "  return optax.tree.sum(diffs)"
    ]
   },
   {
diff --git a/optax/_src/linear_algebra.py b/optax/_src/linear_algebra.py
@@ -17,6 +17,7 @@
 from collections.abc import Callable
 import functools
 from typing import Optional, Union
+import warnings
 
 import chex
 import jax
@@ -33,10 +34,16 @@ def _normalize_tree(x):
 
 
 def global_norm(updates: base.PyTree) -> chex.Array:
-  """Compute the global norm across a nested structure of tensors."""
-  return jnp.sqrt(
-      sum(jnp.sum(numerics.abs_sq(x)) for x in jax.tree.leaves(updates))
+  """Compute the global norm across a nested structure of tensors.
+
+  .. warning::
+    Deprecated in favor of :func:`optax.tree.norm`.
+  """
+  warnings.warn(
+    "optax.global_norm is deprecated in favor of optax.tree.norm",
+    DeprecationWarning
   )
+  return optax.tree.norm(updates)
 
 
 def _power_iteration_cond_fun(error_tolerance, num_iters, loop_vars):
diff --git a/optax/_src/linear_algebra_test.py b/optax/_src/linear_algebra_test.py
@@ -53,7 +53,7 @@ def test_global_norm(self):
     }
     np.testing.assert_array_equal(
         jnp.sqrt(jnp.sum(flat_updates**2)),
-        linear_algebra.global_norm(nested_updates),
+        optax.tree.norm(nested_updates),
     )
 
   def test_power_iteration_cond_fun(self, dim=6):
diff --git a/optax/_src/utils.py b/optax/_src/utils.py
@@ -345,4 +345,3 @@ def _value_and_grad(
 # TODO(b/183800387): remove legacy aliases.
 safe_norm = numerics.safe_norm
 safe_int32_increment = numerics.safe_int32_increment
-global_norm = linear_algebra.global_norm
diff --git a/optax/contrib/_sam.py b/optax/contrib/_sam.py
@@ -55,6 +55,7 @@
 from optax._src import base
 from optax._src import update
 from optax._src import utils
+import optax.tree
 
 # As a helper for SAM we need a gradient normalizing transformation.
 
@@ -74,7 +75,7 @@ def init_fn(params):
 
   def update_fn(updates, state, params=None):
     del params
-    g_norm = utils.global_norm(updates)
+    g_norm = optax.tree.norm(updates)
     updates = jax.tree.map(lambda g: g / g_norm, updates)
     return updates, state
 
diff --git a/optax/contrib/_sophia.py b/optax/contrib/_sophia.py
@@ -157,10 +157,8 @@ def update_fn(updates, state: SophiaState, params=None, **hess_fn_kwargs):
         lambda m, h: m / jnp.maximum(gamma * h, eps), mu_hat, state.nu
     )
     if clip_threshold is not None:
-      sum_not_clipped = jax.tree.reduce(
-          lambda x, y: x + y,
-          jax.tree.map(lambda u: jnp.sum(jnp.abs(u) < clip_threshold), updates),
-      )
+      not_clipped = jax.tree.map(lambda u: jnp.abs(u) < clip_threshold, updates)
+      sum_not_clipped = optax.tree.sum(not_clipped)
       if verbose:
         win_rate = sum_not_clipped / optax.tree.size(updates)
         jax.lax.cond(
diff --git a/optax/perturbations/_make_pert_test.py b/optax/perturbations/_make_pert_test.py
@@ -16,7 +16,6 @@
 """Tests for optax.perturbations, checking values and gradients."""
 
 from functools import partial  # pylint: disable=g-importing-member
-import operator
 
 from absl.testing import absltest
 from absl.testing import parameterized
@@ -159,7 +158,7 @@ def loss(tree):
       pred = apply_element_tree(tree)
       pred_true = apply_element_tree(example_tree)
       tree_loss = jax.tree.map(lambda x, y: (x - y) ** 2, pred, pred_true)
-      list_loss = jax.tree.reduce(operator.add, tree_loss)
+      list_loss = optax.tree.sum(tree_loss)
       return jax.tree.map(lambda *leaves: sum(leaves) / len(leaves), list_loss)
 
     loss_pert = jax.jit(_make_pert.make_perturbed_fun(
diff --git a/optax/transforms/_accumulation.py b/optax/transforms/_accumulation.py
@@ -176,11 +176,8 @@ def skip_not_finite(
       - `num_not_finite`: total number of inf and NaN found in `updates`.
   """
   del gradient_step, params
-  all_is_finite = [
-      jnp.sum(jnp.logical_not(jnp.isfinite(p)))
-      for p in jax.tree.leaves(updates)
-  ]
-  num_not_finite = jnp.sum(jnp.array(all_is_finite))
+  not_finite = jax.tree.map(lambda x: ~jnp.isfinite(x), updates)
+  num_not_finite = optax.tree.sum(not_finite)
   should_skip = num_not_finite > 0
   return should_skip, {
       'should_skip': should_skip,
@@ -210,9 +207,7 @@ def skip_large_updates(
       - `norm_squared`: overall norm square of the `updates`.
   """
   del gradient_step, params
-  norm_sq = jnp.sum(
-      jnp.array([jnp.sum(p**2) for p in jax.tree.leaves(updates)])
-  )
+  norm_sq = optax.tree.norm(updates, squared=True)
   # This will also return True if `norm_sq` is NaN.
   should_skip = jnp.logical_not(norm_sq < max_squared_norm)
   return should_skip, {'should_skip': should_skip, 'norm_squared': norm_sq}
diff --git a/optax/transforms/_clipping.py b/optax/transforms/_clipping.py
@@ -90,7 +90,7 @@ def clip_by_global_norm(max_norm: float) -> base.GradientTransformation:
 
   def update_fn(updates, state, params=None):
     del params
-    g_norm = linear_algebra.global_norm(updates)
+    g_norm = optax.tree.norm(updates)
     # TODO(b/163995078): revert back to the following (faster) implementation
     # once analyzed how it affects backprop through update (e.g. meta-gradients)
     # g_norm = jnp.maximum(max_norm, g_norm)
@@ -154,7 +154,7 @@ def per_example_global_norm_clip(
         " `grads` to have a batch dimension in the 0th axis."
     )
 
-  global_grad_norms = jax.vmap(linear_algebra.global_norm)(grads)
+  global_grad_norms = jax.vmap(optax.tree.norm)(grads)
   multipliers = jnp.nan_to_num(
       jnp.minimum(l2_norm_clip / global_grad_norms, 1.0), nan=1.0
   )
diff --git a/optax/transforms/_clipping_test.py b/optax/transforms/_clipping_test.py
@@ -21,6 +21,7 @@
 import numpy as np
 from optax._src import linear_algebra
 from optax.transforms import _clipping
+import optax.tree
 
 
 STEPS = 50
@@ -71,7 +72,7 @@ def test_clip_by_global_norm(self):
       # Check that the clipper actually works and global norm is <= max_norm
       updates, _ = clipper.update(updates, None)
       self.assertAlmostEqual(
-          linear_algebra.global_norm(updates), 1.0 / i, places=6
+          optax.tree.norm(updates), 1.0 / i, places=6
       )
       # Check that continuously clipping won't cause numerical issues.
       updates_step, _ = clipper.update(self.per_step_updates, None)
diff --git a/optax/tree_utils/_tree_math.py b/optax/tree_utils/_tree_math.py
@@ -151,7 +151,7 @@ def tree_vdot(tree_x: Any, tree_y: Any) -> chex.Numeric:
     numerical issues.
   """
   vdots = jax.tree.map(_vdot_safe, tree_x, tree_y)
-  return jax.tree.reduce(operator.add, vdots, initializer=0)
+  return tree_sum(vdots)
 
 
 def tree_sum(tree: Any) -> chex.Numeric:

Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@ def test_global_norm(self):`
`53`	`53`	`}`
`54`	`54`	`np.testing.assert_array_equal(`
`55`	`55`	`jnp.sqrt(jnp.sum(flat_updates**2)),`
`56`		`- linear_algebra.global_norm(nested_updates),`
	`56`	`+ optax.tree.norm(nested_updates),`
`57`	`57`	`)`
`58`	`58`
`59`	`59`	`def test_power_iteration_cond_fun(self, dim=6):`