ecmwf · csjfwang · Jul 16, 2025 · Jul 16, 2025 · Jul 22, 2025 · Jul 25, 2025
diff --git a/config/config_forecasting.yml b/config/config_forecasting.yml
@@ -66,7 +66,11 @@ forecast_att_dense_rate: 1.0
 
 healpix_level: 5
 
-rope_2D: False
+# Generalized RoPE selector.
+rope_mode: none  # one of: none, 2d, spherical
+# Optional spherical harmonic band for spherical RoPE. If null, the model picks one
+# conservative shared band that fits all spherical-RoPE attention modules.
+rope_spherical_band: null
 
 with_mixed_precision: True
 with_flash_attention: True

diff --git a/config/config_forecasting_eerie.yml b/config/config_forecasting_eerie.yml
@@ -66,8 +66,6 @@ forecast_att_dense_rate: 1.0
 
 healpix_level: 5
 
-rope_2D: False
-
 with_mixed_precision: True
 with_flash_attention: True
 compile_model: False

diff --git a/config/config_jepa.yml b/config/config_jepa.yml
@@ -66,10 +66,12 @@ forecast_att_dense_rate: 1.0
 with_step_conditioning: True # False
 
 healpix_level: 5
-# Use 2D RoPE instead of traditional global positional encoding
-# When True: uses 2D RoPE based on healpix cell coordinates (lat/lon)
-# When False: uses traditional pe_global positional encoding
-rope_2D: False
+
+# Generalized RoPE selector.
+rope_mode: none  # one of: none, 2d, spherical
+# Optional spherical harmonic band for spherical RoPE. If null, the model picks one
+# conservative shared band that fits all spherical-RoPE attention modules.
+rope_spherical_band: null
 
 with_mixed_precision: True
 with_flash_attention: True

diff --git a/config/default_config.yml b/config/default_config.yml
@@ -67,10 +67,11 @@ num_register_tokens: 0
 
 healpix_level: 5
 
-# Use 2D RoPE instead of traditional global positional encoding
-# When True: uses 2D RoPE based on healpix cell coordinates (lat/lon)
-# When False: uses traditional pe_global positional encoding
-rope_2D: False
+# Generalized RoPE selector.
+rope_mode: none  # one of: none, 2d, spherical
+# Optional spherical harmonic band for spherical RoPE. If null, the model picks one
+# conservative shared band that fits all spherical-RoPE attention modules.
+rope_spherical_band: null
 
 with_mixed_precision: True
 with_flash_attention: True

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ requires-python = ">=3.12,<3.13"
 dependencies = [
  'numpy~=2.2',
  'astropy_healpix~=1.1.2',
+ 'healpy>=1.19,<2',
  'zarr~=3.1.3',
  'pandas~=2.2',
  'tqdm',
@@ -273,4 +274,3 @@ members = [
 # Explicitly not depending on 'packages/dashboard' : this causes issues when deploying
 # the streamlit dashboard.
 ]
-
diff --git a/src/weathergen/model/attention.py b/src/weathergen/model/attention.py
@@ -14,13 +14,13 @@
 from torch.nn.attention.flex_attention import create_block_mask, flex_attention
 
 from weathergen.model.norms import AdaLayerNorm, RMSNorm
-from weathergen.model.positional_encoding import rotary_pos_emb_2d
+from weathergen.model.positional_encoding import apply_rope
 
 """
 Attention blocks used by WeatherGenerator.
 
-Some blocks optionally apply 2D RoPE. When enabled, the caller must provide per-token 2D
-coordinates aligned with the token order (lat, lon in radians).
+Some blocks optionally apply RoPE-like positional modulation. When enabled, the caller must
+provide per-token coordinates aligned with the token order (lat, lon in radians).
 """
 
 
@@ -40,7 +40,7 @@ def __init__(
         dim_aux=None,
         norm_eps=1e-5,
         attention_dtype=torch.bfloat16,
-        with_2d_rope=False,
+        rope_mode="none",
     ):
         super(MultiSelfAttentionHeadVarlen, self).__init__()
 
@@ -49,7 +49,10 @@ def __init__(
         self.with_flash = with_flash
         self.softcap = softcap
         self.with_residual = with_residual
-        self.with_2d_rope = with_2d_rope
+        self.rope_mode = rope_mode
+        self.rope_post_mod_qk_lnorm = rope_mode == "spherical"
+        if self.rope_post_mod_qk_lnorm:
+            assert with_qk_lnorm, "rope_post_mod_qk_lnorm=True requires with_qk_lnorm=True"
 
         assert dim_embed % num_heads == 0
         self.dim_head_proj = dim_embed // num_heads if dim_head_proj is None else dim_head_proj
@@ -79,6 +82,9 @@ def __init__(
         lnorm = qk_norm if with_qk_lnorm else torch.nn.Identity
         self.lnorm_q = lnorm(self.dim_head_proj, eps=norm_eps)
         self.lnorm_k = lnorm(self.dim_head_proj, eps=norm_eps)
+        post_rope_lnorm = norm if self.rope_post_mod_qk_lnorm else torch.nn.Identity
+        self.post_rope_lnorm_q = post_rope_lnorm(self.dim_head_proj, eps=norm_eps)
+        self.post_rope_lnorm_k = post_rope_lnorm(self.dim_head_proj, eps=norm_eps)
 
         self.dtype = attention_dtype
 
@@ -96,10 +102,12 @@ def forward(self, x, x_lens, ada_ln_aux=None, coords=None):
         ks = self.lnorm_k(self.proj_heads_k(x).reshape(s)).to(self.dtype)
         vs = self.proj_heads_v(x).reshape(s)
 
-        if self.with_2d_rope:
-            if coords is None:
-                raise ValueError("coords must be provided when with_2d_rope=True")
-            qs, ks = rotary_pos_emb_2d(qs, ks, coords, unsqueeze_dim=1)
+        qs, ks = apply_rope(
+            qs, ks, coords, self.rope_mode, 1
+        )
+        if self.rope_post_mod_qk_lnorm:
+            qs = self.post_rope_lnorm_q(qs).to(self.dtype)
+            ks = self.post_rope_lnorm_k(ks).to(self.dtype)
 
         # set dropout rate according to training/eval mode as required by flash_attn
         dropout_rate = self.dropout_rate if self.training else 0.0
@@ -225,15 +233,18 @@ def __init__(
         dim_aux=None,
         norm_eps=1e-5,
         attention_dtype=torch.bfloat16,
-        with_2d_rope=False,
+        rope_mode="none",
     ):
         super(MultiSelfAttentionHeadLocal, self).__init__()
 
         self.num_heads = num_heads
         self.with_flash = with_flash
         self.softcap = softcap
         self.with_residual = with_residual
-        self.with_2d_rope = with_2d_rope
+        self.rope_mode = rope_mode
+        self.rope_post_mod_qk_lnorm = rope_mode == "spherical"
+        if self.rope_post_mod_qk_lnorm:
+            assert with_qk_lnorm, "rope_post_mod_qk_lnorm=True requires with_qk_lnorm=True"
 
         assert dim_embed % num_heads == 0
         self.dim_head_proj = dim_embed // num_heads if dim_head_proj is None else dim_head_proj
@@ -263,6 +274,9 @@ def __init__(
         lnorm = qk_norm if with_qk_lnorm else torch.nn.Identity
         self.lnorm_q = lnorm(self.dim_head_proj, eps=norm_eps)
         self.lnorm_k = lnorm(self.dim_head_proj, eps=norm_eps)
+        post_rope_lnorm = norm if self.rope_post_mod_qk_lnorm else torch.nn.Identity
+        self.post_rope_lnorm_q = post_rope_lnorm(self.dim_head_proj, eps=norm_eps)
+        self.post_rope_lnorm_k = post_rope_lnorm(self.dim_head_proj, eps=norm_eps)
 
         self.dtype = attention_dtype
         assert with_flash, "Only flash attention supported."
@@ -288,10 +302,12 @@ def forward(self, x, coords=None, ada_ln_aux=None):
         ks = self.lnorm_k(self.proj_heads_k(x).reshape(s)).to(self.dtype).permute([0, 2, 1, 3])
         vs = self.proj_heads_v(x).reshape(s).permute([0, 2, 1, 3])
 
-        if self.with_2d_rope:
-            if coords is None:
-                raise ValueError("coords must be provided when with_2d_rope=True")
-            qs, ks = rotary_pos_emb_2d(qs, ks, coords, unsqueeze_dim=1)
+        qs, ks = apply_rope(
+            qs, ks, coords, self.rope_mode, 1
+        )
+        if self.rope_post_mod_qk_lnorm:
+            qs = self.post_rope_lnorm_q(qs).to(self.dtype)
+            ks = self.post_rope_lnorm_k(ks).to(self.dtype)
 
         outs = self.flex_attention(qs, ks, vs, block_mask=self.block_mask).transpose(1, 2)
 
@@ -540,7 +556,7 @@ def __init__(
         dim_aux=None,
         norm_eps=1e-5,
         attention_dtype=torch.bfloat16,
-        with_2d_rope=False,
+        rope_mode="none",
     ):
         super(MultiSelfAttentionHead, self).__init__()
 
@@ -549,7 +565,10 @@ def __init__(
         self.softcap = softcap
         self.dropout_rate = dropout_rate
         self.with_residual = with_residual
-        self.with_2d_rope = with_2d_rope
+        self.rope_mode = rope_mode
+        self.rope_post_mod_qk_lnorm = rope_mode == "spherical"
+        if self.rope_post_mod_qk_lnorm:
+            assert with_qk_lnorm, "rope_post_mod_qk_lnorm=True requires with_qk_lnorm=True"
 
         assert dim_embed % num_heads == 0
         self.dim_head_proj = dim_embed // num_heads if dim_head_proj is None else dim_head_proj
@@ -579,6 +598,9 @@ def __init__(
         lnorm = qk_norm if with_qk_lnorm else torch.nn.Identity
         self.lnorm_q = lnorm(self.dim_head_proj, eps=norm_eps)
         self.lnorm_k = lnorm(self.dim_head_proj, eps=norm_eps)
+        post_rope_lnorm = norm if self.rope_post_mod_qk_lnorm else torch.nn.Identity
+        self.post_rope_lnorm_q = post_rope_lnorm(self.dim_head_proj, eps=norm_eps)
+        self.post_rope_lnorm_k = post_rope_lnorm(self.dim_head_proj, eps=norm_eps)
 
         self.dtype = attention_dtype
         if with_flash:
@@ -599,10 +621,12 @@ def forward(self, x, coords=None, ada_ln_aux=None):
         ks = self.lnorm_k(self.proj_heads_k(x).reshape(s)).to(self.dtype)
         vs = self.proj_heads_v(x).reshape(s).to(self.dtype)
 
-        if self.with_2d_rope:
-            if coords is None:
-                raise ValueError("coords must be provided when with_2d_rope=True")
-            qs, ks = rotary_pos_emb_2d(qs, ks, coords, unsqueeze_dim=2)
+        qs, ks = apply_rope(
+            qs, ks, coords, self.rope_mode, 2
+        )
+        if self.rope_post_mod_qk_lnorm:
+            qs = self.post_rope_lnorm_q(qs).to(self.dtype)
+            ks = self.post_rope_lnorm_k(ks).to(self.dtype)
 
         # set dropout rate according to training/eval mode as required by flash_attn
         dropout_rate = self.dropout_rate if self.training else 0.0

diff --git a/src/weathergen/model/encoder.py b/src/weathergen/model/encoder.py
@@ -133,7 +133,11 @@ def forward(self, model_params, batch):
         tokens_global = checkpoint(
             self.ae_global_engine,
             tokens_global,
-            coords=model_params.rope_coords,
+            coords=(
+                model_params.rope_spherical_coeffs.unbind(dim=-1)
+                if model_params.rope_spherical_coeffs is not None
+                else model_params.rope_coords
+            ),
             use_reentrant=False,
         )
 
@@ -221,6 +225,8 @@ def aggregation_engine_unmasked(
         tokens_global_register_class,
         tokens_lens,
         rope_cell_coords=None,
+        rope_cell_coeffs=None,
+        rope_extra_coeffs=None,
     ):
         """
         Aggregation engine on the global latents of unmasked cells
@@ -251,8 +257,19 @@ def aggregation_engine_unmasked(
         )
 
         # Build packed coords matching the interleaved token order
-        if rope_cell_coords is not None:
-            num_extra = self.num_class_tokens + self.num_register_tokens
+        num_extra = self.num_class_tokens + self.num_register_tokens
+        if rope_cell_coeffs is not None:
+            extra_real, extra_imag = rope_extra_coeffs.unbind(dim=-1)
+            cell_real, cell_imag = rope_cell_coeffs.unbind(dim=-1)
+            packed_real = []
+            packed_imag = []
+            for mask_b in cell_mask.flatten(0, 1):
+                packed_real.append(extra_real)
+                packed_imag.append(extra_imag)
+                packed_real.append(cell_real[mask_b])
+                packed_imag.append(cell_imag[mask_b])
+            packed_coords = (torch.cat(packed_real, dim=0), torch.cat(packed_imag, dim=0))
+        elif rope_cell_coords is not None:
             zero_coords = torch.zeros(
                 num_extra, 2, device=rope_cell_coords.device, dtype=rope_cell_coords.dtype
             )
@@ -316,6 +333,8 @@ def assimilate_local(
             tokens_global_register_class,
             batch.tokens_lens,
             rope_cell_coords=model_params.rope_cell_coords,
+            rope_cell_coeffs=model_params.rope_spherical_cell_coeffs,
+            rope_extra_coeffs=model_params.rope_spherical_extra_coeffs,
         )
 
         # final processing

diff --git a/src/weathergen/model/engines.py b/src/weathergen/model/engines.py
@@ -29,6 +29,7 @@
     StreamEmbedTransformer,
 )
 from weathergen.model.layers import MLP
+from weathergen.model.positional_encoding import get_rope_mode
 from weathergen.model.utils import ActivationFactory
 from weathergen.utils.utils import get_dtype
 
@@ -389,6 +390,7 @@ def __init__(self, cf: Config, num_healpix_cells: int) -> None:
         super(QueryAggregationEngine, self).__init__()
         self.cf = cf
         self.num_healpix_cells = num_healpix_cells
+        rope_mode = get_rope_mode(self.cf)
 
         self.ae_aggregation_blocks = torch.nn.ModuleList()
 
@@ -409,7 +411,7 @@ def __init__(self, cf: Config, num_healpix_cells: int) -> None:
                         qk_norm_type=self.cf.get("qk_norm_type", self.cf.norm_type),
                         norm_eps=self.cf.norm_eps,
                         attention_dtype=get_dtype(self.cf.attention_dtype),
-                        with_2d_rope=self.cf.get("rope_2D", False),
+                        rope_mode=rope_mode,
                     )
                 )
             else:
@@ -465,6 +467,7 @@ def __init__(self, cf: Config, num_healpix_cells: int) -> None:
         super(GlobalAssimilationEngine, self).__init__()
         self.cf = cf
         self.num_healpix_cells = num_healpix_cells
+        rope_mode = get_rope_mode(self.cf)
 
         self.ae_global_blocks = torch.nn.ModuleList()
 
@@ -485,7 +488,7 @@ def __init__(self, cf: Config, num_healpix_cells: int) -> None:
                         qk_norm_type=self.cf.get("qk_norm_type", self.cf.norm_type),
                         norm_eps=self.cf.norm_eps,
                         attention_dtype=get_dtype(self.cf.attention_dtype),
-                        with_2d_rope=self.cf.get("rope_2D", False),
+                        rope_mode=rope_mode,
                     )
                 )
             else:
@@ -502,7 +505,7 @@ def __init__(self, cf: Config, num_healpix_cells: int) -> None:
                         qk_norm_type=self.cf.get("qk_norm_type", self.cf.norm_type),
                         norm_eps=self.cf.norm_eps,
                         attention_dtype=get_dtype(self.cf.attention_dtype),
-                        with_2d_rope=self.cf.get("rope_2D", False),
+                        rope_mode=rope_mode,
                     )
                 )
             # MLP block
@@ -553,6 +556,7 @@ def __init__(self, cf: Config, mode_cfg, num_healpix_cells: int, dim_aux: int =
         super(ForecastingEngine, self).__init__()
         self.cf = cf
         self.num_healpix_cells = num_healpix_cells
+        rope_mode = get_rope_mode(self.cf)
         self.fe_blocks = torch.nn.ModuleList()
 
         global_rate = int(1 / self.cf.forecast_att_dense_rate)
@@ -572,7 +576,7 @@ def __init__(self, cf: Config, mode_cfg, num_healpix_cells: int, dim_aux: int =
                             dim_aux=dim_aux,
                             norm_eps=self.cf.norm_eps,
                             attention_dtype=get_dtype(self.cf.attention_dtype),
-                            with_2d_rope=self.cf.get("rope_2D", False),
+                            rope_mode=rope_mode,
                         )
                     )
                 else:
@@ -590,7 +594,7 @@ def __init__(self, cf: Config, mode_cfg, num_healpix_cells: int, dim_aux: int =
                             dim_aux=dim_aux,
                             norm_eps=self.cf.norm_eps,
                             attention_dtype=get_dtype(self.cf.attention_dtype),
-                            with_2d_rope=self.cf.get("rope_2D", False),
+                            rope_mode=rope_mode,
                         )
                     )
                 # Add MLP block