worldfnd
diff --git a/‎src/algebra/ntt/cooley_tukey.rs‎
Lines changed: 247 additions & 0 deletions b/‎src/algebra/ntt/cooley_tukey.rs‎
Lines changed: 247 additions & 0 deletions
diff --git a/‎src/algebra/ntt/mod.rs‎
Lines changed: 104 additions & 2 deletions b/‎src/algebra/ntt/mod.rs‎
Lines changed: 104 additions & 2 deletions
@@ -356,6 +356,164 @@ impl<F: Field> NttEngine<F> {
             size => self.ntt_recurse(values, roots, size),
         }
     }
+
+    /// Output-pruned NTT (Sorensen-Burrus, radix-2 DIT).
+    ///
+    /// Computes the size-`size` NTT of `values` (zero-padded to `size` if
+    /// shorter) and returns the outputs at positions `indices`, in input
+    /// order. Output `j` equals the full NTT at position `indices[j]`.
+    ///
+    /// Walks the butterfly DAG backwards from `indices` to mark only the
+    /// cone of butterflies that contribute to the queried outputs, then
+    /// runs only the marked butterflies on the forward pass. Cost is
+    /// `O(size + indices.len() * log(size))` field operations, vs
+    /// `O(size * log(size))` for a full NTT.
+    ///
+    /// `size` must be a power of two.
+    pub fn ntt_partial(&self, values: &[F], size: usize, indices: &[usize]) -> Vec<F> {
+        let plan = PartialNttPlan::new(size, indices);
+        let mut out = vec![F::ZERO; indices.len()];
+        self.ntt_partial_with_plan_into(values, &plan, &mut out, 1);
+        out
+    }
+
+    /// Run a pruned NTT using a precomputed plan and write outputs into
+    /// `out` at stride `stride` (so `out[j * stride]` holds the result for
+    /// `plan.indices[j]`). When `stride == 1`, output is contiguous.
+    ///
+    /// Sharing a single plan across many NTTs with the same `(size, indices)`
+    /// avoids re-running the O(size · log size) mask construction per call.
+    pub fn ntt_partial_with_plan_into(
+        &self,
+        values: &[F],
+        plan: &PartialNttPlan,
+        out: &mut [F],
+        stride: usize,
+    ) {
+        let size = plan.size;
+        let indices = &plan.indices;
+        assert!(values.len() <= size, "input longer than NTT size");
+        if indices.is_empty() {
+            return;
+        }
+        assert!(
+            out.len() >= (indices.len() - 1) * stride + 1,
+            "output buffer too small for stride"
+        );
+        if size == 1 {
+            let v = values.first().copied().unwrap_or(F::ZERO);
+            for j in 0..indices.len() {
+                out[j * stride] = v;
+            }
+            return;
+        }
+
+        let log_n = size.trailing_zeros() as usize;
+        let roots = self.roots_table(size);
+
+        // Load bit-reversed input into work buffer, gated by mask[0].
+        let mut work = vec![F::ZERO; size];
+        let shift = (usize::BITS as usize) - log_n;
+        for (j, &c) in values.iter().enumerate() {
+            let rev = j.reverse_bits() >> shift;
+            if plan.mask[0][rev] {
+                work[rev] = c;
+            }
+        }
+
+        // Forward DIT, skipping butterflies with no needed outputs.
+        // The shared roots table may hold roots at a larger order than `size`;
+        // `roots[k * twiddle_step]` retrieves ω_m^k regardless.
+        for stage in 1..=log_n {
+            let m = 1usize << stage;
+            let half = m >> 1;
+            let twiddle_step = roots.len() / m;
+            let cur = &plan.mask[stage];
+            let mut base = 0;
+            while base < size {
+                for k in 0..half {
+                    let a = base + k;
+                    let b = a + half;
+                    if cur[a] || cur[b] {
+                        let w = roots[k * twiddle_step];
+                        let t = work[b] * w;
+                        let u = work[a];
+                        work[a] = u + t;
+                        work[b] = u - t;
+                    }
+                }
+                base += m;
+            }
+        }
+
+        for (j, &i) in indices.iter().enumerate() {
+            out[j * stride] = work[i];
+        }
+    }
+}
+
+/// Pruning plan for an output-pruned NTT.
+///
+/// Holds the queried output indices and the precomputed per-stage
+/// "needed-position" masks used by [`NttEngine::ntt_partial_with_plan_into`].
+/// Construct once per `(size, indices)` and reuse across multiple NTTs of
+/// the same shape (e.g. all polynomials in an interleaved batch).
+#[derive(Debug, Clone)]
+pub struct PartialNttPlan {
+    size: usize,
+    indices: Vec<usize>,
+    /// `mask[stage][p]` is true iff position `p` after `stage` DIT stages
+    /// must be correct for the final outputs. `mask[log_n]` mirrors
+    /// `indices`; `mask[0]` selects the bit-reversed input positions that
+    /// must be loaded.
+    mask: Vec<Vec<bool>>,
+}
+
+impl PartialNttPlan {
+    pub fn new(size: usize, indices: &[usize]) -> Self {
+        assert!(size.is_power_of_two(), "size must be a power of two");
+        assert!(
+            indices.iter().all(|&i| i < size),
+            "query index out of range"
+        );
+        let log_n = size.trailing_zeros() as usize;
+        let mut mask: Vec<Vec<bool>> = vec![vec![false; size]; log_n + 1];
+        for &i in indices {
+            mask[log_n][i] = true;
+        }
+        for stage in (1..=log_n).rev() {
+            let m = 1usize << stage;
+            let half = m >> 1;
+            let (lo, hi) = mask.split_at_mut(stage);
+            let cur = &hi[0];
+            let prev = &mut lo[stage - 1];
+            let mut base = 0;
+            while base < size {
+                for k in 0..half {
+                    let a = base + k;
+                    let b = a + half;
+                    if cur[a] || cur[b] {
+                        prev[a] = true;
+                        prev[b] = true;
+                    }
+                }
+                base += m;
+            }
+        }
+        Self {
+            size,
+            indices: indices.to_vec(),
+            mask,
+        }
+    }
+
+    pub fn size(&self) -> usize {
+        self.size
+    }
+
+    pub fn indices(&self) -> &[usize] {
+        &self.indices
+    }
 }
 
 /// Applies twiddle factors to a slice of field elements in-place.
@@ -963,4 +1121,93 @@ mod tests {
 
         assert_eq!(values_ntt, expected_values);
     }
+
+    #[test]
+    fn test_ntt_partial_matches_full() {
+        use ark_std::{rand::Rng, UniformRand};
+
+        let engine = NttEngine::<Field64>::new_from_fftfield();
+        let mut rng = ark_std::test_rng();
+
+        for &size in &[4usize, 16, 64, 256, 1024, 1 << 15] {
+            for _ in 0..8 {
+                // Full NTT reference.
+                let coeffs: Vec<_> = (0..size).map(|_| Field64::rand(&mut rng)).collect();
+                let mut full = coeffs.clone();
+                engine.ntt_batch(&mut full, size);
+
+                // Random subset of varying size (cover dense + sparse).
+                let k = rng.gen_range(1..=size.min(64));
+                let mut perm: Vec<usize> = (0..size).collect();
+                for i in (1..size).rev() {
+                    perm.swap(i, rng.gen_range(0..=i));
+                }
+                let indices: Vec<usize> = perm.into_iter().take(k).collect();
+
+                let partial = engine.ntt_partial(&coeffs, size, &indices);
+                assert_eq!(partial.len(), indices.len());
+                for (j, &idx) in indices.iter().enumerate() {
+                    assert_eq!(partial[j], full[idx], "size={size} idx={idx}");
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_ntt_partial_zero_padded_input() {
+        // M < N: input is zero-padded. Partial NTT must agree with full NTT
+        // computed over the zero-padded coefficient vector.
+        use ark_std::UniformRand;
+
+        let engine = NttEngine::<Field64>::new_from_fftfield();
+        let mut rng = ark_std::test_rng();
+
+        for (m, size) in [(1usize, 4), (4, 16), (256, 1024), (1 << 13, 1 << 15)] {
+            let coeffs: Vec<_> = (0..m).map(|_| Field64::rand(&mut rng)).collect();
+            let mut padded = coeffs.clone();
+            padded.resize(size, Field64::ZERO);
+            engine.ntt_batch(&mut padded, size);
+
+            let stride = (size / 8).max(1);
+            let indices: Vec<usize> = (0..size).step_by(stride).take(8).collect();
+            let partial = engine.ntt_partial(&coeffs, size, &indices);
+            for (j, &idx) in indices.iter().enumerate() {
+                assert_eq!(partial[j], padded[idx], "m={m} size={size} idx={idx}");
+            }
+        }
+    }
+
+    #[test]
+    fn test_ntt_partial_edge_cases() {
+        use ark_std::UniformRand;
+
+        let engine = NttEngine::<Field64>::new_from_fftfield();
+        let mut rng = ark_std::test_rng();
+
+        // Empty index set.
+        let coeffs: Vec<_> = (0..16).map(|_| Field64::rand(&mut rng)).collect();
+        let out = engine.ntt_partial(&coeffs, 16, &[]);
+        assert!(out.is_empty());
+
+        // Singleton at position 0 and position N-1.
+        let coeffs: Vec<_> = (0..64).map(|_| Field64::rand(&mut rng)).collect();
+        let mut full = coeffs.clone();
+        engine.ntt_batch(&mut full, 64);
+        for idx in [0usize, 1, 31, 32, 63] {
+            let out = engine.ntt_partial(&coeffs, 64, &[idx]);
+            assert_eq!(out, vec![full[idx]], "idx={idx}");
+        }
+
+        // Repeated indices: each occurrence must yield the matching output.
+        let indices = vec![5usize, 5, 17, 5, 17];
+        let out = engine.ntt_partial(&coeffs, 64, &indices);
+        for (j, &idx) in indices.iter().enumerate() {
+            assert_eq!(out[j], full[idx]);
+        }
+
+        // size = 1: any indices must all return values[0].
+        let single = vec![Field64::from(42)];
+        let out = engine.ntt_partial(&single, 1, &[0, 0, 0]);
+        assert_eq!(out, vec![Field64::from(42); 3]);
+    }
 }
@@ -21,9 +21,9 @@ use static_assertions::assert_obj_safe;
 #[cfg(feature = "tracing")]
 use tracing::instrument;
 
-use self::matrix::MatrixMut;
+use self::{cooley_tukey::NttEngine, matrix::MatrixMut};
 pub use self::{
-    cooley_tukey::{generator, intt, intt_batch, ntt, ntt_batch},
+    cooley_tukey::{generator, intt, intt_batch, ntt, ntt_batch, PartialNttPlan},
     transpose::transpose,
     wavelet::{inverse_wavelet_transform, wavelet_transform},
 };
@@ -93,6 +93,54 @@ pub fn interleaved_rs_encode<F: 'static>(
     engine.interleaved_encode(interleaved_coeffs, codeword_length, interleaving_depth)
 }
 
+/// Partial Reed-Solomon encode that materialises only the rows at `indices`.
+///
+/// Equivalent to taking [`interleaved_rs_encode`]'s output (a row-major
+/// `(codeword_length, num_polys * interleaving_depth)` matrix) and
+/// extracting the rows whose row index is in `indices`. Output layout is
+/// row-major `(indices.len(), num_polys * interleaving_depth)`, byte-exact
+/// against the full encode.
+///
+/// Uses an output-pruned NTT (see [`PartialNttPlan`]) so peak memory and
+/// flop count are both proportional to `indices.len()`, not
+/// `codeword_length`. The pruning plan is built once for the index set and
+/// reused across every polynomial × interleaving slot.
+#[cfg_attr(feature = "tracing", instrument(level = "debug", skip(coeffs, indices), fields(size = coeffs.len(), k = indices.len())))]
+pub fn partial_interleaved_rs_encode<F: FftField>(
+    coeffs: &[&[F]],
+    codeword_length: usize,
+    interleaving_depth: usize,
+    indices: &[usize],
+) -> Vec<F> {
+    if coeffs.is_empty() || indices.is_empty() {
+        return Vec::new();
+    }
+    let poly_size = coeffs[0].len();
+    for poly in coeffs {
+        assert_eq!(poly.len(), poly_size);
+    }
+    assert!(poly_size.is_multiple_of(interleaving_depth));
+    let message_length = poly_size / interleaving_depth;
+    assert!(codeword_length.is_multiple_of(message_length));
+
+    let num_polys = coeffs.len();
+    let num_cols = num_polys * interleaving_depth;
+    let k = indices.len();
+
+    let engine = NttEngine::<F>::new_from_cache();
+    let plan = PartialNttPlan::new(codeword_length, indices);
+
+    let mut out = vec![F::ZERO; k * num_cols];
+    for (poly_idx, poly) in coeffs.iter().enumerate() {
+        for slot_idx in 0..interleaving_depth {
+            let col = poly_idx * interleaving_depth + slot_idx;
+            let block = &poly[slot_idx * message_length..(slot_idx + 1) * message_length];
+            engine.ntt_partial_with_plan_into(block, &plan, &mut out[col..], num_cols);
+        }
+    }
+    out
+}
+
 ///
 /// RS encode coefficients grouped in `interleaving_depth` contiguous blocks
 /// at the rate 1/`expansion`, then interleave the evaluations per point.
@@ -350,4 +398,58 @@ mod tests {
             interleaved_rs_encode(&[poly.as_slice()], codeword_length, 1 << folding_factor);
         assert_eq!(expected, interleaved_ntt);
     }
+
+    #[test]
+    fn test_partial_interleaved_rs_encode_matches_full() {
+        use ark_std::{rand::Rng, UniformRand};
+
+        let mut rng = ark_std::test_rng();
+
+        // Span several (num_polys, interleaving_depth, M, N) shapes covering
+        // the regimes that actually appear in whir_zk (single witness with
+        // depth 8, multi-witness with depth 1, M = N/4 blowup).
+        let cases = [
+            (1usize, 1usize, 64usize, 256usize),
+            (1, 8, 16, 64),
+            (2, 4, 32, 128),
+            (1, 8, 1 << 10, 1 << 12),
+        ];
+
+        for (num_polys, interleaving_depth, message_length, codeword_length) in cases {
+            let poly_size = message_length * interleaving_depth;
+            let polys: Vec<Vec<Field64>> = (0..num_polys)
+                .map(|_| (0..poly_size).map(|_| Field64::rand(&mut rng)).collect())
+                .collect();
+            let poly_slices: Vec<&[Field64]> = polys.iter().map(Vec::as_slice).collect();
+
+            let full = interleaved_rs_encode(&poly_slices, codeword_length, interleaving_depth);
+            let num_cols = num_polys * interleaving_depth;
+            assert_eq!(full.len(), codeword_length * num_cols);
+
+            // Random subset including 0, last, and a sprinkling in between.
+            let k = rng.gen_range(1..=codeword_length.min(16));
+            let mut perm: Vec<usize> = (0..codeword_length).collect();
+            for i in (1..codeword_length).rev() {
+                perm.swap(i, rng.gen_range(0..=i));
+            }
+            let indices: Vec<usize> = perm.into_iter().take(k).collect();
+
+            let partial = partial_interleaved_rs_encode(
+                &poly_slices,
+                codeword_length,
+                interleaving_depth,
+                &indices,
+            );
+            assert_eq!(partial.len(), k * num_cols);
+
+            for (row, &idx) in indices.iter().enumerate() {
+                let full_row = &full[idx * num_cols..(idx + 1) * num_cols];
+                let partial_row = &partial[row * num_cols..(row + 1) * num_cols];
+                assert_eq!(
+                    partial_row, full_row,
+                    "shape=({num_polys},{interleaving_depth},{message_length},{codeword_length}) row idx={idx}"
+                );
+            }
+        }
+    }
 }