diff --git a/src/algorithms/bravo.rs b/src/algorithms/bravo.rs index 9989f96..1ceeead 100644 --- a/src/algorithms/bravo.rs +++ b/src/algorithms/bravo.rs @@ -10,7 +10,7 @@ /// The initial implementation was translated from mathematical notation in the paper /// to Rust by Claude 4.5 Opus. use fearless_simd::prelude::*; -use fearless_simd::{f32x4, f32x8, f64x2, f64x4, Simd}; +use fearless_simd::{f32x4, f32x8, f64x4, f64x8, Simd}; /// Macro to generate bit_rev_bravo implementations for concrete types. /// Used instead of generics because `fearless_simd` doesn't let us be generic over the exact float type. @@ -138,8 +138,8 @@ macro_rules! impl_bit_rev_bravo { // which is necessary for using the native vector width impl_bit_rev_bravo!(bit_rev_bravo_chunk_4_f32, f32, f32x4, 4); impl_bit_rev_bravo!(bit_rev_bravo_chunk_8_f32, f32, f32x8, 8); -impl_bit_rev_bravo!(bit_rev_bravo_chunk_2_f64, f64, f64x2, 2); impl_bit_rev_bravo!(bit_rev_bravo_chunk_4_f64, f64, f64x4, 4); +impl_bit_rev_bravo!(bit_rev_bravo_chunk_8_f64, f64, f64x8, 8); /// Performs in-place bit-reversal permutation using the CO-BRAVO algorithm. /// @@ -163,8 +163,10 @@ pub fn bit_rev_bravo_f32(simd: S, data: &mut [f32], n: usize) { #[inline(always)] // required by fearless_simd pub fn bit_rev_bravo_f64(simd: S, data: &mut [f64], n: usize) { match ::N { - 2 => bit_rev_bravo_chunk_2_f64(simd, data, n), // SSE, NEON and fallback - _ => bit_rev_bravo_chunk_4_f64(simd, data, n), + // despite exceeding the native vector width, it is profitable to use larger chunks + // according to benchmarks on both Zen4 and Apple M4 + 2 => bit_rev_bravo_chunk_4_f64(simd, data, n), // SSE, NEON and fallback + _ => bit_rev_bravo_chunk_8_f64(simd, data, n), // fearless_simd has no native support for AVX-512 yet } }