diff --git a/Cargo.lock b/Cargo.lock index ecf6dd6..d9809a8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -165,7 +165,7 @@ checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" [[package]] name = "earshot" -version = "0.1.0" +version = "1.0.0" dependencies = [ "criterion", "libm", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] @@ -362,18 +362,28 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.209" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.209" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -394,9 +404,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.76" +version = "2.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525" +checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index c637fe2..33a4320 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,19 +1,18 @@ [package] name = "earshot" -version = "0.1.0" +version = "1.0.0" description = "Ridiculously fast & accurate voice activity detection in pure Rust" repository = "https://github.com/pykeio/earshot" authors = [ "Carson M " ] -license = "MIT" -edition = "2021" -exclude = ["tests/data", ".github"] +license = "MIT OR Apache-2.0" +edition = "2024" +exclude = ["tests/data", "benches/", "examples/", ".github"] +rust-version = "1.87" [features] -default = [ "std", "embed-weights" ] +default = [ "std" ] # Currently just impls `std::error::Error` for the `Error` type. std = [] -# Embed the default model weights in the binary. Enables `Default` for `QuantizedPredictor`. -embed-weights = [] [dependencies] libm = "0.2" diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/LICENSE b/LICENSE-MIT similarity index 97% rename from LICENSE rename to LICENSE-MIT index 91c7f17..74f0f32 100644 --- a/LICENSE +++ b/LICENSE-MIT @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 pyke.io +Copyright (c) 2025-2026 pyke.io Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 19207a6..bfd844d 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,41 @@ # Earshot Ridiculously fast & accurate voice activity detection in pure Rust. -Achieves an RTF of 0.0014; 10x faster than Silero/TEN VAD. +Achieves an RTF of 0.0007 (1,270x real time): **20x faster** than Silero VAD v6 & TEN VAD - and more accurate, too! -## Performance -Compiling with `RUSTFLAGS="-C target-cpu=native"` in release mode is highly recommended as it can cut processing time in half. +> If you find Earshot useful, please consider [sponsoring pyke.io](https://opencollective.com/pyke-osai). + + + +## Usage + +```rs +use earshot::Detector; + +// Create a new VAD detector using the default NN. +let mut detector = Detector::default(); + +let mut frame_receiver = ... +while let Some(frame) = frame_receiver.recv() { + // `frame` is Vec with length 256. Each frame passed to the detector must be exactly 256 samples. + // f32 [-1, 1] frames are also supported with `predict_f32`. + let score = detector.predict_i16(&frame); + // Score is between 0-1; 0 = no voice, 1 = voice. + if score >= 0.5 { // 0.5 is a good default threshold, but can be customized. + println!("Voice detected!"); + } +} +``` + +## Binary & memory size +Earshot is very embedded-friendly: each instance of `Detector` uses ~8 KiB of memory to store the audio buffer & neural network state. Binary footprint is ~100 KiB; the neural network is 75 KiB of that. + +In contrast, Silero's model is 2 MiB, TEN's is 310 KiB, but both require ONNX Runtime, which adds an additional 8 MB to your binary (+ a whole lot more memory). + +## `#![no_std]` +Earshot supports `#![no_std]`, but it does require an allocator. The `std` feature is enabled by default, so add `default-features = false` to enable `#![no_std]`: + +```toml +[dependencies] +earshot = { version = "1", default-features = false } +``` diff --git a/benches/vad.rs b/benches/vad.rs index df396cf..27f2299 100644 --- a/benches/vad.rs +++ b/benches/vad.rs @@ -1,10 +1,10 @@ use std::hint::black_box; use criterion::{Criterion, criterion_group, criterion_main}; -use earshot::{Detector, QuantizedPredictor}; +use earshot::Detector; fn bench_vad(c: &mut Criterion) { - let mut vad = Detector::::default(); + let mut vad = Detector::default(); c.bench_function("Single frame - f32", |b| { let frame = (0..256 as i16).map(|i| i.wrapping_mul(i) as f32).collect::>(); b.iter(|| { diff --git a/examples/extract-voice.rs b/examples/extract-voice.rs index 4f5ff03..c110d71 100644 --- a/examples/extract-voice.rs +++ b/examples/extract-voice.rs @@ -1,11 +1,10 @@ -use core::{mem, ptr, slice}; use std::{ env::args, fs::{self, File}, io::Write }; -use earshot::{Detector, QuantizedPredictor}; +use earshot::Detector; fn main() { let mut args = args().skip(1); @@ -18,7 +17,7 @@ fn main() { return; }; - let mut detector = Detector::::default(); + let mut detector = Detector::default(); let mut out = File::create(output).unwrap(); @@ -26,15 +25,13 @@ fn main() { for x in wav[44..].chunks_exact(512) { let mut samples = vec![0; 256]; for i in 0..256 { - samples[i] = i16::from_le_bytes([x[(i * 2)], x[(i * 2) + 1]]); + samples[i] = i16::from_le_bytes([x[i * 2], x[(i * 2) + 1]]); } let score = detector.predict_i16(&samples); if score >= 0.5 { - println!("voice"); out.write_all(&x).unwrap(); } else { - println!("silence {score}"); } } diff --git a/rustfmt.toml b/rustfmt.toml index 8d861c5..e7e25d8 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,4 +1,4 @@ -edition = "2021" +edition = "2024" style_edition = "2024" unstable_features = true diff --git a/src/default_predictor.rs b/src/default_predictor.rs new file mode 100644 index 0000000..3fcc42a --- /dev/null +++ b/src/default_predictor.rs @@ -0,0 +1,194 @@ +const _WEIGHTS_LEN: usize = include_bytes!("weights.bin").len(); +static WEIGHTS: &[u8; _WEIGHTS_LEN] = { + #[repr(C, align(4))] + struct AlignedData(T); + + const __DATA: &'static AlignedData<[u8; _WEIGHTS_LEN]> = &AlignedData(*include_bytes!("weights.bin")); + &__DATA.0 +}; + +const fn weight<'a, const SIZE: usize>(offset: usize) -> &'a [f32; SIZE] { + unsafe { &*(WEIGHTS.as_ptr().cast::().add(offset) as *const [_; SIZE]) } +} + +static NORM_WEIGHT: &[f32; 40] = weight(0); +static LAYER1_KERNEL: &[f32; 9] = weight(40); +static LAYER1_WEIGHT: &[f32; 16] = weight(49); +static LAYER1_BIAS: &[f32; 16] = weight(65); +static LAYER2_KERNEL: &[f32; 48] = weight(81); +static LAYER2_WEIGHT: &[f32; 256] = weight(129); +static LAYER2_BIAS: &[f32; 16] = weight(385); +static LAYER3_KERNEL: &[f32; 48] = weight(401); +static LAYER3_WEIGHT: &[f32; 256] = weight(449); +static LAYER3_BIAS: &[f32; 16] = weight(705); +static RNN1_WEIGHT: &[f32; 10240] = weight(721); +static RNN2_WEIGHT: &[f32; 8192] = weight(10961); +static OUTPUT_WEIGHT: &[f32; 128] = weight(19153); + +pub struct DefaultPredictor { + state: Vec +} + +impl DefaultPredictor { + pub fn new() -> Self { + Self { state: vec![0.0; 128] } + } +} + +impl crate::Predictor for DefaultPredictor { + fn reset(&mut self) { + self.state.fill(0.0); + } + + fn normalize(&self, features: &mut [f32]) { + let i_rms = 1. / (features.iter().map(|x| x * x).sum::() / features.len() as f32).sqrt(); + for (i, v) in features.iter_mut().enumerate() { + *v = NORM_WEIGHT[i] * *v * i_rms; + } + } + + fn predict(&mut self, features: &[f32], buffer: &mut [f32]) -> f32 { + let (buffer1, buffer2) = buffer.split_at_mut(288); + input_layer1(&features, buffer1); + input_layer2_3::<18, 9, false>(&buffer1[..288], LAYER2_KERNEL, LAYER2_WEIGHT, LAYER2_BIAS, &mut buffer2[..144]); + input_layer2_3::<9, 5, true>(&buffer2[..144], LAYER3_KERNEL, LAYER3_WEIGHT, LAYER3_BIAS, &mut buffer1[..80]); + mingru::<80>(&buffer1[..80], &self.state[..64], RNN1_WEIGHT, &mut buffer2[..128]); + self.state[..64].copy_from_slice(&buffer2[..64]); + mingru::<64>(&buffer2[..64], &self.state[64..128], RNN2_WEIGHT, &mut buffer1[..128]); + self.state[64..128].copy_from_slice(&buffer1[..64]); + output(&buffer2[..64], &buffer1[..64]) + } +} + +#[inline(never)] +fn input_layer1(features: &[f32], output: &mut [f32]) { + const NUM_FRAMES: usize = 3; + const NUM_FEATURES: usize = 40; + + const KERNEL_SIZE: usize = 3; + const { + assert!((NUM_FRAMES - KERNEL_SIZE) / 1 + 1 == 1); + }; + const DEPTHWISE_NUM_FEATURES: usize = (NUM_FEATURES - KERNEL_SIZE) / 1 + 1; + const OUT_CHANNELS: usize = 16; + + const POOL_KERNEL_SIZE: usize = 3; + const POOL_STRIDE: usize = 2; + const POOLED_COLS: usize = (DEPTHWISE_NUM_FEATURES - POOL_KERNEL_SIZE) / POOL_STRIDE + 1; + + output.fill(0.0); + + let mut row = [0.0_f32; DEPTHWISE_NUM_FEATURES]; + for ox in 0..DEPTHWISE_NUM_FEATURES { + // depthwise conv + let mut sum = 0.0; + for kh in 0..KERNEL_SIZE { + for kw in 0..KERNEL_SIZE { + let w = ox + kw; + let input_idx = (kh * NUM_FEATURES) + w; + sum += features[input_idx] * LAYER1_KERNEL[(kh * KERNEL_SIZE) + kw]; + } + } + + row[ox] = sum; + } + + for c in 0..OUT_CHANNELS { + let mut new_row = [0.0; DEPTHWISE_NUM_FEATURES]; + for ox in 0..DEPTHWISE_NUM_FEATURES { + // pointwise conv + new_row[ox] = (row[ox] * LAYER1_WEIGHT[c]) + LAYER1_BIAS[c]; + } + + // max pool over row + let out_row_offs = POOLED_COLS * c; + for q in 0..POOLED_COLS { + for x in 0..POOL_KERNEL_SIZE { + let out_q = &mut output[out_row_offs + q]; + // `out` is zeroed, so this also acts as ReLU + *out_q = (*out_q).max(new_row[(q * POOL_STRIDE) + x]); + } + } + } +} + +#[inline(never)] +fn input_layer2_3( + features: &[f32], + kernel: &[f32; 48], + weight: &[f32; 256], + bias: &[f32; 16], + output: &mut [f32] +) { + const HORIZONTAL_KERNEL_SIZE: usize = 3; + const STRIDE: usize = 2; + const CHANNELS: usize = 16; + + output.fill(0.0); + + for ox in 0..OUT_FEATURES { + let mut dw = [0.0; CHANNELS]; + for c in 0..CHANNELS { + // depthwise conv + let mut sum = 0.0; + for kw in 0..HORIZONTAL_KERNEL_SIZE { + let ix = (ox * STRIDE + kw) as isize - 1; + if ix < 0 || ix >= IN_FEATURES as isize { + continue; + } + sum += features[(c * IN_FEATURES) + ix as usize] * kernel[(c * HORIZONTAL_KERNEL_SIZE) + kw]; + } + + dw[c] = sum; + } + + // pointwise conv + for oc in 0..CHANNELS { + let mut ic = 0.0; + for c in 0..CHANNELS { + let sum = dw[c]; + ic += sum * weight[(oc * CHANNELS) + c]; + } + + let ptr = if !LAYER3 { &mut output[(oc * OUT_FEATURES) + ox] } else { &mut output[(ox * CHANNELS) + oc] }; + *ptr = (ic + bias[oc]).max(0.0); + } + } +} + +#[inline(never)] +fn mingru(features: &[f32], h: &[f32], weight: &[f32], out: &mut [f32]) { + for d in 0..128 { + let mut o = 0.0; + let ri = d * IN_DIM; + + for f in 0..IN_DIM { + o += features[f] * weight[ri + f]; + } + + out[d] = o; + } + + for i in 0..64 { + let g = (out[64 + i] * 0.25).clamp(0.0, 1.0); + let v = &mut out[i]; + *v = (1. - g) * h[i] + g * *v; + } +} + +#[inline] +fn sigmoid(x: f32) -> f32 { + 1. / (1. + (-x).exp()) +} + +#[inline(never)] +fn output(out_1: &[f32], out_2: &[f32]) -> f32 { + let mut out = 0.0; + for f in 0..64 { + out += out_1[f] * OUTPUT_WEIGHT[f]; + } + for f in 0..64 { + out += out_2[f] * OUTPUT_WEIGHT[64 + f]; + } + sigmoid(out) +} diff --git a/src/lib.rs b/src/lib.rs index d1799e5..8538da8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,59 +5,39 @@ extern crate alloc; use alloc::{boxed::Box, vec}; use core::{f32, ptr}; +mod default_predictor; mod fft; -mod quantized_predictor; mod util; -#[cfg(feature = "embed-weights")] -pub use self::quantized_predictor::default_weights as default_quantized_weights; -pub use self::quantized_predictor::{PackedWeights, QuantizedPredictor}; +pub use self::default_predictor::DefaultPredictor; use self::util::OnceLock; +/// Used by [`Detector`] to predict the VAD score of a frame based on extracted features. +/// +/// # Stability +/// If you wish to implement `Predictor` yourself, note that **the API is unstable and subject to change!** pub trait Predictor { + #[doc(hidden)] fn reset(&mut self); + + #[doc(hidden)] + fn normalize(&self, features: &mut [f32]); + + #[doc(hidden)] fn predict(&mut self, features: &[f32], buffer: &mut [f32]) -> f32; } const FFT_SIZE: usize = 1024; const WINDOW_SIZE: usize = 768; const N_MELS: usize = 40; -const N_FEATURES: usize = N_MELS + 1; +const N_FEATURES: usize = N_MELS; const N_CONTEXT_FRAMES: usize = 3; const N_BINS: usize = FFT_SIZE / 2 + 1; const PRE_EMPHASIS_COEFF: f32 = 0.97; const POWER_FAC: f32 = 1. / (32768.0f32 * 32768.0); -#[rustfmt::skip] -const FEATURE_MEANS: [f32; 40] = [ - -8.198236465454, -6.265716552734, -5.483818531036, -4.758691310883, - -4.417088985443, -4.142892837524, -3.912850379944, -3.845927953720, - -3.657090425491, -3.723418712616, -3.876134157181, -3.843890905380, - -3.690405130386, -3.756065845490, -3.698696136475, -3.650463104248, - -3.700468778610, -3.567321300507, -3.498900175095, -3.477807044983, - -3.458816051483, -3.444923877716, -3.401328563690, -3.306261301041, - -3.278556823730, -3.233250856400, -3.198616027832, -3.204526424408, - -3.208798646927, -3.257838010788, -3.381376743317, -3.534021377563, - -3.640867948532, -3.726858854294, -3.773730993271, -3.804667234421, - -3.832901000977, -3.871120452881, -3.990592956543, -4.480289459229 -]; - -#[rustfmt::skip] -const FEATURE_STDS: [f32; 40] = [ - 5.166063785553, 4.977209568024, 4.698895931244, 4.630621433258, - 4.634347915649, 4.641156196594, 4.640676498413, 4.666367053986, - 4.650534629822, 4.640020847321, 4.637400150299, 4.620099067688, - 4.596316337585, 4.562654972076, 4.554360389709, 4.566910743713, - 4.562489986420, 4.562412738800, 4.585299491882, 4.600179672241, - 4.592845916748, 4.585922718048, 4.583496570587, 4.626092910767, - 4.626957893372, 4.626289367676, 4.637005805969, 4.683015823364, - 4.726813793182, 4.734289646149, 4.753227233887, 4.849722862244, - 4.869434833527, 4.884482860565, 4.921327114105, 4.959212303162, - 4.996619224548, 5.044823646545, 5.072216987610, 5.096439361572 -]; - struct Filters { - mel_coeffs: Box<[f32]>, + mel_coeffs: Box<[(usize, Box<[f32]>)]>, window: Box<[f32]> } @@ -73,15 +53,20 @@ impl Filters { bin_points[i] = ((FFT_SIZE as f32 + 1.) * hz / 16000.) as usize; } - let mut mel_coeffs = vec![0.0; N_MELS * N_BINS].into_boxed_slice(); + let mut mel_coeffs = Vec::with_capacity(N_MELS); for i in 0..N_MELS { + let mut points = Vec::with_capacity(bin_points[i + 2] - bin_points[i]); for j in bin_points[i]..bin_points[i + 1] { - mel_coeffs[(i * N_BINS) + j] = (j - bin_points[i]) as f32 / (bin_points[i + 1] - bin_points[i]) as f32; + points.push((j - bin_points[i]) as f32 / (bin_points[i + 1] - bin_points[i]) as f32); } for j in bin_points[i + 1]..bin_points[i + 2] { - mel_coeffs[(i * N_BINS) + j] = (bin_points[i + 2] - j) as f32 / (bin_points[i + 2] - bin_points[i + 1]) as f32; + points.push((bin_points[i + 2] - j) as f32 / (bin_points[i + 2] - bin_points[i + 1]) as f32); } + + // Mel filterbank is naturally very sparse. Rather than waste compute & storage on the whole matrix, only store + // non-zero elements. + mel_coeffs.push((bin_points[i], points.into_boxed_slice())); } // hann window @@ -92,13 +77,16 @@ impl Filters { window[i] = x * x; } - Self { mel_coeffs, window } + Self { + mel_coeffs: mel_coeffs.into_boxed_slice(), + window + } } } static FILTERS: OnceLock = OnceLock::new(); -pub struct Detector

{ +pub struct Detector

{ predictor: P, prev_signal: f32, sample_ring_buffer: Box<[f32]>, @@ -106,9 +94,9 @@ pub struct Detector

{ buffer: Box<[f32]> } -impl Default for Detector

{ +impl Default for Detector { fn default() -> Self { - Self::new(P::default()) + Self::new(DefaultPredictor::new()) } } @@ -223,13 +211,14 @@ impl Detector

{ let cur_frame_features = &mut self.features[(N_FEATURES * (N_CONTEXT_FRAMES - 1))..]; for i in 0..N_MELS { let mut per_band_value = 0.; - for j in 0..N_BINS { - per_band_value += self.buffer[j] * filters.mel_coeffs[(i * N_BINS) + j]; + let (start, ref coeffs) = filters.mel_coeffs[i]; + for (offs, coeff) in coeffs.iter().enumerate() { + per_band_value += self.buffer[start + offs] * *coeff; } - per_band_value = libm::logf(per_band_value + 1e-20); - cur_frame_features[i] = (per_band_value - FEATURE_MEANS[i]) / FEATURE_STDS[i]; + cur_frame_features[i] = libm::logf(per_band_value + 1e-20); } + self.predictor.normalize(cur_frame_features); self.predictor.predict(&self.features, &mut self.buffer) } diff --git a/src/quantized-model.bin b/src/quantized-model.bin deleted file mode 100644 index b6b77fe..0000000 Binary files a/src/quantized-model.bin and /dev/null differ diff --git a/src/quantized_predictor.rs b/src/quantized_predictor.rs deleted file mode 100644 index 010d69b..0000000 --- a/src/quantized_predictor.rs +++ /dev/null @@ -1,489 +0,0 @@ -use alloc::{boxed::Box, vec}; -use core::{mem, slice}; - -use super::{Predictor, util::OnceLock}; - -struct BitBufferReader<'d> { - pub buf: &'d [u8], - idx: usize, - bit_buffer: u32, - n_bits: u32 -} - -impl<'d> BitBufferReader<'d> { - pub fn new(buffer: &'d [u8]) -> Self { - Self { - buf: buffer, - idx: 0, - bit_buffer: 0, - n_bits: 0 - } - } - - pub fn read(&mut self, len: u32) -> i32 { - while self.n_bits < len { - let byte = self.buf[self.idx]; - self.idx += 1; - - self.bit_buffer |= (byte as u32) << self.n_bits; - self.n_bits += 8; - } - - let bits = self.bit_buffer & ((1 << len) - 1); - self.bit_buffer >>= len; - self.n_bits -= len; - - let sign = (bits & (1 << (len - 1))) != 0; - (if sign { -1 << len } else { 0 }) | bits as i32 - } - - pub fn read_array(&mut self, bit_len: u32, cnt: usize) -> Box<[T]> { - (0..cnt).map(|_| T::from_i32(self.read(bit_len))).collect() - } -} - -trait FromI32 { - fn from_i32(x: i32) -> Self; -} - -impl FromI32 for i8 { - fn from_i32(x: i32) -> Self { - x as i8 - } -} -impl FromI32 for i16 { - fn from_i32(x: i32) -> Self { - x as i16 - } -} -impl FromI32 for i32 { - fn from_i32(x: i32) -> Self { - x - } -} - -pub struct PackedWeights { - layer1_kernel: Box<[i16]>, - layer1_weight: Box<[i16]>, - layer1_bias: Box<[i16]>, - layer2_kernel: Box<[i16]>, - layer2_weight: Box<[i16]>, - layer2_bias: Box<[i16]>, - layer3_kernel: Box<[i16]>, - layer3_weight: Box<[i16]>, - layer3_bias: Box<[i16]>, - lstm1_ih: Box<[i16]>, - lstm1_hh: Box<[i16]>, - lstm1_bias: Box<[i16]>, - lstm2_ih: Box<[i16]>, - lstm2_hh: Box<[i16]>, - lstm2_bias: Box<[i16]>, - out1_weight: Box<[i16]>, - out1_bias: Box<[i16]>, - out2_weight: Box<[i16]>, - out2_bias: i8 -} - -impl PackedWeights { - pub fn new(bytes: &[u8]) -> Self { - assert_eq!(bytes.len(), 135783, "invalid length for packed QuantizedPredictor weights"); - let mut reader = BitBufferReader::new(bytes); - Self { - layer1_kernel: reader.read_array(14, 9), - layer1_weight: reader.read_array(14, 16), - layer1_bias: reader.read_array(12, 16), - layer2_kernel: reader.read_array(15, 48), - layer2_weight: reader.read_array(16, 256), - layer2_bias: reader.read_array(14, 16), - layer3_kernel: reader.read_array(14, 48), - layer3_weight: reader.read_array(15, 256), - layer3_bias: reader.read_array(12, 16), - lstm1_ih: reader.read_array(15, 20480), - lstm1_hh: reader.read_array(14, 16384), - lstm1_bias: reader.read_array(12, 256), - lstm2_ih: reader.read_array(15, 16384), - lstm2_hh: reader.read_array(14, 16384), - lstm2_bias: reader.read_array(12, 256), - out1_weight: reader.read_array(14, 4096), - out1_bias: reader.read_array(11, 32), - out2_weight: reader.read_array(13, 32), - out2_bias: reader.read(4) as i8 - } - } -} - -#[cfg(feature = "embed-weights")] -static DEFAULT_WEIGHT_BYTES: &[u8] = include_bytes!("quantized-model.bin"); -#[cfg(feature = "embed-weights")] -static DEFAULT_WEIGHTS: OnceLock = OnceLock::new(); - -#[cfg(feature = "embed-weights")] -pub fn default_weights() -> &'static PackedWeights { - DEFAULT_WEIGHTS.get_or_init(|| PackedWeights::new(DEFAULT_WEIGHT_BYTES)) -} - -pub struct ActivationTables { - sigmoid: Box<[i32]>, - tanh: Box<[i32]> -} - -impl ActivationTables { - const Q11_SCALE: i32 = 2048; // 2 ** 11 - const Q11_SCALE_FLOAT: f32 = 2048.; - const SIGMOID_MAX: i32 = Self::Q11_SCALE * 6; // sigmoid goes asymptotic < -6 or > 6, so limit computation to between these values - const TANH_MAX: i32 = Self::Q11_SCALE * 4; // ^ 4 for tanh - pub const OUT_SCALE: f32 = 65536.; // 2 ** 16, outputs in Q16 - - pub fn new() -> Self { - let sigmoid_len = Self::SIGMOID_MAX * 2 + 1; - let mut sigmoid_table = vec![0; sigmoid_len as usize].into_boxed_slice(); - for i in 0..sigmoid_len { - let v = Self::_real_sigmoid((i - (Self::SIGMOID_MAX)) as f32 / Self::Q11_SCALE_FLOAT); - sigmoid_table[i as usize] = libm::roundevenf(v * Self::OUT_SCALE) as i32; - } - let tanh_len = Self::TANH_MAX * 2 + 1; - let mut tanh_table = vec![0; tanh_len as usize].into_boxed_slice(); - for i in 0..tanh_len { - let v = libm::tanhf((i - (Self::TANH_MAX)) as f32 / Self::Q11_SCALE_FLOAT); - tanh_table[i as usize] = libm::roundevenf(v * Self::OUT_SCALE) as i32; - } - - Self { - sigmoid: sigmoid_table, - tanh: tanh_table - } - } - - #[inline] - fn _real_sigmoid(x: f32) -> f32 { - 1. / (1. + libm::expf(-x)) - } - - #[inline] - pub fn sigmoid(&self, x: i32) -> i32 { - unsafe { - *self - .sigmoid - .get_unchecked((x + Self::SIGMOID_MAX).clamp(0, Self::SIGMOID_MAX * 2) as usize) - } - } - #[inline] - pub fn tanh(&self, x: i32) -> i32 { - unsafe { *self.tanh.get_unchecked((x + Self::TANH_MAX).clamp(0, Self::TANH_MAX * 2) as usize) } - } -} - -static ACTIVATION_TABLES: OnceLock = OnceLock::new(); - -pub struct QuantizedPredictor<'w> { - weights: &'w PackedWeights, - state: Box<[i32]> -} - -impl<'w> QuantizedPredictor<'w> { - pub fn new(weights: &'w PackedWeights) -> Self { - Self { - weights, - state: vec![0; 256].into_boxed_slice() - } - } -} - -#[cfg(feature = "embed-weights")] -impl Default for QuantizedPredictor<'static> { - fn default() -> Self { - Self::new(default_weights()) - } -} - -impl Predictor for QuantizedPredictor<'_> { - fn reset(&mut self) { - self.state.fill(0); - } - - fn predict(&mut self, features: &[f32], buffer: &mut [f32]) -> f32 { - assert_eq!(features.len(), 41 * 3); - assert!(buffer.len() > 464); - - let buffer = unsafe { mem::transmute::<&mut [f32], &mut [i32]>(buffer) }; - - let buffer_ptr = buffer.as_mut_ptr(); - input_layer1(features, &self.weights.layer1_kernel, &self.weights.layer1_weight, &self.weights.layer1_bias, &mut buffer[..304]); - input_layer2(&buffer[..304], &self.weights.layer2_kernel, &self.weights.layer2_weight, &self.weights.layer2_bias, unsafe { - slice::from_raw_parts_mut(buffer_ptr.add(304), 160) - }); - input_layer3(&buffer[304..], &self.weights.layer3_kernel, &self.weights.layer3_weight, &self.weights.layer3_bias, unsafe { - slice::from_raw_parts_mut(buffer_ptr, 80) - }); - lstm::<80, { 80 * 256 }>( - &buffer[..80], - &self.state[..64], - &self.state[64..128], - &self.weights.lstm1_ih, - &self.weights.lstm1_hh, - &self.weights.lstm1_bias, - unsafe { slice::from_raw_parts_mut(buffer_ptr.add(80), 256) } - ); - self.state[..128].copy_from_slice(&buffer[80..208]); - lstm::<64, { 64 * 256 }>( - &self.state[..128], - &self.state[128..192], - &self.state[192..], - &self.weights.lstm2_ih, - &self.weights.lstm2_hh, - &self.weights.lstm2_bias, - &mut buffer[..256] - ); - self.state[128..].copy_from_slice(&buffer[..128]); - output( - &self.state[..128], - &self.state[128..], - &self.weights.out1_weight, - &self.weights.out1_bias, - &self.weights.out2_weight, - self.weights.out2_bias - ) - } -} - -#[inline(never)] -fn input_layer1(features: &[f32], kernel: &[i16], weight: &[i16], bias: &[i16], output: &mut [i32]) { - const NUM_FRAMES: usize = 3; - const NUM_FEATURES: usize = 41; - const FEATURES_INPUT: usize = const { NUM_FRAMES * NUM_FEATURES }; - - const KERNEL_SIZE: usize = 3; - const { - assert!((NUM_FRAMES - KERNEL_SIZE) / 1 + 1 == 1); - }; - const DEPTHWISE_NUM_FEATURES: usize = (NUM_FEATURES - KERNEL_SIZE) / 1 + 1; - const OUT_CHANNELS: usize = 16; - - const POOL_KERNEL_SIZE: usize = 3; - const POOL_STRIDE: usize = 2; - const SCALE_FACTOR: f32 = (1 << 16) as f32; - const POOLED_COLS: usize = (DEPTHWISE_NUM_FEATURES - POOL_KERNEL_SIZE) / POOL_STRIDE + 1; - - output.fill(0); - - assert_eq!(features.len(), FEATURES_INPUT); - - let mut tmp = [0i32; FEATURES_INPUT]; - // doing this conversion in the convolution loop kills performance - for i in 0..FEATURES_INPUT { - unsafe { - // convert to Q16 - *tmp.get_unchecked_mut(i) = libm::floorf(*features.get_unchecked(i) * SCALE_FACTOR) as i32; - }; - } - - let mut row = [0; DEPTHWISE_NUM_FEATURES]; - for c in 0..OUT_CHANNELS { - for ox in 0..DEPTHWISE_NUM_FEATURES { - // depthwise conv - let mut sum = 0; - for kh in 0..KERNEL_SIZE { - for kw in 0..KERNEL_SIZE { - let w = ox + kw; - let input_idx = (kh * NUM_FEATURES) + w; - unsafe { - // Q16 * Q13 = Q29 - sum += *tmp.get_unchecked(input_idx) as i64 * *kernel.get_unchecked((kh * KERNEL_SIZE) + kw) as i64; - } - } - } - - // pointwise conv - unsafe { - // Q29 * Q13 = Q42. bias is Q12 so shift left by 42-12=30 - let x = (sum * *weight.get_unchecked(c) as i64) + ((*bias.get_unchecked(c) as i64) << 30); - // shift down to Q16 - *row.get_unchecked_mut(ox) = (x >> 26) as i32; - } - } - - // max pool over row - let out_row_offs = POOLED_COLS * c; - for q in 0..POOLED_COLS { - for x in 0..POOL_KERNEL_SIZE { - let out_q = unsafe { output.as_mut_ptr().add(out_row_offs + q) }; - // `output` is initially zeroed, so this also acts as ReLU - unsafe { *out_q = (*out_q).max(*row.get_unchecked((q * POOL_STRIDE) + x)) }; - } - } - } -} - -#[inline(never)] -fn input_layer2(features: &[i32], kernel: &[i16], weight: &[i16], bias: &[i16], output: &mut [i32]) { - const HORIZONTAL_KERNEL_SIZE: usize = 3; - const STRIDE: usize = 2; - const CHANNELS: usize = 16; - - const IN_FEATURES: usize = 19; - const OUT_FEATURES: usize = 10; - - output.fill(0); - - for ox in 0..OUT_FEATURES { - let mut row = [0; CHANNELS]; - for c in 0..CHANNELS { - // depthwise conv - let mut sum = 0; - for kw in 0..HORIZONTAL_KERNEL_SIZE { - let ix = (ox * STRIDE + kw) as isize - 1; - if ix < 0 || ix >= IN_FEATURES as isize { - continue; - } - - // Q16 * Q13 = Q29 - unsafe { - sum += *features.get_unchecked((c * IN_FEATURES) + ix as usize) as i64 * *kernel.get_unchecked((c * HORIZONTAL_KERNEL_SIZE) + kw) as i64; - } - } - - // pointwise conv - for oc in 0..CHANNELS { - unsafe { - // Q29 * Q13 = Q42 - let r = sum * *weight.get_unchecked((oc * CHANNELS) + c) as i64; - *row.get_unchecked_mut(oc) += r; - } - } - } - - // apply pointwise conv bias + relu - for oc in 0..CHANNELS { - unsafe { - // bias is Q12 so shift left by 42-12=30 - let br = *row.get_unchecked(oc) + ((*bias.get_unchecked(oc) as i64) << 30); - // shift down to Q16 - *output.get_unchecked_mut((oc * OUT_FEATURES) + ox) = ((br >> 26) as i32).max(0); - } - } - } -} - -#[inline(never)] -fn input_layer3(features: &[i32], kernel: &[i16], weight: &[i16], bias: &[i16], output: &mut [i32]) { - const HORIZONTAL_KERNEL_SIZE: usize = 3; - const STRIDE: usize = 2; - const CHANNELS: usize = 16; - - const IN_FEATURES: usize = 10; - const OUT_FEATURES: usize = 5; - - output.fill(0); - - for ox in 0..OUT_FEATURES { - let mut row = [0; CHANNELS]; - for c in 0..CHANNELS { - // depthwise conv - let mut sum = 0i64; - for kw in 0..HORIZONTAL_KERNEL_SIZE { - let ix = ox * STRIDE + kw; // layer 3 does not use left padding - if ix >= IN_FEATURES { - continue; - } - unsafe { - // Q16 * Q13 = Q29 - sum += *features.get_unchecked((c * IN_FEATURES) + ix as usize) as i64 * *kernel.get_unchecked((c * HORIZONTAL_KERNEL_SIZE) + kw) as i64; - } - } - - // pointwise conv - for oc in 0..CHANNELS { - unsafe { - // Q29 * Q13 = Q42 - *row.get_unchecked_mut(oc) += sum * *weight.get_unchecked((oc * CHANNELS) + c) as i64; - } - } - } - - // apply pointwise conv bias + relu - for oc in 0..CHANNELS { - unsafe { - // bias is Q12 so shift left by 42-12=30 - let r = *row.get_unchecked_mut(oc) + ((*bias.get_unchecked(oc) as i64) << 30); - let ptr = output.get_unchecked_mut((ox * CHANNELS) + oc); - // shift down to Q16 - *ptr = (r >> 26).max(0) as i32; - } - } - } -} - -#[inline(never)] -fn lstm(features: &[i32], h: &[i32], c: &[i32], weight_ih: &[i16], weight_hh: &[i16], bias: &[i16], out: &mut [i32]) { - for d in 0..256 { - // init with Q10 bias, shifted left by 18 to get 10+18=Q28 - let mut o = (unsafe { *bias.get_unchecked(d) } as i64) << 18; - let (ri, rh) = (d * IN_DIM, d * 64); - - for f in 0..IN_DIM { - unsafe { - // Q16 * Q12 = Q28 - o += *features.get_unchecked(f) as i64 * *weight_ih.get_unchecked(ri + f) as i64; - }; - } - - for g in 0..64 { - unsafe { - // Q16 * Q12 = Q28 - o += *h.get_unchecked(g) as i64 * *weight_hh.get_unchecked(rh + g) as i64; - } - } - - unsafe { - // shift down from Q28 to Q11 - *out.get_unchecked_mut(d) = (o >> 17) as i32; - }; - } - - let act = ACTIVATION_TABLES.get_or_init(ActivationTables::new); - for i in 0..64 { - unsafe { - // layout is [input, output, forget, cell] - let ix = act.sigmoid(*out.get_unchecked(i)) as i64; - let fx = act.sigmoid(*out.get_unchecked(128 + i)) as i64; - let cx = act.tanh(*out.get_unchecked(192 + i)) as i64; - // Q16 * Q16 = Q32; Q16 * Q16 = Q32 - let x = (fx * *c.get_unchecked(i) as i64) + (ix * cx); - let xt = act.tanh((x >> 21) as i32) as i64; // Q11 in, Q16 out - // arrange outputs as [hidden, cell] - let o = act.sigmoid(mem::replace(out.get_unchecked_mut(64 + i), (x >> 16) as i32)) as i64; - // Q16 * Q16 = Q32, shift down to Q16 - *out.get_unchecked_mut(i) = ((o * xt) >> 16) as i32; - }; - } -} - -#[inline(never)] -fn output(out_1: &[i32], out_2: &[i32], weight_1: &[i16], bias_1: &[i16], weight_2: &[i16], bias_2: i8) -> f32 { - let mut temp = [0; 32]; - for h in 0..64 { - for f in 0..32 { - unsafe { - // Q16 * Q12 = Q28, shift down to Q19 - let mut o = *out_2.get_unchecked(h) as i64 * *weight_1.get_unchecked(h * 32 + f) as i64; - o += *out_1.get_unchecked(h) as i64 * *weight_1.get_unchecked((h + 64) * 32 + f) as i64; - *temp.get_unchecked_mut(f) += (o >> 9) as i32; - } - } - } - - let mut out = 0; - for f in 0..32 { - unsafe { - // bias is Q10 so shift left by 19-10=9 - let q = *temp.get_unchecked(f) as i64 + ((*bias_1.get_unchecked(f) as i64) << 9); - // Q19 * Q13 = Q32 - out += q.max(0) * *weight_2.get_unchecked(f) as i64; - } - } - // bias is Q9 so shift left by 32-9=23 - out += (bias_2 as i64) << 23; - // shift down to Q11 - out >>= 21; - ACTIVATION_TABLES.get_or_init(ActivationTables::new).sigmoid(out as i32) as f32 / ActivationTables::OUT_SCALE -} diff --git a/src/util.rs b/src/util.rs index 2846e93..2620b43 100644 --- a/src/util.rs +++ b/src/util.rs @@ -30,7 +30,7 @@ impl OnceLock { #[inline] unsafe fn get_unchecked(&self) -> &T { - &*(*self.data.get()).as_ptr() + unsafe { &*(*self.data.get()).as_ptr() } } #[inline] diff --git a/src/weights.bin b/src/weights.bin new file mode 100644 index 0000000..aa3f56f Binary files /dev/null and b/src/weights.bin differ diff --git a/tests/data/audio_tiny16.raw b/tests/data/audio_tiny16.raw deleted file mode 100644 index a2a3b32..0000000 Binary files a/tests/data/audio_tiny16.raw and /dev/null differ