diff --git a/Cargo.lock b/Cargo.lock
index ecf6dd6..d9809a8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -165,7 +165,7 @@ checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
 
 [[package]]
 name = "earshot"
-version = "0.1.0"
+version = "1.0.0"
 dependencies = [
  "criterion",
  "libm",
@@ -280,9 +280,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.86"
+version = "1.0.106"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
 dependencies = [
  "unicode-ident",
 ]
@@ -362,18 +362,28 @@ dependencies = [
 
 [[package]]
 name = "serde"
-version = "1.0.209"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.209"
+version = "1.0.228"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -394,9 +404,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.76"
+version = "2.0.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
+checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a"
 dependencies = [
  "proc-macro2",
  "quote",
diff --git a/Cargo.toml b/Cargo.toml
index c637fe2..33a4320 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,19 +1,18 @@
 [package]
 name = "earshot"
-version = "0.1.0"
+version = "1.0.0"
 description = "Ridiculously fast & accurate voice activity detection in pure Rust"
 repository = "https://github.com/pykeio/earshot"
 authors = [ "Carson M <carson@pyke.io>" ]
-license = "MIT"
-edition = "2021"
-exclude = ["tests/data", ".github"]
+license = "MIT OR Apache-2.0"
+edition = "2024"
+exclude = ["tests/data", "benches/", "examples/", ".github"]
+rust-version = "1.87"
 
 [features]
-default = [ "std", "embed-weights" ]
+default = [ "std" ]
 # Currently just impls `std::error::Error` for the `Error` type.
 std = []
-# Embed the default model weights in the binary. Enables `Default` for `QuantizedPredictor`.
-embed-weights = []
 
 [dependencies]
 libm = "0.2"
diff --git a/LICENSE-APACHE b/LICENSE-APACHE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE-APACHE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/LICENSE b/LICENSE-MIT
similarity index 97%
rename from LICENSE
rename to LICENSE-MIT
index 91c7f17..74f0f32 100644
--- a/LICENSE
+++ b/LICENSE-MIT
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2025 pyke.io
+Copyright (c) 2025-2026 pyke.io
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 19207a6..bfd844d 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,41 @@
 # Earshot
 Ridiculously fast & accurate voice activity detection in pure Rust.
 
-Achieves an RTF of 0.0014; 10x faster than Silero/TEN VAD.
+Achieves an RTF of 0.0007 (1,270x real time): **20x faster** than Silero VAD v6 & TEN VAD - and more accurate, too!
 
-## Performance
-Compiling with `RUSTFLAGS="-C target-cpu=native"` in release mode is highly recommended as it can cut processing time in half.
+> If you find Earshot useful, please consider [sponsoring pyke.io](https://opencollective.com/pyke-osai).
+
+<img src="https://i.pyke.io/earshot-1.0-pr.png"/>
+
+## Usage
+
+```rs
+use earshot::Detector;
+
+// Create a new VAD detector using the default NN.
+let mut detector = Detector::default();
+
+let mut frame_receiver = ...
+while let Some(frame) = frame_receiver.recv() {
+	// `frame` is Vec<i16> with length 256. Each frame passed to the detector must be exactly 256 samples.
+	// f32 [-1, 1] frames are also supported with `predict_f32`.
+	let score = detector.predict_i16(&frame);
+	// Score is between 0-1; 0 = no voice, 1 = voice.
+	if score >= 0.5 { // 0.5 is a good default threshold, but can be customized.
+		println!("Voice detected!");
+	}
+}
+```
+
+## Binary & memory size
+Earshot is very embedded-friendly: each instance of `Detector` uses ~8 KiB of memory to store the audio buffer & neural network state. Binary footprint is ~100 KiB; the neural network is 75 KiB of that.
+
+In contrast, Silero's model is 2 MiB, TEN's is 310 KiB, but both require ONNX Runtime, which adds an additional 8 MB to your binary (+ a whole lot more memory).
+
+## `#![no_std]`
+Earshot supports `#![no_std]`, but it does require an allocator. The `std` feature is enabled by default, so add `default-features = false` to enable `#![no_std]`:
+
+```toml
+[dependencies]
+earshot = { version = "1", default-features = false }
+```
diff --git a/benches/vad.rs b/benches/vad.rs
index df396cf..27f2299 100644
--- a/benches/vad.rs
+++ b/benches/vad.rs
@@ -1,10 +1,10 @@
 use std::hint::black_box;
 
 use criterion::{Criterion, criterion_group, criterion_main};
-use earshot::{Detector, QuantizedPredictor};
+use earshot::Detector;
 
 fn bench_vad(c: &mut Criterion) {
-	let mut vad = Detector::<QuantizedPredictor>::default();
+	let mut vad = Detector::default();
 	c.bench_function("Single frame - f32", |b| {
 		let frame = (0..256 as i16).map(|i| i.wrapping_mul(i) as f32).collect::<Vec<_>>();
 		b.iter(|| {
diff --git a/examples/extract-voice.rs b/examples/extract-voice.rs
index 4f5ff03..c110d71 100644
--- a/examples/extract-voice.rs
+++ b/examples/extract-voice.rs
@@ -1,11 +1,10 @@
-use core::{mem, ptr, slice};
 use std::{
 	env::args,
 	fs::{self, File},
 	io::Write
 };
 
-use earshot::{Detector, QuantizedPredictor};
+use earshot::Detector;
 
 fn main() {
 	let mut args = args().skip(1);
@@ -18,7 +17,7 @@ fn main() {
 		return;
 	};
 
-	let mut detector = Detector::<QuantizedPredictor>::default();
+	let mut detector = Detector::default();
 
 	let mut out = File::create(output).unwrap();
 
@@ -26,15 +25,13 @@ fn main() {
 	for x in wav[44..].chunks_exact(512) {
 		let mut samples = vec![0; 256];
 		for i in 0..256 {
-			samples[i] = i16::from_le_bytes([x[(i * 2)], x[(i * 2) + 1]]);
+			samples[i] = i16::from_le_bytes([x[i * 2], x[(i * 2) + 1]]);
 		}
 
 		let score = detector.predict_i16(&samples);
 		if score >= 0.5 {
-			println!("voice");
 			out.write_all(&x).unwrap();
 		} else {
-			println!("silence {score}");
 		}
 	}
 
diff --git a/rustfmt.toml b/rustfmt.toml
index 8d861c5..e7e25d8 100644
--- a/rustfmt.toml
+++ b/rustfmt.toml
@@ -1,4 +1,4 @@
-edition = "2021"
+edition = "2024"
 style_edition = "2024"
 unstable_features = true
 
diff --git a/src/default_predictor.rs b/src/default_predictor.rs
new file mode 100644
index 0000000..3fcc42a
--- /dev/null
+++ b/src/default_predictor.rs
@@ -0,0 +1,194 @@
+const _WEIGHTS_LEN: usize = include_bytes!("weights.bin").len();
+static WEIGHTS: &[u8; _WEIGHTS_LEN] = {
+	#[repr(C, align(4))]
+	struct AlignedData<T: ?Sized>(T);
+
+	const __DATA: &'static AlignedData<[u8; _WEIGHTS_LEN]> = &AlignedData(*include_bytes!("weights.bin"));
+	&__DATA.0
+};
+
+const fn weight<'a, const SIZE: usize>(offset: usize) -> &'a [f32; SIZE] {
+	unsafe { &*(WEIGHTS.as_ptr().cast::<f32>().add(offset) as *const [_; SIZE]) }
+}
+
+static NORM_WEIGHT: &[f32; 40] = weight(0);
+static LAYER1_KERNEL: &[f32; 9] = weight(40);
+static LAYER1_WEIGHT: &[f32; 16] = weight(49);
+static LAYER1_BIAS: &[f32; 16] = weight(65);
+static LAYER2_KERNEL: &[f32; 48] = weight(81);
+static LAYER2_WEIGHT: &[f32; 256] = weight(129);
+static LAYER2_BIAS: &[f32; 16] = weight(385);
+static LAYER3_KERNEL: &[f32; 48] = weight(401);
+static LAYER3_WEIGHT: &[f32; 256] = weight(449);
+static LAYER3_BIAS: &[f32; 16] = weight(705);
+static RNN1_WEIGHT: &[f32; 10240] = weight(721);
+static RNN2_WEIGHT: &[f32; 8192] = weight(10961);
+static OUTPUT_WEIGHT: &[f32; 128] = weight(19153);
+
+pub struct DefaultPredictor {
+	state: Vec<f32>
+}
+
+impl DefaultPredictor {
+	pub fn new() -> Self {
+		Self { state: vec![0.0; 128] }
+	}
+}
+
+impl crate::Predictor for DefaultPredictor {
+	fn reset(&mut self) {
+		self.state.fill(0.0);
+	}
+
+	fn normalize(&self, features: &mut [f32]) {
+		let i_rms = 1. / (features.iter().map(|x| x * x).sum::<f32>() / features.len() as f32).sqrt();
+		for (i, v) in features.iter_mut().enumerate() {
+			*v = NORM_WEIGHT[i] * *v * i_rms;
+		}
+	}
+
+	fn predict(&mut self, features: &[f32], buffer: &mut [f32]) -> f32 {
+		let (buffer1, buffer2) = buffer.split_at_mut(288);
+		input_layer1(&features, buffer1);
+		input_layer2_3::<18, 9, false>(&buffer1[..288], LAYER2_KERNEL, LAYER2_WEIGHT, LAYER2_BIAS, &mut buffer2[..144]);
+		input_layer2_3::<9, 5, true>(&buffer2[..144], LAYER3_KERNEL, LAYER3_WEIGHT, LAYER3_BIAS, &mut buffer1[..80]);
+		mingru::<80>(&buffer1[..80], &self.state[..64], RNN1_WEIGHT, &mut buffer2[..128]);
+		self.state[..64].copy_from_slice(&buffer2[..64]);
+		mingru::<64>(&buffer2[..64], &self.state[64..128], RNN2_WEIGHT, &mut buffer1[..128]);
+		self.state[64..128].copy_from_slice(&buffer1[..64]);
+		output(&buffer2[..64], &buffer1[..64])
+	}
+}
+
+#[inline(never)]
+fn input_layer1(features: &[f32], output: &mut [f32]) {
+	const NUM_FRAMES: usize = 3;
+	const NUM_FEATURES: usize = 40;
+
+	const KERNEL_SIZE: usize = 3;
+	const {
+		assert!((NUM_FRAMES - KERNEL_SIZE) / 1 + 1 == 1);
+	};
+	const DEPTHWISE_NUM_FEATURES: usize = (NUM_FEATURES - KERNEL_SIZE) / 1 + 1;
+	const OUT_CHANNELS: usize = 16;
+
+	const POOL_KERNEL_SIZE: usize = 3;
+	const POOL_STRIDE: usize = 2;
+	const POOLED_COLS: usize = (DEPTHWISE_NUM_FEATURES - POOL_KERNEL_SIZE) / POOL_STRIDE + 1;
+
+	output.fill(0.0);
+
+	let mut row = [0.0_f32; DEPTHWISE_NUM_FEATURES];
+	for ox in 0..DEPTHWISE_NUM_FEATURES {
+		// depthwise conv
+		let mut sum = 0.0;
+		for kh in 0..KERNEL_SIZE {
+			for kw in 0..KERNEL_SIZE {
+				let w = ox + kw;
+				let input_idx = (kh * NUM_FEATURES) + w;
+				sum += features[input_idx] * LAYER1_KERNEL[(kh * KERNEL_SIZE) + kw];
+			}
+		}
+
+		row[ox] = sum;
+	}
+
+	for c in 0..OUT_CHANNELS {
+		let mut new_row = [0.0; DEPTHWISE_NUM_FEATURES];
+		for ox in 0..DEPTHWISE_NUM_FEATURES {
+			// pointwise conv
+			new_row[ox] = (row[ox] * LAYER1_WEIGHT[c]) + LAYER1_BIAS[c];
+		}
+
+		// max pool over row
+		let out_row_offs = POOLED_COLS * c;
+		for q in 0..POOLED_COLS {
+			for x in 0..POOL_KERNEL_SIZE {
+				let out_q = &mut output[out_row_offs + q];
+				// `out` is zeroed, so this also acts as ReLU
+				*out_q = (*out_q).max(new_row[(q * POOL_STRIDE) + x]);
+			}
+		}
+	}
+}
+
+#[inline(never)]
+fn input_layer2_3<const IN_FEATURES: usize, const OUT_FEATURES: usize, const LAYER3: bool>(
+	features: &[f32],
+	kernel: &[f32; 48],
+	weight: &[f32; 256],
+	bias: &[f32; 16],
+	output: &mut [f32]
+) {
+	const HORIZONTAL_KERNEL_SIZE: usize = 3;
+	const STRIDE: usize = 2;
+	const CHANNELS: usize = 16;
+
+	output.fill(0.0);
+
+	for ox in 0..OUT_FEATURES {
+		let mut dw = [0.0; CHANNELS];
+		for c in 0..CHANNELS {
+			// depthwise conv
+			let mut sum = 0.0;
+			for kw in 0..HORIZONTAL_KERNEL_SIZE {
+				let ix = (ox * STRIDE + kw) as isize - 1;
+				if ix < 0 || ix >= IN_FEATURES as isize {
+					continue;
+				}
+				sum += features[(c * IN_FEATURES) + ix as usize] * kernel[(c * HORIZONTAL_KERNEL_SIZE) + kw];
+			}
+
+			dw[c] = sum;
+		}
+
+		// pointwise conv
+		for oc in 0..CHANNELS {
+			let mut ic = 0.0;
+			for c in 0..CHANNELS {
+				let sum = dw[c];
+				ic += sum * weight[(oc * CHANNELS) + c];
+			}
+
+			let ptr = if !LAYER3 { &mut output[(oc * OUT_FEATURES) + ox] } else { &mut output[(ox * CHANNELS) + oc] };
+			*ptr = (ic + bias[oc]).max(0.0);
+		}
+	}
+}
+
+#[inline(never)]
+fn mingru<const IN_DIM: usize>(features: &[f32], h: &[f32], weight: &[f32], out: &mut [f32]) {
+	for d in 0..128 {
+		let mut o = 0.0;
+		let ri = d * IN_DIM;
+
+		for f in 0..IN_DIM {
+			o += features[f] * weight[ri + f];
+		}
+
+		out[d] = o;
+	}
+
+	for i in 0..64 {
+		let g = (out[64 + i] * 0.25).clamp(0.0, 1.0);
+		let v = &mut out[i];
+		*v = (1. - g) * h[i] + g * *v;
+	}
+}
+
+#[inline]
+fn sigmoid(x: f32) -> f32 {
+	1. / (1. + (-x).exp())
+}
+
+#[inline(never)]
+fn output(out_1: &[f32], out_2: &[f32]) -> f32 {
+	let mut out = 0.0;
+	for f in 0..64 {
+		out += out_1[f] * OUTPUT_WEIGHT[f];
+	}
+	for f in 0..64 {
+		out += out_2[f] * OUTPUT_WEIGHT[64 + f];
+	}
+	sigmoid(out)
+}
diff --git a/src/lib.rs b/src/lib.rs
index d1799e5..8538da8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,59 +5,39 @@ extern crate alloc;
 use alloc::{boxed::Box, vec};
 use core::{f32, ptr};
 
+mod default_predictor;
 mod fft;
-mod quantized_predictor;
 mod util;
 
-#[cfg(feature = "embed-weights")]
-pub use self::quantized_predictor::default_weights as default_quantized_weights;
-pub use self::quantized_predictor::{PackedWeights, QuantizedPredictor};
+pub use self::default_predictor::DefaultPredictor;
 use self::util::OnceLock;
 
+/// Used by [`Detector`] to predict the VAD score of a frame based on extracted features.
+///
+/// # Stability
+/// If you wish to implement `Predictor` yourself, note that **the API is unstable and subject to change!**
 pub trait Predictor {
+	#[doc(hidden)]
 	fn reset(&mut self);
+
+	#[doc(hidden)]
+	fn normalize(&self, features: &mut [f32]);
+
+	#[doc(hidden)]
 	fn predict(&mut self, features: &[f32], buffer: &mut [f32]) -> f32;
 }
 
 const FFT_SIZE: usize = 1024;
 const WINDOW_SIZE: usize = 768;
 const N_MELS: usize = 40;
-const N_FEATURES: usize = N_MELS + 1;
+const N_FEATURES: usize = N_MELS;
 const N_CONTEXT_FRAMES: usize = 3;
 const N_BINS: usize = FFT_SIZE / 2 + 1;
 const PRE_EMPHASIS_COEFF: f32 = 0.97;
 const POWER_FAC: f32 = 1. / (32768.0f32 * 32768.0);
 
-#[rustfmt::skip]
-const FEATURE_MEANS: [f32; 40] = [
-    -8.198236465454, -6.265716552734, -5.483818531036, -4.758691310883,
-	-4.417088985443, -4.142892837524, -3.912850379944, -3.845927953720,
-	-3.657090425491, -3.723418712616, -3.876134157181, -3.843890905380,
-    -3.690405130386, -3.756065845490, -3.698696136475, -3.650463104248,
-	-3.700468778610, -3.567321300507, -3.498900175095, -3.477807044983,
-	-3.458816051483, -3.444923877716, -3.401328563690, -3.306261301041,
-    -3.278556823730, -3.233250856400, -3.198616027832, -3.204526424408,
-	-3.208798646927, -3.257838010788, -3.381376743317, -3.534021377563,
-	-3.640867948532, -3.726858854294, -3.773730993271, -3.804667234421,
-    -3.832901000977, -3.871120452881, -3.990592956543, -4.480289459229
-];
-
-#[rustfmt::skip]
-const FEATURE_STDS: [f32; 40] = [
-    5.166063785553, 4.977209568024, 4.698895931244, 4.630621433258,
-	4.634347915649, 4.641156196594, 4.640676498413, 4.666367053986,
-	4.650534629822, 4.640020847321, 4.637400150299, 4.620099067688,
-    4.596316337585, 4.562654972076, 4.554360389709, 4.566910743713,
-	4.562489986420, 4.562412738800, 4.585299491882, 4.600179672241,
-	4.592845916748, 4.585922718048, 4.583496570587, 4.626092910767,
-    4.626957893372, 4.626289367676, 4.637005805969, 4.683015823364,
-	4.726813793182, 4.734289646149, 4.753227233887, 4.849722862244,
-	4.869434833527, 4.884482860565, 4.921327114105, 4.959212303162,
-    4.996619224548, 5.044823646545, 5.072216987610, 5.096439361572
-];
-
 struct Filters {
-	mel_coeffs: Box<[f32]>,
+	mel_coeffs: Box<[(usize, Box<[f32]>)]>,
 	window: Box<[f32]>
 }
 
@@ -73,15 +53,20 @@ impl Filters {
 			bin_points[i] = ((FFT_SIZE as f32 + 1.) * hz / 16000.) as usize;
 		}
 
-		let mut mel_coeffs = vec![0.0; N_MELS * N_BINS].into_boxed_slice();
+		let mut mel_coeffs = Vec::with_capacity(N_MELS);
 		for i in 0..N_MELS {
+			let mut points = Vec::with_capacity(bin_points[i + 2] - bin_points[i]);
 			for j in bin_points[i]..bin_points[i + 1] {
-				mel_coeffs[(i * N_BINS) + j] = (j - bin_points[i]) as f32 / (bin_points[i + 1] - bin_points[i]) as f32;
+				points.push((j - bin_points[i]) as f32 / (bin_points[i + 1] - bin_points[i]) as f32);
 			}
 
 			for j in bin_points[i + 1]..bin_points[i + 2] {
-				mel_coeffs[(i * N_BINS) + j] = (bin_points[i + 2] - j) as f32 / (bin_points[i + 2] - bin_points[i + 1]) as f32;
+				points.push((bin_points[i + 2] - j) as f32 / (bin_points[i + 2] - bin_points[i + 1]) as f32);
 			}
+
+			// Mel filterbank is naturally very sparse. Rather than waste compute & storage on the whole matrix, only store
+			// non-zero elements.
+			mel_coeffs.push((bin_points[i], points.into_boxed_slice()));
 		}
 
 		// hann window
@@ -92,13 +77,16 @@ impl Filters {
 			window[i] = x * x;
 		}
 
-		Self { mel_coeffs, window }
+		Self {
+			mel_coeffs: mel_coeffs.into_boxed_slice(),
+			window
+		}
 	}
 }
 
 static FILTERS: OnceLock<Filters> = OnceLock::new();
 
-pub struct Detector<P> {
+pub struct Detector<P = DefaultPredictor> {
 	predictor: P,
 	prev_signal: f32,
 	sample_ring_buffer: Box<[f32]>,
@@ -106,9 +94,9 @@ pub struct Detector<P> {
 	buffer: Box<[f32]>
 }
 
-impl<P: Predictor + Default> Default for Detector<P> {
+impl Default for Detector<DefaultPredictor> {
 	fn default() -> Self {
-		Self::new(P::default())
+		Self::new(DefaultPredictor::new())
 	}
 }
 
@@ -223,13 +211,14 @@ impl<P: Predictor> Detector<P> {
 		let cur_frame_features = &mut self.features[(N_FEATURES * (N_CONTEXT_FRAMES - 1))..];
 		for i in 0..N_MELS {
 			let mut per_band_value = 0.;
-			for j in 0..N_BINS {
-				per_band_value += self.buffer[j] * filters.mel_coeffs[(i * N_BINS) + j];
+			let (start, ref coeffs) = filters.mel_coeffs[i];
+			for (offs, coeff) in coeffs.iter().enumerate() {
+				per_band_value += self.buffer[start + offs] * *coeff;
 			}
 
-			per_band_value = libm::logf(per_band_value + 1e-20);
-			cur_frame_features[i] = (per_band_value - FEATURE_MEANS[i]) / FEATURE_STDS[i];
+			cur_frame_features[i] = libm::logf(per_band_value + 1e-20);
 		}
+		self.predictor.normalize(cur_frame_features);
 
 		self.predictor.predict(&self.features, &mut self.buffer)
 	}
diff --git a/src/quantized-model.bin b/src/quantized-model.bin
deleted file mode 100644
index b6b77fe..0000000
Binary files a/src/quantized-model.bin and /dev/null differ
diff --git a/src/quantized_predictor.rs b/src/quantized_predictor.rs
deleted file mode 100644
index 010d69b..0000000
--- a/src/quantized_predictor.rs
+++ /dev/null
@@ -1,489 +0,0 @@
-use alloc::{boxed::Box, vec};
-use core::{mem, slice};
-
-use super::{Predictor, util::OnceLock};
-
-struct BitBufferReader<'d> {
-	pub buf: &'d [u8],
-	idx: usize,
-	bit_buffer: u32,
-	n_bits: u32
-}
-
-impl<'d> BitBufferReader<'d> {
-	pub fn new(buffer: &'d [u8]) -> Self {
-		Self {
-			buf: buffer,
-			idx: 0,
-			bit_buffer: 0,
-			n_bits: 0
-		}
-	}
-
-	pub fn read(&mut self, len: u32) -> i32 {
-		while self.n_bits < len {
-			let byte = self.buf[self.idx];
-			self.idx += 1;
-
-			self.bit_buffer |= (byte as u32) << self.n_bits;
-			self.n_bits += 8;
-		}
-
-		let bits = self.bit_buffer & ((1 << len) - 1);
-		self.bit_buffer >>= len;
-		self.n_bits -= len;
-
-		let sign = (bits & (1 << (len - 1))) != 0;
-		(if sign { -1 << len } else { 0 }) | bits as i32
-	}
-
-	pub fn read_array<T: FromI32>(&mut self, bit_len: u32, cnt: usize) -> Box<[T]> {
-		(0..cnt).map(|_| T::from_i32(self.read(bit_len))).collect()
-	}
-}
-
-trait FromI32 {
-	fn from_i32(x: i32) -> Self;
-}
-
-impl FromI32 for i8 {
-	fn from_i32(x: i32) -> Self {
-		x as i8
-	}
-}
-impl FromI32 for i16 {
-	fn from_i32(x: i32) -> Self {
-		x as i16
-	}
-}
-impl FromI32 for i32 {
-	fn from_i32(x: i32) -> Self {
-		x
-	}
-}
-
-pub struct PackedWeights {
-	layer1_kernel: Box<[i16]>,
-	layer1_weight: Box<[i16]>,
-	layer1_bias: Box<[i16]>,
-	layer2_kernel: Box<[i16]>,
-	layer2_weight: Box<[i16]>,
-	layer2_bias: Box<[i16]>,
-	layer3_kernel: Box<[i16]>,
-	layer3_weight: Box<[i16]>,
-	layer3_bias: Box<[i16]>,
-	lstm1_ih: Box<[i16]>,
-	lstm1_hh: Box<[i16]>,
-	lstm1_bias: Box<[i16]>,
-	lstm2_ih: Box<[i16]>,
-	lstm2_hh: Box<[i16]>,
-	lstm2_bias: Box<[i16]>,
-	out1_weight: Box<[i16]>,
-	out1_bias: Box<[i16]>,
-	out2_weight: Box<[i16]>,
-	out2_bias: i8
-}
-
-impl PackedWeights {
-	pub fn new(bytes: &[u8]) -> Self {
-		assert_eq!(bytes.len(), 135783, "invalid length for packed QuantizedPredictor weights");
-		let mut reader = BitBufferReader::new(bytes);
-		Self {
-			layer1_kernel: reader.read_array(14, 9),
-			layer1_weight: reader.read_array(14, 16),
-			layer1_bias: reader.read_array(12, 16),
-			layer2_kernel: reader.read_array(15, 48),
-			layer2_weight: reader.read_array(16, 256),
-			layer2_bias: reader.read_array(14, 16),
-			layer3_kernel: reader.read_array(14, 48),
-			layer3_weight: reader.read_array(15, 256),
-			layer3_bias: reader.read_array(12, 16),
-			lstm1_ih: reader.read_array(15, 20480),
-			lstm1_hh: reader.read_array(14, 16384),
-			lstm1_bias: reader.read_array(12, 256),
-			lstm2_ih: reader.read_array(15, 16384),
-			lstm2_hh: reader.read_array(14, 16384),
-			lstm2_bias: reader.read_array(12, 256),
-			out1_weight: reader.read_array(14, 4096),
-			out1_bias: reader.read_array(11, 32),
-			out2_weight: reader.read_array(13, 32),
-			out2_bias: reader.read(4) as i8
-		}
-	}
-}
-
-#[cfg(feature = "embed-weights")]
-static DEFAULT_WEIGHT_BYTES: &[u8] = include_bytes!("quantized-model.bin");
-#[cfg(feature = "embed-weights")]
-static DEFAULT_WEIGHTS: OnceLock<PackedWeights> = OnceLock::new();
-
-#[cfg(feature = "embed-weights")]
-pub fn default_weights() -> &'static PackedWeights {
-	DEFAULT_WEIGHTS.get_or_init(|| PackedWeights::new(DEFAULT_WEIGHT_BYTES))
-}
-
-pub struct ActivationTables {
-	sigmoid: Box<[i32]>,
-	tanh: Box<[i32]>
-}
-
-impl ActivationTables {
-	const Q11_SCALE: i32 = 2048; // 2 ** 11
-	const Q11_SCALE_FLOAT: f32 = 2048.;
-	const SIGMOID_MAX: i32 = Self::Q11_SCALE * 6; // sigmoid goes asymptotic < -6 or > 6, so limit computation to between these values
-	const TANH_MAX: i32 = Self::Q11_SCALE * 4; // ^ 4 for tanh
-	pub const OUT_SCALE: f32 = 65536.; // 2 ** 16, outputs in Q16
-
-	pub fn new() -> Self {
-		let sigmoid_len = Self::SIGMOID_MAX * 2 + 1;
-		let mut sigmoid_table = vec![0; sigmoid_len as usize].into_boxed_slice();
-		for i in 0..sigmoid_len {
-			let v = Self::_real_sigmoid((i - (Self::SIGMOID_MAX)) as f32 / Self::Q11_SCALE_FLOAT);
-			sigmoid_table[i as usize] = libm::roundevenf(v * Self::OUT_SCALE) as i32;
-		}
-		let tanh_len = Self::TANH_MAX * 2 + 1;
-		let mut tanh_table = vec![0; tanh_len as usize].into_boxed_slice();
-		for i in 0..tanh_len {
-			let v = libm::tanhf((i - (Self::TANH_MAX)) as f32 / Self::Q11_SCALE_FLOAT);
-			tanh_table[i as usize] = libm::roundevenf(v * Self::OUT_SCALE) as i32;
-		}
-
-		Self {
-			sigmoid: sigmoid_table,
-			tanh: tanh_table
-		}
-	}
-
-	#[inline]
-	fn _real_sigmoid(x: f32) -> f32 {
-		1. / (1. + libm::expf(-x))
-	}
-
-	#[inline]
-	pub fn sigmoid(&self, x: i32) -> i32 {
-		unsafe {
-			*self
-				.sigmoid
-				.get_unchecked((x + Self::SIGMOID_MAX).clamp(0, Self::SIGMOID_MAX * 2) as usize)
-		}
-	}
-	#[inline]
-	pub fn tanh(&self, x: i32) -> i32 {
-		unsafe { *self.tanh.get_unchecked((x + Self::TANH_MAX).clamp(0, Self::TANH_MAX * 2) as usize) }
-	}
-}
-
-static ACTIVATION_TABLES: OnceLock<ActivationTables> = OnceLock::new();
-
-pub struct QuantizedPredictor<'w> {
-	weights: &'w PackedWeights,
-	state: Box<[i32]>
-}
-
-impl<'w> QuantizedPredictor<'w> {
-	pub fn new(weights: &'w PackedWeights) -> Self {
-		Self {
-			weights,
-			state: vec![0; 256].into_boxed_slice()
-		}
-	}
-}
-
-#[cfg(feature = "embed-weights")]
-impl Default for QuantizedPredictor<'static> {
-	fn default() -> Self {
-		Self::new(default_weights())
-	}
-}
-
-impl Predictor for QuantizedPredictor<'_> {
-	fn reset(&mut self) {
-		self.state.fill(0);
-	}
-
-	fn predict(&mut self, features: &[f32], buffer: &mut [f32]) -> f32 {
-		assert_eq!(features.len(), 41 * 3);
-		assert!(buffer.len() > 464);
-
-		let buffer = unsafe { mem::transmute::<&mut [f32], &mut [i32]>(buffer) };
-
-		let buffer_ptr = buffer.as_mut_ptr();
-		input_layer1(features, &self.weights.layer1_kernel, &self.weights.layer1_weight, &self.weights.layer1_bias, &mut buffer[..304]);
-		input_layer2(&buffer[..304], &self.weights.layer2_kernel, &self.weights.layer2_weight, &self.weights.layer2_bias, unsafe {
-			slice::from_raw_parts_mut(buffer_ptr.add(304), 160)
-		});
-		input_layer3(&buffer[304..], &self.weights.layer3_kernel, &self.weights.layer3_weight, &self.weights.layer3_bias, unsafe {
-			slice::from_raw_parts_mut(buffer_ptr, 80)
-		});
-		lstm::<80, { 80 * 256 }>(
-			&buffer[..80],
-			&self.state[..64],
-			&self.state[64..128],
-			&self.weights.lstm1_ih,
-			&self.weights.lstm1_hh,
-			&self.weights.lstm1_bias,
-			unsafe { slice::from_raw_parts_mut(buffer_ptr.add(80), 256) }
-		);
-		self.state[..128].copy_from_slice(&buffer[80..208]);
-		lstm::<64, { 64 * 256 }>(
-			&self.state[..128],
-			&self.state[128..192],
-			&self.state[192..],
-			&self.weights.lstm2_ih,
-			&self.weights.lstm2_hh,
-			&self.weights.lstm2_bias,
-			&mut buffer[..256]
-		);
-		self.state[128..].copy_from_slice(&buffer[..128]);
-		output(
-			&self.state[..128],
-			&self.state[128..],
-			&self.weights.out1_weight,
-			&self.weights.out1_bias,
-			&self.weights.out2_weight,
-			self.weights.out2_bias
-		)
-	}
-}
-
-#[inline(never)]
-fn input_layer1(features: &[f32], kernel: &[i16], weight: &[i16], bias: &[i16], output: &mut [i32]) {
-	const NUM_FRAMES: usize = 3;
-	const NUM_FEATURES: usize = 41;
-	const FEATURES_INPUT: usize = const { NUM_FRAMES * NUM_FEATURES };
-
-	const KERNEL_SIZE: usize = 3;
-	const {
-		assert!((NUM_FRAMES - KERNEL_SIZE) / 1 + 1 == 1);
-	};
-	const DEPTHWISE_NUM_FEATURES: usize = (NUM_FEATURES - KERNEL_SIZE) / 1 + 1;
-	const OUT_CHANNELS: usize = 16;
-
-	const POOL_KERNEL_SIZE: usize = 3;
-	const POOL_STRIDE: usize = 2;
-	const SCALE_FACTOR: f32 = (1 << 16) as f32;
-	const POOLED_COLS: usize = (DEPTHWISE_NUM_FEATURES - POOL_KERNEL_SIZE) / POOL_STRIDE + 1;
-
-	output.fill(0);
-
-	assert_eq!(features.len(), FEATURES_INPUT);
-
-	let mut tmp = [0i32; FEATURES_INPUT];
-	// doing this conversion in the convolution loop kills performance
-	for i in 0..FEATURES_INPUT {
-		unsafe {
-			// convert to Q16
-			*tmp.get_unchecked_mut(i) = libm::floorf(*features.get_unchecked(i) * SCALE_FACTOR) as i32;
-		};
-	}
-
-	let mut row = [0; DEPTHWISE_NUM_FEATURES];
-	for c in 0..OUT_CHANNELS {
-		for ox in 0..DEPTHWISE_NUM_FEATURES {
-			// depthwise conv
-			let mut sum = 0;
-			for kh in 0..KERNEL_SIZE {
-				for kw in 0..KERNEL_SIZE {
-					let w = ox + kw;
-					let input_idx = (kh * NUM_FEATURES) + w;
-					unsafe {
-						// Q16 * Q13 = Q29
-						sum += *tmp.get_unchecked(input_idx) as i64 * *kernel.get_unchecked((kh * KERNEL_SIZE) + kw) as i64;
-					}
-				}
-			}
-
-			// pointwise conv
-			unsafe {
-				// Q29 * Q13 = Q42. bias is Q12 so shift left by 42-12=30
-				let x = (sum * *weight.get_unchecked(c) as i64) + ((*bias.get_unchecked(c) as i64) << 30);
-				// shift down to Q16
-				*row.get_unchecked_mut(ox) = (x >> 26) as i32;
-			}
-		}
-
-		// max pool over row
-		let out_row_offs = POOLED_COLS * c;
-		for q in 0..POOLED_COLS {
-			for x in 0..POOL_KERNEL_SIZE {
-				let out_q = unsafe { output.as_mut_ptr().add(out_row_offs + q) };
-				// `output` is initially zeroed, so this also acts as ReLU
-				unsafe { *out_q = (*out_q).max(*row.get_unchecked((q * POOL_STRIDE) + x)) };
-			}
-		}
-	}
-}
-
-#[inline(never)]
-fn input_layer2(features: &[i32], kernel: &[i16], weight: &[i16], bias: &[i16], output: &mut [i32]) {
-	const HORIZONTAL_KERNEL_SIZE: usize = 3;
-	const STRIDE: usize = 2;
-	const CHANNELS: usize = 16;
-
-	const IN_FEATURES: usize = 19;
-	const OUT_FEATURES: usize = 10;
-
-	output.fill(0);
-
-	for ox in 0..OUT_FEATURES {
-		let mut row = [0; CHANNELS];
-		for c in 0..CHANNELS {
-			// depthwise conv
-			let mut sum = 0;
-			for kw in 0..HORIZONTAL_KERNEL_SIZE {
-				let ix = (ox * STRIDE + kw) as isize - 1;
-				if ix < 0 || ix >= IN_FEATURES as isize {
-					continue;
-				}
-
-				// Q16 * Q13 = Q29
-				unsafe {
-					sum += *features.get_unchecked((c * IN_FEATURES) + ix as usize) as i64 * *kernel.get_unchecked((c * HORIZONTAL_KERNEL_SIZE) + kw) as i64;
-				}
-			}
-
-			// pointwise conv
-			for oc in 0..CHANNELS {
-				unsafe {
-					// Q29 * Q13 = Q42
-					let r = sum * *weight.get_unchecked((oc * CHANNELS) + c) as i64;
-					*row.get_unchecked_mut(oc) += r;
-				}
-			}
-		}
-
-		// apply pointwise conv bias + relu
-		for oc in 0..CHANNELS {
-			unsafe {
-				// bias is Q12 so shift left by 42-12=30
-				let br = *row.get_unchecked(oc) + ((*bias.get_unchecked(oc) as i64) << 30);
-				// shift down to Q16
-				*output.get_unchecked_mut((oc * OUT_FEATURES) + ox) = ((br >> 26) as i32).max(0);
-			}
-		}
-	}
-}
-
-#[inline(never)]
-fn input_layer3(features: &[i32], kernel: &[i16], weight: &[i16], bias: &[i16], output: &mut [i32]) {
-	const HORIZONTAL_KERNEL_SIZE: usize = 3;
-	const STRIDE: usize = 2;
-	const CHANNELS: usize = 16;
-
-	const IN_FEATURES: usize = 10;
-	const OUT_FEATURES: usize = 5;
-
-	output.fill(0);
-
-	for ox in 0..OUT_FEATURES {
-		let mut row = [0; CHANNELS];
-		for c in 0..CHANNELS {
-			// depthwise conv
-			let mut sum = 0i64;
-			for kw in 0..HORIZONTAL_KERNEL_SIZE {
-				let ix = ox * STRIDE + kw; // layer 3 does not use left padding
-				if ix >= IN_FEATURES {
-					continue;
-				}
-				unsafe {
-					// Q16 * Q13 = Q29
-					sum += *features.get_unchecked((c * IN_FEATURES) + ix as usize) as i64 * *kernel.get_unchecked((c * HORIZONTAL_KERNEL_SIZE) + kw) as i64;
-				}
-			}
-
-			// pointwise conv
-			for oc in 0..CHANNELS {
-				unsafe {
-					// Q29 * Q13 = Q42
-					*row.get_unchecked_mut(oc) += sum * *weight.get_unchecked((oc * CHANNELS) + c) as i64;
-				}
-			}
-		}
-
-		// apply pointwise conv bias + relu
-		for oc in 0..CHANNELS {
-			unsafe {
-				// bias is Q12 so shift left by 42-12=30
-				let r = *row.get_unchecked_mut(oc) + ((*bias.get_unchecked(oc) as i64) << 30);
-				let ptr = output.get_unchecked_mut((ox * CHANNELS) + oc);
-				// shift down to Q16
-				*ptr = (r >> 26).max(0) as i32;
-			}
-		}
-	}
-}
-
-#[inline(never)]
-fn lstm<const IN_DIM: usize, const IH_DIM: usize>(features: &[i32], h: &[i32], c: &[i32], weight_ih: &[i16], weight_hh: &[i16], bias: &[i16], out: &mut [i32]) {
-	for d in 0..256 {
-		// init with Q10 bias, shifted left by 18 to get 10+18=Q28
-		let mut o = (unsafe { *bias.get_unchecked(d) } as i64) << 18;
-		let (ri, rh) = (d * IN_DIM, d * 64);
-
-		for f in 0..IN_DIM {
-			unsafe {
-				// Q16 * Q12 = Q28
-				o += *features.get_unchecked(f) as i64 * *weight_ih.get_unchecked(ri + f) as i64;
-			};
-		}
-
-		for g in 0..64 {
-			unsafe {
-				// Q16 * Q12 = Q28
-				o += *h.get_unchecked(g) as i64 * *weight_hh.get_unchecked(rh + g) as i64;
-			}
-		}
-
-		unsafe {
-			// shift down from Q28 to Q11
-			*out.get_unchecked_mut(d) = (o >> 17) as i32;
-		};
-	}
-
-	let act = ACTIVATION_TABLES.get_or_init(ActivationTables::new);
-	for i in 0..64 {
-		unsafe {
-			// layout is [input, output, forget, cell]
-			let ix = act.sigmoid(*out.get_unchecked(i)) as i64;
-			let fx = act.sigmoid(*out.get_unchecked(128 + i)) as i64;
-			let cx = act.tanh(*out.get_unchecked(192 + i)) as i64;
-			// Q16 * Q16 = Q32; Q16 * Q16 = Q32
-			let x = (fx * *c.get_unchecked(i) as i64) + (ix * cx);
-			let xt = act.tanh((x >> 21) as i32) as i64; // Q11 in, Q16 out
-			// arrange outputs as [hidden, cell]
-			let o = act.sigmoid(mem::replace(out.get_unchecked_mut(64 + i), (x >> 16) as i32)) as i64;
-			// Q16 * Q16 = Q32, shift down to Q16
-			*out.get_unchecked_mut(i) = ((o * xt) >> 16) as i32;
-		};
-	}
-}
-
-#[inline(never)]
-fn output(out_1: &[i32], out_2: &[i32], weight_1: &[i16], bias_1: &[i16], weight_2: &[i16], bias_2: i8) -> f32 {
-	let mut temp = [0; 32];
-	for h in 0..64 {
-		for f in 0..32 {
-			unsafe {
-				// Q16 * Q12 = Q28, shift down to Q19
-				let mut o = *out_2.get_unchecked(h) as i64 * *weight_1.get_unchecked(h * 32 + f) as i64;
-				o += *out_1.get_unchecked(h) as i64 * *weight_1.get_unchecked((h + 64) * 32 + f) as i64;
-				*temp.get_unchecked_mut(f) += (o >> 9) as i32;
-			}
-		}
-	}
-
-	let mut out = 0;
-	for f in 0..32 {
-		unsafe {
-			// bias is Q10 so shift left by 19-10=9
-			let q = *temp.get_unchecked(f) as i64 + ((*bias_1.get_unchecked(f) as i64) << 9);
-			// Q19 * Q13 = Q32
-			out += q.max(0) * *weight_2.get_unchecked(f) as i64;
-		}
-	}
-	// bias is Q9 so shift left by 32-9=23
-	out += (bias_2 as i64) << 23;
-	// shift down to Q11
-	out >>= 21;
-	ACTIVATION_TABLES.get_or_init(ActivationTables::new).sigmoid(out as i32) as f32 / ActivationTables::OUT_SCALE
-}
diff --git a/src/util.rs b/src/util.rs
index 2846e93..2620b43 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -30,7 +30,7 @@ impl<T> OnceLock<T> {
 
 	#[inline]
 	unsafe fn get_unchecked(&self) -> &T {
-		&*(*self.data.get()).as_ptr()
+		unsafe { &*(*self.data.get()).as_ptr() }
 	}
 
 	#[inline]
diff --git a/src/weights.bin b/src/weights.bin
new file mode 100644
index 0000000..aa3f56f
Binary files /dev/null and b/src/weights.bin differ
diff --git a/tests/data/audio_tiny16.raw b/tests/data/audio_tiny16.raw
deleted file mode 100644
index a2a3b32..0000000
Binary files a/tests/data/audio_tiny16.raw and /dev/null differ