From b78f1e139b462c2a157d761eb176f9b4e1f6ea54 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 15:40:59 +0200 Subject: [PATCH 01/20] perf(sort): use lookup table to speed up sequence conversion to bitpacked representation Looking into sort performance (see https://github.com/nextstrain/nextclade/issues/1647), I noticed that the various string containment operations where performance limiting. Instead of doing multiple string containment call, this PR uses a compile time lookup table to turn a sequence into a bitpacked representation. A quick benchmark on 240MB of SC2 sequences show this reduces core sort algorithm runtime (excluding minimizer download) by 42%. --- .../nextclade/src/sort/minimizer_search.rs | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index baa3dc865..6eeafaf4f 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -243,32 +243,36 @@ const fn invertible_hash(x: u64) -> u64 { x } +const NUCLEOTIDE_LOOKUP: [(bool, u8); 256] = { + let mut table = [(false, 0); 256]; + table[b'A' as usize] = (true, 0b11); // A=11=3 + table[b'C' as usize] = (true, 0b10); // C=10=2 + table[b'G' as usize] = (true, 0b00); // G=00=0 + table[b'T' as usize] = (true, 0b01); // T=01=1 + table +}; + fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u64 { let cutoff = params.cutoff as u64; let mut x = 0; let mut j = 0; - for (i, nuc) in kmer.iter().enumerate() { - let nuc = *nuc as char; - + // Create a bit-packed representation of the kmer + // where each nucleotide is represented by 2 bits: + // A=11, C=10, G=00, T=01 + // We skip every third nucleotide to pick up conserved patterns + for (i, &nuc) in kmer.iter().enumerate() { if i % 3 == 2 { continue; // skip every third nucleotide to pick up conserved patterns } - if !"ACGT".contains(nuc) { - return cutoff + 1; // break out of loop, return hash above cutoff - } - - // A=11=3, C=10=2, G=00=0, T=01=1 - if "AC".contains(nuc) { - x += 1 << j; - } - - if "AT".contains(nuc) { - x += 1 << (j + 1); + let (is_valid, bits) = NUCLEOTIDE_LOOKUP[nuc as usize]; + if !is_valid { + return cutoff + 1; // invalid nucleotide } + x |= (bits as u64) << j; j += 2; } From 969d40eaa445606157c9e214635308dd04636600 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 15:45:38 +0200 Subject: [PATCH 02/20] tweaks to comments --- packages/nextclade/src/sort/minimizer_search.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index 6eeafaf4f..f0a6e1ca6 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -244,7 +244,7 @@ const fn invertible_hash(x: u64) -> u64 { } const NUCLEOTIDE_LOOKUP: [(bool, u8); 256] = { - let mut table = [(false, 0); 256]; + let mut table = [(false, 0); 256]; // Non-ACGT table[b'A' as usize] = (true, 0b11); // A=11=3 table[b'C' as usize] = (true, 0b10); // C=10=2 table[b'G' as usize] = (true, 0b00); // G=00=0 @@ -261,10 +261,10 @@ fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u64 { // Create a bit-packed representation of the kmer // where each nucleotide is represented by 2 bits: // A=11, C=10, G=00, T=01 - // We skip every third nucleotide to pick up conserved patterns + // Skip every third nucleotide to pick up conserved patterns for (i, &nuc) in kmer.iter().enumerate() { if i % 3 == 2 { - continue; // skip every third nucleotide to pick up conserved patterns + continue; } let (is_valid, bits) = NUCLEOTIDE_LOOKUP[nuc as usize]; From 0faced0af8bdd3239f88262d614430f28a307128 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 16:07:35 +0200 Subject: [PATCH 03/20] Also accept lower case --- packages/nextclade/src/sort/minimizer_search.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index f0a6e1ca6..884b96af9 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -246,9 +246,13 @@ const fn invertible_hash(x: u64) -> u64 { const NUCLEOTIDE_LOOKUP: [(bool, u8); 256] = { let mut table = [(false, 0); 256]; // Non-ACGT table[b'A' as usize] = (true, 0b11); // A=11=3 + table[b'a' as usize] = (true, 0b11); // a=11=3 table[b'C' as usize] = (true, 0b10); // C=10=2 + table[b'c' as usize] = (true, 0b10); // c=10=2 table[b'G' as usize] = (true, 0b00); // G=00=0 + table[b'g' as usize] = (true, 0b00); // g=00=0 table[b'T' as usize] = (true, 0b01); // T=01=1 + table[b't' as usize] = (true, 0b01); // t=01=1 table }; From 5e2127432721f1d91507a50f3bfb4a078a66767c Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 16:13:36 +0200 Subject: [PATCH 04/20] T/C were actually wrong in comments previously --- packages/nextclade/src/sort/minimizer_search.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index 884b96af9..a89e8f1ef 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -246,13 +246,13 @@ const fn invertible_hash(x: u64) -> u64 { const NUCLEOTIDE_LOOKUP: [(bool, u8); 256] = { let mut table = [(false, 0); 256]; // Non-ACGT table[b'A' as usize] = (true, 0b11); // A=11=3 - table[b'a' as usize] = (true, 0b11); // a=11=3 - table[b'C' as usize] = (true, 0b10); // C=10=2 - table[b'c' as usize] = (true, 0b10); // c=10=2 + table[b'a' as usize] = (true, 0b11); + table[b'T' as usize] = (true, 0b10); // T=10=2 + table[b't' as usize] = (true, 0b10); table[b'G' as usize] = (true, 0b00); // G=00=0 - table[b'g' as usize] = (true, 0b00); // g=00=0 - table[b'T' as usize] = (true, 0b01); // T=01=1 - table[b't' as usize] = (true, 0b01); // t=01=1 + table[b'g' as usize] = (true, 0b00); + table[b'C' as usize] = (true, 0b01); // C=01=1 + table[b'c' as usize] = (true, 0b01); table }; @@ -271,7 +271,7 @@ fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u64 { continue; } - let (is_valid, bits) = NUCLEOTIDE_LOOKUP[nuc as usize]; + let (is_valid, bits) = NUCLEOTIDE_LOOKUP[nuc as u8]; if !is_valid { return cutoff + 1; // invalid nucleotide } From 2ed072beefba466e9081565edafeb9535c8917a5 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 16:17:52 +0200 Subject: [PATCH 05/20] Fix T/C mixup --- packages/nextclade/src/sort/minimizer_search.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index a89e8f1ef..4d3de42a0 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -246,16 +246,13 @@ const fn invertible_hash(x: u64) -> u64 { const NUCLEOTIDE_LOOKUP: [(bool, u8); 256] = { let mut table = [(false, 0); 256]; // Non-ACGT table[b'A' as usize] = (true, 0b11); // A=11=3 - table[b'a' as usize] = (true, 0b11); table[b'T' as usize] = (true, 0b10); // T=10=2 - table[b't' as usize] = (true, 0b10); table[b'G' as usize] = (true, 0b00); // G=00=0 - table[b'g' as usize] = (true, 0b00); table[b'C' as usize] = (true, 0b01); // C=01=1 - table[b'c' as usize] = (true, 0b01); table }; +/// Expects kmer to be all uppercase letters fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u64 { let cutoff = params.cutoff as u64; @@ -271,7 +268,7 @@ fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u64 { continue; } - let (is_valid, bits) = NUCLEOTIDE_LOOKUP[nuc as u8]; + let (is_valid, bits) = NUCLEOTIDE_LOOKUP[nuc as usize]; if !is_valid { return cutoff + 1; // invalid nucleotide } From 9e72eb2a5577e96e9752dc6c425d58791cd88d5c Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 16:36:04 +0200 Subject: [PATCH 06/20] Reuse our skip_every iterator extension --- packages/nextclade/src/align/seed_match.rs | 4 ++-- packages/nextclade/src/sort/minimizer_search.rs | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/packages/nextclade/src/align/seed_match.rs b/packages/nextclade/src/align/seed_match.rs index 0888b066b..bb50363ea 100644 --- a/packages/nextclade/src/align/seed_match.rs +++ b/packages/nextclade/src/align/seed_match.rs @@ -16,7 +16,7 @@ use std::cmp::{max, min}; use std::collections::{BTreeMap, VecDeque}; /// Copied from https://stackoverflow.com/a/75084739/7483211 -struct SkipEvery { +pub struct SkipEvery { inner: I, every: usize, index: usize, @@ -44,7 +44,7 @@ impl Iterator for SkipEvery { } } -trait IteratorSkipEveryExt: Iterator + Sized { +pub trait IteratorSkipEveryExt: Iterator + Sized { fn skip_every(self, every: usize) -> SkipEvery { SkipEvery::new(self, every) } diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index 4d3de42a0..afc7c1320 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -1,3 +1,4 @@ +use crate::align::seed_match::IteratorSkipEveryExt; use crate::io::fasta::FastaRecord; use crate::sort::minimizer_index::{MinimizerIndexJson, MinimizerIndexParams}; use crate::sort::params::NextcladeSeqSortParams; @@ -263,11 +264,7 @@ fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u64 { // where each nucleotide is represented by 2 bits: // A=11, C=10, G=00, T=01 // Skip every third nucleotide to pick up conserved patterns - for (i, &nuc) in kmer.iter().enumerate() { - if i % 3 == 2 { - continue; - } - + for (i, &nuc) in kmer.iter().skip_every(3).enumerate() { let (is_valid, bits) = NUCLEOTIDE_LOOKUP[nuc as usize]; if !is_valid { return cutoff + 1; // invalid nucleotide From 1248066246a652395554565c13487da811863dd1 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 16:50:00 +0200 Subject: [PATCH 07/20] fix lint --- packages/nextclade/src/sort/minimizer_search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index afc7c1320..d51b36119 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -264,7 +264,7 @@ fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u64 { // where each nucleotide is represented by 2 bits: // A=11, C=10, G=00, T=01 // Skip every third nucleotide to pick up conserved patterns - for (i, &nuc) in kmer.iter().skip_every(3).enumerate() { + for &nuc in kmer.iter().skip_every(3) { let (is_valid, bits) = NUCLEOTIDE_LOOKUP[nuc as usize]; if !is_valid { return cutoff + 1; // invalid nucleotide From 73e0638d7bcd7cae9e6582122a71cdf625aa613e Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 17:55:18 +0200 Subject: [PATCH 08/20] Give capacity hints --- packages/nextclade/src/sort/minimizer_search.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index d51b36119..a019bd8e5 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -283,7 +283,9 @@ pub fn get_ref_search_minimizers(seq: &FastaRecord, params: &MinimizerIndexParam let seq_str = preprocess_seq(&seq.seq); let n = seq_str.len().saturating_sub(k); - let mut minimizers = Vec::with_capacity(n); + // We keep only every cutoff/2^32-th minimizer + let expected_n_minimizers=n * (2<<32) / (cutoff as usize); + let mut minimizers = Vec::with_capacity(2*expected_n_minimizers); for i in 0..n { let kmer = &seq_str.as_bytes()[i..i + k]; let mhash = get_hash(kmer, params); @@ -292,7 +294,9 @@ pub fn get_ref_search_minimizers(seq: &FastaRecord, params: &MinimizerIndexParam minimizers.push(mhash); } } - minimizers.into_iter().unique().collect_vec() + let mut result = Vec::with_capacity(minimizers.len()); + result.extend(minimizers.into_iter().unique()); + result } fn preprocess_seq(seq: impl AsRef) -> String { From ae438d1888f7cf5f364f59f3033cc55850a6fc5b Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 18:09:30 +0200 Subject: [PATCH 09/20] Use u32 as hashes are always 32-bit --- .../nextclade/src/sort/minimizer_index.rs | 4 +-- .../nextclade/src/sort/minimizer_search.rs | 29 +++++++++---------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_index.rs b/packages/nextclade/src/sort/minimizer_index.rs index 4bdcefd7d..1c40df7dc 100644 --- a/packages/nextclade/src/sort/minimizer_index.rs +++ b/packages/nextclade/src/sort/minimizer_index.rs @@ -14,7 +14,7 @@ pub const MINIMIZER_INDEX_SCHEMA_VERSION_FROM: &str = "3.0.0"; pub const MINIMIZER_INDEX_SCHEMA_VERSION_TO: &str = "3.0.0"; pub const MINIMIZER_INDEX_ALGO_VERSION: &str = "1"; -pub type MinimizerMap = BTreeMap>; +pub type MinimizerMap = BTreeMap>; /// Contains external configuration and data specific for a particular pathogen #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] @@ -56,7 +56,7 @@ pub fn serde_deserialize_minimizers<'de, D: Deserializer<'de>>(deserializer: D) let res = map .into_iter() - .map(|(k, v)| Ok((u64::from_str(&k)?, v))) + .map(|(k, v)| Ok((u32::from_str(&k)?, v))) .collect::>() .map_err(serde::de::Error::custom)?; diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index a019bd8e5..0e39b238c 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -232,15 +232,14 @@ pub struct DatasetSuggestionStats { pub qry_indices: Vec, } -const fn invertible_hash(x: u64) -> u64 { - let m: u64 = (1 << 32) - 1; - let mut x: u64 = (!x).wrapping_add(x << 21) & m; +const fn invertible_hash(x: u32) -> u32 { + let mut x = (!x).wrapping_add(x << 21); x = x ^ (x >> 24); - x = (x + (x << 3) + (x << 8)) & m; + x = x.wrapping_add(x << 3).wrapping_add(x << 8); x = x ^ (x >> 14); - x = (x + (x << 2) + (x << 4)) & m; + x = x.wrapping_add(x << 2).wrapping_add(x << 4); x = x ^ (x >> 28); - x = (x + (x << 31)) & m; + x = x.wrapping_add(x << 31); x } @@ -254,11 +253,11 @@ const NUCLEOTIDE_LOOKUP: [(bool, u8); 256] = { }; /// Expects kmer to be all uppercase letters -fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u64 { - let cutoff = params.cutoff as u64; +fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u32 { + let cutoff = params.cutoff as u32; - let mut x = 0; - let mut j = 0; + let mut x: u32 = 0; + let mut j: u8 = 0; // Create a bit-packed representation of the kmer // where each nucleotide is represented by 2 bits: @@ -270,22 +269,22 @@ fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u64 { return cutoff + 1; // invalid nucleotide } - x |= (bits as u64) << j; + x |= (bits as u32) << j; j += 2; } invertible_hash(x) } -pub fn get_ref_search_minimizers(seq: &FastaRecord, params: &MinimizerIndexParams) -> Vec { +pub fn get_ref_search_minimizers(seq: &FastaRecord, params: &MinimizerIndexParams) -> Vec { let k = params.k as usize; - let cutoff = params.cutoff as u64; + let cutoff = params.cutoff as u32; let seq_str = preprocess_seq(&seq.seq); let n = seq_str.len().saturating_sub(k); // We keep only every cutoff/2^32-th minimizer - let expected_n_minimizers=n * (2<<32) / (cutoff as usize); - let mut minimizers = Vec::with_capacity(2*expected_n_minimizers); + let expected_n_minimizers = n * (1 << 32) / (cutoff as usize); + let mut minimizers = Vec::with_capacity(2 * expected_n_minimizers); for i in 0..n { let kmer = &seq_str.as_bytes()[i..i + k]; let mhash = get_hash(kmer, params); From 3805161838f22ebfb407117d4ebc368d9d751fbe Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 18:33:18 +0200 Subject: [PATCH 10/20] More idiomatic and faster yet (~5%) --- .../nextclade/src/sort/minimizer_search.rs | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index 0e39b238c..727bc47af 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -277,25 +277,31 @@ fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u32 { } pub fn get_ref_search_minimizers(seq: &FastaRecord, params: &MinimizerIndexParams) -> Vec { - let k = params.k as usize; - let cutoff = params.cutoff as u32; + let k = params.k as usize; + let cutoff = params.cutoff as u32; - let seq_str = preprocess_seq(&seq.seq); - let n = seq_str.len().saturating_sub(k); - // We keep only every cutoff/2^32-th minimizer - let expected_n_minimizers = n * (1 << 32) / (cutoff as usize); - let mut minimizers = Vec::with_capacity(2 * expected_n_minimizers); - for i in 0..n { - let kmer = &seq_str.as_bytes()[i..i + k]; - let mhash = get_hash(kmer, params); - // accept only hashes below cutoff --> reduces the size of the index and the number of lookups - if mhash < cutoff { - minimizers.push(mhash); - } - } - let mut result = Vec::with_capacity(minimizers.len()); - result.extend(minimizers.into_iter().unique()); - result + let seq_str = preprocess_seq(&seq.seq); + let n_kmers = seq_str.len().saturating_sub(k); + + // 1. Estimate the number of items that will pass the filter. + // The probability of a random u32 hash being < cutoff is (cutoff / u32::MAX). + // We use floating-point math for a more accurate calculation. + let expected_count = (n_kmers as f64 * (cutoff as f64 / u32::MAX as f64)) as usize; + + // 2. Pre-allocate a Vec with the estimated capacity. + let mut minimizers = Vec::with_capacity(expected_count); + + // 3. Define the iterator chain. + let minimizer_iter = seq_str.as_bytes() + .windows(k) + .map(|kmer| get_hash(kmer, params)) + .filter(|&mhash| mhash < cutoff) + .unique(); + + // 4. Extend the vector, which consumes the iterator and fills the Vec. + minimizers.extend(minimizer_iter); + + minimizers } fn preprocess_seq(seq: impl AsRef) -> String { From 50b8a4afee1763ca2d3eeef978cae57a25fe0e42 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 18:58:30 +0200 Subject: [PATCH 11/20] Faster still Now 2.26s vs 4.91 in lateset version --- .../nextclade/src/sort/minimizer_search.rs | 51 ++++++++----------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index 727bc47af..00003e196 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -45,17 +45,8 @@ pub fn run_minimizer_search( ) -> Result { let n_refs = index.references.len(); - let minimizers = get_ref_search_minimizers(fasta_record, &index.params); - let mut hit_counts = vec![0; n_refs]; - for m in minimizers { - if let Some(mz) = index.minimizers.get(&m) { - for (ri, hit_count) in hit_counts.iter_mut().enumerate() { - if mz.contains(&ri) { - *hit_count += 1; - } - } - } - } + let hit_counts = calculate_minimizer_hits(fasta_record, index, n_refs); + // we expect hits to be proportional to the length of the sequence and the number of minimizers per reference let mut scores: Vec = vec![0.0; hit_counts.len()]; @@ -276,32 +267,32 @@ fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u32 { invertible_hash(x) } -pub fn get_ref_search_minimizers(seq: &FastaRecord, params: &MinimizerIndexParams) -> Vec { +pub fn calculate_minimizer_hits( + fasta_record: &FastaRecord, + index: &MinimizerIndexJson, + n_refs: usize, +) -> Vec { + let params = &index.params; let k = params.k as usize; let cutoff = params.cutoff as u32; - let seq_str = preprocess_seq(&seq.seq); - let n_kmers = seq_str.len().saturating_sub(k); + let seq_str = preprocess_seq(&fasta_record.seq); - // 1. Estimate the number of items that will pass the filter. - // The probability of a random u32 hash being < cutoff is (cutoff / u32::MAX). - // We use floating-point math for a more accurate calculation. - let expected_count = (n_kmers as f64 * (cutoff as f64 / u32::MAX as f64)) as usize; - - // 2. Pre-allocate a Vec with the estimated capacity. - let mut minimizers = Vec::with_capacity(expected_count); - - // 3. Define the iterator chain. - let minimizer_iter = seq_str.as_bytes() + seq_str.as_bytes() .windows(k) .map(|kmer| get_hash(kmer, params)) .filter(|&mhash| mhash < cutoff) - .unique(); - - // 4. Extend the vector, which consumes the iterator and fills the Vec. - minimizers.extend(minimizer_iter); - - minimizers + .unique() + .fold(vec![0; n_refs], |mut acc, m| { + if let Some(locations) = index.minimizers.get(&m) { + for &ref_idx in locations { + if let Some(count) = acc.get_mut(ref_idx) { + *count += 1; + } + } + } + acc + }) } fn preprocess_seq(seq: impl AsRef) -> String { From 04fa6b4ad52d65f1458e5cb14d418bf55cb2c0f3 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 19:22:08 +0200 Subject: [PATCH 12/20] Convert only once --- .../nextclade/src/sort/minimizer_search.rs | 50 ++++++++++++------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index 00003e196..adfdbf47d 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -47,7 +47,6 @@ pub fn run_minimizer_search( let hit_counts = calculate_minimizer_hits(fasta_record, index, n_refs); - // we expect hits to be proportional to the length of the sequence and the number of minimizers per reference let mut scores: Vec = vec![0.0; hit_counts.len()]; for i in 0..n_refs { @@ -234,29 +233,32 @@ const fn invertible_hash(x: u32) -> u32 { x } -const NUCLEOTIDE_LOOKUP: [(bool, u8); 256] = { - let mut table = [(false, 0); 256]; // Non-ACGT - table[b'A' as usize] = (true, 0b11); // A=11=3 - table[b'T' as usize] = (true, 0b10); // T=10=2 - table[b'G' as usize] = (true, 0b00); // G=00=0 - table[b'C' as usize] = (true, 0b01); // C=01=1 +const INVALID_NUCLEOTIDE_VALUE: u8 = 4; // Sentinel value for invalid nucleotides + +const NUCLEOTIDE_LOOKUP: [u8; 256] = { + let mut table = [INVALID_NUCLEOTIDE_VALUE; 256]; // Use sentinel value 4 for invalid nucleotides + table[b'A' as usize] = 0b11; // A=11=3 + table[b'a' as usize] = 0b11; // a=11=3 + table[b'T' as usize] = 0b10; // T=10=2 + table[b't' as usize] = 0b10; // t=10=2 + table[b'G' as usize] = 0b00; // G=00=0 + table[b'g' as usize] = 0b00; // g=00=0 + table[b'C' as usize] = 0b01; // C=01=1 + table[b'c' as usize] = 0b01; // c=01=1 table }; -/// Expects kmer to be all uppercase letters +// Expects bit-encoded kmer where each nucleotide is represented by 2 bits +// A=11, C=10, G=00, T=01 and invalid nucleotides are represented by INVALID_NUCLEOTIDE_VALUE fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u32 { let cutoff = params.cutoff as u32; let mut x: u32 = 0; let mut j: u8 = 0; - // Create a bit-packed representation of the kmer - // where each nucleotide is represented by 2 bits: - // A=11, C=10, G=00, T=01 // Skip every third nucleotide to pick up conserved patterns - for &nuc in kmer.iter().skip_every(3) { - let (is_valid, bits) = NUCLEOTIDE_LOOKUP[nuc as usize]; - if !is_valid { + for &bits in kmer.iter().skip_every(3) { + if bits == INVALID_NUCLEOTIDE_VALUE { return cutoff + 1; // invalid nucleotide } @@ -278,8 +280,7 @@ pub fn calculate_minimizer_hits( let seq_str = preprocess_seq(&fasta_record.seq); - seq_str.as_bytes() - .windows(k) + seq_str.windows(k) .map(|kmer| get_hash(kmer, params)) .filter(|&mhash| mhash < cutoff) .unique() @@ -295,6 +296,19 @@ pub fn calculate_minimizer_hits( }) } -fn preprocess_seq(seq: impl AsRef) -> String { - seq.as_ref().to_uppercase().replace('-', "") +// Create a bit-packed representation of the kmer +// where each nucleotide is represented by 2 bits: +// A=11, C=10, G=00, T=01 +// Invalid nucleotides are represented by INVALID_NUCLEOTIDE_VALUE +fn preprocess_seq(seq: impl AsRef) -> Vec{ + seq.as_ref() + .bytes() + .filter_map(|b| { + if b == b'-' { + None // skip gaps + } else { + Some(NUCLEOTIDE_LOOKUP[b as usize]) + } + }) + .collect() } From 1e578756ba1e2d26c4dd01c13d777e92566efcd4 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 19:36:44 +0200 Subject: [PATCH 13/20] Condense --- .../nextclade/src/sort/minimizer_search.rs | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index adfdbf47d..11d07b240 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -250,7 +250,7 @@ const NUCLEOTIDE_LOOKUP: [u8; 256] = { // Expects bit-encoded kmer where each nucleotide is represented by 2 bits // A=11, C=10, G=00, T=01 and invalid nucleotides are represented by INVALID_NUCLEOTIDE_VALUE -fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u32 { +fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> Option { let cutoff = params.cutoff as u32; let mut x: u32 = 0; @@ -259,14 +259,14 @@ fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u32 { // Skip every third nucleotide to pick up conserved patterns for &bits in kmer.iter().skip_every(3) { if bits == INVALID_NUCLEOTIDE_VALUE { - return cutoff + 1; // invalid nucleotide + return None; } x |= (bits as u32) << j; j += 2; } - invertible_hash(x) + Some(invertible_hash(x)) } pub fn calculate_minimizer_hits( @@ -278,11 +278,10 @@ pub fn calculate_minimizer_hits( let k = params.k as usize; let cutoff = params.cutoff as u32; - let seq_str = preprocess_seq(&fasta_record.seq); + let seq_str = preprocess_seq(fasta_record.seq); seq_str.windows(k) - .map(|kmer| get_hash(kmer, params)) - .filter(|&mhash| mhash < cutoff) + .filter_map(|kmer| get_hash(kmer, params)) .unique() .fold(vec![0; n_refs], |mut acc, m| { if let Some(locations) = index.minimizers.get(&m) { @@ -300,12 +299,12 @@ pub fn calculate_minimizer_hits( // where each nucleotide is represented by 2 bits: // A=11, C=10, G=00, T=01 // Invalid nucleotides are represented by INVALID_NUCLEOTIDE_VALUE -fn preprocess_seq(seq: impl AsRef) -> Vec{ - seq.as_ref() - .bytes() - .filter_map(|b| { +fn preprocess_seq(seq: String) -> Vec{ + seq.as_bytes() + .iter() + .filter_map(|&b| { if b == b'-' { - None // skip gaps + None } else { Some(NUCLEOTIDE_LOOKUP[b as usize]) } From 9eb2efa16e6a11c0d4ec6bc32a6b37bc14589461 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 19:44:30 +0200 Subject: [PATCH 14/20] Use filter map, this is actually slow --- packages/nextclade/src/sort/minimizer_search.rs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index 11d07b240..1410939ae 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -250,9 +250,7 @@ const NUCLEOTIDE_LOOKUP: [u8; 256] = { // Expects bit-encoded kmer where each nucleotide is represented by 2 bits // A=11, C=10, G=00, T=01 and invalid nucleotides are represented by INVALID_NUCLEOTIDE_VALUE -fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> Option { - let cutoff = params.cutoff as u32; - +fn get_hash(kmer: &[u8]) -> Option { let mut x: u32 = 0; let mut j: u8 = 0; @@ -274,14 +272,10 @@ pub fn calculate_minimizer_hits( index: &MinimizerIndexJson, n_refs: usize, ) -> Vec { - let params = &index.params; - let k = params.k as usize; - let cutoff = params.cutoff as u32; - - let seq_str = preprocess_seq(fasta_record.seq); + let seq_str = preprocess_seq(&fasta_record.seq); - seq_str.windows(k) - .filter_map(|kmer| get_hash(kmer, params)) + seq_str.windows(index.params.k as usize) + .filter_map(|kmer| get_hash(kmer)) .unique() .fold(vec![0; n_refs], |mut acc, m| { if let Some(locations) = index.minimizers.get(&m) { @@ -299,7 +293,7 @@ pub fn calculate_minimizer_hits( // where each nucleotide is represented by 2 bits: // A=11, C=10, G=00, T=01 // Invalid nucleotides are represented by INVALID_NUCLEOTIDE_VALUE -fn preprocess_seq(seq: String) -> Vec{ +fn preprocess_seq(seq: &str) -> Vec { seq.as_bytes() .iter() .filter_map(|&b| { From 66db9c2cda3b7ccf5c980b88fdfecb55f5761cfe Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 19:45:19 +0200 Subject: [PATCH 15/20] Revert "Use filter map, this is actually slow" This reverts commit 9eb2efa16e6a11c0d4ec6bc32a6b37bc14589461. --- packages/nextclade/src/sort/minimizer_search.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index 1410939ae..11d07b240 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -250,7 +250,9 @@ const NUCLEOTIDE_LOOKUP: [u8; 256] = { // Expects bit-encoded kmer where each nucleotide is represented by 2 bits // A=11, C=10, G=00, T=01 and invalid nucleotides are represented by INVALID_NUCLEOTIDE_VALUE -fn get_hash(kmer: &[u8]) -> Option { +fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> Option { + let cutoff = params.cutoff as u32; + let mut x: u32 = 0; let mut j: u8 = 0; @@ -272,10 +274,14 @@ pub fn calculate_minimizer_hits( index: &MinimizerIndexJson, n_refs: usize, ) -> Vec { - let seq_str = preprocess_seq(&fasta_record.seq); + let params = &index.params; + let k = params.k as usize; + let cutoff = params.cutoff as u32; + + let seq_str = preprocess_seq(fasta_record.seq); - seq_str.windows(index.params.k as usize) - .filter_map(|kmer| get_hash(kmer)) + seq_str.windows(k) + .filter_map(|kmer| get_hash(kmer, params)) .unique() .fold(vec![0; n_refs], |mut acc, m| { if let Some(locations) = index.minimizers.get(&m) { @@ -293,7 +299,7 @@ pub fn calculate_minimizer_hits( // where each nucleotide is represented by 2 bits: // A=11, C=10, G=00, T=01 // Invalid nucleotides are represented by INVALID_NUCLEOTIDE_VALUE -fn preprocess_seq(seq: &str) -> Vec { +fn preprocess_seq(seq: String) -> Vec{ seq.as_bytes() .iter() .filter_map(|&b| { From 6fd11378cb96b52539230114b7e46d9b1c1232b8 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 19:45:23 +0200 Subject: [PATCH 16/20] Revert "Condense" This reverts commit 1e578756ba1e2d26c4dd01c13d777e92566efcd4. --- .../nextclade/src/sort/minimizer_search.rs | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index 11d07b240..adfdbf47d 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -250,7 +250,7 @@ const NUCLEOTIDE_LOOKUP: [u8; 256] = { // Expects bit-encoded kmer where each nucleotide is represented by 2 bits // A=11, C=10, G=00, T=01 and invalid nucleotides are represented by INVALID_NUCLEOTIDE_VALUE -fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> Option { +fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u32 { let cutoff = params.cutoff as u32; let mut x: u32 = 0; @@ -259,14 +259,14 @@ fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> Option { // Skip every third nucleotide to pick up conserved patterns for &bits in kmer.iter().skip_every(3) { if bits == INVALID_NUCLEOTIDE_VALUE { - return None; + return cutoff + 1; // invalid nucleotide } x |= (bits as u32) << j; j += 2; } - Some(invertible_hash(x)) + invertible_hash(x) } pub fn calculate_minimizer_hits( @@ -278,10 +278,11 @@ pub fn calculate_minimizer_hits( let k = params.k as usize; let cutoff = params.cutoff as u32; - let seq_str = preprocess_seq(fasta_record.seq); + let seq_str = preprocess_seq(&fasta_record.seq); seq_str.windows(k) - .filter_map(|kmer| get_hash(kmer, params)) + .map(|kmer| get_hash(kmer, params)) + .filter(|&mhash| mhash < cutoff) .unique() .fold(vec![0; n_refs], |mut acc, m| { if let Some(locations) = index.minimizers.get(&m) { @@ -299,12 +300,12 @@ pub fn calculate_minimizer_hits( // where each nucleotide is represented by 2 bits: // A=11, C=10, G=00, T=01 // Invalid nucleotides are represented by INVALID_NUCLEOTIDE_VALUE -fn preprocess_seq(seq: String) -> Vec{ - seq.as_bytes() - .iter() - .filter_map(|&b| { +fn preprocess_seq(seq: impl AsRef) -> Vec{ + seq.as_ref() + .bytes() + .filter_map(|b| { if b == b'-' { - None + None // skip gaps } else { Some(NUCLEOTIDE_LOOKUP[b as usize]) } From e3d6a81c96c1ed66b737cd7326035c9427d975af Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 9 Jun 2025 19:50:37 +0200 Subject: [PATCH 17/20] go brrr --- .../nextclade/src/sort/minimizer_search.rs | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index adfdbf47d..9cb53e070 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -300,15 +300,14 @@ pub fn calculate_minimizer_hits( // where each nucleotide is represented by 2 bits: // A=11, C=10, G=00, T=01 // Invalid nucleotides are represented by INVALID_NUCLEOTIDE_VALUE -fn preprocess_seq(seq: impl AsRef) -> Vec{ - seq.as_ref() - .bytes() - .filter_map(|b| { - if b == b'-' { - None // skip gaps - } else { - Some(NUCLEOTIDE_LOOKUP[b as usize]) - } - }) - .collect() +fn preprocess_seq(seq: &str) -> Vec{ + let mut result = Vec::with_capacity(seq.len()); + result.extend(seq.bytes().filter_map(|b| { + if b == b'-' { + None + } else { + Some(NUCLEOTIDE_LOOKUP[b as usize]) + } + })); + result } From 68bdc524de502f52d5dd611af2d3ab82d3f53746 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Tue, 10 Jun 2025 09:33:06 +0200 Subject: [PATCH 18/20] Use gxhash --- Cargo.lock | 48 +++++++++++-------- Cargo.toml | 1 + packages/nextclade/Cargo.toml | 1 + .../nextclade/src/sort/minimizer_search.rs | 4 ++ 4 files changed, 35 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f68bb2277..79433439b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -529,7 +529,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.101", ] [[package]] @@ -828,7 +828,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1586fa608b1dab41f667475b4a41faec5ba680aee428bfa5de4ea520fdc6e901" dependencies = [ "quote", - "syn 2.0.22", + "syn 2.0.101", ] [[package]] @@ -857,7 +857,7 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.101", ] [[package]] @@ -868,7 +868,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.22", + "syn 2.0.101", ] [[package]] @@ -1009,7 +1009,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.101", ] [[package]] @@ -1155,7 +1155,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.101", ] [[package]] @@ -1256,6 +1256,15 @@ version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" +[[package]] +name = "gxhash" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ce1bab7aa741d4e7042b2aae415b78741f267a98a7271ea226cd5ba6c43d7d" +dependencies = [ + "rustversion", +] + [[package]] name = "half" version = "1.8.2" @@ -1772,6 +1781,7 @@ dependencies = [ "flate2", "gcollections", "getrandom", + "gxhash", "indexmap 1.9.3", "intervallum", "itertools 0.11.0", @@ -1999,7 +2009,7 @@ checksum = "fa59f025cde9c698fcb4fcb3533db4621795374065bee908215263488f2d2a1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.101", ] [[package]] @@ -2196,9 +2206,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.63" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] @@ -2262,9 +2272,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.29" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] @@ -2701,7 +2711,7 @@ checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.101", ] [[package]] @@ -2735,7 +2745,7 @@ checksum = "bcec881020c684085e55a25f7fd888954d56609ef363479dc5a1305eb0d40cab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.101", ] [[package]] @@ -2961,7 +2971,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.22", + "syn 2.0.101", ] [[package]] @@ -2993,9 +3003,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.22" +version = "2.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2efbeae7acf4eabd6bcdcbd11c92f45231ddda7539edc7806bd1a04a03b24616" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" dependencies = [ "proc-macro2", "quote", @@ -3037,7 +3047,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.101", ] [[package]] @@ -3411,7 +3421,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.101", "wasm-bindgen-shared", ] @@ -3445,7 +3455,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.22", + "syn 2.0.101", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/Cargo.toml b/Cargo.toml index 5f01ba70b..961834f76 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,6 +55,7 @@ eyre = "=0.6.8" flate2 = "=1.0.26" gcollections = "=1.5.0" getrandom = { version = "=0.2.10", features = ["js"] } +gxhash = "=3.5.0" indexmap = { version = "=1.9.3", features = ["serde"] } intervallum = "=1.4.0" itertools = "=0.11.0" diff --git a/packages/nextclade/Cargo.toml b/packages/nextclade/Cargo.toml index 528540beb..0e4322831 100644 --- a/packages/nextclade/Cargo.toml +++ b/packages/nextclade/Cargo.toml @@ -39,6 +39,7 @@ eyre = { workspace = true } flate2 = { workspace = true } gcollections = { workspace = true } getrandom = { workspace = true } +gxhash = { workspace = true } indexmap = { workspace = true } intervallum = { workspace = true } itertools = { workspace = true } diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index 9cb53e070..eaeb2c75e 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -12,6 +12,7 @@ use ordered_float::OrderedFloat; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::collections::{BTreeMap, BTreeSet}; +use gxhash::{HashSet, HashSetExt}; #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] @@ -278,6 +279,9 @@ pub fn calculate_minimizer_hits( let k = params.k as usize; let cutoff = params.cutoff as u32; + // let expected_n_minimizers = fasta_record.seq.len() as f64 * cutoff as f64 / (1_u64 << 32) as f64; + // let mut seen: HashSet<_> = HashSet::with_capacity(expected_n_minimizers as usize); + let seq_str = preprocess_seq(&fasta_record.seq); seq_str.windows(k) From a617da5ae71c1c8b01b5ceb6866229fe61f25a38 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Tue, 10 Jun 2025 09:38:54 +0200 Subject: [PATCH 19/20] Use std Hashset instead of gxhash --- Cargo.lock | 48 ++++++++----------- Cargo.toml | 1 - packages/nextclade/Cargo.toml | 1 - .../nextclade/src/sort/minimizer_search.rs | 10 ++-- 4 files changed, 23 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 79433439b..f68bb2277 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -529,7 +529,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.22", ] [[package]] @@ -828,7 +828,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1586fa608b1dab41f667475b4a41faec5ba680aee428bfa5de4ea520fdc6e901" dependencies = [ "quote", - "syn 2.0.101", + "syn 2.0.22", ] [[package]] @@ -857,7 +857,7 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.22", ] [[package]] @@ -868,7 +868,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.101", + "syn 2.0.22", ] [[package]] @@ -1009,7 +1009,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.22", ] [[package]] @@ -1155,7 +1155,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.22", ] [[package]] @@ -1256,15 +1256,6 @@ version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" -[[package]] -name = "gxhash" -version = "3.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3ce1bab7aa741d4e7042b2aae415b78741f267a98a7271ea226cd5ba6c43d7d" -dependencies = [ - "rustversion", -] - [[package]] name = "half" version = "1.8.2" @@ -1781,7 +1772,6 @@ dependencies = [ "flate2", "gcollections", "getrandom", - "gxhash", "indexmap 1.9.3", "intervallum", "itertools 0.11.0", @@ -2009,7 +1999,7 @@ checksum = "fa59f025cde9c698fcb4fcb3533db4621795374065bee908215263488f2d2a1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.22", ] [[package]] @@ -2206,9 +2196,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.95" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" dependencies = [ "unicode-ident", ] @@ -2272,9 +2262,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.40" +version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" dependencies = [ "proc-macro2", ] @@ -2711,7 +2701,7 @@ checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.22", ] [[package]] @@ -2745,7 +2735,7 @@ checksum = "bcec881020c684085e55a25f7fd888954d56609ef363479dc5a1305eb0d40cab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.22", ] [[package]] @@ -2971,7 +2961,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.101", + "syn 2.0.22", ] [[package]] @@ -3003,9 +2993,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.101" +version = "2.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +checksum = "2efbeae7acf4eabd6bcdcbd11c92f45231ddda7539edc7806bd1a04a03b24616" dependencies = [ "proc-macro2", "quote", @@ -3047,7 +3037,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.22", ] [[package]] @@ -3421,7 +3411,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.22", "wasm-bindgen-shared", ] @@ -3455,7 +3445,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.22", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/Cargo.toml b/Cargo.toml index 961834f76..5f01ba70b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,7 +55,6 @@ eyre = "=0.6.8" flate2 = "=1.0.26" gcollections = "=1.5.0" getrandom = { version = "=0.2.10", features = ["js"] } -gxhash = "=3.5.0" indexmap = { version = "=1.9.3", features = ["serde"] } intervallum = "=1.4.0" itertools = "=0.11.0" diff --git a/packages/nextclade/Cargo.toml b/packages/nextclade/Cargo.toml index 0e4322831..528540beb 100644 --- a/packages/nextclade/Cargo.toml +++ b/packages/nextclade/Cargo.toml @@ -39,7 +39,6 @@ eyre = { workspace = true } flate2 = { workspace = true } gcollections = { workspace = true } getrandom = { workspace = true } -gxhash = { workspace = true } indexmap = { workspace = true } intervallum = { workspace = true } itertools = { workspace = true } diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index eaeb2c75e..12a66722a 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -11,8 +11,7 @@ use log::debug; use ordered_float::OrderedFloat; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use std::collections::{BTreeMap, BTreeSet}; -use gxhash::{HashSet, HashSetExt}; +use std::collections::{BTreeMap, BTreeSet, HashSet}; #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] @@ -279,15 +278,14 @@ pub fn calculate_minimizer_hits( let k = params.k as usize; let cutoff = params.cutoff as u32; - // let expected_n_minimizers = fasta_record.seq.len() as f64 * cutoff as f64 / (1_u64 << 32) as f64; - // let mut seen: HashSet<_> = HashSet::with_capacity(expected_n_minimizers as usize); + let expected_n_minimizers = fasta_record.seq.len() as f64 * cutoff as f64 / (1_u64 << 32) as f64; + let mut seen: HashSet<_> = HashSet::with_capacity(expected_n_minimizers as usize); let seq_str = preprocess_seq(&fasta_record.seq); seq_str.windows(k) .map(|kmer| get_hash(kmer, params)) - .filter(|&mhash| mhash < cutoff) - .unique() + .filter(|&mhash| mhash < cutoff && seen.insert(mhash)) .fold(vec![0; n_refs], |mut acc, m| { if let Some(locations) = index.minimizers.get(&m) { for &ref_idx in locations { From e385f1c7bcfdb148c667368adb19ca7a2715f1b0 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Tue, 10 Jun 2025 09:46:55 +0200 Subject: [PATCH 20/20] Use fast version of unique --- packages/nextclade/src/sort/minimizer_search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/nextclade/src/sort/minimizer_search.rs b/packages/nextclade/src/sort/minimizer_search.rs index 12a66722a..2cd2252a2 100644 --- a/packages/nextclade/src/sort/minimizer_search.rs +++ b/packages/nextclade/src/sort/minimizer_search.rs @@ -285,7 +285,7 @@ pub fn calculate_minimizer_hits( seq_str.windows(k) .map(|kmer| get_hash(kmer, params)) - .filter(|&mhash| mhash < cutoff && seen.insert(mhash)) + .filter(|&mhash| mhash < cutoff && seen.insert(mhash)) // Faster than .unique() .fold(vec![0; n_refs], |mut acc, m| { if let Some(locations) = index.minimizers.get(&m) { for &ref_idx in locations {