From e2e7e704498f591f83cde97f53117628c5f77894 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Fri, 28 Nov 2025 15:14:18 +0200 Subject: [PATCH 01/31] Add lib_ts_chainalign stub. --- Cargo.lock | 7 +++++++ Cargo.toml | 2 +- lib_ts_chainalign/Cargo.toml | 12 ++++++++++++ lib_ts_chainalign/src/lib.rs | 14 ++++++++++++++ lib_tsalign/Cargo.toml | 2 +- 5 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 lib_ts_chainalign/Cargo.toml create mode 100644 lib_ts_chainalign/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index c91b223..7d89556 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -460,6 +460,13 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lib_ts_chainalign" +version = "0.1.0" +dependencies = [ + "generic_a_star", +] + [[package]] name = "lib_tsalign" version = "0.19.1" diff --git a/Cargo.toml b/Cargo.toml index 2c1f08d..d990e6b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ members = [ "python_bindings", # Internal - "tsalign-tests", + "tsalign-tests", "lib_ts_chainalign", ] resolver = "2" diff --git a/lib_ts_chainalign/Cargo.toml b/lib_ts_chainalign/Cargo.toml new file mode 100644 index 0000000..eef794d --- /dev/null +++ b/lib_ts_chainalign/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "lib_ts_chainalign" +description = "A chaining-based sequence-to-sequence aligner that accounts for template switches" +authors = ["Sebastian Schmidt "] +version = "0.1.0" +license.workspace = true +edition.workspace = true +rust-version.workspace = true +repository.workspace = true + +[dependencies] +generic_a_star = {version = "0.19.1", path = "../generic_a_star"} \ No newline at end of file diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs new file mode 100644 index 0000000..b93cf3f --- /dev/null +++ b/lib_ts_chainalign/src/lib.rs @@ -0,0 +1,14 @@ +pub fn add(left: u64, right: u64) -> u64 { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} diff --git a/lib_tsalign/Cargo.toml b/lib_tsalign/Cargo.toml index 7956740..a5b9566 100644 --- a/lib_tsalign/Cargo.toml +++ b/lib_tsalign/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "lib_tsalign" description = "A sequence-to-sequence aligner that accounts for template switches" -license.workspace = true authors = ["Sebastian Schmidt "] version = "0.19.1" +license.workspace = true edition.workspace = true rust-version.workspace = true repository.workspace = true From d95cd0254fd5cd91536aa6da9cda867fad8ce263 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Fri, 28 Nov 2025 16:41:32 +0200 Subject: [PATCH 02/31] Gap affine cost lower bound. --- Cargo.lock | 19 +- Cargo.toml | 3 +- generic_a_star/src/cost.rs | 138 +++++++++- lib_ts_chainalign/Cargo.toml | 4 +- lib_ts_chainalign/src/lib.rs | 15 +- lib_ts_chainalign/src/lower_bounds.rs | 1 + .../src/lower_bounds/gap_affine.rs | 60 +++++ .../src/lower_bounds/gap_affine/algo.rs | 242 ++++++++++++++++++ .../src/lower_bounds/gap_affine/tests.rs | 118 +++++++++ 9 files changed, 581 insertions(+), 19 deletions(-) create mode 100644 lib_ts_chainalign/src/lower_bounds.rs create mode 100644 lib_ts_chainalign/src/lower_bounds/gap_affine.rs create mode 100644 lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs create mode 100644 lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs diff --git a/Cargo.lock b/Cargo.lock index 7d89556..6cd833a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -465,6 +465,8 @@ name = "lib_ts_chainalign" version = "0.1.0" dependencies = [ "generic_a_star", + "ndarray 0.17.1", + "num-traits", ] [[package]] @@ -476,7 +478,7 @@ dependencies = [ "extend_map", "generic_a_star", "log", - "ndarray", + "ndarray 0.16.1", "noisy_float", "nom", "num-traits", @@ -581,6 +583,21 @@ dependencies = [ "serde", ] +[[package]] +name = "ndarray" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c7c9125e8f6f10c9da3aad044cc918cf8784fa34de857b1aa68038eb05a50a9" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + [[package]] name = "noisy_float" version = "0.2.0" diff --git a/Cargo.toml b/Cargo.toml index d990e6b..03cb19d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "generic_a_star", "seed_chain", "lib_tsalign", + "lib_ts_chainalign", "lib_tsshow", "tsalign", @@ -11,7 +12,7 @@ members = [ "python_bindings", # Internal - "tsalign-tests", "lib_ts_chainalign", + "tsalign-tests", ] resolver = "2" diff --git a/generic_a_star/src/cost.rs b/generic_a_star/src/cost.rs index d0733e4..a1cf4e6 100644 --- a/generic_a_star/src/cost.rs +++ b/generic_a_star/src/cost.rs @@ -9,8 +9,7 @@ use num_traits::{Bounded, CheckedAdd, CheckedSub, SaturatingSub, Zero}; /// The cost of an A* node. pub trait AStarCost: - From - + From + From + Add + Sub + SaturatingSub @@ -35,6 +34,8 @@ pub trait AStarCost: fn as_u64(&self) -> u64; fn as_primitive(&self) -> Self::CostType; + + fn from_primitive(value: Self::CostType) -> Self; } macro_rules! primitive_cost { @@ -58,6 +59,10 @@ macro_rules! primitive_cost { fn as_primitive(&self) -> Self::CostType { self.0 } + + fn from_primitive(value: Self::CostType) -> Self { + Self(value) + } } impl From<$primitive> for $name { @@ -160,3 +165,132 @@ primitive_cost!(U32Cost, u32); primitive_cost!(I32Cost, i32); primitive_cost!(U64Cost, u64); primitive_cost!(I64Cost, i64); + +/// A pair of cost types where the first cost is prioritised over the second cost. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct OrderedPairCost(pub A, pub B); + +impl AStarCost for OrderedPairCost { + type CostType = A::CostType; + + fn as_f64(&self) -> f64 { + self.0.as_f64() + } + + fn as_u64(&self) -> u64 { + self.0.as_u64() + } + + fn as_primitive(&self) -> Self::CostType { + self.0.as_primitive() + } + + fn from_primitive(value: Self::CostType) -> Self { + Self(A::from_primitive(value), B::zero()) + } +} + +impl Display for OrderedPairCost { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "({}, {})", self.0, self.1) + } +} + +impl Zero for OrderedPairCost { + fn zero() -> Self { + Self(A::zero(), B::zero()) + } + + fn is_zero(&self) -> bool { + self.0.is_zero() && self.1.is_zero() + } +} + +impl Bounded for OrderedPairCost { + fn min_value() -> Self { + Self(A::min_value(), B::min_value()) + } + + fn max_value() -> Self { + Self(A::max_value(), B::max_value()) + } +} + +impl, B: From> From for OrderedPairCost { + fn from(value: u8) -> Self { + Self(A::from(value), B::from(0)) + } +} + +impl, B: Add> Add for OrderedPairCost { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0, self.1 + rhs.1) + } +} + +impl, B: Sub> Sub for OrderedPairCost { + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Self(self.0 - rhs.0, self.1 - rhs.1) + } +} + +impl FromStr for OrderedPairCost { + type Err = (); + + fn from_str(s: &str) -> Result { + let Some(s) = s.strip_prefix('(') else { + return Err(()); + }; + let Some(s) = s.strip_suffix(')') else { + return Err(()); + }; + let mut parts = s.splitn(2, ','); + let s = parts.next().ok_or(())?.trim(); + let a = s.parse::().map_err(|_| ())?; + let s = parts.next().ok_or(())?.trim(); + let b = s.parse::().map_err(|_| ())?; + Ok(Self(a, b)) + } +} + +impl AddAssign for OrderedPairCost { + fn add_assign(&mut self, rhs: Self) { + self.0 += rhs.0; + self.1 += rhs.1; + } +} + +impl SubAssign for OrderedPairCost { + fn sub_assign(&mut self, rhs: Self) { + self.0 -= rhs.0; + self.1 -= rhs.1; + } +} + +impl CheckedAdd for OrderedPairCost { + fn checked_add(&self, rhs: &Self) -> Option { + Some(Self( + self.0.checked_add(&rhs.0)?, + self.1.checked_add(&rhs.1)?, + )) + } +} + +impl CheckedSub for OrderedPairCost { + fn checked_sub(&self, rhs: &Self) -> Option { + Some(Self( + self.0.checked_sub(&rhs.0)?, + self.1.checked_sub(&rhs.1)?, + )) + } +} + +impl SaturatingSub for OrderedPairCost { + fn saturating_sub(&self, rhs: &Self) -> Self { + Self(self.0.saturating_sub(&rhs.0), self.1.saturating_sub(&rhs.1)) + } +} diff --git a/lib_ts_chainalign/Cargo.toml b/lib_ts_chainalign/Cargo.toml index eef794d..2f4def0 100644 --- a/lib_ts_chainalign/Cargo.toml +++ b/lib_ts_chainalign/Cargo.toml @@ -9,4 +9,6 @@ rust-version.workspace = true repository.workspace = true [dependencies] -generic_a_star = {version = "0.19.1", path = "../generic_a_star"} \ No newline at end of file +generic_a_star = { version = "0.19.1", path = "../generic_a_star" } +ndarray = "0.17.1" +num-traits.workspace = true diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index b93cf3f..403ca0c 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -1,14 +1 @@ -pub fn add(left: u64, right: u64) -> u64 { - left + right -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } -} +mod lower_bounds; diff --git a/lib_ts_chainalign/src/lower_bounds.rs b/lib_ts_chainalign/src/lower_bounds.rs new file mode 100644 index 0000000..23fca46 --- /dev/null +++ b/lib_ts_chainalign/src/lower_bounds.rs @@ -0,0 +1 @@ +pub mod gap_affine; diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine.rs b/lib_ts_chainalign/src/lower_bounds/gap_affine.rs new file mode 100644 index 0000000..0750641 --- /dev/null +++ b/lib_ts_chainalign/src/lower_bounds/gap_affine.rs @@ -0,0 +1,60 @@ +use generic_a_star::{AStar, AStarNode, cost::AStarCost}; +use ndarray::Array2; + +use crate::lower_bounds::gap_affine::algo::Context; + +mod algo; +#[cfg(test)] +mod tests; + +pub struct GapAffineLowerBounds { + max_n: usize, + lower_bounds: Array2, +} + +pub struct GapAffineLowerBoundCostTable { + pub substitution: Cost, + pub gap_open: Cost, + pub gap_extend: Cost, +} + +impl GapAffineLowerBounds { + #[expect(dead_code)] + pub fn new( + max_n: usize, + max_match_run: u32, + cost_table: &GapAffineLowerBoundCostTable, + ) -> Self { + let mut lower_bounds = Array2::::from_elem((max_n + 1, max_n + 1), Cost::max_value()); + lower_bounds[[0, 0]] = Cost::zero(); + let context = Context::new(cost_table, max_match_run, max_n); + let mut a_star = AStar::new(context); + a_star.initialise(); + a_star.search_until(|_, node| { + if node.identifier.has_non_match { + let lower_bound = &mut lower_bounds[[node.identifier.a, node.identifier.b]]; + *lower_bound = (*lower_bound).min(node.cost().0); + } + false + }); + + Self { + max_n, + lower_bounds, + } + } +} + +impl GapAffineLowerBounds { + #[expect(dead_code)] + pub fn max_n(&self) -> usize { + self.max_n + } +} + +impl GapAffineLowerBounds { + #[expect(dead_code)] + pub fn lower_bound(&self, a: usize, b: usize) -> Cost { + self.lower_bounds[[a, b]] + } +} diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs b/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs new file mode 100644 index 0000000..d97954a --- /dev/null +++ b/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs @@ -0,0 +1,242 @@ +use std::fmt::Display; + +use generic_a_star::{ + AStarContext, AStarNode, + cost::{AStarCost, OrderedPairCost, U32Cost}, + reset::Reset, +}; +use num_traits::Zero; + +use crate::lower_bounds::gap_affine::GapAffineLowerBoundCostTable; + +pub struct Context<'a, Cost> { + costs: &'a GapAffineLowerBoundCostTable, + max_match_run: u32, + max_n: usize, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub struct Node { + pub identifier: Identifier, + pub cost: Cost, + pub match_run: u32, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct Identifier { + pub a: usize, + pub b: usize, + /// True if this node was reached via at least one non-match. + pub has_non_match: bool, + gap_type: GapType, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub enum GapType { + None, + InA, + InB, +} + +impl<'a, Cost> Context<'a, Cost> { + pub fn new( + costs: &'a GapAffineLowerBoundCostTable, + max_match_run: u32, + max_n: usize, + ) -> Self { + Self { + costs, + max_match_run, + max_n, + } + } +} + +impl AStarContext for Context<'_, Cost> { + type Node = Node; + + fn create_root(&self) -> Self::Node { + Node { + identifier: Identifier { + a: 0, + b: 0, + has_non_match: false, + gap_type: GapType::None, + }, + cost: Cost::zero(), + match_run: 0, + } + } + + fn generate_successors(&mut self, node: &Self::Node, output: &mut impl Extend) { + let Node { + identifier: + Identifier { + a, + b, + has_non_match, + gap_type, + }, + cost, + match_run, + } = node; + + if *a < self.max_n && *b < self.max_n { + if *match_run < self.max_match_run { + // Match + let new_cost = *cost; + output.extend(std::iter::once(Node { + identifier: Identifier { + a: a + 1, + b: b + 1, + has_non_match: *has_non_match, + gap_type: GapType::None, + }, + cost: new_cost, + match_run: match_run + 1, + })); + } + + // Substitution + let new_cost = *cost + self.costs.substitution; + output.extend(std::iter::once(Node { + identifier: Identifier { + a: a + 1, + b: b + 1, + has_non_match: true, + gap_type: GapType::None, + }, + cost: new_cost, + match_run: 0, + })); + } + + if *a < self.max_n { + // Gap in B + let new_cost = *cost + + match gap_type { + GapType::InB => self.costs.gap_extend, + _ => self.costs.gap_open, + }; + output.extend(std::iter::once(Node { + identifier: Identifier { + a: a + 1, + b: *b, + has_non_match: true, + gap_type: GapType::InB, + }, + cost: new_cost, + match_run: 0, + })); + } + + if *b < self.max_n { + // Gap in A + let new_cost = *cost + + match gap_type { + GapType::InA => self.costs.gap_extend, + _ => self.costs.gap_open, + }; + output.extend(std::iter::once(Node { + identifier: Identifier { + a: *a, + b: b + 1, + has_non_match: true, + gap_type: GapType::InA, + }, + cost: new_cost, + match_run: 0, + })); + } + } + + fn is_target(&self, _node: &Self::Node) -> bool { + // Run until whole matrix is filled + false + } + + fn cost_limit(&self) -> Option<::Cost> { + None + } + + fn memory_limit(&self) -> Option { + None + } +} + +impl Reset for Context<'_, Cost> { + fn reset(&mut self) { + // No internal state to reset + } +} + +impl AStarNode for Node { + type Identifier = Identifier; + + type EdgeType = (); + + // Use match run as secondary cost + type Cost = OrderedPairCost; + + fn identifier(&self) -> &Self::Identifier { + &self.identifier + } + + fn cost(&self) -> Self::Cost { + OrderedPairCost(self.cost, U32Cost::from_primitive(self.match_run)) + } + + fn a_star_lower_bound(&self) -> Self::Cost { + OrderedPairCost(Cost::zero(), U32Cost::zero()) + } + + fn secondary_maximisable_score(&self) -> usize { + 0 + } + + fn predecessor(&self) -> Option<&Self::Identifier> { + // Backtracking not supported + None + } + + fn predecessor_edge_type(&self) -> Option { + // Backtracking not supported + None + } +} + +impl Display for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}: {}", self.identifier, self.cost) + } +} + +impl Display for Identifier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "({}, {}, {})", self.a, self.b, self.gap_type) + } +} + +impl Display for GapType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + GapType::None => write!(f, "M/S"), + GapType::InA => write!(f, "GA"), + GapType::InB => write!(f, "GB"), + } + } +} + +impl PartialOrd for Node { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Node { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.cost + .cmp(&other.cost) + .then_with(|| self.match_run.cmp(&other.match_run)) + } +} diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs b/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs new file mode 100644 index 0000000..0dd4859 --- /dev/null +++ b/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs @@ -0,0 +1,118 @@ +use generic_a_star::cost::{AStarCost, U32Cost}; +use ndarray::Array2; + +use crate::lower_bounds::gap_affine::{GapAffineLowerBoundCostTable, GapAffineLowerBounds}; + +#[test] +fn test_max_match_run_0() { + let cost_table = GapAffineLowerBoundCostTable { + substitution: U32Cost::from(2u8), + gap_open: U32Cost::from(3u8), + gap_extend: U32Cost::from(1u8), + }; + let max_n = 2; + let lower_bounds = GapAffineLowerBounds::new(max_n, 0, &cost_table); + + assert_eq!(lower_bounds.max_n(), max_n); + + #[rustfmt::skip] + let exepcted_lower_bounds = Array2::from_shape_vec( + (max_n+1, max_n+1), + vec![ + 0u32, 3, 4, + 3, 2, 5, + 4, 5, 4, + ], + ) + .unwrap(); + + for a in 0..=max_n { + for b in 0..=max_n { + assert_eq!( + lower_bounds.lower_bound(a, b).as_primitive(), + exepcted_lower_bounds[[a, b]], + "lower bound({}, {})", + a, + b + ); + } + } +} + +#[test] +fn test_max_match_run_1() { + let cost_table = GapAffineLowerBoundCostTable { + substitution: U32Cost::from(2u8), + gap_open: U32Cost::from(3u8), + gap_extend: U32Cost::from(1u8), + }; + let max_n = 4; + let lower_bounds = GapAffineLowerBounds::new(max_n, 1, &cost_table); + + assert_eq!(lower_bounds.max_n(), max_n); + + #[rustfmt::skip] + let exepcted_lower_bounds = Array2::from_shape_vec( + (max_n+1, max_n+1), + vec![ + 0u32, 3, 4, 5, 6, + 3, 2, 3, 4, 5, + 4, 3, 2, 3, 4, + 5, 4, 3, 2, 5, + 6, 5, 4, 5, 4, + ], + ) + .unwrap(); + + for a in 0..=max_n { + for b in 0..=max_n { + assert_eq!( + lower_bounds.lower_bound(a, b).as_primitive(), + exepcted_lower_bounds[[a, b]], + "lower bound({}, {})", + a, + b + ); + } + } +} + +#[test] +fn test_max_match_run_2() { + let cost_table = GapAffineLowerBoundCostTable { + substitution: U32Cost::from(2u8), + gap_open: U32Cost::from(3u8), + gap_extend: U32Cost::from(1u8), + }; + let max_n = 6; + let lower_bounds = GapAffineLowerBounds::new(max_n, 2, &cost_table); + + assert_eq!(lower_bounds.max_n(), max_n); + + #[rustfmt::skip] + let exepcted_lower_bounds = Array2::from_shape_vec( + (max_n+1, max_n+1), + vec![ + 0u32, 3, 4, 5, 6, 7, 8, + 3, 2, 3, 4, 5, 6, 7, + 4, 3, 2, 3, 4, 5, 6, + 5, 4, 3, 2, 3, 4, 5, + 6, 5, 4, 3, 2, 3, 4, + 7, 6, 5, 4, 3, 2, 5, + 8, 7, 6, 5, 4, 5, 4, + ], + ) + .unwrap(); + + for a in 0..=max_n { + for b in 0..=max_n { + assert_eq!( + lower_bounds.lower_bound(a, b).as_primitive(), + exepcted_lower_bounds[[a, b]], + "lower bound({}, {})", + a, + b + ); + } + } +} From 0fc75c700732658625b0f8605e8047f6f6eef99a Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 1 Dec 2025 12:26:28 +0200 Subject: [PATCH 03/31] Precompute TS jump lower bounds for 12-jump. --- lib_ts_chainalign/src/costs.rs | 22 ++++++ lib_ts_chainalign/src/lib.rs | 19 +++++ lib_ts_chainalign/src/lower_bounds.rs | 1 + .../src/lower_bounds/gap_affine.rs | 44 +++++------ .../src/lower_bounds/gap_affine/algo.rs | 10 +-- .../src/lower_bounds/gap_affine/tests.rs | 14 +--- lib_ts_chainalign/src/lower_bounds/ts_jump.rs | 78 +++++++++++++++++++ 7 files changed, 146 insertions(+), 42 deletions(-) create mode 100644 lib_ts_chainalign/src/costs.rs create mode 100644 lib_ts_chainalign/src/lower_bounds/ts_jump.rs diff --git a/lib_ts_chainalign/src/costs.rs b/lib_ts_chainalign/src/costs.rs new file mode 100644 index 0000000..6836df9 --- /dev/null +++ b/lib_ts_chainalign/src/costs.rs @@ -0,0 +1,22 @@ +use std::ops::Range; + +pub struct GapAffineCosts { + pub substitution: Cost, + pub gap_open: Cost, + pub gap_extend: Cost, +} + +#[expect(dead_code)] +pub struct TsLimits { + pub jump_12: Range, + pub jump_34: Range, + pub length_23: Range, +} + +pub struct AlignmentCosts { + pub primary_costs: GapAffineCosts, + pub secondary_costs: GapAffineCosts, + pub ts_base_cost: Cost, + #[expect(dead_code)] + pub ts_limits: TsLimits, +} diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index 403ca0c..5d4cb8a 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -1 +1,20 @@ +use generic_a_star::cost::AStarCost; + +use crate::{ + costs::AlignmentCosts, + lower_bounds::{gap_affine::GapAffineLowerBounds, ts_jump::TsJumpLowerBounds}, +}; + +mod costs; mod lower_bounds; + +#[expect(dead_code)] +fn compute_lower_bounds( + max_n: usize, + max_match_run: u32, + costs: &AlignmentCosts, +) { + let _gap_affine_lower_bounds = + GapAffineLowerBounds::new(max_n, max_match_run, &costs.primary_costs); + let _ts_jump_lower_bounds = TsJumpLowerBounds::new(max_n, max_match_run, costs); +} diff --git a/lib_ts_chainalign/src/lower_bounds.rs b/lib_ts_chainalign/src/lower_bounds.rs index 23fca46..273e7bf 100644 --- a/lib_ts_chainalign/src/lower_bounds.rs +++ b/lib_ts_chainalign/src/lower_bounds.rs @@ -1 +1,2 @@ pub mod gap_affine; +pub mod ts_jump; diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine.rs b/lib_ts_chainalign/src/lower_bounds/gap_affine.rs index 0750641..0563473 100644 --- a/lib_ts_chainalign/src/lower_bounds/gap_affine.rs +++ b/lib_ts_chainalign/src/lower_bounds/gap_affine.rs @@ -1,30 +1,19 @@ use generic_a_star::{AStar, AStarNode, cost::AStarCost}; -use ndarray::Array2; +use ndarray::{Array1, Array2}; -use crate::lower_bounds::gap_affine::algo::Context; +use crate::{costs::GapAffineCosts, lower_bounds::gap_affine::algo::Context}; mod algo; #[cfg(test)] mod tests; pub struct GapAffineLowerBounds { - max_n: usize, lower_bounds: Array2, -} - -pub struct GapAffineLowerBoundCostTable { - pub substitution: Cost, - pub gap_open: Cost, - pub gap_extend: Cost, + variable_gap2_lower_bounds: Array1, } impl GapAffineLowerBounds { - #[expect(dead_code)] - pub fn new( - max_n: usize, - max_match_run: u32, - cost_table: &GapAffineLowerBoundCostTable, - ) -> Self { + pub fn new(max_n: usize, max_match_run: u32, cost_table: &GapAffineCosts) -> Self { let mut lower_bounds = Array2::::from_elem((max_n + 1, max_n + 1), Cost::max_value()); lower_bounds[[0, 0]] = Cost::zero(); let context = Context::new(cost_table, max_match_run, max_n); @@ -37,24 +26,29 @@ impl GapAffineLowerBounds { } false }); + let variable_gap2_lower_bounds = Array1::from_iter((0..=max_n).map(|gap1| { + (0..=max_n) + .map(|gap2| lower_bounds[[gap1, gap2]]) + .min() + .unwrap() + })); Self { - max_n, lower_bounds, + variable_gap2_lower_bounds, } } } -impl GapAffineLowerBounds { - #[expect(dead_code)] - pub fn max_n(&self) -> usize { - self.max_n +impl GapAffineLowerBounds { + /// A lower bound of the cost for chaining two anchors with the given gaps. + /// The lower bound is symmetric, so the order of the gaps does not matter. + pub fn lower_bound(&self, gap1: usize, gap2: usize) -> Cost { + self.lower_bounds[[gap1, gap2]] } -} -impl GapAffineLowerBounds { - #[expect(dead_code)] - pub fn lower_bound(&self, a: usize, b: usize) -> Cost { - self.lower_bounds[[a, b]] + /// A lower bound of the cost for chaining two anchors with only one specified gap length. + pub fn variable_gap2_lower_bound(&self, gap: usize) -> Cost { + self.variable_gap2_lower_bounds[[gap]] } } diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs b/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs index d97954a..82b8769 100644 --- a/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs @@ -7,10 +7,10 @@ use generic_a_star::{ }; use num_traits::Zero; -use crate::lower_bounds::gap_affine::GapAffineLowerBoundCostTable; +use crate::costs::GapAffineCosts; pub struct Context<'a, Cost> { - costs: &'a GapAffineLowerBoundCostTable, + costs: &'a GapAffineCosts, max_match_run: u32, max_n: usize, } @@ -39,11 +39,7 @@ pub enum GapType { } impl<'a, Cost> Context<'a, Cost> { - pub fn new( - costs: &'a GapAffineLowerBoundCostTable, - max_match_run: u32, - max_n: usize, - ) -> Self { + pub fn new(costs: &'a GapAffineCosts, max_match_run: u32, max_n: usize) -> Self { Self { costs, max_match_run, diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs b/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs index 0dd4859..6991e6e 100644 --- a/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs +++ b/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs @@ -1,11 +1,11 @@ use generic_a_star::cost::{AStarCost, U32Cost}; use ndarray::Array2; -use crate::lower_bounds::gap_affine::{GapAffineLowerBoundCostTable, GapAffineLowerBounds}; +use crate::{costs::GapAffineCosts, lower_bounds::gap_affine::GapAffineLowerBounds}; #[test] fn test_max_match_run_0() { - let cost_table = GapAffineLowerBoundCostTable { + let cost_table = GapAffineCosts { substitution: U32Cost::from(2u8), gap_open: U32Cost::from(3u8), gap_extend: U32Cost::from(1u8), @@ -13,8 +13,6 @@ fn test_max_match_run_0() { let max_n = 2; let lower_bounds = GapAffineLowerBounds::new(max_n, 0, &cost_table); - assert_eq!(lower_bounds.max_n(), max_n); - #[rustfmt::skip] let exepcted_lower_bounds = Array2::from_shape_vec( (max_n+1, max_n+1), @@ -41,7 +39,7 @@ fn test_max_match_run_0() { #[test] fn test_max_match_run_1() { - let cost_table = GapAffineLowerBoundCostTable { + let cost_table = GapAffineCosts { substitution: U32Cost::from(2u8), gap_open: U32Cost::from(3u8), gap_extend: U32Cost::from(1u8), @@ -49,8 +47,6 @@ fn test_max_match_run_1() { let max_n = 4; let lower_bounds = GapAffineLowerBounds::new(max_n, 1, &cost_table); - assert_eq!(lower_bounds.max_n(), max_n); - #[rustfmt::skip] let exepcted_lower_bounds = Array2::from_shape_vec( (max_n+1, max_n+1), @@ -79,7 +75,7 @@ fn test_max_match_run_1() { #[test] fn test_max_match_run_2() { - let cost_table = GapAffineLowerBoundCostTable { + let cost_table = GapAffineCosts { substitution: U32Cost::from(2u8), gap_open: U32Cost::from(3u8), gap_extend: U32Cost::from(1u8), @@ -87,8 +83,6 @@ fn test_max_match_run_2() { let max_n = 6; let lower_bounds = GapAffineLowerBounds::new(max_n, 2, &cost_table); - assert_eq!(lower_bounds.max_n(), max_n); - #[rustfmt::skip] let exepcted_lower_bounds = Array2::from_shape_vec( (max_n+1, max_n+1), diff --git a/lib_ts_chainalign/src/lower_bounds/ts_jump.rs b/lib_ts_chainalign/src/lower_bounds/ts_jump.rs new file mode 100644 index 0000000..194952e --- /dev/null +++ b/lib_ts_chainalign/src/lower_bounds/ts_jump.rs @@ -0,0 +1,78 @@ +use generic_a_star::cost::AStarCost; +use ndarray::{Array1, Array3}; + +use crate::{costs::AlignmentCosts, lower_bounds::gap_affine::GapAffineLowerBounds}; + +pub struct TsJumpLowerBounds { + primary_lower_bounds: GapAffineLowerBounds, + secondary_lower_bounds: GapAffineLowerBounds, + lower_bounds_12: Array1, + lower_bounds_1234: Array3, +} + +impl TsJumpLowerBounds { + pub fn new(max_n: usize, max_match_run: u32, cost_table: &AlignmentCosts) -> Self { + let primary_lower_bounds = + GapAffineLowerBounds::new(max_n, max_match_run, &cost_table.primary_costs); + let secondary_lower_bounds = + GapAffineLowerBounds::new(max_n, max_match_run, &cost_table.secondary_costs); + + // This way of calculating the lower bound for the 12-jump does not take the shape limits of the template switch into account. + // However, most of the time these limits are gonna be big, so they should not have a big impact on the lower bound. + let mut lower_bounds_12 = Array1::::from_elem(max_n + 1, Cost::max_value()); + for primary_descendant_gap in 0..=max_n { + for secondary_descendant_gap in 0..=max_n - primary_descendant_gap { + let lower_bound = primary_lower_bounds + .variable_gap2_lower_bound(primary_descendant_gap) + + cost_table.ts_base_cost + + secondary_lower_bounds.variable_gap2_lower_bound(secondary_descendant_gap); + lower_bounds_12[[primary_descendant_gap + secondary_descendant_gap]] = + lower_bounds_12[[primary_descendant_gap + secondary_descendant_gap]] + .min(lower_bound); + } + } + + todo!() + } +} + +impl TsJumpLowerBounds { + /// A lower bound of the cost for chaining two primary anchors with the given gaps. + /// The lower bound is symmetric, so the order of the gaps does not matter. + #[expect(dead_code)] + pub fn primary_lower_bound(&self, gap1: usize, gap2: usize) -> Cost { + self.primary_lower_bounds.lower_bound(gap1, gap2) + } + + /// A lower bound of the cost for chaining two secondary anchors with the given gaps. + /// The lower bound is symmetric, so the order of the gaps does not matter. + #[expect(dead_code)] + pub fn secondary_lower_bound(&self, gap1: usize, gap2: usize) -> Cost { + self.secondary_lower_bounds.lower_bound(gap1, gap2) + } + + /// A lower bound of the cost for chaining a primary anchor with a secondary anchor. + /// As the ancestor gap is determined by the 34-jump which is not know when the 12-jump is evaluated, + /// this lower bound only depends on the descendant gap. + /// + /// This lower bound takes the template switch base cost into account. + #[expect(dead_code)] + pub fn lower_bound_12(&self, descendant_gap: usize) -> Cost { + self.lower_bounds_12[[descendant_gap]] + } + + /// A lower bound of the cost of the jump chainings of a template switch. + /// This is a bound to chaining a primary anchor with a secondary anchor for the 12-jump, then chaining some secondary anchors (possibly none), + /// and finally chaining the last secondary anchor with a primary anchor for the 34-jump. + /// + /// **Note:** This lower bound supersedes the 12-jump lower bound, so adding both together would be incorrect. + #[expect(dead_code)] + pub fn lower_bound_1234( + &self, + ancestor_gap: usize, + descendant_gap1: usize, + descendant_gap2: usize, + ) -> Cost { + self.lower_bounds_1234[[ancestor_gap, descendant_gap1, descendant_gap2]] + } +} From 1b86a71394bcdf97d64e4076a32fc8c46f603272 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 1 Dec 2025 14:44:38 +0200 Subject: [PATCH 04/31] Precompute lower bounds also for 34-jump. --- lib_ts_chainalign/src/lib.rs | 9 +- .../src/lower_bounds/gap_affine.rs | 21 ++- .../src/lower_bounds/gap_affine/tests.rs | 108 ++++++++++++++ lib_ts_chainalign/src/lower_bounds/ts_jump.rs | 75 +++++----- .../src/lower_bounds/ts_jump/tests.rs | 138 ++++++++++++++++++ 5 files changed, 308 insertions(+), 43 deletions(-) create mode 100644 lib_ts_chainalign/src/lower_bounds/ts_jump/tests.rs diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index 5d4cb8a..43e65da 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -14,7 +14,12 @@ fn compute_lower_bounds( max_match_run: u32, costs: &AlignmentCosts, ) { - let _gap_affine_lower_bounds = + let gap_affine_lower_bounds = GapAffineLowerBounds::new(max_n, max_match_run, &costs.primary_costs); - let _ts_jump_lower_bounds = TsJumpLowerBounds::new(max_n, max_match_run, costs); + let ts_jump_lower_bounds = TsJumpLowerBounds::new(max_n, max_match_run, costs); + + // Remove dead code warnings + gap_affine_lower_bounds.lower_bound(0, 0); + ts_jump_lower_bounds.lower_bound_12(0); + ts_jump_lower_bounds.lower_bound_34(0); } diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine.rs b/lib_ts_chainalign/src/lower_bounds/gap_affine.rs index 0563473..51d3611 100644 --- a/lib_ts_chainalign/src/lower_bounds/gap_affine.rs +++ b/lib_ts_chainalign/src/lower_bounds/gap_affine.rs @@ -14,13 +14,30 @@ pub struct GapAffineLowerBounds { impl GapAffineLowerBounds { pub fn new(max_n: usize, max_match_run: u32, cost_table: &GapAffineCosts) -> Self { - let mut lower_bounds = Array2::::from_elem((max_n + 1, max_n + 1), Cost::max_value()); + Self::compute(max_n, max_match_run, cost_table, false) + } + + pub(super) fn new_allow_all_matches( + max_n: usize, + max_match_run: u32, + cost_table: &GapAffineCosts, + ) -> Self { + Self::compute(max_n, max_match_run, cost_table, true) + } + + fn compute( + max_n: usize, + max_match_run: u32, + cost_table: &GapAffineCosts, + allow_all_match_run: bool, + ) -> Self { + let mut lower_bounds = Array2::from_elem((max_n + 1, max_n + 1), Cost::max_value()); lower_bounds[[0, 0]] = Cost::zero(); let context = Context::new(cost_table, max_match_run, max_n); let mut a_star = AStar::new(context); a_star.initialise(); a_star.search_until(|_, node| { - if node.identifier.has_non_match { + if node.identifier.has_non_match || allow_all_match_run { let lower_bound = &mut lower_bounds[[node.identifier.a, node.identifier.b]]; *lower_bound = (*lower_bound).min(node.cost().0); } diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs b/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs index 6991e6e..e3e995b 100644 --- a/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs +++ b/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs @@ -110,3 +110,111 @@ fn test_max_match_run_2() { } } } + +#[test] +fn test_max_match_run_0_allow_all_matches() { + let cost_table = GapAffineCosts { + substitution: U32Cost::from(2u8), + gap_open: U32Cost::from(3u8), + gap_extend: U32Cost::from(1u8), + }; + let max_n = 2; + let lower_bounds = GapAffineLowerBounds::new_allow_all_matches(max_n, 0, &cost_table); + + #[rustfmt::skip] + let exepcted_lower_bounds = Array2::from_shape_vec( + (max_n+1, max_n+1), + vec![ + 0u32, 3, 4, + 3, 2, 5, + 4, 5, 4, + ], + ) + .unwrap(); + + for a in 0..=max_n { + for b in 0..=max_n { + assert_eq!( + lower_bounds.lower_bound(a, b).as_primitive(), + exepcted_lower_bounds[[a, b]], + "lower bound({}, {})", + a, + b + ); + } + } +} + +#[test] +fn test_max_match_run_1_allow_all_matches() { + let cost_table = GapAffineCosts { + substitution: U32Cost::from(2u8), + gap_open: U32Cost::from(3u8), + gap_extend: U32Cost::from(1u8), + }; + let max_n = 4; + let lower_bounds = GapAffineLowerBounds::new_allow_all_matches(max_n, 1, &cost_table); + + #[rustfmt::skip] + let exepcted_lower_bounds = Array2::from_shape_vec( + (max_n+1, max_n+1), + vec![ + 0u32, 3, 4, 5, 6, + 3, 0, 3, 4, 5, + 4, 3, 2, 3, 4, + 5, 4, 3, 2, 5, + 6, 5, 4, 5, 4, + ], + ) + .unwrap(); + + for a in 0..=max_n { + for b in 0..=max_n { + assert_eq!( + lower_bounds.lower_bound(a, b).as_primitive(), + exepcted_lower_bounds[[a, b]], + "lower bound({}, {})", + a, + b + ); + } + } +} + +#[test] +fn test_max_match_run_2_allow_all_matches() { + let cost_table = GapAffineCosts { + substitution: U32Cost::from(2u8), + gap_open: U32Cost::from(3u8), + gap_extend: U32Cost::from(1u8), + }; + let max_n = 6; + let lower_bounds = GapAffineLowerBounds::new_allow_all_matches(max_n, 2, &cost_table); + + #[rustfmt::skip] + let exepcted_lower_bounds = Array2::from_shape_vec( + (max_n+1, max_n+1), + vec![ + 0u32, 3, 4, 5, 6, 7, 8, + 3, 0, 3, 4, 5, 6, 7, + 4, 3, 0, 3, 4, 5, 6, + 5, 4, 3, 2, 3, 4, 5, + 6, 5, 4, 3, 2, 3, 4, + 7, 6, 5, 4, 3, 2, 5, + 8, 7, 6, 5, 4, 5, 4, + ], + ) + .unwrap(); + + for a in 0..=max_n { + for b in 0..=max_n { + assert_eq!( + lower_bounds.lower_bound(a, b).as_primitive(), + exepcted_lower_bounds[[a, b]], + "lower bound({}, {})", + a, + b + ); + } + } +} diff --git a/lib_ts_chainalign/src/lower_bounds/ts_jump.rs b/lib_ts_chainalign/src/lower_bounds/ts_jump.rs index 194952e..93ac6f8 100644 --- a/lib_ts_chainalign/src/lower_bounds/ts_jump.rs +++ b/lib_ts_chainalign/src/lower_bounds/ts_jump.rs @@ -1,25 +1,32 @@ use generic_a_star::cost::AStarCost; -use ndarray::{Array1, Array3}; +use ndarray::Array1; use crate::{costs::AlignmentCosts, lower_bounds::gap_affine::GapAffineLowerBounds}; +#[cfg(test)] +mod tests; + pub struct TsJumpLowerBounds { - primary_lower_bounds: GapAffineLowerBounds, - secondary_lower_bounds: GapAffineLowerBounds, lower_bounds_12: Array1, - lower_bounds_1234: Array3, + lower_bounds_34: Array1, } impl TsJumpLowerBounds { pub fn new(max_n: usize, max_match_run: u32, cost_table: &AlignmentCosts) -> Self { - let primary_lower_bounds = - GapAffineLowerBounds::new(max_n, max_match_run, &cost_table.primary_costs); - let secondary_lower_bounds = - GapAffineLowerBounds::new(max_n, max_match_run, &cost_table.secondary_costs); + let primary_lower_bounds = GapAffineLowerBounds::new_allow_all_matches( + max_n, + max_match_run, + &cost_table.primary_costs, + ); + let secondary_lower_bounds = GapAffineLowerBounds::new_allow_all_matches( + max_n, + max_match_run, + &cost_table.secondary_costs, + ); // This way of calculating the lower bound for the 12-jump does not take the shape limits of the template switch into account. // However, most of the time these limits are gonna be big, so they should not have a big impact on the lower bound. - let mut lower_bounds_12 = Array1::::from_elem(max_n + 1, Cost::max_value()); + let mut lower_bounds_12 = Array1::from_elem(max_n + 1, Cost::max_value()); for primary_descendant_gap in 0..=max_n { for secondary_descendant_gap in 0..=max_n - primary_descendant_gap { let lower_bound = primary_lower_bounds @@ -32,47 +39,37 @@ impl TsJumpLowerBounds { } } - todo!() + let mut lower_bounds_34 = Array1::from_elem(max_n + 1, Cost::max_value()); + for secondary_descendant_gap in 0..=max_n { + for primary_descendant_gap in 0..=max_n - secondary_descendant_gap { + let lower_bound = secondary_lower_bounds + .variable_gap2_lower_bound(secondary_descendant_gap) + + primary_lower_bounds.variable_gap2_lower_bound(primary_descendant_gap); + lower_bounds_34[[primary_descendant_gap + secondary_descendant_gap]] = + lower_bounds_34[[primary_descendant_gap + secondary_descendant_gap]] + .min(lower_bound); + } + } + + Self { + lower_bounds_12, + lower_bounds_34, + } } } impl TsJumpLowerBounds { - /// A lower bound of the cost for chaining two primary anchors with the given gaps. - /// The lower bound is symmetric, so the order of the gaps does not matter. - #[expect(dead_code)] - pub fn primary_lower_bound(&self, gap1: usize, gap2: usize) -> Cost { - self.primary_lower_bounds.lower_bound(gap1, gap2) - } - - /// A lower bound of the cost for chaining two secondary anchors with the given gaps. - /// The lower bound is symmetric, so the order of the gaps does not matter. - #[expect(dead_code)] - pub fn secondary_lower_bound(&self, gap1: usize, gap2: usize) -> Cost { - self.secondary_lower_bounds.lower_bound(gap1, gap2) - } - /// A lower bound of the cost for chaining a primary anchor with a secondary anchor. - /// As the ancestor gap is determined by the 34-jump which is not know when the 12-jump is evaluated, - /// this lower bound only depends on the descendant gap. /// /// This lower bound takes the template switch base cost into account. - #[expect(dead_code)] pub fn lower_bound_12(&self, descendant_gap: usize) -> Cost { self.lower_bounds_12[[descendant_gap]] } - /// A lower bound of the cost of the jump chainings of a template switch. - /// This is a bound to chaining a primary anchor with a secondary anchor for the 12-jump, then chaining some secondary anchors (possibly none), - /// and finally chaining the last secondary anchor with a primary anchor for the 34-jump. + /// A lower bound of the cost for chaining a secondary anchor with a primary anchor. /// - /// **Note:** This lower bound supersedes the 12-jump lower bound, so adding both together would be incorrect. - #[expect(dead_code)] - pub fn lower_bound_1234( - &self, - ancestor_gap: usize, - descendant_gap1: usize, - descendant_gap2: usize, - ) -> Cost { - self.lower_bounds_1234[[ancestor_gap, descendant_gap1, descendant_gap2]] + /// This lower bound does **not** take the template switch base cost into account. + pub fn lower_bound_34(&self, descendant_gap: usize) -> Cost { + self.lower_bounds_34[[descendant_gap]] } } diff --git a/lib_ts_chainalign/src/lower_bounds/ts_jump/tests.rs b/lib_ts_chainalign/src/lower_bounds/ts_jump/tests.rs new file mode 100644 index 0000000..1a02f1f --- /dev/null +++ b/lib_ts_chainalign/src/lower_bounds/ts_jump/tests.rs @@ -0,0 +1,138 @@ +use generic_a_star::cost::{AStarCost, U32Cost}; + +use crate::{ + costs::{AlignmentCosts, GapAffineCosts, TsLimits}, + lower_bounds::ts_jump::TsJumpLowerBounds, +}; + +#[test] +fn test_max_match_run_0() { + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts { + substitution: U32Cost::from(2u8), + gap_open: U32Cost::from(3u8), + gap_extend: U32Cost::from(1u8), + }, + secondary_costs: GapAffineCosts { + substitution: U32Cost::from(4u8), + gap_open: U32Cost::from(6u8), + gap_extend: U32Cost::from(2u8), + }, + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..200, + }, + }; + + let max_n = 2; + let lower_bounds = TsJumpLowerBounds::new(max_n, 0, &cost_table); + + let expected_lower_bounds_12 = [2, 4, 6]; + let expected_lower_bounds_34 = + expected_lower_bounds_12.map(|cost| cost - cost_table.ts_base_cost.as_primitive()); + + for descendant_gap in 0..=max_n { + assert_eq!( + lower_bounds.lower_bound_12(descendant_gap).as_primitive(), + expected_lower_bounds_12[descendant_gap], + "lower_bound_12({})", + descendant_gap + ); + assert_eq!( + lower_bounds.lower_bound_34(descendant_gap).as_primitive(), + expected_lower_bounds_34[descendant_gap], + "lower_bound_34({})", + descendant_gap + ); + } +} + +#[test] +fn test_max_match_run_1() { + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts { + substitution: U32Cost::from(2u8), + gap_open: U32Cost::from(3u8), + gap_extend: U32Cost::from(1u8), + }, + secondary_costs: GapAffineCosts { + substitution: U32Cost::from(4u8), + gap_open: U32Cost::from(6u8), + gap_extend: U32Cost::from(2u8), + }, + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..200, + }, + }; + + let max_n = 8; + let lower_bounds = TsJumpLowerBounds::new(max_n, 1, &cost_table); + + let expected_lower_bounds_12 = [2, 2, 2, 4, 4, 6, 6, 8, 8]; + let expected_lower_bounds_34 = + expected_lower_bounds_12.map(|cost| cost - cost_table.ts_base_cost.as_primitive()); + + for descendant_gap in 0..=max_n { + assert_eq!( + lower_bounds.lower_bound_12(descendant_gap).as_primitive(), + expected_lower_bounds_12[descendant_gap], + "lower_bound_12({})", + descendant_gap + ); + assert_eq!( + lower_bounds.lower_bound_34(descendant_gap).as_primitive(), + expected_lower_bounds_34[descendant_gap], + "lower_bound_34({})", + descendant_gap + ); + } +} + +#[test] +fn test_max_match_run_2() { + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts { + substitution: U32Cost::from(2u8), + gap_open: U32Cost::from(3u8), + gap_extend: U32Cost::from(1u8), + }, + secondary_costs: GapAffineCosts { + substitution: U32Cost::from(4u8), + gap_open: U32Cost::from(6u8), + gap_extend: U32Cost::from(2u8), + }, + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..200, + }, + }; + + let max_n = 9; + let lower_bounds = TsJumpLowerBounds::new(max_n, 2, &cost_table); + + let expected_lower_bounds_12 = [2, 2, 2, 2, 2, 4, 4, 4, 6, 6]; + let expected_lower_bounds_34 = + expected_lower_bounds_12.map(|cost| cost - cost_table.ts_base_cost.as_primitive()); + + for descendant_gap in 0..=max_n { + assert_eq!( + lower_bounds.lower_bound_12(descendant_gap).as_primitive(), + expected_lower_bounds_12[descendant_gap], + "lower_bound_12({})", + descendant_gap + ); + assert_eq!( + lower_bounds.lower_bound_34(descendant_gap).as_primitive(), + expected_lower_bounds_34[descendant_gap], + "lower_bound_34({})", + descendant_gap + ); + } +} From d073f98c506d4297cc21dc5bfc184cc14346a99a Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 1 Dec 2025 16:29:23 +0200 Subject: [PATCH 05/31] Implement gap filling for gap-affine alignment. --- generic_a_star/src/lib.rs | 14 ++ lib_ts_chainalign/src/alignment.rs | 59 +++++ .../src/alignment/coordinates.rs | 107 +++++++++ lib_ts_chainalign/src/alignment/sequences.rs | 19 ++ lib_ts_chainalign/src/costs.rs | 28 ++- lib_ts_chainalign/src/gap_filling.rs | 3 + .../src/gap_filling/gap_affine.rs | 64 +++++ .../src/gap_filling/gap_affine/algo.rs | 222 ++++++++++++++++++ .../src/gap_filling/gap_affine/tests.rs | 151 ++++++++++++ lib_ts_chainalign/src/lib.rs | 6 +- lib_ts_chainalign/src/lower_bounds.rs | 2 + .../src/lower_bounds/gap_affine.rs | 5 +- .../src/lower_bounds/gap_affine/algo.rs | 75 +++--- 13 files changed, 707 insertions(+), 48 deletions(-) create mode 100644 lib_ts_chainalign/src/alignment.rs create mode 100644 lib_ts_chainalign/src/alignment/coordinates.rs create mode 100644 lib_ts_chainalign/src/alignment/sequences.rs create mode 100644 lib_ts_chainalign/src/gap_filling.rs create mode 100644 lib_ts_chainalign/src/gap_filling/gap_affine.rs create mode 100644 lib_ts_chainalign/src/gap_filling/gap_affine/algo.rs create mode 100644 lib_ts_chainalign/src/gap_filling/gap_affine/tests.rs diff --git a/generic_a_star/src/lib.rs b/generic_a_star/src/lib.rs index 7baf8fd..ea825eb 100644 --- a/generic_a_star/src/lib.rs +++ b/generic_a_star/src/lib.rs @@ -512,6 +512,20 @@ impl AStar { None } } + + /// Reconstruct the path from a root node to the target node. + pub fn reconstruct_path(&self) -> Vec<::EdgeType> { + let AStarState::Terminated { + result: AStarResult::FoundTarget { .. }, + } = &self.state + else { + panic!("Cannot reconstruct path since no target was found.") + }; + + let mut result = self.backtrack().collect::>(); + result.reverse(); + result + } } impl AStarResult { diff --git a/lib_ts_chainalign/src/alignment.rs b/lib_ts_chainalign/src/alignment.rs new file mode 100644 index 0000000..b28c018 --- /dev/null +++ b/lib_ts_chainalign/src/alignment.rs @@ -0,0 +1,59 @@ +//! Representation of an alignment. + +use std::fmt::Display; + +pub mod coordinates; +pub mod sequences; + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub enum AlignmentType { + Match, + Substitution, + Gap1, + Gap2, + TsStart, + TsEnd, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub enum GapType { + None, + /// A gap in sequence 1, meaning that sequence 2 has characters that are missing from sequence 1. + In1, + /// A gap in sequence 2, meaning that sequence 1 has characters that are missing from sequence 2. + In2, +} + +pub struct Alignment { + pub alignment: Vec<(usize, AlignmentType)>, +} + +impl Display for GapType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + GapType::None => write!(f, "M/S"), + GapType::In1 => write!(f, "GA"), + GapType::In2 => write!(f, "GB"), + } + } +} + +impl FromIterator for Alignment { + fn from_iter>(iter: T) -> Self { + let mut alignment = Vec::new(); + for alignment_type in iter { + if Some(alignment_type) == alignment.last().map(|(_, alignment_type)| *alignment_type) { + alignment.last_mut().unwrap().0 += 1; + } else { + alignment.push((1, alignment_type)); + } + } + Self { alignment } + } +} + +impl From> for Alignment { + fn from(alignment_types: Vec) -> Self { + alignment_types.into_iter().collect() + } +} diff --git a/lib_ts_chainalign/src/alignment/coordinates.rs b/lib_ts_chainalign/src/alignment/coordinates.rs new file mode 100644 index 0000000..844c305 --- /dev/null +++ b/lib_ts_chainalign/src/alignment/coordinates.rs @@ -0,0 +1,107 @@ +use std::{fmt::Display, ops::Range}; + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct AlignmentCoordinates { + seq1: SequenceOrdinate, + seq2: SequenceOrdinate, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct SequenceOrdinate { + ordinate: usize, + rc: bool, +} + +impl AlignmentCoordinates { + pub fn new_forwards(seq1: usize, seq2: usize) -> Self { + Self { + seq1: SequenceOrdinate::new(seq1, false), + seq2: SequenceOrdinate::new(seq2, false), + } + } + + pub fn seq1(&self) -> SequenceOrdinate { + self.seq1 + } + + pub fn seq2(&self) -> SequenceOrdinate { + self.seq2 + } + + pub fn can_increment_1(&self, start: AlignmentCoordinates, end: AlignmentCoordinates) -> bool { + self.seq1() + .can_increment(start.seq1().ordinate()..end.seq1().ordinate()) + } + + pub fn can_increment_2(&self, start: AlignmentCoordinates, end: AlignmentCoordinates) -> bool { + self.seq2() + .can_increment(start.seq2().ordinate()..end.seq2().ordinate()) + } + + pub fn can_increment_both( + &self, + start: AlignmentCoordinates, + end: AlignmentCoordinates, + ) -> bool { + self.can_increment_1(start, end) && self.can_increment_2(start, end) + } + + pub fn increment_1(&self) -> Self { + Self { + seq1: SequenceOrdinate::new( + if self.seq1.is_rc() { + self.seq1.ordinate().wrapping_sub(1) + } else { + self.seq1.ordinate() + 1 + }, + self.seq1.is_rc(), + ), + seq2: self.seq2, + } + } + + pub fn increment_2(&self) -> Self { + Self { + seq1: self.seq1, + seq2: SequenceOrdinate::new( + if self.seq2.is_rc() { + self.seq2.ordinate().wrapping_sub(1) + } else { + self.seq2.ordinate() + 1 + }, + self.seq2.is_rc(), + ), + } + } + + pub fn increment_both(&self) -> Self { + self.increment_1().increment_2() + } +} + +impl SequenceOrdinate { + pub fn new(ordinate: usize, rc: bool) -> Self { + Self { ordinate, rc } + } + + /// Returns the sequence index of this ordinate. + /// + /// If the index runs over the end of the sequence, it may roll around to `usize::MAX` in reverse complements. + pub fn ordinate(&self) -> usize { + self.ordinate + } + + pub fn is_rc(&self) -> bool { + self.rc + } + + pub fn can_increment(&self, range: Range) -> bool { + range.contains(&self.ordinate) + } +} + +impl Display for SequenceOrdinate { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}{}", self.ordinate, if self.rc { "rc" } else { "" }) + } +} diff --git a/lib_ts_chainalign/src/alignment/sequences.rs b/lib_ts_chainalign/src/alignment/sequences.rs new file mode 100644 index 0000000..86707ab --- /dev/null +++ b/lib_ts_chainalign/src/alignment/sequences.rs @@ -0,0 +1,19 @@ +use crate::alignment::coordinates::AlignmentCoordinates; + +pub struct AlignmentSequences { + seq1: Vec, + seq2: Vec, +} + +impl AlignmentSequences { + pub fn new(seq1: Vec, seq2: Vec) -> Self { + Self { seq1, seq2 } + } + + pub fn characters(&self, coordinates: AlignmentCoordinates) -> (u8, u8) { + ( + self.seq1[coordinates.seq1().ordinate()], + self.seq2[coordinates.seq2().ordinate()], + ) + } +} diff --git a/lib_ts_chainalign/src/costs.rs b/lib_ts_chainalign/src/costs.rs index 6836df9..164186b 100644 --- a/lib_ts_chainalign/src/costs.rs +++ b/lib_ts_chainalign/src/costs.rs @@ -6,7 +6,6 @@ pub struct GapAffineCosts { pub gap_extend: Cost, } -#[expect(dead_code)] pub struct TsLimits { pub jump_12: Range, pub jump_34: Range, @@ -17,6 +16,31 @@ pub struct AlignmentCosts { pub primary_costs: GapAffineCosts, pub secondary_costs: GapAffineCosts, pub ts_base_cost: Cost, - #[expect(dead_code)] pub ts_limits: TsLimits, } + +impl GapAffineCosts { + pub fn new(substitution: Cost, gap_open: Cost, gap_extend: Cost) -> Self { + Self { + substitution, + gap_open, + gap_extend, + } + } +} + +impl AlignmentCosts { + pub fn new( + primary_costs: GapAffineCosts, + secondary_costs: GapAffineCosts, + ts_base_cost: Cost, + ts_limits: TsLimits, + ) -> Self { + Self { + primary_costs, + secondary_costs, + ts_base_cost, + ts_limits, + } + } +} diff --git a/lib_ts_chainalign/src/gap_filling.rs b/lib_ts_chainalign/src/gap_filling.rs new file mode 100644 index 0000000..1302c2d --- /dev/null +++ b/lib_ts_chainalign/src/gap_filling.rs @@ -0,0 +1,3 @@ +//! Fill the gap between two chained anchors with an optimal alignment. + +pub mod gap_affine; diff --git a/lib_ts_chainalign/src/gap_filling/gap_affine.rs b/lib_ts_chainalign/src/gap_filling/gap_affine.rs new file mode 100644 index 0000000..7e95184 --- /dev/null +++ b/lib_ts_chainalign/src/gap_filling/gap_affine.rs @@ -0,0 +1,64 @@ +use generic_a_star::{AStar, AStarResult, cost::AStarCost}; + +use crate::{ + alignment::{Alignment, coordinates::AlignmentCoordinates, sequences::AlignmentSequences}, + costs::GapAffineCosts, + gap_filling::gap_affine::algo::Context, +}; + +mod algo; +#[cfg(test)] +mod tests; + +pub struct GapAffineAlignment { + start: AlignmentCoordinates, + end: AlignmentCoordinates, + alignment: Alignment, + cost: Cost, +} + +impl GapAffineAlignment { + pub fn new( + start: AlignmentCoordinates, + end: AlignmentCoordinates, + sequences: &AlignmentSequences, + cost_table: &GapAffineCosts, + ) -> Self { + let context = Context::new(cost_table, sequences, start, end); + let mut a_star = AStar::new(context); + a_star.initialise(); + match a_star.search() { + AStarResult::FoundTarget { cost, .. } => Self { + start, + end, + alignment: a_star.reconstruct_path().into(), + cost, + }, + AStarResult::ExceededCostLimit { .. } => unreachable!("Cost limit is None"), + AStarResult::ExceededMemoryLimit { .. } => unreachable!("Cost limit is None"), + AStarResult::NoTarget => { + panic!("No gap-affine alignment found between the given coordinates") + } + } + } +} + +impl GapAffineAlignment { + pub fn start(&self) -> AlignmentCoordinates { + self.start + } + + pub fn end(&self) -> AlignmentCoordinates { + self.end + } + + pub fn alignment(&self) -> &Alignment { + &self.alignment + } +} + +impl GapAffineAlignment { + pub fn cost(&self) -> Cost { + self.cost + } +} diff --git a/lib_ts_chainalign/src/gap_filling/gap_affine/algo.rs b/lib_ts_chainalign/src/gap_filling/gap_affine/algo.rs new file mode 100644 index 0000000..6836267 --- /dev/null +++ b/lib_ts_chainalign/src/gap_filling/gap_affine/algo.rs @@ -0,0 +1,222 @@ +use std::fmt::Display; + +use generic_a_star::{AStarContext, AStarNode, cost::AStarCost, reset::Reset}; + +use crate::{ + alignment::{ + AlignmentType, GapType, coordinates::AlignmentCoordinates, sequences::AlignmentSequences, + }, + costs::GapAffineCosts, +}; + +pub struct Context<'costs, 'sequences, Cost> { + costs: &'costs GapAffineCosts, + sequences: &'sequences AlignmentSequences, + start: AlignmentCoordinates, + end: AlignmentCoordinates, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub struct Node { + pub identifier: Identifier, + pub predecessor: Option, + pub predecessor_alignment_type: Option, + pub cost: Cost, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct Identifier { + pub coordinates: AlignmentCoordinates, + gap_type: GapType, +} + +impl<'costs, 'sequences, Cost> Context<'costs, 'sequences, Cost> { + pub fn new( + costs: &'costs GapAffineCosts, + sequences: &'sequences AlignmentSequences, + start: AlignmentCoordinates, + end: AlignmentCoordinates, + ) -> Self { + Self { + costs, + sequences, + start, + end, + } + } +} + +impl AStarContext for Context<'_, '_, Cost> { + type Node = Node; + + fn create_root(&self) -> Self::Node { + Node { + identifier: Identifier { + coordinates: self.start, + gap_type: GapType::None, + }, + predecessor: None, + predecessor_alignment_type: None, + cost: Cost::zero(), + } + } + + fn generate_successors(&mut self, node: &Self::Node, output: &mut impl Extend) { + let Node { + identifier, cost, .. + } = node; + let predecessor = Some(*identifier); + let Identifier { + coordinates, + gap_type, + } = *identifier; + + if coordinates.can_increment_both(self.start, self.end) { + let (c1, c2) = self.sequences.characters(coordinates); + let is_match = c1 == c2; + + if is_match { + // Match + let new_cost = *cost; + output.extend(std::iter::once(Node { + identifier: Identifier { + coordinates: coordinates.increment_both(), + gap_type: GapType::None, + }, + predecessor, + predecessor_alignment_type: Some(AlignmentType::Match), + cost: new_cost, + })); + } else { + // Substitution + let new_cost = *cost + self.costs.substitution; + output.extend(std::iter::once(Node { + identifier: Identifier { + coordinates: coordinates.increment_both(), + gap_type: GapType::None, + }, + predecessor, + predecessor_alignment_type: Some(AlignmentType::Substitution), + cost: new_cost, + })); + } + } + + if coordinates.can_increment_1(self.start, self.end) { + // Gap in 2 + let new_cost = *cost + + match gap_type { + GapType::In2 => self.costs.gap_extend, + _ => self.costs.gap_open, + }; + output.extend(std::iter::once(Node { + identifier: Identifier { + coordinates: coordinates.increment_1(), + gap_type: GapType::In2, + }, + predecessor, + predecessor_alignment_type: Some(AlignmentType::Gap2), + cost: new_cost, + })); + } + + if coordinates.can_increment_2(self.start, self.end) { + // Gap in 1 + let new_cost = *cost + + match gap_type { + GapType::In1 => self.costs.gap_extend, + _ => self.costs.gap_open, + }; + output.extend(std::iter::once(Node { + identifier: Identifier { + coordinates: coordinates.increment_2(), + gap_type: GapType::In1, + }, + predecessor, + predecessor_alignment_type: Some(AlignmentType::Gap1), + cost: new_cost, + })); + } + } + + fn is_target(&self, node: &Self::Node) -> bool { + node.identifier.coordinates == self.end + } + + fn cost_limit(&self) -> Option<::Cost> { + None + } + + fn memory_limit(&self) -> Option { + None + } +} + +impl Reset for Context<'_, '_, Cost> { + fn reset(&mut self) { + unimplemented!() + } +} + +impl AStarNode for Node { + type Identifier = Identifier; + + type EdgeType = AlignmentType; + + // Use match run as secondary cost + type Cost = Cost; + + fn identifier(&self) -> &Self::Identifier { + &self.identifier + } + + fn cost(&self) -> Self::Cost { + self.cost + } + + fn a_star_lower_bound(&self) -> Self::Cost { + Cost::zero() + } + + fn secondary_maximisable_score(&self) -> usize { + 0 + } + + fn predecessor(&self) -> Option<&Self::Identifier> { + self.predecessor.as_ref() + } + + fn predecessor_edge_type(&self) -> Option { + self.predecessor_alignment_type + } +} + +impl Display for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}: {}", self.identifier, self.cost) + } +} + +impl Display for Identifier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "({}, {}, {})", + self.coordinates.seq1(), + self.coordinates.seq2(), + self.gap_type + ) + } +} + +impl PartialOrd for Node { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Node { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.cost.cmp(&other.cost) + } +} diff --git a/lib_ts_chainalign/src/gap_filling/gap_affine/tests.rs b/lib_ts_chainalign/src/gap_filling/gap_affine/tests.rs new file mode 100644 index 0000000..ad664b0 --- /dev/null +++ b/lib_ts_chainalign/src/gap_filling/gap_affine/tests.rs @@ -0,0 +1,151 @@ +use generic_a_star::cost::U32Cost; + +use crate::alignment::AlignmentType; +use crate::gap_filling::gap_affine::{AlignmentCoordinates, GapAffineAlignment}; +use crate::{alignment::sequences::AlignmentSequences, costs::GapAffineCosts}; + +#[test] +fn test_start_end() { + let seq1 = b"ACGT".to_vec(); + let seq2 = b"ACGTT".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = + GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); + + let start = AlignmentCoordinates::new_forwards(0, 0); + let end = AlignmentCoordinates::new_forwards(4, 5); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![(4, AlignmentType::Match), (1, AlignmentType::Gap1)] + ); + assert_eq!(alignment.cost(), U32Cost::from(3u8)); +} + +#[test] +fn test_partial_alignment() { + let seq1 = b"ACCGT".to_vec(); + let seq2 = b"ACGGTT".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = + GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); + + let start = AlignmentCoordinates::new_forwards(1, 1); + let end = AlignmentCoordinates::new_forwards(4, 4); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (1, AlignmentType::Match), + (1, AlignmentType::Substitution), + (1, AlignmentType::Match) + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(2u8)); +} + +#[test] +fn test_gap_directions() { + let seq1 = b"ACGCCGTGTTCT".to_vec(); + let seq2 = b"ACGGTGTTAACT".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = + GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); + + let start = AlignmentCoordinates::new_forwards(1, 1); + let end = AlignmentCoordinates::new_forwards(11, 11); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (2, AlignmentType::Match), + (2, AlignmentType::Gap2), + (5, AlignmentType::Match), + (2, AlignmentType::Gap1), + (1, AlignmentType::Match) + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(8u8)); +} + +#[test] +fn test_extremity_gaps() { + let seq1 = b"ACGCCGTGTTCT".to_vec(); + let seq2 = b"ACGGTGTTAACT".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = + GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); + + let start = AlignmentCoordinates::new_forwards(3, 3); + let end = AlignmentCoordinates::new_forwards(10, 10); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (2, AlignmentType::Gap2), + (5, AlignmentType::Match), + (2, AlignmentType::Gap1), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(8u8)); +} + +#[test] +fn test_extremity_substitutions() { + let seq1 = b"AGGGA".to_vec(); + let seq2 = b"TGGGT".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = + GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); + + let start = AlignmentCoordinates::new_forwards(0, 0); + let end = AlignmentCoordinates::new_forwards(5, 5); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (1, AlignmentType::Substitution), + (3, AlignmentType::Match), + (1, AlignmentType::Substitution), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(4u8)); +} + +#[test] +fn test_substitutions_as_gaps() { + let seq1 = b"AAAAAAAAAAAAAAAAAAAA".to_vec(); + let seq2 = b"TTTTTTTTTTTTTTTTTTTT".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = + GapAffineCosts::new(U32Cost::from(3u8), U32Cost::from(3u8), U32Cost::from(1u8)); + + let start = AlignmentCoordinates::new_forwards(0, 0); + let end = AlignmentCoordinates::new_forwards(20, 20); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert!( + alignment.alignment().alignment + == vec![(20, AlignmentType::Gap1), (20, AlignmentType::Gap2),] + || alignment.alignment().alignment + == vec![(20, AlignmentType::Gap2), (20, AlignmentType::Gap1),] + ); + assert_eq!(alignment.cost(), U32Cost::from(44u8)); +} diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index 43e65da..d4fcd9d 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -5,8 +5,10 @@ use crate::{ lower_bounds::{gap_affine::GapAffineLowerBounds, ts_jump::TsJumpLowerBounds}, }; -mod costs; -mod lower_bounds; +pub mod alignment; +pub mod costs; +pub mod gap_filling; +pub mod lower_bounds; #[expect(dead_code)] fn compute_lower_bounds( diff --git a/lib_ts_chainalign/src/lower_bounds.rs b/lib_ts_chainalign/src/lower_bounds.rs index 273e7bf..032e8c7 100644 --- a/lib_ts_chainalign/src/lower_bounds.rs +++ b/lib_ts_chainalign/src/lower_bounds.rs @@ -1,2 +1,4 @@ +//! Compute lower bounds for chaining anchors with gaps. + pub mod gap_affine; pub mod ts_jump; diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine.rs b/lib_ts_chainalign/src/lower_bounds/gap_affine.rs index 51d3611..2330de7 100644 --- a/lib_ts_chainalign/src/lower_bounds/gap_affine.rs +++ b/lib_ts_chainalign/src/lower_bounds/gap_affine.rs @@ -38,7 +38,10 @@ impl GapAffineLowerBounds { a_star.initialise(); a_star.search_until(|_, node| { if node.identifier.has_non_match || allow_all_match_run { - let lower_bound = &mut lower_bounds[[node.identifier.a, node.identifier.b]]; + let lower_bound = &mut lower_bounds[[ + node.identifier.coordinates.seq1().ordinate(), + node.identifier.coordinates.seq2().ordinate(), + ]]; *lower_bound = (*lower_bound).min(node.cost().0); } false diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs b/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs index 82b8769..8ff6515 100644 --- a/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs @@ -7,7 +7,10 @@ use generic_a_star::{ }; use num_traits::Zero; -use crate::costs::GapAffineCosts; +use crate::{ + alignment::{GapType, coordinates::AlignmentCoordinates}, + costs::GapAffineCosts, +}; pub struct Context<'a, Cost> { costs: &'a GapAffineCosts, @@ -24,20 +27,12 @@ pub struct Node { #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] pub struct Identifier { - pub a: usize, - pub b: usize, + pub coordinates: AlignmentCoordinates, /// True if this node was reached via at least one non-match. pub has_non_match: bool, gap_type: GapType, } -#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] -pub enum GapType { - None, - InA, - InB, -} - impl<'a, Cost> Context<'a, Cost> { pub fn new(costs: &'a GapAffineCosts, max_match_run: u32, max_n: usize) -> Self { Self { @@ -54,8 +49,7 @@ impl AStarContext for Context<'_, Cost> { fn create_root(&self) -> Self::Node { Node { identifier: Identifier { - a: 0, - b: 0, + coordinates: AlignmentCoordinates::new_forwards(0, 0), has_non_match: false, gap_type: GapType::None, }, @@ -68,8 +62,7 @@ impl AStarContext for Context<'_, Cost> { let Node { identifier: Identifier { - a, - b, + coordinates, has_non_match, gap_type, }, @@ -77,14 +70,16 @@ impl AStarContext for Context<'_, Cost> { match_run, } = node; - if *a < self.max_n && *b < self.max_n { + if coordinates.seq1().can_increment(0..self.max_n) + && coordinates.seq2().can_increment(0..self.max_n) + { if *match_run < self.max_match_run { // Match let new_cost = *cost; output.extend(std::iter::once(Node { identifier: Identifier { - a: a + 1, - b: b + 1, + coordinates: coordinates.increment_both(), + has_non_match: *has_non_match, gap_type: GapType::None, }, @@ -97,8 +92,7 @@ impl AStarContext for Context<'_, Cost> { let new_cost = *cost + self.costs.substitution; output.extend(std::iter::once(Node { identifier: Identifier { - a: a + 1, - b: b + 1, + coordinates: coordinates.increment_both(), has_non_match: true, gap_type: GapType::None, }, @@ -107,38 +101,36 @@ impl AStarContext for Context<'_, Cost> { })); } - if *a < self.max_n { - // Gap in B + if coordinates.seq1().can_increment(0..self.max_n) { + // Gap in 2 let new_cost = *cost + match gap_type { - GapType::InB => self.costs.gap_extend, + GapType::In2 => self.costs.gap_extend, _ => self.costs.gap_open, }; output.extend(std::iter::once(Node { identifier: Identifier { - a: a + 1, - b: *b, + coordinates: coordinates.increment_1(), has_non_match: true, - gap_type: GapType::InB, + gap_type: GapType::In2, }, cost: new_cost, match_run: 0, })); } - if *b < self.max_n { - // Gap in A + if coordinates.seq2().can_increment(0..self.max_n) { + // Gap in 1 let new_cost = *cost + match gap_type { - GapType::InA => self.costs.gap_extend, + GapType::In1 => self.costs.gap_extend, _ => self.costs.gap_open, }; output.extend(std::iter::once(Node { identifier: Identifier { - a: *a, - b: b + 1, + coordinates: coordinates.increment_2(), has_non_match: true, - gap_type: GapType::InA, + gap_type: GapType::In1, }, cost: new_cost, match_run: 0, @@ -162,7 +154,7 @@ impl AStarContext for Context<'_, Cost> { impl Reset for Context<'_, Cost> { fn reset(&mut self) { - // No internal state to reset + unimplemented!() } } @@ -209,17 +201,14 @@ impl Display for Node { impl Display for Identifier { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "({}, {}, {})", self.a, self.b, self.gap_type) - } -} - -impl Display for GapType { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - GapType::None => write!(f, "M/S"), - GapType::InA => write!(f, "GA"), - GapType::InB => write!(f, "GB"), - } + write!( + f, + "({}, {}, {}, {})", + self.coordinates.seq1(), + self.coordinates.seq2(), + self.gap_type, + self.has_non_match + ) } } From ec3eaa2e51f2234dc036b13b83547bce79d05e54 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 1 Dec 2025 16:31:09 +0200 Subject: [PATCH 06/31] Rename alignment modules. --- .../src/{lower_bounds.rs => chaining_lower_bounds.rs} | 0 .../{lower_bounds => chaining_lower_bounds}/gap_affine.rs | 2 +- .../gap_affine/algo.rs | 0 .../gap_affine/tests.rs | 2 +- .../src/{lower_bounds => chaining_lower_bounds}/ts_jump.rs | 2 +- .../ts_jump/tests.rs | 2 +- lib_ts_chainalign/src/{gap_filling.rs => exact_chaining.rs} | 0 .../src/{gap_filling => exact_chaining}/gap_affine.rs | 2 +- .../src/{gap_filling => exact_chaining}/gap_affine/algo.rs | 0 .../src/{gap_filling => exact_chaining}/gap_affine/tests.rs | 2 +- lib_ts_chainalign/src/lib.rs | 6 +++--- 11 files changed, 9 insertions(+), 9 deletions(-) rename lib_ts_chainalign/src/{lower_bounds.rs => chaining_lower_bounds.rs} (100%) rename lib_ts_chainalign/src/{lower_bounds => chaining_lower_bounds}/gap_affine.rs (96%) rename lib_ts_chainalign/src/{lower_bounds => chaining_lower_bounds}/gap_affine/algo.rs (100%) rename lib_ts_chainalign/src/{lower_bounds => chaining_lower_bounds}/gap_affine/tests.rs (98%) rename lib_ts_chainalign/src/{lower_bounds => chaining_lower_bounds}/ts_jump.rs (97%) rename lib_ts_chainalign/src/{lower_bounds => chaining_lower_bounds}/ts_jump/tests.rs (98%) rename lib_ts_chainalign/src/{gap_filling.rs => exact_chaining.rs} (100%) rename lib_ts_chainalign/src/{gap_filling => exact_chaining}/gap_affine.rs (97%) rename lib_ts_chainalign/src/{gap_filling => exact_chaining}/gap_affine/algo.rs (100%) rename lib_ts_chainalign/src/{gap_filling => exact_chaining}/gap_affine/tests.rs (98%) diff --git a/lib_ts_chainalign/src/lower_bounds.rs b/lib_ts_chainalign/src/chaining_lower_bounds.rs similarity index 100% rename from lib_ts_chainalign/src/lower_bounds.rs rename to lib_ts_chainalign/src/chaining_lower_bounds.rs diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine.rs b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs similarity index 96% rename from lib_ts_chainalign/src/lower_bounds/gap_affine.rs rename to lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs index 2330de7..34655d7 100644 --- a/lib_ts_chainalign/src/lower_bounds/gap_affine.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs @@ -1,7 +1,7 @@ use generic_a_star::{AStar, AStarNode, cost::AStarCost}; use ndarray::{Array1, Array2}; -use crate::{costs::GapAffineCosts, lower_bounds::gap_affine::algo::Context}; +use crate::{chaining_lower_bounds::gap_affine::algo::Context, costs::GapAffineCosts}; mod algo; #[cfg(test)] diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs similarity index 100% rename from lib_ts_chainalign/src/lower_bounds/gap_affine/algo.rs rename to lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs diff --git a/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/tests.rs similarity index 98% rename from lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs rename to lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/tests.rs index e3e995b..84f68f8 100644 --- a/lib_ts_chainalign/src/lower_bounds/gap_affine/tests.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/tests.rs @@ -1,7 +1,7 @@ use generic_a_star::cost::{AStarCost, U32Cost}; use ndarray::Array2; -use crate::{costs::GapAffineCosts, lower_bounds::gap_affine::GapAffineLowerBounds}; +use crate::{chaining_lower_bounds::gap_affine::GapAffineLowerBounds, costs::GapAffineCosts}; #[test] fn test_max_match_run_0() { diff --git a/lib_ts_chainalign/src/lower_bounds/ts_jump.rs b/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump.rs similarity index 97% rename from lib_ts_chainalign/src/lower_bounds/ts_jump.rs rename to lib_ts_chainalign/src/chaining_lower_bounds/ts_jump.rs index 93ac6f8..a65e583 100644 --- a/lib_ts_chainalign/src/lower_bounds/ts_jump.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump.rs @@ -1,7 +1,7 @@ use generic_a_star::cost::AStarCost; use ndarray::Array1; -use crate::{costs::AlignmentCosts, lower_bounds::gap_affine::GapAffineLowerBounds}; +use crate::{chaining_lower_bounds::gap_affine::GapAffineLowerBounds, costs::AlignmentCosts}; #[cfg(test)] mod tests; diff --git a/lib_ts_chainalign/src/lower_bounds/ts_jump/tests.rs b/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump/tests.rs similarity index 98% rename from lib_ts_chainalign/src/lower_bounds/ts_jump/tests.rs rename to lib_ts_chainalign/src/chaining_lower_bounds/ts_jump/tests.rs index 1a02f1f..aa9ea00 100644 --- a/lib_ts_chainalign/src/lower_bounds/ts_jump/tests.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump/tests.rs @@ -1,8 +1,8 @@ use generic_a_star::cost::{AStarCost, U32Cost}; use crate::{ + chaining_lower_bounds::ts_jump::TsJumpLowerBounds, costs::{AlignmentCosts, GapAffineCosts, TsLimits}, - lower_bounds::ts_jump::TsJumpLowerBounds, }; #[test] diff --git a/lib_ts_chainalign/src/gap_filling.rs b/lib_ts_chainalign/src/exact_chaining.rs similarity index 100% rename from lib_ts_chainalign/src/gap_filling.rs rename to lib_ts_chainalign/src/exact_chaining.rs diff --git a/lib_ts_chainalign/src/gap_filling/gap_affine.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs similarity index 97% rename from lib_ts_chainalign/src/gap_filling/gap_affine.rs rename to lib_ts_chainalign/src/exact_chaining/gap_affine.rs index 7e95184..807b2e4 100644 --- a/lib_ts_chainalign/src/gap_filling/gap_affine.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs @@ -3,7 +3,7 @@ use generic_a_star::{AStar, AStarResult, cost::AStarCost}; use crate::{ alignment::{Alignment, coordinates::AlignmentCoordinates, sequences::AlignmentSequences}, costs::GapAffineCosts, - gap_filling::gap_affine::algo::Context, + exact_chaining::gap_affine::algo::Context, }; mod algo; diff --git a/lib_ts_chainalign/src/gap_filling/gap_affine/algo.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs similarity index 100% rename from lib_ts_chainalign/src/gap_filling/gap_affine/algo.rs rename to lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs diff --git a/lib_ts_chainalign/src/gap_filling/gap_affine/tests.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs similarity index 98% rename from lib_ts_chainalign/src/gap_filling/gap_affine/tests.rs rename to lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs index ad664b0..a929bc5 100644 --- a/lib_ts_chainalign/src/gap_filling/gap_affine/tests.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs @@ -1,7 +1,7 @@ use generic_a_star::cost::U32Cost; use crate::alignment::AlignmentType; -use crate::gap_filling::gap_affine::{AlignmentCoordinates, GapAffineAlignment}; +use crate::exact_chaining::gap_affine::{AlignmentCoordinates, GapAffineAlignment}; use crate::{alignment::sequences::AlignmentSequences, costs::GapAffineCosts}; #[test] diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index d4fcd9d..bce1629 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -1,14 +1,14 @@ use generic_a_star::cost::AStarCost; use crate::{ + chaining_lower_bounds::{gap_affine::GapAffineLowerBounds, ts_jump::TsJumpLowerBounds}, costs::AlignmentCosts, - lower_bounds::{gap_affine::GapAffineLowerBounds, ts_jump::TsJumpLowerBounds}, }; pub mod alignment; +pub mod chaining_lower_bounds; pub mod costs; -pub mod gap_filling; -pub mod lower_bounds; +pub mod exact_chaining; #[expect(dead_code)] fn compute_lower_bounds( From 1d4d9715f90f05f0537d72dbce944db6c6064427 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Tue, 2 Dec 2025 10:34:26 +0200 Subject: [PATCH 07/31] Restrict match runs in exact chaining. --- .../chaining_lower_bounds/gap_affine/algo.rs | 2 +- .../src/exact_chaining/gap_affine.rs | 5 +- .../src/exact_chaining/gap_affine/algo.rs | 59 ++++++++---- .../src/exact_chaining/gap_affine/tests.rs | 92 +++++++++++++++++-- 4 files changed, 131 insertions(+), 27 deletions(-) diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs index 8ff6515..f30230f 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs @@ -195,7 +195,7 @@ impl AStarNode for Node { impl Display for Node { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}: {}", self.identifier, self.cost) + write!(f, "{}: {}, {}", self.identifier, self.cost, self.match_run) } } diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs index 807b2e4..e95b0fb 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs @@ -23,8 +23,9 @@ impl GapAffineAlignment { end: AlignmentCoordinates, sequences: &AlignmentSequences, cost_table: &GapAffineCosts, + max_match_run: u32, ) -> Self { - let context = Context::new(cost_table, sequences, start, end); + let context = Context::new(cost_table, sequences, start, end, max_match_run); let mut a_star = AStar::new(context); a_star.initialise(); match a_star.search() { @@ -32,7 +33,7 @@ impl GapAffineAlignment { start, end, alignment: a_star.reconstruct_path().into(), - cost, + cost: cost.0, }, AStarResult::ExceededCostLimit { .. } => unreachable!("Cost limit is None"), AStarResult::ExceededMemoryLimit { .. } => unreachable!("Cost limit is None"), diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs index 6836267..f8f1bf4 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs @@ -1,6 +1,11 @@ use std::fmt::Display; -use generic_a_star::{AStarContext, AStarNode, cost::AStarCost, reset::Reset}; +use generic_a_star::{ + AStarContext, AStarNode, + cost::{AStarCost, OrderedPairCost, U32Cost}, + reset::Reset, +}; +use num_traits::Zero; use crate::{ alignment::{ @@ -14,6 +19,7 @@ pub struct Context<'costs, 'sequences, Cost> { sequences: &'sequences AlignmentSequences, start: AlignmentCoordinates, end: AlignmentCoordinates, + max_match_run: u32, } #[derive(Debug, Clone, Copy, Eq, PartialEq)] @@ -22,6 +28,7 @@ pub struct Node { pub predecessor: Option, pub predecessor_alignment_type: Option, pub cost: Cost, + pub match_run: u32, } #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] @@ -36,12 +43,14 @@ impl<'costs, 'sequences, Cost> Context<'costs, 'sequences, Cost> { sequences: &'sequences AlignmentSequences, start: AlignmentCoordinates, end: AlignmentCoordinates, + max_match_run: u32, ) -> Self { Self { costs, sequences, start, end, + max_match_run, } } } @@ -58,12 +67,16 @@ impl AStarContext for Context<'_, '_, Cost> { predecessor: None, predecessor_alignment_type: None, cost: Cost::zero(), + match_run: 0, } } fn generate_successors(&mut self, node: &Self::Node, output: &mut impl Extend) { let Node { - identifier, cost, .. + identifier, + cost, + match_run, + .. } = node; let predecessor = Some(*identifier); let Identifier { @@ -76,17 +89,22 @@ impl AStarContext for Context<'_, '_, Cost> { let is_match = c1 == c2; if is_match { - // Match - let new_cost = *cost; - output.extend(std::iter::once(Node { - identifier: Identifier { - coordinates: coordinates.increment_both(), - gap_type: GapType::None, - }, - predecessor, - predecessor_alignment_type: Some(AlignmentType::Match), - cost: new_cost, - })); + // Disallow runs of matches longer than the maximum. + // This is because we do not want the exact chaining to find new anchors (which actually already exist). + if *match_run < self.max_match_run { + // Match + let new_cost = *cost; + output.extend(std::iter::once(Node { + identifier: Identifier { + coordinates: coordinates.increment_both(), + gap_type: GapType::None, + }, + predecessor, + predecessor_alignment_type: Some(AlignmentType::Match), + cost: new_cost, + match_run: match_run + 1, + })); + } } else { // Substitution let new_cost = *cost + self.costs.substitution; @@ -98,6 +116,7 @@ impl AStarContext for Context<'_, '_, Cost> { predecessor, predecessor_alignment_type: Some(AlignmentType::Substitution), cost: new_cost, + match_run: 0, })); } } @@ -117,6 +136,7 @@ impl AStarContext for Context<'_, '_, Cost> { predecessor, predecessor_alignment_type: Some(AlignmentType::Gap2), cost: new_cost, + match_run: 0, })); } @@ -135,6 +155,7 @@ impl AStarContext for Context<'_, '_, Cost> { predecessor, predecessor_alignment_type: Some(AlignmentType::Gap1), cost: new_cost, + match_run: 0, })); } } @@ -164,18 +185,18 @@ impl AStarNode for Node { type EdgeType = AlignmentType; // Use match run as secondary cost - type Cost = Cost; + type Cost = OrderedPairCost; fn identifier(&self) -> &Self::Identifier { &self.identifier } fn cost(&self) -> Self::Cost { - self.cost + OrderedPairCost(self.cost, U32Cost::from_primitive(self.match_run)) } fn a_star_lower_bound(&self) -> Self::Cost { - Cost::zero() + OrderedPairCost(Cost::zero(), U32Cost::zero()) } fn secondary_maximisable_score(&self) -> usize { @@ -193,7 +214,7 @@ impl AStarNode for Node { impl Display for Node { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}: {}", self.identifier, self.cost) + write!(f, "{}: {}, {}", self.identifier, self.cost, self.match_run) } } @@ -217,6 +238,8 @@ impl PartialOrd for Node { impl Ord for Node { fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.cost.cmp(&other.cost) + self.cost + .cmp(&other.cost) + .then_with(|| self.match_run.cmp(&other.match_run)) } } diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs index a929bc5..6e4cbb0 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs @@ -14,7 +14,7 @@ fn test_start_end() { let start = AlignmentCoordinates::new_forwards(0, 0); let end = AlignmentCoordinates::new_forwards(4, 5); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -35,7 +35,7 @@ fn test_partial_alignment() { let start = AlignmentCoordinates::new_forwards(1, 1); let end = AlignmentCoordinates::new_forwards(4, 4); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -60,7 +60,7 @@ fn test_gap_directions() { let start = AlignmentCoordinates::new_forwards(1, 1); let end = AlignmentCoordinates::new_forwards(11, 11); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -87,7 +87,7 @@ fn test_extremity_gaps() { let start = AlignmentCoordinates::new_forwards(3, 3); let end = AlignmentCoordinates::new_forwards(10, 10); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -112,7 +112,7 @@ fn test_extremity_substitutions() { let start = AlignmentCoordinates::new_forwards(0, 0); let end = AlignmentCoordinates::new_forwards(5, 5); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -137,7 +137,7 @@ fn test_substitutions_as_gaps() { let start = AlignmentCoordinates::new_forwards(0, 0); let end = AlignmentCoordinates::new_forwards(20, 20); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -149,3 +149,83 @@ fn test_substitutions_as_gaps() { ); assert_eq!(alignment.cost(), U32Cost::from(44u8)); } + +#[test] +fn test_max_match_run_0() { + let seq1 = b"AAAAAAAAAA".to_vec(); + let seq2 = b"AACAACCAAA".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = + GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); + + let start = AlignmentCoordinates::new_forwards(1, 1); + let end = AlignmentCoordinates::new_forwards(9, 9); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, 0); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert!( + alignment.alignment().alignment + == vec![(8, AlignmentType::Gap1), (8, AlignmentType::Gap2),] + || alignment.alignment().alignment + == vec![(8, AlignmentType::Gap2), (8, AlignmentType::Gap1),] + ); + assert_eq!(alignment.cost(), U32Cost::from(20u8)); +} + +#[test] +fn test_max_match_run_1() { + let seq1 = b"AAAAAAAAAA".to_vec(); + let seq2 = b"AACAACCAAA".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = + GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); + + let start = AlignmentCoordinates::new_forwards(1, 1); + let end = AlignmentCoordinates::new_forwards(9, 9); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, 1); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (1, AlignmentType::Match), + (1, AlignmentType::Substitution), + (1, AlignmentType::Match), + (1, AlignmentType::Gap2), + (1, AlignmentType::Match), + (2, AlignmentType::Substitution), + (1, AlignmentType::Match), + (1, AlignmentType::Gap1), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(12u8)); +} + +#[test] +fn test_max_match_run_2() { + let seq1 = b"AAAAAAAAAA".to_vec(); + let seq2 = b"AACAACCAAA".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = + GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); + + let start = AlignmentCoordinates::new_forwards(1, 1); + let end = AlignmentCoordinates::new_forwards(9, 9); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, 2); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (1, AlignmentType::Match), + (1, AlignmentType::Substitution), + (2, AlignmentType::Match), + (2, AlignmentType::Substitution), + (2, AlignmentType::Match), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(6u8)); +} From 3821835c2bd9d70bd19618f70925cc3127af6113 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Tue, 2 Dec 2025 11:40:35 +0200 Subject: [PATCH 08/31] Refactor alignment coordinates. --- lib_ts_chainalign/src/alignment.rs | 31 +- .../src/alignment/coordinates.rs | 187 +++++++---- lib_ts_chainalign/src/alignment/sequences.rs | 23 +- .../src/chaining_lower_bounds/gap_affine.rs | 4 +- .../chaining_lower_bounds/gap_affine/algo.rs | 32 +- .../chaining_lower_bounds/ts_jump/tests.rs | 3 + lib_ts_chainalign/src/costs.rs | 1 + lib_ts_chainalign/src/exact_chaining.rs | 1 + .../src/exact_chaining/gap_affine.rs | 4 + .../src/exact_chaining/gap_affine/algo.rs | 32 +- .../src/exact_chaining/gap_affine/tests.rs | 36 +- .../src/exact_chaining/ts_12_jump.rs | 68 ++++ .../src/exact_chaining/ts_12_jump/algo.rs | 310 ++++++++++++++++++ .../src/exact_chaining/ts_12_jump/tests.rs | 1 + 14 files changed, 599 insertions(+), 134 deletions(-) create mode 100644 lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs create mode 100644 lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs create mode 100644 lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs diff --git a/lib_ts_chainalign/src/alignment.rs b/lib_ts_chainalign/src/alignment.rs index b28c018..fc9339f 100644 --- a/lib_ts_chainalign/src/alignment.rs +++ b/lib_ts_chainalign/src/alignment.rs @@ -11,17 +11,38 @@ pub enum AlignmentType { Substitution, Gap1, Gap2, - TsStart, + TsStart { + ancestor: TsAncestor, + descendant: TsDescendant, + }, TsEnd, } +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub enum TsAncestor { + Seq1, + Seq2, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub enum TsDescendant { + Seq1, + Seq2, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct TsKind { + ancestor: TsAncestor, + descendant: TsDescendant, +} + #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] pub enum GapType { None, /// A gap in sequence 1, meaning that sequence 2 has characters that are missing from sequence 1. - In1, + InA, /// A gap in sequence 2, meaning that sequence 1 has characters that are missing from sequence 2. - In2, + InB, } pub struct Alignment { @@ -32,8 +53,8 @@ impl Display for GapType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { GapType::None => write!(f, "M/S"), - GapType::In1 => write!(f, "GA"), - GapType::In2 => write!(f, "GB"), + GapType::InA => write!(f, "GA"), + GapType::InB => write!(f, "GB"), } } } diff --git a/lib_ts_chainalign/src/alignment/coordinates.rs b/lib_ts_chainalign/src/alignment/coordinates.rs index 844c305..5f92f5b 100644 --- a/lib_ts_chainalign/src/alignment/coordinates.rs +++ b/lib_ts_chainalign/src/alignment/coordinates.rs @@ -1,107 +1,156 @@ -use std::{fmt::Display, ops::Range}; +use std::fmt::Display; -#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] -pub struct AlignmentCoordinates { - seq1: SequenceOrdinate, - seq2: SequenceOrdinate, -} +use crate::alignment::TsKind; #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] -pub struct SequenceOrdinate { - ordinate: usize, - rc: bool, +pub enum AlignmentCoordinates { + Primary { + a: usize, + b: usize, + }, + Secondary { + ancestor: usize, + descendant: usize, + ts_kind: TsKind, + }, } impl AlignmentCoordinates { - pub fn new_forwards(seq1: usize, seq2: usize) -> Self { - Self { - seq1: SequenceOrdinate::new(seq1, false), - seq2: SequenceOrdinate::new(seq2, false), + pub fn new_primary(a: usize, b: usize) -> Self { + Self::Primary { a, b } + } + + pub fn primary_ordinate_a(&self) -> Option { + match self { + AlignmentCoordinates::Primary { a, .. } => Some(*a), + AlignmentCoordinates::Secondary { .. } => None, } } - pub fn seq1(&self) -> SequenceOrdinate { - self.seq1 + pub fn primary_ordinate_b(&self) -> Option { + match self { + AlignmentCoordinates::Primary { b, .. } => Some(*b), + AlignmentCoordinates::Secondary { .. } => None, + } } - pub fn seq2(&self) -> SequenceOrdinate { - self.seq2 + pub fn secondary_ordinate_ancestor(&self) -> Option { + match self { + AlignmentCoordinates::Secondary { ancestor, .. } => Some(*ancestor), + AlignmentCoordinates::Primary { .. } => None, + } } - pub fn can_increment_1(&self, start: AlignmentCoordinates, end: AlignmentCoordinates) -> bool { - self.seq1() - .can_increment(start.seq1().ordinate()..end.seq1().ordinate()) + pub fn secondary_ordinate_descendant(&self) -> Option { + match self { + AlignmentCoordinates::Secondary { descendant, .. } => Some(*descendant), + AlignmentCoordinates::Primary { .. } => None, + } } - pub fn can_increment_2(&self, start: AlignmentCoordinates, end: AlignmentCoordinates) -> bool { - self.seq2() - .can_increment(start.seq2().ordinate()..end.seq2().ordinate()) + pub fn ts_kind(&self) -> Option { + match self { + AlignmentCoordinates::Secondary { ts_kind, .. } => Some(*ts_kind), + AlignmentCoordinates::Primary { .. } => None, + } } - pub fn can_increment_both( - &self, - start: AlignmentCoordinates, - end: AlignmentCoordinates, - ) -> bool { - self.can_increment_1(start, end) && self.can_increment_2(start, end) + pub fn is_primary(&self) -> bool { + matches!(self, AlignmentCoordinates::Primary { .. }) } - pub fn increment_1(&self) -> Self { - Self { - seq1: SequenceOrdinate::new( - if self.seq1.is_rc() { - self.seq1.ordinate().wrapping_sub(1) - } else { - self.seq1.ordinate() + 1 - }, - self.seq1.is_rc(), - ), - seq2: self.seq2, - } + pub fn is_secondary(&self) -> bool { + matches!(self, AlignmentCoordinates::Secondary { .. }) } - pub fn increment_2(&self) -> Self { - Self { - seq1: self.seq1, - seq2: SequenceOrdinate::new( - if self.seq2.is_rc() { - self.seq2.ordinate().wrapping_sub(1) - } else { - self.seq2.ordinate() + 1 - }, - self.seq2.is_rc(), - ), + pub fn can_increment_a(&self, start: AlignmentCoordinates, end: AlignmentCoordinates) -> bool { + assert_eq!(self.ts_kind(), start.ts_kind()); + assert_eq!(self.ts_kind(), end.ts_kind()); + + match self { + AlignmentCoordinates::Primary { a, .. } => { + (start.primary_ordinate_a().unwrap()..end.primary_ordinate_a().unwrap()).contains(a) + } + AlignmentCoordinates::Secondary { ancestor, .. } => { + (start.secondary_ordinate_ancestor().unwrap() + ..end.secondary_ordinate_ancestor().unwrap()) + .contains(ancestor) + } } } - pub fn increment_both(&self) -> Self { - self.increment_1().increment_2() + pub fn can_increment_b(&self, start: AlignmentCoordinates, end: AlignmentCoordinates) -> bool { + assert_eq!(self.ts_kind(), start.ts_kind()); + assert_eq!(self.ts_kind(), end.ts_kind()); + + match self { + AlignmentCoordinates::Primary { b, .. } => { + (start.primary_ordinate_b().unwrap()..end.primary_ordinate_b().unwrap()).contains(b) + } + AlignmentCoordinates::Secondary { descendant, .. } => { + (start.secondary_ordinate_descendant().unwrap() + ..end.secondary_ordinate_descendant().unwrap()) + .contains(descendant) + } + } } -} -impl SequenceOrdinate { - pub fn new(ordinate: usize, rc: bool) -> Self { - Self { ordinate, rc } + pub fn can_increment_both( + &self, + start: AlignmentCoordinates, + end: AlignmentCoordinates, + ) -> bool { + self.can_increment_a(start, end) && self.can_increment_b(start, end) } - /// Returns the sequence index of this ordinate. - /// - /// If the index runs over the end of the sequence, it may roll around to `usize::MAX` in reverse complements. - pub fn ordinate(&self) -> usize { - self.ordinate + pub fn increment_a(&self) -> Self { + match self { + AlignmentCoordinates::Primary { a, b } => { + AlignmentCoordinates::Primary { a: a + 1, b: *b } + } + AlignmentCoordinates::Secondary { + ancestor, + descendant, + ts_kind, + } => AlignmentCoordinates::Secondary { + ancestor: ancestor - 1, + descendant: *descendant, + ts_kind: *ts_kind, + }, + } } - pub fn is_rc(&self) -> bool { - self.rc + pub fn increment_b(&self) -> Self { + match self { + AlignmentCoordinates::Primary { a, b } => { + AlignmentCoordinates::Primary { a: *a, b: b + 1 } + } + AlignmentCoordinates::Secondary { + ancestor, + descendant, + ts_kind, + } => AlignmentCoordinates::Secondary { + ancestor: *ancestor, + descendant: descendant + 1, + ts_kind: *ts_kind, + }, + } } - pub fn can_increment(&self, range: Range) -> bool { - range.contains(&self.ordinate) + pub fn increment_both(&self) -> Self { + self.increment_a().increment_b() } } -impl Display for SequenceOrdinate { +impl Display for AlignmentCoordinates { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}{}", self.ordinate, if self.rc { "rc" } else { "" }) + match self { + AlignmentCoordinates::Primary { a, b } => write!(f, "({}, {})", a, b), + AlignmentCoordinates::Secondary { + ancestor, + descendant, + ts_kind, + } => write!(f, "({}, {}, {:?})", ancestor, descendant, ts_kind), + } } } diff --git a/lib_ts_chainalign/src/alignment/sequences.rs b/lib_ts_chainalign/src/alignment/sequences.rs index 86707ab..ccdf184 100644 --- a/lib_ts_chainalign/src/alignment/sequences.rs +++ b/lib_ts_chainalign/src/alignment/sequences.rs @@ -1,4 +1,4 @@ -use crate::alignment::coordinates::AlignmentCoordinates; +use crate::alignment::{TsAncestor, TsDescendant, coordinates::AlignmentCoordinates}; pub struct AlignmentSequences { seq1: Vec, @@ -11,9 +11,22 @@ impl AlignmentSequences { } pub fn characters(&self, coordinates: AlignmentCoordinates) -> (u8, u8) { - ( - self.seq1[coordinates.seq1().ordinate()], - self.seq2[coordinates.seq2().ordinate()], - ) + match coordinates { + AlignmentCoordinates::Primary { a, b } => (self.seq1[a], self.seq2[b]), + AlignmentCoordinates::Secondary { + ancestor, + descendant, + ts_kind, + } => ( + match ts_kind.ancestor { + TsAncestor::Seq1 => self.seq1[ancestor], + TsAncestor::Seq2 => self.seq2[ancestor], + }, + match ts_kind.descendant { + TsDescendant::Seq1 => self.seq1[descendant], + TsDescendant::Seq2 => self.seq2[descendant], + }, + ), + } } } diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs index 34655d7..8e1d94e 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs @@ -39,8 +39,8 @@ impl GapAffineLowerBounds { a_star.search_until(|_, node| { if node.identifier.has_non_match || allow_all_match_run { let lower_bound = &mut lower_bounds[[ - node.identifier.coordinates.seq1().ordinate(), - node.identifier.coordinates.seq2().ordinate(), + node.identifier.coordinates.primary_ordinate_a().unwrap(), + node.identifier.coordinates.primary_ordinate_b().unwrap(), ]]; *lower_bound = (*lower_bound).min(node.cost().0); } diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs index f30230f..21b26e0 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs @@ -49,7 +49,7 @@ impl AStarContext for Context<'_, Cost> { fn create_root(&self) -> Self::Node { Node { identifier: Identifier { - coordinates: AlignmentCoordinates::new_forwards(0, 0), + coordinates: AlignmentCoordinates::new_primary(0, 0), has_non_match: false, gap_type: GapType::None, }, @@ -69,10 +69,10 @@ impl AStarContext for Context<'_, Cost> { cost, match_run, } = node; + let start = AlignmentCoordinates::new_primary(0, 0); + let end = AlignmentCoordinates::new_primary(self.max_n, self.max_n); - if coordinates.seq1().can_increment(0..self.max_n) - && coordinates.seq2().can_increment(0..self.max_n) - { + if coordinates.can_increment_both(start, end) { if *match_run < self.max_match_run { // Match let new_cost = *cost; @@ -101,36 +101,36 @@ impl AStarContext for Context<'_, Cost> { })); } - if coordinates.seq1().can_increment(0..self.max_n) { - // Gap in 2 + if coordinates.can_increment_a(start, end) { + // Gap in b let new_cost = *cost + match gap_type { - GapType::In2 => self.costs.gap_extend, + GapType::InB => self.costs.gap_extend, _ => self.costs.gap_open, }; output.extend(std::iter::once(Node { identifier: Identifier { - coordinates: coordinates.increment_1(), + coordinates: coordinates.increment_a(), has_non_match: true, - gap_type: GapType::In2, + gap_type: GapType::InB, }, cost: new_cost, match_run: 0, })); } - if coordinates.seq2().can_increment(0..self.max_n) { - // Gap in 1 + if coordinates.can_increment_b(start, end) { + // Gap in a let new_cost = *cost + match gap_type { - GapType::In1 => self.costs.gap_extend, + GapType::InA => self.costs.gap_extend, _ => self.costs.gap_open, }; output.extend(std::iter::once(Node { identifier: Identifier { - coordinates: coordinates.increment_2(), + coordinates: coordinates.increment_b(), has_non_match: true, - gap_type: GapType::In1, + gap_type: GapType::InA, }, cost: new_cost, match_run: 0, @@ -204,8 +204,8 @@ impl Display for Identifier { write!( f, "({}, {}, {}, {})", - self.coordinates.seq1(), - self.coordinates.seq2(), + self.coordinates.primary_ordinate_a().unwrap(), + self.coordinates.primary_ordinate_b().unwrap(), self.gap_type, self.has_non_match ) diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump/tests.rs b/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump/tests.rs index aa9ea00..d69d9a3 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump/tests.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump/tests.rs @@ -23,6 +23,7 @@ fn test_max_match_run_0() { jump_12: -100..100, jump_34: -100..100, length_23: 0..200, + ancestor_gap: -100..100, }, }; @@ -67,6 +68,7 @@ fn test_max_match_run_1() { jump_12: -100..100, jump_34: -100..100, length_23: 0..200, + ancestor_gap: -100..100, }, }; @@ -111,6 +113,7 @@ fn test_max_match_run_2() { jump_12: -100..100, jump_34: -100..100, length_23: 0..200, + ancestor_gap: -100..100, }, }; diff --git a/lib_ts_chainalign/src/costs.rs b/lib_ts_chainalign/src/costs.rs index 164186b..2d6a075 100644 --- a/lib_ts_chainalign/src/costs.rs +++ b/lib_ts_chainalign/src/costs.rs @@ -10,6 +10,7 @@ pub struct TsLimits { pub jump_12: Range, pub jump_34: Range, pub length_23: Range, + pub ancestor_gap: Range, } pub struct AlignmentCosts { diff --git a/lib_ts_chainalign/src/exact_chaining.rs b/lib_ts_chainalign/src/exact_chaining.rs index 1302c2d..988526b 100644 --- a/lib_ts_chainalign/src/exact_chaining.rs +++ b/lib_ts_chainalign/src/exact_chaining.rs @@ -1,3 +1,4 @@ //! Fill the gap between two chained anchors with an optimal alignment. pub mod gap_affine; +pub mod ts_12_jump; diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs index e95b0fb..3986084 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs @@ -25,6 +25,10 @@ impl GapAffineAlignment { cost_table: &GapAffineCosts, max_match_run: u32, ) -> Self { + assert!( + start.is_primary() && end.is_primary() || start.is_secondary() && end.is_secondary() + ); + let context = Context::new(cost_table, sequences, start, end, max_match_run); let mut a_star = AStar::new(context); a_star.initialise(); diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs index f8f1bf4..f16a33d 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs @@ -85,8 +85,8 @@ impl AStarContext for Context<'_, '_, Cost> { } = *identifier; if coordinates.can_increment_both(self.start, self.end) { - let (c1, c2) = self.sequences.characters(coordinates); - let is_match = c1 == c2; + let (ca, cb) = self.sequences.characters(coordinates); + let is_match = ca == cb; if is_match { // Disallow runs of matches longer than the maximum. @@ -121,17 +121,17 @@ impl AStarContext for Context<'_, '_, Cost> { } } - if coordinates.can_increment_1(self.start, self.end) { - // Gap in 2 + if coordinates.can_increment_a(self.start, self.end) { + // Gap in b let new_cost = *cost + match gap_type { - GapType::In2 => self.costs.gap_extend, + GapType::InB => self.costs.gap_extend, _ => self.costs.gap_open, }; output.extend(std::iter::once(Node { identifier: Identifier { - coordinates: coordinates.increment_1(), - gap_type: GapType::In2, + coordinates: coordinates.increment_a(), + gap_type: GapType::InB, }, predecessor, predecessor_alignment_type: Some(AlignmentType::Gap2), @@ -140,17 +140,17 @@ impl AStarContext for Context<'_, '_, Cost> { })); } - if coordinates.can_increment_2(self.start, self.end) { - // Gap in 1 + if coordinates.can_increment_b(self.start, self.end) { + // Gap in a let new_cost = *cost + match gap_type { - GapType::In1 => self.costs.gap_extend, + GapType::InA => self.costs.gap_extend, _ => self.costs.gap_open, }; output.extend(std::iter::once(Node { identifier: Identifier { - coordinates: coordinates.increment_2(), - gap_type: GapType::In1, + coordinates: coordinates.increment_b(), + gap_type: GapType::InA, }, predecessor, predecessor_alignment_type: Some(AlignmentType::Gap1), @@ -220,13 +220,7 @@ impl Display for Node { impl Display for Identifier { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "({}, {}, {})", - self.coordinates.seq1(), - self.coordinates.seq2(), - self.gap_type - ) + write!(f, "({}, {})", self.coordinates, self.gap_type) } } diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs index 6e4cbb0..9055eb3 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs @@ -12,8 +12,8 @@ fn test_start_end() { let cost_table = GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); - let start = AlignmentCoordinates::new_forwards(0, 0); - let end = AlignmentCoordinates::new_forwards(4, 5); + let start = AlignmentCoordinates::new_primary(0, 0); + let end = AlignmentCoordinates::new_primary(4, 5); let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); @@ -33,8 +33,8 @@ fn test_partial_alignment() { let cost_table = GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); - let start = AlignmentCoordinates::new_forwards(1, 1); - let end = AlignmentCoordinates::new_forwards(4, 4); + let start = AlignmentCoordinates::new_primary(1, 1); + let end = AlignmentCoordinates::new_primary(4, 4); let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); @@ -58,8 +58,8 @@ fn test_gap_directions() { let cost_table = GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); - let start = AlignmentCoordinates::new_forwards(1, 1); - let end = AlignmentCoordinates::new_forwards(11, 11); + let start = AlignmentCoordinates::new_primary(1, 1); + let end = AlignmentCoordinates::new_primary(11, 11); let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); @@ -85,8 +85,8 @@ fn test_extremity_gaps() { let cost_table = GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); - let start = AlignmentCoordinates::new_forwards(3, 3); - let end = AlignmentCoordinates::new_forwards(10, 10); + let start = AlignmentCoordinates::new_primary(3, 3); + let end = AlignmentCoordinates::new_primary(10, 10); let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); @@ -110,8 +110,8 @@ fn test_extremity_substitutions() { let cost_table = GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); - let start = AlignmentCoordinates::new_forwards(0, 0); - let end = AlignmentCoordinates::new_forwards(5, 5); + let start = AlignmentCoordinates::new_primary(0, 0); + let end = AlignmentCoordinates::new_primary(5, 5); let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); @@ -135,8 +135,8 @@ fn test_substitutions_as_gaps() { let cost_table = GapAffineCosts::new(U32Cost::from(3u8), U32Cost::from(3u8), U32Cost::from(1u8)); - let start = AlignmentCoordinates::new_forwards(0, 0); - let end = AlignmentCoordinates::new_forwards(20, 20); + let start = AlignmentCoordinates::new_primary(0, 0); + let end = AlignmentCoordinates::new_primary(20, 20); let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); assert_eq!(alignment.start(), start); @@ -158,8 +158,8 @@ fn test_max_match_run_0() { let cost_table = GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); - let start = AlignmentCoordinates::new_forwards(1, 1); - let end = AlignmentCoordinates::new_forwards(9, 9); + let start = AlignmentCoordinates::new_primary(1, 1); + let end = AlignmentCoordinates::new_primary(9, 9); let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, 0); assert_eq!(alignment.start(), start); @@ -181,8 +181,8 @@ fn test_max_match_run_1() { let cost_table = GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); - let start = AlignmentCoordinates::new_forwards(1, 1); - let end = AlignmentCoordinates::new_forwards(9, 9); + let start = AlignmentCoordinates::new_primary(1, 1); + let end = AlignmentCoordinates::new_primary(9, 9); let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, 1); assert_eq!(alignment.start(), start); @@ -211,8 +211,8 @@ fn test_max_match_run_2() { let cost_table = GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); - let start = AlignmentCoordinates::new_forwards(1, 1); - let end = AlignmentCoordinates::new_forwards(9, 9); + let start = AlignmentCoordinates::new_primary(1, 1); + let end = AlignmentCoordinates::new_primary(9, 9); let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, 2); assert_eq!(alignment.start(), start); diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs new file mode 100644 index 0000000..f5c7c1e --- /dev/null +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs @@ -0,0 +1,68 @@ +use generic_a_star::{AStar, AStarResult, cost::AStarCost}; + +use crate::{ + alignment::{Alignment, coordinates::AlignmentCoordinates, sequences::AlignmentSequences}, + costs::AlignmentCosts, + exact_chaining::ts_12_jump::algo::Context, +}; + +mod algo; +#[cfg(test)] +mod tests; + +pub struct Ts12JumpAlignment { + start: AlignmentCoordinates, + end: AlignmentCoordinates, + alignment: Alignment, + cost: Cost, +} + +impl Ts12JumpAlignment { + pub fn new( + start: AlignmentCoordinates, + end: AlignmentCoordinates, + sequences: &AlignmentSequences, + cost_table: &AlignmentCosts, + max_match_run: u32, + ) -> Self { + assert!(start.is_primary()); + assert!(end.is_secondary()); + + let context = Context::new(cost_table, sequences, start, end, max_match_run); + let mut a_star = AStar::new(context); + a_star.initialise(); + match a_star.search() { + AStarResult::FoundTarget { cost, .. } => Self { + start, + end, + alignment: a_star.reconstruct_path().into(), + cost: cost.0, + }, + AStarResult::ExceededCostLimit { .. } => unreachable!("Cost limit is None"), + AStarResult::ExceededMemoryLimit { .. } => unreachable!("Cost limit is None"), + AStarResult::NoTarget => { + panic!("No TS 12-jump alignment found between the given coordinates") + } + } + } +} + +impl Ts12JumpAlignment { + pub fn start(&self) -> AlignmentCoordinates { + self.start + } + + pub fn end(&self) -> AlignmentCoordinates { + self.end + } + + pub fn alignment(&self) -> &Alignment { + &self.alignment + } +} + +impl Ts12JumpAlignment { + pub fn cost(&self) -> Cost { + self.cost + } +} diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs new file mode 100644 index 0000000..2d81c8d --- /dev/null +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs @@ -0,0 +1,310 @@ +use std::fmt::Display; + +use generic_a_star::{ + AStarContext, AStarNode, + cost::{AStarCost, OrderedPairCost, U32Cost}, + reset::Reset, +}; +use num_traits::Zero; + +use crate::{ + alignment::{ + AlignmentType, GapType, coordinates::AlignmentCoordinates, sequences::AlignmentSequences, + }, + costs::AlignmentCosts, +}; + +pub struct Context<'costs, 'sequences, Cost> { + costs: &'costs AlignmentCosts, + sequences: &'sequences AlignmentSequences, + start: AlignmentCoordinates, + end: AlignmentCoordinates, + max_match_run: u32, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub struct Node { + pub identifier: Identifier, + pub predecessor: Option, + pub predecessor_alignment_type: Option, + pub cost: Cost, + pub match_run: u32, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub enum Identifier { + Primary { + coordinates: AlignmentCoordinates, + gap_type: GapType, + }, + Jump12 { + coordinates: AlignmentCoordinates, + jump: isize, + }, + Secondary { + coordinates: AlignmentCoordinates, + gap_type: GapType, + }, +} + +impl<'costs, 'sequences, Cost> Context<'costs, 'sequences, Cost> { + pub fn new( + costs: &'costs AlignmentCosts, + sequences: &'sequences AlignmentSequences, + start: AlignmentCoordinates, + end: AlignmentCoordinates, + max_match_run: u32, + ) -> Self { + Self { + costs, + sequences, + start, + end, + max_match_run, + } + } +} + +impl AStarContext for Context<'_, '_, Cost> { + type Node = Node; + + fn create_root(&self) -> Self::Node { + Node { + identifier: Identifier::Primary { + coordinates: self.start, + gap_type: GapType::None, + }, + predecessor: None, + predecessor_alignment_type: None, + cost: Cost::zero(), + match_run: 0, + } + } + + fn generate_successors(&mut self, node: &Self::Node, output: &mut impl Extend) { + let Node { + identifier, + cost, + match_run, + .. + } = node; + let predecessor = Some(*identifier); + + let coordinates = identifier.coordinates(); + let gap_type = identifier.gap_type(); + let is_primary = matches!(identifier, Identifier::Primary { .. }); + let gap_affine_costs = if is_primary { + &self.costs.primary_costs + } else { + &self.costs.secondary_costs + }; + + // Generate gap-affine successors. + if coordinates.can_increment_both(self.start, self.end) { + let (ca, cb) = self.sequences.characters(coordinates); + let is_match = ca == cb; + + if is_match { + // Disallow runs of matches longer than the maximum. + // This is because we do not want the exact chaining to find new anchors (which actually already exist). + if *match_run < self.max_match_run { + // Match + let new_cost = *cost; + output.extend(std::iter::once(Node { + identifier: Identifier::new_primary_secondary( + is_primary, + coordinates.increment_both(), + GapType::None, + ), + predecessor, + predecessor_alignment_type: Some(AlignmentType::Match), + cost: new_cost, + match_run: match_run + 1, + })); + } + } else { + // Substitution + let new_cost = *cost + gap_affine_costs.substitution; + output.extend(std::iter::once(Node { + identifier: Identifier::new_primary_secondary( + is_primary, + coordinates.increment_both(), + GapType::None, + ), + predecessor, + predecessor_alignment_type: Some(AlignmentType::Substitution), + cost: new_cost, + match_run: 0, + })); + } + } + + if coordinates.can_increment_a(self.start, self.end) { + // Gap in b + let new_cost = *cost + + match gap_type { + GapType::InB => gap_affine_costs.gap_extend, + _ => gap_affine_costs.gap_open, + }; + output.extend(std::iter::once(Node { + identifier: Identifier::new_primary_secondary( + is_primary, + coordinates.increment_a(), + GapType::InB, + ), + predecessor, + predecessor_alignment_type: Some(AlignmentType::Gap2), + cost: new_cost, + match_run: 0, + })); + } + + if coordinates.can_increment_b(self.start, self.end) { + // Gap in a + let new_cost = *cost + + match gap_type { + GapType::InA => gap_affine_costs.gap_extend, + _ => gap_affine_costs.gap_open, + }; + output.extend(std::iter::once(Node { + identifier: Identifier::new_primary_secondary( + is_primary, + coordinates.increment_b(), + GapType::InA, + ), + predecessor, + predecessor_alignment_type: Some(AlignmentType::Gap1), + cost: new_cost, + match_run: 0, + })); + } + + // Generate jump successors. + if is_primary { + //let coordinates = coordinates.jump_12(self.end.ts_kind()); + } + } + + fn is_target(&self, node: &Self::Node) -> bool { + node.identifier.coordinates() == self.end + } + + fn cost_limit(&self) -> Option<::Cost> { + None + } + + fn memory_limit(&self) -> Option { + None + } +} + +impl Reset for Context<'_, '_, Cost> { + fn reset(&mut self) { + unimplemented!() + } +} + +impl AStarNode for Node { + type Identifier = Identifier; + + type EdgeType = AlignmentType; + + // Use match run as secondary cost + type Cost = OrderedPairCost; + + fn identifier(&self) -> &Self::Identifier { + &self.identifier + } + + fn cost(&self) -> Self::Cost { + OrderedPairCost(self.cost, U32Cost::from_primitive(self.match_run)) + } + + fn a_star_lower_bound(&self) -> Self::Cost { + OrderedPairCost(Cost::zero(), U32Cost::zero()) + } + + fn secondary_maximisable_score(&self) -> usize { + 0 + } + + fn predecessor(&self) -> Option<&Self::Identifier> { + self.predecessor.as_ref() + } + + fn predecessor_edge_type(&self) -> Option { + self.predecessor_alignment_type + } +} + +impl Identifier { + pub fn new_primary_secondary( + is_primary: bool, + coordinates: AlignmentCoordinates, + gap_type: GapType, + ) -> Self { + if is_primary { + Identifier::Primary { + coordinates, + gap_type, + } + } else { + Identifier::Secondary { + coordinates, + gap_type, + } + } + } + + pub fn coordinates(&self) -> AlignmentCoordinates { + match self { + Identifier::Primary { coordinates, .. } => *coordinates, + Identifier::Jump12 { coordinates, .. } => *coordinates, + Identifier::Secondary { coordinates, .. } => *coordinates, + } + } + + pub fn gap_type(&self) -> GapType { + match self { + Identifier::Primary { gap_type, .. } => *gap_type, + Identifier::Jump12 { .. } => GapType::None, + Identifier::Secondary { gap_type, .. } => *gap_type, + } + } +} + +impl Display for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}: {}, {}", self.identifier, self.cost, self.match_run) + } +} + +impl Display for Identifier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}({}, {})", + match self { + Identifier::Primary { .. } => "P".to_string(), + Identifier::Jump12 { jump, .. } => format!("J12[{jump}]"), + Identifier::Secondary { .. } => "S".to_string(), + }, + self.coordinates(), + self.gap_type(), + ) + } +} + +impl PartialOrd for Node { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Node { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.cost + .cmp(&other.cost) + .then_with(|| self.match_run.cmp(&other.match_run)) + } +} diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs @@ -0,0 +1 @@ + From 7040766fe41f4084052e1ce3c70b7b011ddf8670 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Tue, 2 Dec 2025 13:09:42 +0200 Subject: [PATCH 09/31] Implement TS 12-jump exact chaining. --- lib_ts_chainalign/src/alignment.rs | 9 +- .../src/alignment/coordinates.rs | 97 +++++++++++++++---- lib_ts_chainalign/src/alignment/sequences.rs | 8 ++ .../chaining_lower_bounds/gap_affine/algo.rs | 2 +- .../src/exact_chaining/gap_affine/algo.rs | 6 +- .../src/exact_chaining/gap_affine/tests.rs | 22 ++--- .../src/exact_chaining/ts_12_jump/algo.rs | 30 ++++-- 7 files changed, 126 insertions(+), 48 deletions(-) diff --git a/lib_ts_chainalign/src/alignment.rs b/lib_ts_chainalign/src/alignment.rs index fc9339f..3b50c91 100644 --- a/lib_ts_chainalign/src/alignment.rs +++ b/lib_ts_chainalign/src/alignment.rs @@ -9,12 +9,9 @@ pub mod sequences; pub enum AlignmentType { Match, Substitution, - Gap1, - Gap2, - TsStart { - ancestor: TsAncestor, - descendant: TsDescendant, - }, + GapA, + GapB, + TsStart { jump: isize, ts_kind: TsKind }, TsEnd, } diff --git a/lib_ts_chainalign/src/alignment/coordinates.rs b/lib_ts_chainalign/src/alignment/coordinates.rs index 5f92f5b..1736e87 100644 --- a/lib_ts_chainalign/src/alignment/coordinates.rs +++ b/lib_ts_chainalign/src/alignment/coordinates.rs @@ -1,6 +1,6 @@ use std::fmt::Display; -use crate::alignment::TsKind; +use crate::alignment::{TsAncestor, TsDescendant, TsKind}; #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] pub enum AlignmentCoordinates { @@ -63,35 +63,50 @@ impl AlignmentCoordinates { matches!(self, AlignmentCoordinates::Secondary { .. }) } + /// Checks if ordinate a can be incremented. + /// In secondary alignments, ordinate b is the ancestor. + /// + /// If ordinate a and the boundary have the same TS kind (possibly `None`), then the check is performed normally. + /// If the TS kinds differ, then there is a jump before the boundary, and ordinate a can always be incremented. pub fn can_increment_a(&self, start: AlignmentCoordinates, end: AlignmentCoordinates) -> bool { - assert_eq!(self.ts_kind(), start.ts_kind()); - assert_eq!(self.ts_kind(), end.ts_kind()); - match self { AlignmentCoordinates::Primary { a, .. } => { - (start.primary_ordinate_a().unwrap()..end.primary_ordinate_a().unwrap()).contains(a) + if self.ts_kind() == end.ts_kind() { + // Incrementing primary ordinate a is always a plus operation, so we only need to check the upper bound. + (..end.primary_ordinate_a().unwrap()).contains(a) + } else { + true + } } AlignmentCoordinates::Secondary { ancestor, .. } => { - (start.secondary_ordinate_ancestor().unwrap() - ..end.secondary_ordinate_ancestor().unwrap()) - .contains(ancestor) + if self.ts_kind() == start.ts_kind() { + // Incrementing the secondary ancestor is always a minus operation, so we only need to check the lower bound. + (start.secondary_ordinate_ancestor().unwrap()..).contains(ancestor) + } else { + true + } } } } - pub fn can_increment_b(&self, start: AlignmentCoordinates, end: AlignmentCoordinates) -> bool { - assert_eq!(self.ts_kind(), start.ts_kind()); - assert_eq!(self.ts_kind(), end.ts_kind()); - - match self { - AlignmentCoordinates::Primary { b, .. } => { - (start.primary_ordinate_b().unwrap()..end.primary_ordinate_b().unwrap()).contains(b) - } - AlignmentCoordinates::Secondary { descendant, .. } => { - (start.secondary_ordinate_descendant().unwrap() - ..end.secondary_ordinate_descendant().unwrap()) - .contains(descendant) + /// Checks if ordinate b can be incremented. + /// In secondary alignments, ordinate b is the descendant. + /// + /// If ordinate b and the `end` boundary have the same TS kind (possibly `None`), then the check is performed normally. + /// If the TS kinds differ, then there is a jump before the `end` boundary, and ordinate b can always be incremented. + pub fn can_increment_b(&self, end: AlignmentCoordinates) -> bool { + if self.ts_kind() == end.ts_kind() { + // Incrementing ordinate b is always a plus operation, so we only need to check the upper bound. + match self { + AlignmentCoordinates::Primary { b, .. } => { + (..end.primary_ordinate_b().unwrap()).contains(b) + } + AlignmentCoordinates::Secondary { descendant, .. } => { + (..end.secondary_ordinate_descendant().unwrap()).contains(descendant) + } } + } else { + true } } @@ -100,7 +115,7 @@ impl AlignmentCoordinates { start: AlignmentCoordinates, end: AlignmentCoordinates, ) -> bool { - self.can_increment_a(start, end) && self.can_increment_b(start, end) + self.can_increment_a(start, end) && self.can_increment_b(end) } pub fn increment_a(&self) -> Self { @@ -140,6 +155,46 @@ impl AlignmentCoordinates { pub fn increment_both(&self) -> Self { self.increment_a().increment_b() } + + /// Generate all possible 12-jumps. + /// + /// The TS kind is given by the start coordinates. + /// The left and right limits of the jump are given by the start and end coordinates. + /// The end coordinates must be in primary form and simply be the end of the aligned sequences. + /// The start coordinates are in secondary form. + pub fn generate_12_jumps( + &self, + start: AlignmentCoordinates, + end: AlignmentCoordinates, + ) -> impl Iterator { + let Self::Primary { a, b } = *self else { + panic!("Can only generate 12-jumps from primary coordinates"); + }; + let ts_kind = start.ts_kind().unwrap(); + let ancestor_zero = match ts_kind.ancestor { + TsAncestor::Seq1 => a, + TsAncestor::Seq2 => b, + } as isize; + let ancestor_limit = match ts_kind.ancestor { + TsAncestor::Seq1 => end.primary_ordinate_a().unwrap(), + TsAncestor::Seq2 => end.primary_ordinate_b().unwrap(), + }; + let descendant = match ts_kind.descendant { + TsDescendant::Seq1 => a, + TsDescendant::Seq2 => b, + }; + + (start.secondary_ordinate_ancestor().unwrap()..ancestor_limit).map(move |ancestor| { + ( + ancestor as isize - ancestor_zero, + Self::Secondary { + ancestor, + descendant, + ts_kind, + }, + ) + }) + } } impl Display for AlignmentCoordinates { diff --git a/lib_ts_chainalign/src/alignment/sequences.rs b/lib_ts_chainalign/src/alignment/sequences.rs index ccdf184..f7c3a9b 100644 --- a/lib_ts_chainalign/src/alignment/sequences.rs +++ b/lib_ts_chainalign/src/alignment/sequences.rs @@ -29,4 +29,12 @@ impl AlignmentSequences { ), } } + + pub fn start(&self) -> AlignmentCoordinates { + AlignmentCoordinates::new_primary(0, 0) + } + + pub fn end(&self) -> AlignmentCoordinates { + AlignmentCoordinates::new_primary(self.seq1.len(), self.seq2.len()) + } } diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs index 21b26e0..3c615d6 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs @@ -119,7 +119,7 @@ impl AStarContext for Context<'_, Cost> { })); } - if coordinates.can_increment_b(start, end) { + if coordinates.can_increment_b(end) { // Gap in a let new_cost = *cost + match gap_type { diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs index f16a33d..c619f9c 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs @@ -134,13 +134,13 @@ impl AStarContext for Context<'_, '_, Cost> { gap_type: GapType::InB, }, predecessor, - predecessor_alignment_type: Some(AlignmentType::Gap2), + predecessor_alignment_type: Some(AlignmentType::GapB), cost: new_cost, match_run: 0, })); } - if coordinates.can_increment_b(self.start, self.end) { + if coordinates.can_increment_b(self.end) { // Gap in a let new_cost = *cost + match gap_type { @@ -153,7 +153,7 @@ impl AStarContext for Context<'_, '_, Cost> { gap_type: GapType::InA, }, predecessor, - predecessor_alignment_type: Some(AlignmentType::Gap1), + predecessor_alignment_type: Some(AlignmentType::GapA), cost: new_cost, match_run: 0, })); diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs index 9055eb3..affbb2a 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs @@ -20,7 +20,7 @@ fn test_start_end() { assert_eq!(alignment.end(), end); assert_eq!( alignment.alignment().alignment, - vec![(4, AlignmentType::Match), (1, AlignmentType::Gap1)] + vec![(4, AlignmentType::Match), (1, AlignmentType::GapA)] ); assert_eq!(alignment.cost(), U32Cost::from(3u8)); } @@ -68,9 +68,9 @@ fn test_gap_directions() { alignment.alignment().alignment, vec![ (2, AlignmentType::Match), - (2, AlignmentType::Gap2), + (2, AlignmentType::GapB), (5, AlignmentType::Match), - (2, AlignmentType::Gap1), + (2, AlignmentType::GapA), (1, AlignmentType::Match) ] ); @@ -94,9 +94,9 @@ fn test_extremity_gaps() { assert_eq!( alignment.alignment().alignment, vec![ - (2, AlignmentType::Gap2), + (2, AlignmentType::GapB), (5, AlignmentType::Match), - (2, AlignmentType::Gap1), + (2, AlignmentType::GapA), ] ); assert_eq!(alignment.cost(), U32Cost::from(8u8)); @@ -143,9 +143,9 @@ fn test_substitutions_as_gaps() { assert_eq!(alignment.end(), end); assert!( alignment.alignment().alignment - == vec![(20, AlignmentType::Gap1), (20, AlignmentType::Gap2),] + == vec![(20, AlignmentType::GapA), (20, AlignmentType::GapB),] || alignment.alignment().alignment - == vec![(20, AlignmentType::Gap2), (20, AlignmentType::Gap1),] + == vec![(20, AlignmentType::GapB), (20, AlignmentType::GapA),] ); assert_eq!(alignment.cost(), U32Cost::from(44u8)); } @@ -166,9 +166,9 @@ fn test_max_match_run_0() { assert_eq!(alignment.end(), end); assert!( alignment.alignment().alignment - == vec![(8, AlignmentType::Gap1), (8, AlignmentType::Gap2),] + == vec![(8, AlignmentType::GapA), (8, AlignmentType::GapB),] || alignment.alignment().alignment - == vec![(8, AlignmentType::Gap2), (8, AlignmentType::Gap1),] + == vec![(8, AlignmentType::GapB), (8, AlignmentType::GapA),] ); assert_eq!(alignment.cost(), U32Cost::from(20u8)); } @@ -193,11 +193,11 @@ fn test_max_match_run_1() { (1, AlignmentType::Match), (1, AlignmentType::Substitution), (1, AlignmentType::Match), - (1, AlignmentType::Gap2), + (1, AlignmentType::GapB), (1, AlignmentType::Match), (2, AlignmentType::Substitution), (1, AlignmentType::Match), - (1, AlignmentType::Gap1), + (1, AlignmentType::GapA), ] ); assert_eq!(alignment.cost(), U32Cost::from(12u8)); diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs index 2d81c8d..972ce28 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs @@ -39,7 +39,6 @@ pub enum Identifier { }, Jump12 { coordinates: AlignmentCoordinates, - jump: isize, }, Secondary { coordinates: AlignmentCoordinates, @@ -55,6 +54,9 @@ impl<'costs, 'sequences, Cost> Context<'costs, 'sequences, Cost> { end: AlignmentCoordinates, max_match_run: u32, ) -> Self { + assert!(start.ts_kind().is_none()); + assert!(end.ts_kind().is_some()); + Self { costs, sequences, @@ -153,13 +155,13 @@ impl AStarContext for Context<'_, '_, Cost> { GapType::InB, ), predecessor, - predecessor_alignment_type: Some(AlignmentType::Gap2), + predecessor_alignment_type: Some(AlignmentType::GapB), cost: new_cost, match_run: 0, })); } - if coordinates.can_increment_b(self.start, self.end) { + if coordinates.can_increment_b(self.end) { // Gap in a let new_cost = *cost + match gap_type { @@ -173,7 +175,7 @@ impl AStarContext for Context<'_, '_, Cost> { GapType::InA, ), predecessor, - predecessor_alignment_type: Some(AlignmentType::Gap1), + predecessor_alignment_type: Some(AlignmentType::GapA), cost: new_cost, match_run: 0, })); @@ -181,7 +183,23 @@ impl AStarContext for Context<'_, '_, Cost> { // Generate jump successors. if is_primary { - //let coordinates = coordinates.jump_12(self.end.ts_kind()); + let new_cost = *cost + self.costs.ts_base_cost; + + // This generates too many jumps, most of these are gonna be much too far. + output.extend( + coordinates + .generate_12_jumps(self.end, self.sequences.end()) + .map(|(jump, coordinates)| Node { + identifier: Identifier::Jump12 { coordinates }, + predecessor, + predecessor_alignment_type: Some(AlignmentType::TsStart { + jump, + ts_kind: coordinates.ts_kind().unwrap(), + }), + cost: new_cost, + match_run: 0, + }), + ); } } @@ -286,7 +304,7 @@ impl Display for Identifier { "{}({}, {})", match self { Identifier::Primary { .. } => "P".to_string(), - Identifier::Jump12 { jump, .. } => format!("J12[{jump}]"), + Identifier::Jump12 { .. } => "J".to_string(), Identifier::Secondary { .. } => "S".to_string(), }, self.coordinates(), From c32f72d1bb4f30ba03e3c2d7fbc9c69e23a03119 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Tue, 2 Dec 2025 14:06:28 +0200 Subject: [PATCH 10/31] Fix gap filling for gap-affine secondary alignment. --- lib_ts_chainalign/src/alignment.rs | 21 +----- .../src/alignment/coordinates.rs | 40 +++++------ lib_ts_chainalign/src/alignment/sequences.rs | 15 ++-- lib_ts_chainalign/src/alignment/ts_kind.rs | 36 ++++++++++ .../chaining_lower_bounds/gap_affine/algo.rs | 5 +- .../src/exact_chaining/gap_affine.rs | 3 +- .../src/exact_chaining/gap_affine/algo.rs | 17 +++-- .../src/exact_chaining/gap_affine/tests.rs | 71 ++++++++++++++++--- .../src/exact_chaining/ts_12_jump.rs | 3 +- .../src/exact_chaining/ts_12_jump/algo.rs | 17 +++-- .../src/exact_chaining/ts_12_jump/tests.rs | 70 ++++++++++++++++++ 11 files changed, 228 insertions(+), 70 deletions(-) create mode 100644 lib_ts_chainalign/src/alignment/ts_kind.rs diff --git a/lib_ts_chainalign/src/alignment.rs b/lib_ts_chainalign/src/alignment.rs index 3b50c91..5e57b99 100644 --- a/lib_ts_chainalign/src/alignment.rs +++ b/lib_ts_chainalign/src/alignment.rs @@ -2,8 +2,11 @@ use std::fmt::Display; +use crate::alignment::ts_kind::TsKind; + pub mod coordinates; pub mod sequences; +pub mod ts_kind; #[derive(Debug, Clone, Copy, Eq, PartialEq)] pub enum AlignmentType { @@ -15,24 +18,6 @@ pub enum AlignmentType { TsEnd, } -#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] -pub enum TsAncestor { - Seq1, - Seq2, -} - -#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] -pub enum TsDescendant { - Seq1, - Seq2, -} - -#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] -pub struct TsKind { - ancestor: TsAncestor, - descendant: TsDescendant, -} - #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] pub enum GapType { None, diff --git a/lib_ts_chainalign/src/alignment/coordinates.rs b/lib_ts_chainalign/src/alignment/coordinates.rs index 1736e87..72eca5c 100644 --- a/lib_ts_chainalign/src/alignment/coordinates.rs +++ b/lib_ts_chainalign/src/alignment/coordinates.rs @@ -1,6 +1,6 @@ use std::fmt::Display; -use crate::alignment::{TsAncestor, TsDescendant, TsKind}; +use crate::alignment::ts_kind::{TsAncestor, TsDescendant, TsKind}; #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] pub enum AlignmentCoordinates { @@ -20,6 +20,14 @@ impl AlignmentCoordinates { Self::Primary { a, b } } + pub fn new_secondary(ancestor: usize, descendant: usize, ts_kind: TsKind) -> Self { + Self::Secondary { + ancestor, + descendant, + ts_kind, + } + } + pub fn primary_ordinate_a(&self) -> Option { match self { AlignmentCoordinates::Primary { a, .. } => Some(*a), @@ -64,28 +72,24 @@ impl AlignmentCoordinates { } /// Checks if ordinate a can be incremented. - /// In secondary alignments, ordinate b is the ancestor. + /// In secondary alignments, ordinate a is the ancestor. /// /// If ordinate a and the boundary have the same TS kind (possibly `None`), then the check is performed normally. /// If the TS kinds differ, then there is a jump before the boundary, and ordinate a can always be incremented. - pub fn can_increment_a(&self, start: AlignmentCoordinates, end: AlignmentCoordinates) -> bool { - match self { - AlignmentCoordinates::Primary { a, .. } => { - if self.ts_kind() == end.ts_kind() { + pub fn can_increment_a(&self, end: AlignmentCoordinates) -> bool { + if self.ts_kind() == end.ts_kind() { + match self { + AlignmentCoordinates::Primary { a, .. } => { // Incrementing primary ordinate a is always a plus operation, so we only need to check the upper bound. (..end.primary_ordinate_a().unwrap()).contains(a) - } else { - true } - } - AlignmentCoordinates::Secondary { ancestor, .. } => { - if self.ts_kind() == start.ts_kind() { + AlignmentCoordinates::Secondary { ancestor, .. } => { // Incrementing the secondary ancestor is always a minus operation, so we only need to check the lower bound. - (start.secondary_ordinate_ancestor().unwrap()..).contains(ancestor) - } else { - true + (end.secondary_ordinate_ancestor().unwrap()..).contains(ancestor) } } + } else { + true } } @@ -110,12 +114,8 @@ impl AlignmentCoordinates { } } - pub fn can_increment_both( - &self, - start: AlignmentCoordinates, - end: AlignmentCoordinates, - ) -> bool { - self.can_increment_a(start, end) && self.can_increment_b(end) + pub fn can_increment_both(&self, end: AlignmentCoordinates) -> bool { + self.can_increment_a(end) && self.can_increment_b(end) } pub fn increment_a(&self) -> Self { diff --git a/lib_ts_chainalign/src/alignment/sequences.rs b/lib_ts_chainalign/src/alignment/sequences.rs index f7c3a9b..64419da 100644 --- a/lib_ts_chainalign/src/alignment/sequences.rs +++ b/lib_ts_chainalign/src/alignment/sequences.rs @@ -1,4 +1,7 @@ -use crate::alignment::{TsAncestor, TsDescendant, coordinates::AlignmentCoordinates}; +use crate::alignment::{ + coordinates::AlignmentCoordinates, + ts_kind::{TsAncestor, TsDescendant}, +}; pub struct AlignmentSequences { seq1: Vec, @@ -10,7 +13,11 @@ impl AlignmentSequences { Self { seq1, seq2 } } - pub fn characters(&self, coordinates: AlignmentCoordinates) -> (u8, u8) { + pub fn characters( + &self, + coordinates: AlignmentCoordinates, + rc_fn: &dyn Fn(u8) -> u8, + ) -> (u8, u8) { match coordinates { AlignmentCoordinates::Primary { a, b } => (self.seq1[a], self.seq2[b]), AlignmentCoordinates::Secondary { @@ -22,10 +29,10 @@ impl AlignmentSequences { TsAncestor::Seq1 => self.seq1[ancestor], TsAncestor::Seq2 => self.seq2[ancestor], }, - match ts_kind.descendant { + rc_fn(match ts_kind.descendant { TsDescendant::Seq1 => self.seq1[descendant], TsDescendant::Seq2 => self.seq2[descendant], - }, + }), ), } } diff --git a/lib_ts_chainalign/src/alignment/ts_kind.rs b/lib_ts_chainalign/src/alignment/ts_kind.rs new file mode 100644 index 0000000..c7f8993 --- /dev/null +++ b/lib_ts_chainalign/src/alignment/ts_kind.rs @@ -0,0 +1,36 @@ +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct TsKind { + pub ancestor: TsAncestor, + pub descendant: TsDescendant, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub enum TsAncestor { + Seq1, + Seq2, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub enum TsDescendant { + Seq1, + Seq2, +} + +impl TsKind { + pub const TS11: Self = TsKind { + ancestor: TsAncestor::Seq1, + descendant: TsDescendant::Seq1, + }; + pub const TS12: Self = TsKind { + ancestor: TsAncestor::Seq1, + descendant: TsDescendant::Seq2, + }; + pub const TS21: Self = TsKind { + ancestor: TsAncestor::Seq2, + descendant: TsDescendant::Seq1, + }; + pub const TS22: Self = TsKind { + ancestor: TsAncestor::Seq2, + descendant: TsDescendant::Seq2, + }; +} diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs index 3c615d6..130952e 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs @@ -69,10 +69,9 @@ impl AStarContext for Context<'_, Cost> { cost, match_run, } = node; - let start = AlignmentCoordinates::new_primary(0, 0); let end = AlignmentCoordinates::new_primary(self.max_n, self.max_n); - if coordinates.can_increment_both(start, end) { + if coordinates.can_increment_both(end) { if *match_run < self.max_match_run { // Match let new_cost = *cost; @@ -101,7 +100,7 @@ impl AStarContext for Context<'_, Cost> { })); } - if coordinates.can_increment_a(start, end) { + if coordinates.can_increment_a(end) { // Gap in b let new_cost = *cost + match gap_type { diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs index 3986084..e43d9a0 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs @@ -23,13 +23,14 @@ impl GapAffineAlignment { end: AlignmentCoordinates, sequences: &AlignmentSequences, cost_table: &GapAffineCosts, + rc_fn: &dyn Fn(u8) -> u8, max_match_run: u32, ) -> Self { assert!( start.is_primary() && end.is_primary() || start.is_secondary() && end.is_secondary() ); - let context = Context::new(cost_table, sequences, start, end, max_match_run); + let context = Context::new(cost_table, sequences, rc_fn, start, end, max_match_run); let mut a_star = AStar::new(context); a_star.initialise(); match a_star.search() { diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs index c619f9c..c179ed3 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs @@ -14,9 +14,10 @@ use crate::{ costs::GapAffineCosts, }; -pub struct Context<'costs, 'sequences, Cost> { +pub struct Context<'costs, 'sequences, 'rc_fn, Cost> { costs: &'costs GapAffineCosts, sequences: &'sequences AlignmentSequences, + rc_fn: &'rc_fn dyn Fn(u8) -> u8, start: AlignmentCoordinates, end: AlignmentCoordinates, max_match_run: u32, @@ -37,10 +38,11 @@ pub struct Identifier { gap_type: GapType, } -impl<'costs, 'sequences, Cost> Context<'costs, 'sequences, Cost> { +impl<'costs, 'sequences, 'rc_fn, Cost> Context<'costs, 'sequences, 'rc_fn, Cost> { pub fn new( costs: &'costs GapAffineCosts, sequences: &'sequences AlignmentSequences, + rc_fn: &'rc_fn dyn Fn(u8) -> u8, start: AlignmentCoordinates, end: AlignmentCoordinates, max_match_run: u32, @@ -48,6 +50,7 @@ impl<'costs, 'sequences, Cost> Context<'costs, 'sequences, Cost> { Self { costs, sequences, + rc_fn, start, end, max_match_run, @@ -55,7 +58,7 @@ impl<'costs, 'sequences, Cost> Context<'costs, 'sequences, Cost> { } } -impl AStarContext for Context<'_, '_, Cost> { +impl AStarContext for Context<'_, '_, '_, Cost> { type Node = Node; fn create_root(&self) -> Self::Node { @@ -84,8 +87,8 @@ impl AStarContext for Context<'_, '_, Cost> { gap_type, } = *identifier; - if coordinates.can_increment_both(self.start, self.end) { - let (ca, cb) = self.sequences.characters(coordinates); + if coordinates.can_increment_both(self.end) { + let (ca, cb) = self.sequences.characters(coordinates, self.rc_fn); let is_match = ca == cb; if is_match { @@ -121,7 +124,7 @@ impl AStarContext for Context<'_, '_, Cost> { } } - if coordinates.can_increment_a(self.start, self.end) { + if coordinates.can_increment_a(self.end) { // Gap in b let new_cost = *cost + match gap_type { @@ -173,7 +176,7 @@ impl AStarContext for Context<'_, '_, Cost> { } } -impl Reset for Context<'_, '_, Cost> { +impl Reset for Context<'_, '_, '_, Cost> { fn reset(&mut self) { unimplemented!() } diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs index affbb2a..27c9d1c 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs @@ -1,9 +1,20 @@ use generic_a_star::cost::U32Cost; use crate::alignment::AlignmentType; +use crate::alignment::ts_kind::TsKind; use crate::exact_chaining::gap_affine::{AlignmentCoordinates, GapAffineAlignment}; use crate::{alignment::sequences::AlignmentSequences, costs::GapAffineCosts}; +fn rc_fn(c: u8) -> u8 { + match c { + b'A' => b'T', + b'C' => b'G', + b'G' => b'C', + b'T' => b'A', + c => unimplemented!("Unsupported character {c}"), + } +} + #[test] fn test_start_end() { let seq1 = b"ACGT".to_vec(); @@ -14,7 +25,7 @@ fn test_start_end() { let start = AlignmentCoordinates::new_primary(0, 0); let end = AlignmentCoordinates::new_primary(4, 5); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -35,7 +46,7 @@ fn test_partial_alignment() { let start = AlignmentCoordinates::new_primary(1, 1); let end = AlignmentCoordinates::new_primary(4, 4); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -60,7 +71,7 @@ fn test_gap_directions() { let start = AlignmentCoordinates::new_primary(1, 1); let end = AlignmentCoordinates::new_primary(11, 11); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -87,7 +98,7 @@ fn test_extremity_gaps() { let start = AlignmentCoordinates::new_primary(3, 3); let end = AlignmentCoordinates::new_primary(10, 10); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -112,7 +123,7 @@ fn test_extremity_substitutions() { let start = AlignmentCoordinates::new_primary(0, 0); let end = AlignmentCoordinates::new_primary(5, 5); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -137,7 +148,7 @@ fn test_substitutions_as_gaps() { let start = AlignmentCoordinates::new_primary(0, 0); let end = AlignmentCoordinates::new_primary(20, 20); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, u32::MAX); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -160,7 +171,7 @@ fn test_max_match_run_0() { let start = AlignmentCoordinates::new_primary(1, 1); let end = AlignmentCoordinates::new_primary(9, 9); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, 0); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 0); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -183,7 +194,7 @@ fn test_max_match_run_1() { let start = AlignmentCoordinates::new_primary(1, 1); let end = AlignmentCoordinates::new_primary(9, 9); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, 1); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 1); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -213,7 +224,7 @@ fn test_max_match_run_2() { let start = AlignmentCoordinates::new_primary(1, 1); let end = AlignmentCoordinates::new_primary(9, 9); - let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, 2); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 2); assert_eq!(alignment.start(), start); assert_eq!(alignment.end(), end); @@ -229,3 +240,45 @@ fn test_max_match_run_2() { ); assert_eq!(alignment.cost(), U32Cost::from(6u8)); } + +#[test] +fn test_secondary_12() { + let seq1 = b"AAAAAAAAAA".to_vec(); + let seq2 = b"TTTTTTTTTT".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = + GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); + + let start = AlignmentCoordinates::new_secondary(9, 1, TsKind::TS12); + let end = AlignmentCoordinates::new_secondary(1, 9, TsKind::TS12); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![(8, AlignmentType::Match),] + ); + assert_eq!(alignment.cost(), U32Cost::from(0u8)); +} + +#[test] +fn test_secondary_21() { + let seq1 = b"AAAAAAAAAA".to_vec(); + let seq2 = b"TTTTTTTTTT".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = + GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); + + let start = AlignmentCoordinates::new_secondary(9, 1, TsKind::TS21); + let end = AlignmentCoordinates::new_secondary(1, 9, TsKind::TS21); + let alignment = GapAffineAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![(8, AlignmentType::Match),] + ); + assert_eq!(alignment.cost(), U32Cost::from(0u8)); +} diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs index f5c7c1e..fdebc5c 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs @@ -23,12 +23,13 @@ impl Ts12JumpAlignment { end: AlignmentCoordinates, sequences: &AlignmentSequences, cost_table: &AlignmentCosts, + rc_fn: &dyn Fn(u8) -> u8, max_match_run: u32, ) -> Self { assert!(start.is_primary()); assert!(end.is_secondary()); - let context = Context::new(cost_table, sequences, start, end, max_match_run); + let context = Context::new(cost_table, sequences, rc_fn, start, end, max_match_run); let mut a_star = AStar::new(context); a_star.initialise(); match a_star.search() { diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs index 972ce28..3e73a28 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs @@ -14,9 +14,10 @@ use crate::{ costs::AlignmentCosts, }; -pub struct Context<'costs, 'sequences, Cost> { +pub struct Context<'costs, 'sequences, 'rc_fn, Cost> { costs: &'costs AlignmentCosts, sequences: &'sequences AlignmentSequences, + rc_fn: &'rc_fn dyn Fn(u8) -> u8, start: AlignmentCoordinates, end: AlignmentCoordinates, max_match_run: u32, @@ -46,10 +47,11 @@ pub enum Identifier { }, } -impl<'costs, 'sequences, Cost> Context<'costs, 'sequences, Cost> { +impl<'costs, 'sequences, 'rc_fn, Cost> Context<'costs, 'sequences, 'rc_fn, Cost> { pub fn new( costs: &'costs AlignmentCosts, sequences: &'sequences AlignmentSequences, + rc_fn: &'rc_fn dyn Fn(u8) -> u8, start: AlignmentCoordinates, end: AlignmentCoordinates, max_match_run: u32, @@ -60,6 +62,7 @@ impl<'costs, 'sequences, Cost> Context<'costs, 'sequences, Cost> { Self { costs, sequences, + rc_fn, start, end, max_match_run, @@ -67,7 +70,7 @@ impl<'costs, 'sequences, Cost> Context<'costs, 'sequences, Cost> { } } -impl AStarContext for Context<'_, '_, Cost> { +impl AStarContext for Context<'_, '_, '_, Cost> { type Node = Node; fn create_root(&self) -> Self::Node { @@ -102,8 +105,8 @@ impl AStarContext for Context<'_, '_, Cost> { }; // Generate gap-affine successors. - if coordinates.can_increment_both(self.start, self.end) { - let (ca, cb) = self.sequences.characters(coordinates); + if coordinates.can_increment_both(self.end) { + let (ca, cb) = self.sequences.characters(coordinates, self.rc_fn); let is_match = ca == cb; if is_match { @@ -141,7 +144,7 @@ impl AStarContext for Context<'_, '_, Cost> { } } - if coordinates.can_increment_a(self.start, self.end) { + if coordinates.can_increment_a(self.end) { // Gap in b let new_cost = *cost + match gap_type { @@ -216,7 +219,7 @@ impl AStarContext for Context<'_, '_, Cost> { } } -impl Reset for Context<'_, '_, Cost> { +impl Reset for Context<'_, '_, '_, Cost> { fn reset(&mut self) { unimplemented!() } diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs index 8b13789..f3fa1bf 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs @@ -1 +1,71 @@ +use generic_a_star::cost::U32Cost; +use crate::{ + alignment::{ + AlignmentType, coordinates::AlignmentCoordinates, sequences::AlignmentSequences, + ts_kind::TsKind, + }, + costs::{AlignmentCosts, GapAffineCosts, TsLimits}, + exact_chaining::ts_12_jump::Ts12JumpAlignment, +}; + +fn rc_fn(c: u8) -> u8 { + match c { + b'A' => b'T', + b'C' => b'G', + b'G' => b'C', + b'T' => b'A', + c => unimplemented!("Unsupported character {c}"), + } +} + +#[ignore] +#[test] +fn test_start_end() { + let seq1 = b"AAGT".to_vec(); + let seq2 = b"ACGTT".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(2u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(4u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_primary(0, 0); + let end = AlignmentCoordinates::new_secondary(4, 0, TsKind::TS21); + let alignment = Ts12JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (1, AlignmentType::Match), + (1, AlignmentType::Substitution), + (1, AlignmentType::Match), + ( + 1, + AlignmentType::TsStart { + jump: -1, + ts_kind: TsKind::TS21 + } + ), + (2, AlignmentType::Match), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(4u8)); +} From 21df6b7e8dc776ffcdcadb20de73aa81201396cb Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Tue, 2 Dec 2025 14:48:03 +0200 Subject: [PATCH 11/31] Fix TS ancestor coordinate interpretation. --- .../src/alignment/coordinates.rs | 57 ++++++++++++++----- lib_ts_chainalign/src/alignment/sequences.rs | 4 +- lib_ts_chainalign/src/alignment/ts_kind.rs | 19 +++++++ .../chaining_lower_bounds/gap_affine/algo.rs | 6 +- .../src/exact_chaining/gap_affine/algo.rs | 6 +- .../src/exact_chaining/gap_affine/tests.rs | 8 +-- .../src/exact_chaining/ts_12_jump/algo.rs | 21 +++++-- .../src/exact_chaining/ts_12_jump/tests.rs | 7 +-- 8 files changed, 95 insertions(+), 33 deletions(-) diff --git a/lib_ts_chainalign/src/alignment/coordinates.rs b/lib_ts_chainalign/src/alignment/coordinates.rs index 72eca5c..ca520cf 100644 --- a/lib_ts_chainalign/src/alignment/coordinates.rs +++ b/lib_ts_chainalign/src/alignment/coordinates.rs @@ -1,6 +1,9 @@ use std::fmt::Display; -use crate::alignment::ts_kind::{TsAncestor, TsDescendant, TsKind}; +use crate::alignment::{ + sequences::AlignmentSequences, + ts_kind::{TsAncestor, TsDescendant, TsKind}, +}; #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] pub enum AlignmentCoordinates { @@ -75,17 +78,30 @@ impl AlignmentCoordinates { /// In secondary alignments, ordinate a is the ancestor. /// /// If ordinate a and the boundary have the same TS kind (possibly `None`), then the check is performed normally. - /// If the TS kinds differ, then there is a jump before the boundary, and ordinate a can always be incremented. - pub fn can_increment_a(&self, end: AlignmentCoordinates) -> bool { + /// If the TS kinds differ, then there is a jump before the boundary, and ordinate a can be incremented until the end of the sequence. + pub fn can_increment_a( + &self, + end: AlignmentCoordinates, + sequences: Option<&AlignmentSequences>, + ) -> bool { if self.ts_kind() == end.ts_kind() { match self { AlignmentCoordinates::Primary { a, .. } => { // Incrementing primary ordinate a is always a plus operation, so we only need to check the upper bound. - (..end.primary_ordinate_a().unwrap()).contains(a) + *a < end.primary_ordinate_a().unwrap() } AlignmentCoordinates::Secondary { ancestor, .. } => { // Incrementing the secondary ancestor is always a minus operation, so we only need to check the lower bound. - (end.secondary_ordinate_ancestor().unwrap()..).contains(ancestor) + end.secondary_ordinate_ancestor().unwrap() < *ancestor + } + } + } else if let Some(sequences) = sequences { + match self { + AlignmentCoordinates::Primary { a, .. } => { + *a < sequences.end().primary_ordinate_a().unwrap() + } + AlignmentCoordinates::Secondary { ancestor, .. } => { + sequences.start().secondary_ordinate_ancestor().unwrap() < *ancestor } } } else { @@ -97,16 +113,27 @@ impl AlignmentCoordinates { /// In secondary alignments, ordinate b is the descendant. /// /// If ordinate b and the `end` boundary have the same TS kind (possibly `None`), then the check is performed normally. - /// If the TS kinds differ, then there is a jump before the `end` boundary, and ordinate b can always be incremented. - pub fn can_increment_b(&self, end: AlignmentCoordinates) -> bool { + /// If the TS kinds differ, then there is a jump before the `end` boundary, and ordinate b can be incremented until the end of the sequence. + pub fn can_increment_b( + &self, + end: AlignmentCoordinates, + sequences: Option<&AlignmentSequences>, + ) -> bool { if self.ts_kind() == end.ts_kind() { // Incrementing ordinate b is always a plus operation, so we only need to check the upper bound. + match self { + AlignmentCoordinates::Primary { b, .. } => *b < end.primary_ordinate_b().unwrap(), + AlignmentCoordinates::Secondary { descendant, .. } => { + *descendant < end.secondary_ordinate_descendant().unwrap() + } + } + } else if let Some(sequences) = sequences { match self { AlignmentCoordinates::Primary { b, .. } => { - (..end.primary_ordinate_b().unwrap()).contains(b) + *b < sequences.end().primary_ordinate_b().unwrap() } AlignmentCoordinates::Secondary { descendant, .. } => { - (..end.secondary_ordinate_descendant().unwrap()).contains(descendant) + *descendant < sequences.end().secondary_ordinate_descendant().unwrap() } } } else { @@ -114,8 +141,12 @@ impl AlignmentCoordinates { } } - pub fn can_increment_both(&self, end: AlignmentCoordinates) -> bool { - self.can_increment_a(end) && self.can_increment_b(end) + pub fn can_increment_both( + &self, + end: AlignmentCoordinates, + sequences: Option<&AlignmentSequences>, + ) -> bool { + self.can_increment_a(end, sequences) && self.can_increment_b(end, sequences) } pub fn increment_a(&self) -> Self { @@ -128,7 +159,7 @@ impl AlignmentCoordinates { descendant, ts_kind, } => AlignmentCoordinates::Secondary { - ancestor: ancestor - 1, + ancestor: ancestor.wrapping_sub(1), descendant: *descendant, ts_kind: *ts_kind, }, @@ -205,7 +236,7 @@ impl Display for AlignmentCoordinates { ancestor, descendant, ts_kind, - } => write!(f, "({}, {}, {:?})", ancestor, descendant, ts_kind), + } => write!(f, "({}, {}, {})", ancestor, descendant, ts_kind), } } } diff --git a/lib_ts_chainalign/src/alignment/sequences.rs b/lib_ts_chainalign/src/alignment/sequences.rs index 64419da..4e3de11 100644 --- a/lib_ts_chainalign/src/alignment/sequences.rs +++ b/lib_ts_chainalign/src/alignment/sequences.rs @@ -26,8 +26,8 @@ impl AlignmentSequences { ts_kind, } => ( match ts_kind.ancestor { - TsAncestor::Seq1 => self.seq1[ancestor], - TsAncestor::Seq2 => self.seq2[ancestor], + TsAncestor::Seq1 => self.seq1[ancestor - 1], + TsAncestor::Seq2 => self.seq2[ancestor - 1], }, rc_fn(match ts_kind.descendant { TsDescendant::Seq1 => self.seq1[descendant], diff --git a/lib_ts_chainalign/src/alignment/ts_kind.rs b/lib_ts_chainalign/src/alignment/ts_kind.rs index c7f8993..3e229ec 100644 --- a/lib_ts_chainalign/src/alignment/ts_kind.rs +++ b/lib_ts_chainalign/src/alignment/ts_kind.rs @@ -1,3 +1,5 @@ +use std::fmt::Display; + #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] pub struct TsKind { pub ancestor: TsAncestor, @@ -34,3 +36,20 @@ impl TsKind { descendant: TsDescendant::Seq2, }; } + +impl Display for TsKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "TS{}{}", + match self.ancestor { + TsAncestor::Seq1 => "1", + TsAncestor::Seq2 => "2", + }, + match self.descendant { + TsDescendant::Seq1 => "1", + TsDescendant::Seq2 => "2", + } + ) + } +} diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs index 130952e..1ff5e26 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine/algo.rs @@ -71,7 +71,7 @@ impl AStarContext for Context<'_, Cost> { } = node; let end = AlignmentCoordinates::new_primary(self.max_n, self.max_n); - if coordinates.can_increment_both(end) { + if coordinates.can_increment_both(end, None) { if *match_run < self.max_match_run { // Match let new_cost = *cost; @@ -100,7 +100,7 @@ impl AStarContext for Context<'_, Cost> { })); } - if coordinates.can_increment_a(end) { + if coordinates.can_increment_a(end, None) { // Gap in b let new_cost = *cost + match gap_type { @@ -118,7 +118,7 @@ impl AStarContext for Context<'_, Cost> { })); } - if coordinates.can_increment_b(end) { + if coordinates.can_increment_b(end, None) { // Gap in a let new_cost = *cost + match gap_type { diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs index c179ed3..f19f66d 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs @@ -87,7 +87,7 @@ impl AStarContext for Context<'_, '_, '_, Cost> { gap_type, } = *identifier; - if coordinates.can_increment_both(self.end) { + if coordinates.can_increment_both(self.end, Some(self.sequences)) { let (ca, cb) = self.sequences.characters(coordinates, self.rc_fn); let is_match = ca == cb; @@ -124,7 +124,7 @@ impl AStarContext for Context<'_, '_, '_, Cost> { } } - if coordinates.can_increment_a(self.end) { + if coordinates.can_increment_a(self.end, Some(self.sequences)) { // Gap in b let new_cost = *cost + match gap_type { @@ -143,7 +143,7 @@ impl AStarContext for Context<'_, '_, '_, Cost> { })); } - if coordinates.can_increment_b(self.end) { + if coordinates.can_increment_b(self.end, Some(self.sequences)) { // Gap in a let new_cost = *cost + match gap_type { diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs index 27c9d1c..41a7978 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/tests.rs @@ -243,8 +243,8 @@ fn test_max_match_run_2() { #[test] fn test_secondary_12() { - let seq1 = b"AAAAAAAAAA".to_vec(); - let seq2 = b"TTTTTTTTTT".to_vec(); + let seq1 = b"GAAAAAAAAG".to_vec(); + let seq2 = b"GTTTTTTTTG".to_vec(); let sequences = AlignmentSequences::new(seq1, seq2); let cost_table = GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); @@ -264,8 +264,8 @@ fn test_secondary_12() { #[test] fn test_secondary_21() { - let seq1 = b"AAAAAAAAAA".to_vec(); - let seq2 = b"TTTTTTTTTT".to_vec(); + let seq1 = b"GAAAAAAAAG".to_vec(); + let seq2 = b"GTTTTTTTTG".to_vec(); let sequences = AlignmentSequences::new(seq1, seq2); let cost_table = GapAffineCosts::new(U32Cost::from(2u8), U32Cost::from(3u8), U32Cost::from(1u8)); diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs index 3e73a28..23d7360 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs @@ -104,8 +104,10 @@ impl AStarContext for Context<'_, '_, '_, Cost> { &self.costs.secondary_costs }; + println!("generate_successors of {node}"); + // Generate gap-affine successors. - if coordinates.can_increment_both(self.end) { + if coordinates.can_increment_both(self.end, Some(self.sequences)) { let (ca, cb) = self.sequences.characters(coordinates, self.rc_fn); let is_match = ca == cb; @@ -144,7 +146,7 @@ impl AStarContext for Context<'_, '_, '_, Cost> { } } - if coordinates.can_increment_a(self.end) { + if coordinates.can_increment_a(self.end, Some(self.sequences)) { // Gap in b let new_cost = *cost + match gap_type { @@ -164,7 +166,7 @@ impl AStarContext for Context<'_, '_, '_, Cost> { })); } - if coordinates.can_increment_b(self.end) { + if coordinates.can_increment_b(self.end, Some(self.sequences)) { // Gap in a let new_cost = *cost + match gap_type { @@ -296,7 +298,18 @@ impl Identifier { impl Display for Node { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}: {}, {}", self.identifier, self.cost, self.match_run) + write!( + f, + "{}{}: {}, {}", + self.identifier, + if let Some(predecessor) = &self.predecessor { + format!("<-{predecessor}") + } else { + "".to_string() + }, + self.cost, + self.match_run + ) } } diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs index f3fa1bf..22f4e63 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs @@ -19,10 +19,9 @@ fn rc_fn(c: u8) -> u8 { } } -#[ignore] #[test] fn test_start_end() { - let seq1 = b"AAGT".to_vec(); + let seq1 = b"AAGG".to_vec(); let seq2 = b"ACGTT".to_vec(); let sequences = AlignmentSequences::new(seq1, seq2); let cost_table = AlignmentCosts { @@ -46,7 +45,7 @@ fn test_start_end() { }; let start = AlignmentCoordinates::new_primary(0, 0); - let end = AlignmentCoordinates::new_secondary(4, 0, TsKind::TS21); + let end = AlignmentCoordinates::new_secondary(0, 5, TsKind::TS12); let alignment = Ts12JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); assert_eq!(alignment.start(), start); @@ -61,7 +60,7 @@ fn test_start_end() { 1, AlignmentType::TsStart { jump: -1, - ts_kind: TsKind::TS21 + ts_kind: TsKind::TS12 } ), (2, AlignmentType::Match), From dff5724f574c35be06f523ce8ec602c47ec4c8b4 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Tue, 2 Dec 2025 15:44:04 +0200 Subject: [PATCH 12/31] Test 12-jump exact chaining algo. --- .../src/exact_chaining/ts_12_jump/algo.rs | 2 - .../src/exact_chaining/ts_12_jump/tests.rs | 260 ++++++++++++++++++ 2 files changed, 260 insertions(+), 2 deletions(-) diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs index 23d7360..e0af5ae 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs @@ -104,8 +104,6 @@ impl AStarContext for Context<'_, '_, '_, Cost> { &self.costs.secondary_costs }; - println!("generate_successors of {node}"); - // Generate gap-affine successors. if coordinates.can_increment_both(self.end, Some(self.sequences)) { let (ca, cb) = self.sequences.characters(coordinates, self.rc_fn); diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs index 22f4e63..613b562 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs @@ -68,3 +68,263 @@ fn test_start_end() { ); assert_eq!(alignment.cost(), U32Cost::from(4u8)); } + +#[test] +fn test_partial_alignment() { + let seq1 = b"AAGG".to_vec(); + let seq2 = b"ACGTT".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(2u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(4u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_primary(1, 1); + let end = AlignmentCoordinates::new_secondary(1, 4, TsKind::TS12); + let alignment = Ts12JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (1, AlignmentType::Substitution), + (1, AlignmentType::Match), + ( + 1, + AlignmentType::TsStart { + jump: -1, + ts_kind: TsKind::TS12 + } + ), + (1, AlignmentType::Match), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(4u8)); +} + +#[test] +fn test_gap_directions() { + let seq1 = b"CCCCCCACCAACAAAAAA".to_vec(); + let seq2 = b"AAAAAACAAGGGGGGAGG".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(10u8), + U32Cost::from(1u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(10u8), + U32Cost::from(1u8), + U32Cost::from(1u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_primary(9, 0); + let end = AlignmentCoordinates::new_secondary(0, 18, TsKind::TS12); + let alignment = Ts12JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (2, AlignmentType::Match), + (1, AlignmentType::GapB), + (4, AlignmentType::Match), + (1, AlignmentType::GapA), + (2, AlignmentType::Match), + ( + 1, + AlignmentType::TsStart { + jump: -9, + ts_kind: TsKind::TS12 + } + ), + (2, AlignmentType::Match), + (1, AlignmentType::GapB), + (4, AlignmentType::Match), + (1, AlignmentType::GapA), + (2, AlignmentType::Match), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(6u8)); +} + +#[test] +fn test_max_match_run_0() { + let seq1 = b"GGAGGAGGAACAACAA".to_vec(); + let seq2 = b"AAAAAAAACCTCCTCC".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(2u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(4u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_primary(8, 0); + let end = AlignmentCoordinates::new_secondary(0, 16, TsKind::TS12); + let alignment = Ts12JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 0); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (16, AlignmentType::GapA), + ( + 1, + AlignmentType::TsStart { + jump: -8, + ts_kind: TsKind::TS12 + } + ), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(20u8)); +} + +#[test] +fn test_max_match_run_1() { + let seq1 = b"GGAGGAGGAACAACAA".to_vec(); + let seq2 = b"AAAAAAAACCTCCTCC".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(2u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(4u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_primary(8, 0); + let end = AlignmentCoordinates::new_secondary(0, 16, TsKind::TS12); + let alignment = Ts12JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 1); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (1, AlignmentType::Match), + (6, AlignmentType::GapA), + (1, AlignmentType::Match), + (6, AlignmentType::GapA), + (1, AlignmentType::Match), + ( + 1, + AlignmentType::TsStart { + jump: -10, + ts_kind: TsKind::TS12 + } + ), + (1, AlignmentType::Match), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(18u8)); +} + +#[test] +fn test_max_match_run_2() { + let seq1 = b"GGAGGAGGAACAACAA".to_vec(); + let seq2 = b"AAAAAAAACCCCCCCC".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(2u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(4u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_primary(8, 0); + let end = AlignmentCoordinates::new_secondary(0, 16, TsKind::TS12); + let alignment = Ts12JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 2); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (2, AlignmentType::Match), + (1, AlignmentType::Substitution), + (2, AlignmentType::Match), + (1, AlignmentType::Substitution), + (2, AlignmentType::Match), + ( + 1, + AlignmentType::TsStart { + jump: -8, + ts_kind: TsKind::TS12 + } + ), + (2, AlignmentType::Match), + (1, AlignmentType::Substitution), + (2, AlignmentType::Match), + (1, AlignmentType::Substitution), + (2, AlignmentType::Match), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(14u8)); +} From 27294b9026f31caf0a88fda21be00be811a5153d Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Wed, 3 Dec 2025 09:56:50 +0200 Subject: [PATCH 13/31] Treat no target found as infinite costs. --- lib_ts_chainalign/src/exact_chaining/gap_affine.rs | 9 ++++++--- lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs index e43d9a0..bd6c7dd 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine.rs @@ -42,9 +42,12 @@ impl GapAffineAlignment { }, AStarResult::ExceededCostLimit { .. } => unreachable!("Cost limit is None"), AStarResult::ExceededMemoryLimit { .. } => unreachable!("Cost limit is None"), - AStarResult::NoTarget => { - panic!("No gap-affine alignment found between the given coordinates") - } + AStarResult::NoTarget => Self { + start, + end, + alignment: Vec::new().into(), + cost: Cost::max_value(), + }, } } } diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs index fdebc5c..3a235d5 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs @@ -41,9 +41,12 @@ impl Ts12JumpAlignment { }, AStarResult::ExceededCostLimit { .. } => unreachable!("Cost limit is None"), AStarResult::ExceededMemoryLimit { .. } => unreachable!("Cost limit is None"), - AStarResult::NoTarget => { - panic!("No TS 12-jump alignment found between the given coordinates") - } + AStarResult::NoTarget => Self { + start, + end, + alignment: Vec::new().into(), + cost: Cost::max_value(), + }, } } } From e17eb7dc069be4cdaab7dbdc51f77b4b603dbf0d Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Wed, 3 Dec 2025 11:08:52 +0200 Subject: [PATCH 14/31] Implement TS 34-jump exact chaining. --- generic_a_star/src/lib.rs | 2 +- lib_ts_chainalign/src/alignment.rs | 2 +- .../src/alignment/coordinates.rs | 80 +++- lib_ts_chainalign/src/costs.rs | 16 + lib_ts_chainalign/src/exact_chaining.rs | 1 + .../src/exact_chaining/gap_affine/algo.rs | 4 + .../src/exact_chaining/ts_12_jump/algo.rs | 4 +- .../src/exact_chaining/ts_34_jump.rs | 74 ++++ .../src/exact_chaining/ts_34_jump/algo.rs | 341 ++++++++++++++++++ .../src/exact_chaining/ts_34_jump/tests.rs | 298 +++++++++++++++ 10 files changed, 804 insertions(+), 18 deletions(-) create mode 100644 lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs create mode 100644 lib_ts_chainalign/src/exact_chaining/ts_34_jump/algo.rs create mode 100644 lib_ts_chainalign/src/exact_chaining/ts_34_jump/tests.rs diff --git a/generic_a_star/src/lib.rs b/generic_a_star/src/lib.rs index ea825eb..75c77e3 100644 --- a/generic_a_star/src/lib.rs +++ b/generic_a_star/src/lib.rs @@ -345,7 +345,7 @@ impl AStar { debug_assert!( previous_visit.cost() + previous_visit.a_star_lower_bound() <= node.cost() + node.a_star_lower_bound(), - "{}", + "Revisiting node at lower costs:\n{}", { use std::fmt::Write; let mut previous_visit = previous_visit; diff --git a/lib_ts_chainalign/src/alignment.rs b/lib_ts_chainalign/src/alignment.rs index 5e57b99..4825125 100644 --- a/lib_ts_chainalign/src/alignment.rs +++ b/lib_ts_chainalign/src/alignment.rs @@ -15,7 +15,7 @@ pub enum AlignmentType { GapA, GapB, TsStart { jump: isize, ts_kind: TsKind }, - TsEnd, + TsEnd { jump: isize }, } #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] diff --git a/lib_ts_chainalign/src/alignment/coordinates.rs b/lib_ts_chainalign/src/alignment/coordinates.rs index ca520cf..dda432e 100644 --- a/lib_ts_chainalign/src/alignment/coordinates.rs +++ b/lib_ts_chainalign/src/alignment/coordinates.rs @@ -77,8 +77,8 @@ impl AlignmentCoordinates { /// Checks if ordinate a can be incremented. /// In secondary alignments, ordinate a is the ancestor. /// - /// If ordinate a and the boundary have the same TS kind (possibly `None`), then the check is performed normally. - /// If the TS kinds differ, then there is a jump before the boundary, and ordinate a can be incremented until the end of the sequence. + /// If ordinate a and the `end` coordinates are both primary or both secondary, then the check is performed normally. + /// If they differ, then there is a jump before the `end` boundary. pub fn can_increment_a( &self, end: AlignmentCoordinates, @@ -96,15 +96,20 @@ impl AlignmentCoordinates { } } } else if let Some(sequences) = sequences { + assert_ne!(self.is_primary(), end.is_primary()); match self { AlignmentCoordinates::Primary { a, .. } => { - *a < sequences.end().primary_ordinate_a().unwrap() - } - AlignmentCoordinates::Secondary { ancestor, .. } => { - sequences.start().secondary_ordinate_ancestor().unwrap() < *ancestor + match end.ts_kind().unwrap().descendant { + // Descendant is a, so it is limited by the descendant ordinate. + TsDescendant::Seq1 => *a < end.secondary_ordinate_descendant().unwrap(), + // Descendant is b, so a can go until the end of the sequence. + TsDescendant::Seq2 => *a < sequences.end().primary_ordinate_a().unwrap(), + } } + AlignmentCoordinates::Secondary { ancestor, .. } => 0 < *ancestor, } } else { + assert_ne!(self.is_primary(), end.is_primary()); true } } @@ -112,8 +117,8 @@ impl AlignmentCoordinates { /// Checks if ordinate b can be incremented. /// In secondary alignments, ordinate b is the descendant. /// - /// If ordinate b and the `end` boundary have the same TS kind (possibly `None`), then the check is performed normally. - /// If the TS kinds differ, then there is a jump before the `end` boundary, and ordinate b can be incremented until the end of the sequence. + /// If ordinate b and the `end` coordinates are both primary or both secondary, then the check is performed normally. + /// If the they differ, then there is a jump before the `end` boundary. pub fn can_increment_b( &self, end: AlignmentCoordinates, @@ -128,15 +133,25 @@ impl AlignmentCoordinates { } } } else if let Some(sequences) = sequences { + assert_ne!(self.is_primary(), end.is_primary()); match self { AlignmentCoordinates::Primary { b, .. } => { - *b < sequences.end().primary_ordinate_b().unwrap() + match end.ts_kind().unwrap().descendant { + // Descendant is a, so b can go until the end of the sequence. + TsDescendant::Seq1 => *b < sequences.end().primary_ordinate_b().unwrap(), + // Descendant is b, so it is limited by the descendant ordinate. + TsDescendant::Seq2 => *b < end.secondary_ordinate_descendant().unwrap(), + } } AlignmentCoordinates::Secondary { descendant, .. } => { - *descendant < sequences.end().secondary_ordinate_descendant().unwrap() + match self.ts_kind().unwrap().descendant { + TsDescendant::Seq1 => *descendant < end.primary_ordinate_a().unwrap(), + TsDescendant::Seq2 => *descendant < end.primary_ordinate_b().unwrap(), + } } } } else { + assert_ne!(self.is_primary(), end.is_primary()); true } } @@ -189,10 +204,10 @@ impl AlignmentCoordinates { /// Generate all possible 12-jumps. /// - /// The TS kind is given by the start coordinates. - /// The left and right limits of the jump are given by the start and end coordinates. - /// The end coordinates must be in primary form and simply be the end of the aligned sequences. - /// The start coordinates are in secondary form. + /// The TS kind is given by the `start` coordinates. + /// The left and right limits of the jump are given by the `start` and `end` coordinates. + /// The `end` coordinates must be in primary form and simply be the end of the aligned sequences. + /// The `start` coordinates are in secondary form. pub fn generate_12_jumps( &self, start: AlignmentCoordinates, @@ -226,6 +241,43 @@ impl AlignmentCoordinates { ) }) } + + /// Generate all possible 34-jumps. + /// + /// The `end` coordinates are in primary form and limit the jump to the left of (or into) them. + pub fn generate_34_jumps( + &self, + end: AlignmentCoordinates, + ) -> impl Iterator { + let Self::Secondary { + ancestor, + descendant, + ts_kind, + } = *self + else { + panic!("Can only generate 34-jumps from secondary coordinates"); + }; + + (0..match ts_kind.descendant { + TsDescendant::Seq1 => end.primary_ordinate_b().unwrap(), + TsDescendant::Seq2 => end.primary_ordinate_a().unwrap(), + }) + .map(move |new_ancestor| { + ( + new_ancestor as isize - ancestor as isize, + match ts_kind.descendant { + TsDescendant::Seq1 => Self::Primary { + a: descendant, + b: new_ancestor, + }, + TsDescendant::Seq2 => Self::Primary { + a: new_ancestor, + b: descendant, + }, + }, + ) + }) + } } impl Display for AlignmentCoordinates { diff --git a/lib_ts_chainalign/src/costs.rs b/lib_ts_chainalign/src/costs.rs index 2d6a075..a365c91 100644 --- a/lib_ts_chainalign/src/costs.rs +++ b/lib_ts_chainalign/src/costs.rs @@ -1,5 +1,7 @@ use std::ops::Range; +use num_traits::Zero; + pub struct GapAffineCosts { pub substitution: Cost, pub gap_open: Cost, @@ -30,6 +32,12 @@ impl GapAffineCosts { } } +impl GapAffineCosts { + pub fn has_zero_cost(&self) -> bool { + self.substitution.is_zero() || self.gap_open.is_zero() || self.gap_extend.is_zero() + } +} + impl AlignmentCosts { pub fn new( primary_costs: GapAffineCosts, @@ -45,3 +53,11 @@ impl AlignmentCosts { } } } + +impl AlignmentCosts { + pub fn has_zero_cost(&self) -> bool { + self.primary_costs.has_zero_cost() + || self.secondary_costs.has_zero_cost() + || self.ts_base_cost.is_zero() + } +} diff --git a/lib_ts_chainalign/src/exact_chaining.rs b/lib_ts_chainalign/src/exact_chaining.rs index 988526b..ac16be4 100644 --- a/lib_ts_chainalign/src/exact_chaining.rs +++ b/lib_ts_chainalign/src/exact_chaining.rs @@ -2,3 +2,4 @@ pub mod gap_affine; pub mod ts_12_jump; +pub mod ts_34_jump; diff --git a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs index f19f66d..53d5c2b 100644 --- a/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/gap_affine/algo.rs @@ -174,6 +174,10 @@ impl AStarContext for Context<'_, '_, '_, Cost> { fn memory_limit(&self) -> Option { None } + + fn is_label_setting(&self) -> bool { + !self.costs.has_zero_cost() + } } impl Reset for Context<'_, '_, '_, Cost> { diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs index e0af5ae..edbf686 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/algo.rs @@ -56,8 +56,8 @@ impl<'costs, 'sequences, 'rc_fn, Cost> Context<'costs, 'sequences, 'rc_fn, Cost> end: AlignmentCoordinates, max_match_run: u32, ) -> Self { - assert!(start.ts_kind().is_none()); - assert!(end.ts_kind().is_some()); + assert!(start.is_primary()); + assert!(end.is_secondary()); Self { costs, diff --git a/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs b/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs new file mode 100644 index 0000000..9f5eda5 --- /dev/null +++ b/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs @@ -0,0 +1,74 @@ +use generic_a_star::{AStar, AStarResult, cost::AStarCost}; + +use crate::{ + alignment::{Alignment, coordinates::AlignmentCoordinates, sequences::AlignmentSequences}, + costs::AlignmentCosts, + exact_chaining::ts_34_jump::algo::Context, +}; + +mod algo; +#[cfg(test)] +mod tests; + +pub struct Ts34JumpAlignment { + start: AlignmentCoordinates, + end: AlignmentCoordinates, + alignment: Alignment, + cost: Cost, +} + +impl Ts34JumpAlignment { + pub fn new( + start: AlignmentCoordinates, + end: AlignmentCoordinates, + sequences: &AlignmentSequences, + cost_table: &AlignmentCosts, + rc_fn: &dyn Fn(u8) -> u8, + max_match_run: u32, + ) -> Self { + assert!(start.is_secondary()); + assert!(end.is_primary()); + + let context = Context::new(cost_table, sequences, rc_fn, start, end, max_match_run); + let mut a_star = AStar::new(context); + a_star.initialise(); + match a_star.search() { + AStarResult::FoundTarget { cost, .. } => Self { + start, + end, + alignment: a_star.reconstruct_path().into(), + // The TS base cost is applied at the 12-jump, but we anyways apply it in this algorithm to make it label-setting if the base cost is non-zero. + // But since the 34-jump has zero cost, we subtract it again. + cost: cost.0 - cost_table.ts_base_cost, + }, + AStarResult::ExceededCostLimit { .. } => unreachable!("Cost limit is None"), + AStarResult::ExceededMemoryLimit { .. } => unreachable!("Cost limit is None"), + AStarResult::NoTarget => Self { + start, + end, + alignment: Vec::new().into(), + cost: Cost::max_value(), + }, + } + } +} + +impl Ts34JumpAlignment { + pub fn start(&self) -> AlignmentCoordinates { + self.start + } + + pub fn end(&self) -> AlignmentCoordinates { + self.end + } + + pub fn alignment(&self) -> &Alignment { + &self.alignment + } +} + +impl Ts34JumpAlignment { + pub fn cost(&self) -> Cost { + self.cost + } +} diff --git a/lib_ts_chainalign/src/exact_chaining/ts_34_jump/algo.rs b/lib_ts_chainalign/src/exact_chaining/ts_34_jump/algo.rs new file mode 100644 index 0000000..0ad9531 --- /dev/null +++ b/lib_ts_chainalign/src/exact_chaining/ts_34_jump/algo.rs @@ -0,0 +1,341 @@ +use std::fmt::Display; + +use generic_a_star::{ + AStarContext, AStarNode, + cost::{AStarCost, OrderedPairCost, U32Cost}, + reset::Reset, +}; +use num_traits::Zero; + +use crate::{ + alignment::{ + AlignmentType, GapType, coordinates::AlignmentCoordinates, sequences::AlignmentSequences, + }, + costs::AlignmentCosts, +}; + +pub struct Context<'costs, 'sequences, 'rc_fn, Cost> { + costs: &'costs AlignmentCosts, + sequences: &'sequences AlignmentSequences, + rc_fn: &'rc_fn dyn Fn(u8) -> u8, + start: AlignmentCoordinates, + end: AlignmentCoordinates, + max_match_run: u32, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub struct Node { + pub identifier: Identifier, + pub predecessor: Option, + pub predecessor_alignment_type: Option, + pub cost: Cost, + pub match_run: u32, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub enum Identifier { + Primary { + coordinates: AlignmentCoordinates, + gap_type: GapType, + }, + Jump34 { + coordinates: AlignmentCoordinates, + }, + Secondary { + coordinates: AlignmentCoordinates, + gap_type: GapType, + }, +} + +impl<'costs, 'sequences, 'rc_fn, Cost> Context<'costs, 'sequences, 'rc_fn, Cost> { + pub fn new( + costs: &'costs AlignmentCosts, + sequences: &'sequences AlignmentSequences, + rc_fn: &'rc_fn dyn Fn(u8) -> u8, + start: AlignmentCoordinates, + end: AlignmentCoordinates, + max_match_run: u32, + ) -> Self { + assert!(start.is_secondary()); + assert!(end.is_primary()); + + Self { + costs, + sequences, + rc_fn, + start, + end, + max_match_run, + } + } +} + +impl AStarContext for Context<'_, '_, '_, Cost> { + type Node = Node; + + fn create_root(&self) -> Self::Node { + Node { + identifier: Identifier::Secondary { + coordinates: self.start, + gap_type: GapType::None, + }, + predecessor: None, + predecessor_alignment_type: None, + cost: Cost::zero(), + match_run: 0, + } + } + + fn generate_successors(&mut self, node: &Self::Node, output: &mut impl Extend) { + let Node { + identifier, + cost, + match_run, + .. + } = node; + let predecessor = Some(*identifier); + + let coordinates = identifier.coordinates(); + let gap_type = identifier.gap_type(); + let is_secondary = matches!(identifier, Identifier::Secondary { .. }); + let gap_affine_costs = if is_secondary { + &self.costs.secondary_costs + } else { + &self.costs.primary_costs + }; + + // Generate gap-affine successors. + if coordinates.can_increment_both(self.end, Some(self.sequences)) { + let (ca, cb) = self.sequences.characters(coordinates, self.rc_fn); + let is_match = ca == cb; + + if is_match { + // Disallow runs of matches longer than the maximum. + // This is because we do not want the exact chaining to find new anchors (which actually already exist). + if *match_run < self.max_match_run { + // Match + let new_cost = *cost; + output.extend(std::iter::once(Node { + identifier: Identifier::new_primary_secondary( + !is_secondary, + coordinates.increment_both(), + GapType::None, + ), + predecessor, + predecessor_alignment_type: Some(AlignmentType::Match), + cost: new_cost, + match_run: match_run + 1, + })); + } + } else { + // Substitution + let new_cost = *cost + gap_affine_costs.substitution; + output.extend(std::iter::once(Node { + identifier: Identifier::new_primary_secondary( + !is_secondary, + coordinates.increment_both(), + GapType::None, + ), + predecessor, + predecessor_alignment_type: Some(AlignmentType::Substitution), + cost: new_cost, + match_run: 0, + })); + } + } + + if coordinates.can_increment_a(self.end, Some(self.sequences)) { + // Gap in b + let new_cost = *cost + + match gap_type { + GapType::InB => gap_affine_costs.gap_extend, + _ => gap_affine_costs.gap_open, + }; + output.extend(std::iter::once(Node { + identifier: Identifier::new_primary_secondary( + !is_secondary, + coordinates.increment_a(), + GapType::InB, + ), + predecessor, + predecessor_alignment_type: Some(AlignmentType::GapB), + cost: new_cost, + match_run: 0, + })); + } + + if coordinates.can_increment_b(self.end, Some(self.sequences)) { + // Gap in a + let new_cost = *cost + + match gap_type { + GapType::InA => gap_affine_costs.gap_extend, + _ => gap_affine_costs.gap_open, + }; + output.extend(std::iter::once(Node { + identifier: Identifier::new_primary_secondary( + !is_secondary, + coordinates.increment_b(), + GapType::InA, + ), + predecessor, + predecessor_alignment_type: Some(AlignmentType::GapA), + cost: new_cost, + match_run: 0, + })); + } + + // Generate jump successors. + if is_secondary { + // TS base cost was applied at 12-jump already, but we anyways apply it to get a label-setting search if it's non-zero. + // We subtract it later in the super module. + let new_cost = *cost + self.costs.ts_base_cost; + + // This generates too many jumps, most of these are gonna be much too far. + output.extend( + coordinates + .generate_34_jumps(self.end) + .map(|(jump, coordinates)| Node { + identifier: Identifier::Jump34 { coordinates }, + predecessor, + predecessor_alignment_type: Some(AlignmentType::TsEnd { jump }), + cost: new_cost, + match_run: 0, + }), + ); + } + } + + fn is_target(&self, node: &Self::Node) -> bool { + node.identifier.coordinates() == self.end + } + + fn cost_limit(&self) -> Option<::Cost> { + None + } + + fn memory_limit(&self) -> Option { + None + } +} + +impl Reset for Context<'_, '_, '_, Cost> { + fn reset(&mut self) { + unimplemented!() + } +} + +impl AStarNode for Node { + type Identifier = Identifier; + + type EdgeType = AlignmentType; + + // Use match run as secondary cost + type Cost = OrderedPairCost; + + fn identifier(&self) -> &Self::Identifier { + &self.identifier + } + + fn cost(&self) -> Self::Cost { + OrderedPairCost(self.cost, U32Cost::from_primitive(self.match_run)) + } + + fn a_star_lower_bound(&self) -> Self::Cost { + OrderedPairCost(Cost::zero(), U32Cost::zero()) + } + + fn secondary_maximisable_score(&self) -> usize { + 0 + } + + fn predecessor(&self) -> Option<&Self::Identifier> { + self.predecessor.as_ref() + } + + fn predecessor_edge_type(&self) -> Option { + self.predecessor_alignment_type + } +} + +impl Identifier { + pub fn new_primary_secondary( + is_primary: bool, + coordinates: AlignmentCoordinates, + gap_type: GapType, + ) -> Self { + if is_primary { + Identifier::Primary { + coordinates, + gap_type, + } + } else { + Identifier::Secondary { + coordinates, + gap_type, + } + } + } + + pub fn coordinates(&self) -> AlignmentCoordinates { + match self { + Identifier::Primary { coordinates, .. } => *coordinates, + Identifier::Jump34 { coordinates, .. } => *coordinates, + Identifier::Secondary { coordinates, .. } => *coordinates, + } + } + + pub fn gap_type(&self) -> GapType { + match self { + Identifier::Primary { gap_type, .. } => *gap_type, + Identifier::Jump34 { .. } => GapType::None, + Identifier::Secondary { gap_type, .. } => *gap_type, + } + } +} + +impl Display for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}{}: {}, {}", + self.identifier, + if let Some(predecessor) = &self.predecessor { + format!("<-{predecessor}") + } else { + "".to_string() + }, + self.cost, + self.match_run + ) + } +} + +impl Display for Identifier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}({}, {})", + match self { + Identifier::Primary { .. } => "P".to_string(), + Identifier::Jump34 { .. } => "J".to_string(), + Identifier::Secondary { .. } => "S".to_string(), + }, + self.coordinates(), + self.gap_type(), + ) + } +} + +impl PartialOrd for Node { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Node { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.cost + .cmp(&other.cost) + .then_with(|| self.match_run.cmp(&other.match_run)) + } +} diff --git a/lib_ts_chainalign/src/exact_chaining/ts_34_jump/tests.rs b/lib_ts_chainalign/src/exact_chaining/ts_34_jump/tests.rs new file mode 100644 index 0000000..6fac089 --- /dev/null +++ b/lib_ts_chainalign/src/exact_chaining/ts_34_jump/tests.rs @@ -0,0 +1,298 @@ +use generic_a_star::cost::U32Cost; + +use crate::{ + alignment::{ + AlignmentType, coordinates::AlignmentCoordinates, sequences::AlignmentSequences, + ts_kind::TsKind, + }, + costs::{AlignmentCosts, GapAffineCosts, TsLimits}, + exact_chaining::ts_34_jump::Ts34JumpAlignment, +}; + +fn rc_fn(c: u8) -> u8 { + match c { + b'A' => b'T', + b'C' => b'G', + b'G' => b'C', + b'T' => b'A', + c => unimplemented!("Unsupported character {c}"), + } +} + +#[test] +fn test_start_end() { + let seq1 = b"AAGG".to_vec(); + let seq2 = b"TTACG".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(2u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(4u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start: AlignmentCoordinates = AlignmentCoordinates::new_secondary(2, 0, TsKind::TS12); + let end = AlignmentCoordinates::new_primary(4, 5); + let alignment = Ts34JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (2, AlignmentType::Match), + (1, AlignmentType::TsEnd { jump: 1 }), + (1, AlignmentType::Match), + (1, AlignmentType::Substitution), + (1, AlignmentType::Match), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(2u8)); +} + +#[test] +fn test_partial_alignment() { + let seq1 = b"AAGG".to_vec(); + let seq2 = b"TTACG".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(2u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(4u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start: AlignmentCoordinates = AlignmentCoordinates::new_secondary(1, 1, TsKind::TS12); + let end = AlignmentCoordinates::new_primary(3, 4); + let alignment = Ts34JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (1, AlignmentType::Match), + (1, AlignmentType::TsEnd { jump: 1 }), + (1, AlignmentType::Match), + (1, AlignmentType::Substitution), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(2u8)); +} + +#[test] +fn test_gap_directions() { + let seq1 = b"CCCCCCACCAACAAAAAA".to_vec(); + let seq2 = b"AAAAAACAAGGGGGGAGG".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(10u8), + U32Cost::from(1u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(10u8), + U32Cost::from(1u8), + U32Cost::from(1u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_secondary(18, 0, TsKind::TS21); + let end = AlignmentCoordinates::new_primary(18, 9); + let alignment = Ts34JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, u32::MAX); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (2, AlignmentType::Match), + (1, AlignmentType::GapB), + (4, AlignmentType::Match), + (1, AlignmentType::GapA), + (2, AlignmentType::Match), + (1, AlignmentType::TsEnd { jump: -9 }), + (2, AlignmentType::Match), + (1, AlignmentType::GapB), + (4, AlignmentType::Match), + (1, AlignmentType::GapA), + (2, AlignmentType::Match), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(4u8)); +} + +#[test] +fn test_max_match_run_0() { + let seq1 = b"GGAGGAGGAACAACAA".to_vec(); + let seq2 = b"CCCCCCCCAAAAAAAA".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_secondary(8, 0, TsKind::TS12); + let end = AlignmentCoordinates::new_primary(16, 16); + let alignment = Ts34JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 0); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (1, AlignmentType::TsEnd { jump: 6 }), + (2, AlignmentType::Substitution), + (14, AlignmentType::GapA), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(18u8)); +} + +#[test] +fn test_max_match_run_1() { + let seq1 = b"GGAGGAGGAACAACAA".to_vec(); + let seq2 = b"CCCCCCCCAAAAAAAA".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_secondary(8, 0, TsKind::TS12); + let end = AlignmentCoordinates::new_primary(16, 16); + let alignment = Ts34JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 1); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (1, AlignmentType::Match), + (1, AlignmentType::TsEnd { jump: -2 }), + (5, AlignmentType::Substitution), + (1, AlignmentType::Match), + (1, AlignmentType::Substitution), + (1, AlignmentType::Match), + (1, AlignmentType::Substitution), + (1, AlignmentType::Match), + (4, AlignmentType::GapA), + (1, AlignmentType::Match), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(13u8)); +} + +#[test] +fn test_max_match_run_2() { + let seq1 = b"GGAGGAGGAACAACAA".to_vec(); + let seq2 = b"CCCCCCCCAAAAAAAA".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_secondary(8, 0, TsKind::TS12); + let end = AlignmentCoordinates::new_primary(16, 16); + let alignment = Ts34JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 2); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![ + (2, AlignmentType::Match), + (1, AlignmentType::Substitution), + (2, AlignmentType::Match), + (1, AlignmentType::Substitution), + (2, AlignmentType::Match), + (1, AlignmentType::TsEnd { jump: 8 }), + (2, AlignmentType::Match), + (1, AlignmentType::Substitution), + (2, AlignmentType::Match), + (1, AlignmentType::Substitution), + (2, AlignmentType::Match), + ] + ); + assert_eq!(alignment.cost(), U32Cost::from(4u8)); +} From d1b9fc7c6175897eeccdd49edb69fbb0ca39d34a Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Wed, 3 Dec 2025 13:55:23 +0200 Subject: [PATCH 15/31] Connect tschainalign to tsalign (except AlignmentResult). --- .gitignore | 1 + Cargo.lock | 54 ++++++++ lib_ts_chainalign/Cargo.toml | 5 +- .../src/chaining_cost_function.rs | 1 + .../src/chaining_lower_bounds.rs | 77 +++++++++++ .../src/chaining_lower_bounds/gap_affine.rs | 2 + .../src/chaining_lower_bounds/ts_jump.rs | 2 + lib_ts_chainalign/src/costs.rs | 6 + lib_ts_chainalign/src/costs/compat.rs | 60 +++++++++ lib_ts_chainalign/src/lib.rs | 43 +++--- lib_tsalign/src/costs/cost_function.rs | 42 +++++- lib_tsalign/src/costs/gap_affine.rs | 61 +++++++++ test_files/config/chainalign/config.tsa | 127 ++++++++++++++++++ test_files/test_chainalign.sh | 3 + test_files/test_chainalign_release.sh | 3 + tsalign/Cargo.toml | 3 + tsalign/src/align.rs | 26 +++- tsalign/src/align/a_star_chain_ts.rs | 109 +++++++++++++++ tsalign/src/main.rs | 4 +- 19 files changed, 608 insertions(+), 21 deletions(-) create mode 100644 lib_ts_chainalign/src/chaining_cost_function.rs create mode 100644 lib_ts_chainalign/src/costs/compat.rs create mode 100644 test_files/config/chainalign/config.tsa create mode 100755 test_files/test_chainalign.sh create mode 100755 test_files/test_chainalign_release.sh create mode 100644 tsalign/src/align/a_star_chain_ts.rs diff --git a/.gitignore b/.gitignore index 766c870..e27bb6f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ /tsa_config /*.log /callgrind.out.* +*.tsc \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 6cd833a..e7c0cd6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -103,6 +103,26 @@ dependencies = [ "compare", ] +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -127,6 +147,12 @@ dependencies = [ "wyz", ] +[[package]] +name = "bswap" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3acc5ce9c60e68df21b877f13f908ef95c89f01cb6c656cf76ba95f10bc72f5" + [[package]] name = "bytemuck" version = "1.23.2" @@ -464,9 +490,12 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" name = "lib_ts_chainalign" version = "0.1.0" dependencies = [ + "compact-genome", "generic_a_star", + "lib_tsalign", "ndarray 0.17.1", "num-traits", + "serde", ] [[package]] @@ -596,6 +625,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "rawpointer", + "serde", ] [[package]] @@ -967,6 +997,15 @@ dependencies = [ "serde", ] +[[package]] +name = "sha" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4208d5a903276a9f3b797afdf6c5bc12a8da1344b053b100abf3565ecc80cb7e" +dependencies = [ + "bswap", +] + [[package]] name = "simd-adler32" version = "0.3.7" @@ -1267,12 +1306,15 @@ name = "tsalign" version = "0.19.1" dependencies = [ "anyhow", + "bincode", "clap", "compact-genome", + "lib_ts_chainalign", "lib_tsalign", "lib_tsshow", "log", "serde", + "sha", "simplelog", "toml", "traitsequence", @@ -1345,6 +1387,12 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "usvg" version = "0.45.1" @@ -1393,6 +1441,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + [[package]] name = "weezl" version = "0.1.10" diff --git a/lib_ts_chainalign/Cargo.toml b/lib_ts_chainalign/Cargo.toml index 2f4def0..c1e0672 100644 --- a/lib_ts_chainalign/Cargo.toml +++ b/lib_ts_chainalign/Cargo.toml @@ -10,5 +10,8 @@ repository.workspace = true [dependencies] generic_a_star = { version = "0.19.1", path = "../generic_a_star" } -ndarray = "0.17.1" +lib_tsalign = { version = "0.19.1", path = "../lib_tsalign" } +ndarray = { version = "0.17.1", features = ["serde"] } num-traits.workspace = true +serde.workspace = true +compact-genome.workspace = true diff --git a/lib_ts_chainalign/src/chaining_cost_function.rs b/lib_ts_chainalign/src/chaining_cost_function.rs new file mode 100644 index 0000000..3f9ee32 --- /dev/null +++ b/lib_ts_chainalign/src/chaining_cost_function.rs @@ -0,0 +1 @@ +pub struct ChainingCostFunction {} diff --git a/lib_ts_chainalign/src/chaining_lower_bounds.rs b/lib_ts_chainalign/src/chaining_lower_bounds.rs index 032e8c7..e8c941d 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds.rs @@ -1,4 +1,81 @@ //! Compute lower bounds for chaining anchors with gaps. +use generic_a_star::cost::AStarCost; +use serde::{Deserialize, Serialize}; + +use crate::{ + chaining_lower_bounds::{gap_affine::GapAffineLowerBounds, ts_jump::TsJumpLowerBounds}, + costs::AlignmentCosts, +}; + pub mod gap_affine; pub mod ts_jump; + +#[derive(Serialize, Deserialize)] +pub struct ChainingLowerBounds { + primary: GapAffineLowerBounds, + secondary: GapAffineLowerBounds, + jump: TsJumpLowerBounds, + alignment_costs: AlignmentCosts, +} + +impl ChainingLowerBounds { + /// Compute chaining lower bounds. + /// + /// * `max_n` is the maximum sequence length that the lower bounds should support. + /// * `max_match_run` is the maximum consecutive sequence of matches that is allowed. + /// Set this to `k-1`, if the anchors are `k`-mers. + /// * `alignment_costs` is the cost function for the alignment. + pub fn new(max_n: usize, max_match_run: u32, alignment_costs: AlignmentCosts) -> Self { + Self { + primary: GapAffineLowerBounds::new( + max_n, + max_match_run, + &alignment_costs.primary_costs, + ), + secondary: GapAffineLowerBounds::new( + max_n, + max_match_run, + &alignment_costs.secondary_costs, + ), + jump: TsJumpLowerBounds::new(max_n, max_match_run, &alignment_costs), + alignment_costs, + } + } +} + +impl ChainingLowerBounds { + pub fn primary_lower_bound(&self, gap1: usize, gap2: usize) -> Cost { + self.primary.lower_bound(gap1, gap2) + } + + pub fn secondary_lower_bound(&self, gap1: usize, gap2: usize) -> Cost { + self.secondary.lower_bound(gap1, gap2) + } + + pub fn jump_12_lower_bound(&self, descendant_gap: usize) -> Cost { + self.jump.lower_bound_12(descendant_gap) + } + + pub fn jump_34_lower_bound(&self, descendant_gap: usize) -> Cost { + self.jump.lower_bound_34(descendant_gap) + } +} + +impl ChainingLowerBounds { + pub fn primary(&self) -> &GapAffineLowerBounds { + &self.primary + } + + pub fn secondary(&self) -> &GapAffineLowerBounds { + &self.secondary + } + + pub fn jump(&self) -> &TsJumpLowerBounds { + &self.jump + } + + pub fn alignment_costs(&self) -> &AlignmentCosts { + &self.alignment_costs + } +} diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs index 8e1d94e..60ad212 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs @@ -1,5 +1,6 @@ use generic_a_star::{AStar, AStarNode, cost::AStarCost}; use ndarray::{Array1, Array2}; +use serde::{Deserialize, Serialize}; use crate::{chaining_lower_bounds::gap_affine::algo::Context, costs::GapAffineCosts}; @@ -7,6 +8,7 @@ mod algo; #[cfg(test)] mod tests; +#[derive(Serialize, Deserialize)] pub struct GapAffineLowerBounds { lower_bounds: Array2, variable_gap2_lower_bounds: Array1, diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump.rs b/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump.rs index a65e583..402d606 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump.rs @@ -1,11 +1,13 @@ use generic_a_star::cost::AStarCost; use ndarray::Array1; +use serde::{Deserialize, Serialize}; use crate::{chaining_lower_bounds::gap_affine::GapAffineLowerBounds, costs::AlignmentCosts}; #[cfg(test)] mod tests; +#[derive(Serialize, Deserialize)] pub struct TsJumpLowerBounds { lower_bounds_12: Array1, lower_bounds_34: Array1, diff --git a/lib_ts_chainalign/src/costs.rs b/lib_ts_chainalign/src/costs.rs index a365c91..edbfc8c 100644 --- a/lib_ts_chainalign/src/costs.rs +++ b/lib_ts_chainalign/src/costs.rs @@ -1,13 +1,18 @@ use std::ops::Range; use num_traits::Zero; +use serde::{Deserialize, Serialize}; +mod compat; + +#[derive(Serialize, Deserialize)] pub struct GapAffineCosts { pub substitution: Cost, pub gap_open: Cost, pub gap_extend: Cost, } +#[derive(Serialize, Deserialize)] pub struct TsLimits { pub jump_12: Range, pub jump_34: Range, @@ -15,6 +20,7 @@ pub struct TsLimits { pub ancestor_gap: Range, } +#[derive(Serialize, Deserialize)] pub struct AlignmentCosts { pub primary_costs: GapAffineCosts, pub secondary_costs: GapAffineCosts, diff --git a/lib_ts_chainalign/src/costs/compat.rs b/lib_ts_chainalign/src/costs/compat.rs new file mode 100644 index 0000000..9999bc7 --- /dev/null +++ b/lib_ts_chainalign/src/costs/compat.rs @@ -0,0 +1,60 @@ +//! Functions for making tschainalign compatible with tsalign types. + +use compact_genome::interface::alphabet::Alphabet; +use generic_a_star::cost::AStarCost; +use lib_tsalign::{ + a_star_aligner::template_switch_distance::TemplateSwitchDirection, + config::TemplateSwitchConfig, costs::gap_affine::GapAffineAlignmentCostTable, +}; + +use crate::costs::{AlignmentCosts, GapAffineCosts, TsLimits}; + +impl From> + for AlignmentCosts +{ + fn from(value: TemplateSwitchConfig) -> Self { + let value = &value; + assert!(value.left_flank_length == 0 && value.right_flank_length == 0); + + let ts_base_cost = value.base_cost.qqr; + assert_eq!(ts_base_cost, value.base_cost.qrr); + assert_eq!(ts_base_cost, value.base_cost.rqr); + assert_eq!(ts_base_cost, value.base_cost.rrr); + + Self { + primary_costs: (&value.primary_edit_costs).into(), + secondary_costs: value + .secondary_edit_costs(TemplateSwitchDirection::Reverse) + .into(), + ts_base_cost, + ts_limits: TsLimits { + jump_12: value.offset_costs.zero_range().unwrap(), + // tsalign costs do not support limiting this. + jump_34: isize::MIN..isize::MAX, + length_23: value.length_costs.zero_range().unwrap(), + ancestor_gap: value.reverse_anti_primary_gap_costs.zero_range().unwrap(), + }, + } + } +} + +impl + From<&'_ GapAffineAlignmentCostTable> for GapAffineCosts +{ + fn from(value: &GapAffineAlignmentCostTable) -> Self { + assert_eq!(value.unique_match_cost(), Some(Cost::zero())); + Self { + substitution: value.unique_substitution_cost().unwrap(), + gap_open: value.unique_gap_open_cost().unwrap(), + gap_extend: value.unique_gap_extend_cost().unwrap(), + } + } +} + +impl From> + for GapAffineCosts +{ + fn from(value: GapAffineAlignmentCostTable) -> Self { + (&value).into() + } +} diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index bce1629..5f228bc 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -1,27 +1,38 @@ -use generic_a_star::cost::AStarCost; - -use crate::{ - chaining_lower_bounds::{gap_affine::GapAffineLowerBounds, ts_jump::TsJumpLowerBounds}, - costs::AlignmentCosts, +use generic_a_star::cost::U32Cost; +use lib_tsalign::a_star_aligner::{ + alignment_geometry::AlignmentRange, alignment_result::AlignmentResult, + template_switch_distance::AlignmentType, }; +use crate::{chaining_lower_bounds::ChainingLowerBounds, costs::AlignmentCosts}; + pub mod alignment; +pub mod chaining_cost_function; pub mod chaining_lower_bounds; pub mod costs; pub mod exact_chaining; -#[expect(dead_code)] -fn compute_lower_bounds( +/// Perform preprocessing for tschainalign. +/// +/// * `max_n` is the maximum sequence length that the lower bounds should support. +/// * `max_match_run` is the maximum consecutive sequence of matches that is allowed. +/// Set this to `k-1`, if the anchors are `k`-mers. +/// * `alignment_costs` is the cost function for the alignment. +pub fn preprocess( max_n: usize, max_match_run: u32, - costs: &AlignmentCosts, -) { - let gap_affine_lower_bounds = - GapAffineLowerBounds::new(max_n, max_match_run, &costs.primary_costs); - let ts_jump_lower_bounds = TsJumpLowerBounds::new(max_n, max_match_run, costs); + alignment_costs: AlignmentCosts, +) -> ChainingLowerBounds { + ChainingLowerBounds::new(max_n, max_match_run, alignment_costs) +} - // Remove dead code warnings - gap_affine_lower_bounds.lower_bound(0, 0); - ts_jump_lower_bounds.lower_bound_12(0); - ts_jump_lower_bounds.lower_bound_34(0); +pub fn align( + _reference: &[u8], + _query: &[u8], + _range: AlignmentRange, + _reference_name: &str, + _query_name: &str, + _chaining_lower_bounds: &ChainingLowerBounds, +) -> AlignmentResult { + todo!() } diff --git a/lib_tsalign/src/costs/cost_function.rs b/lib_tsalign/src/costs/cost_function.rs index f02b77e..2cf9ce0 100644 --- a/lib_tsalign/src/costs/cost_function.rs +++ b/lib_tsalign/src/costs/cost_function.rs @@ -1,4 +1,4 @@ -use std::ops::{Add, Bound, RangeBounds, Sub}; +use std::ops::{Add, Bound, Range, RangeBounds, Sub}; use num_traits::{Bounded, One, Zero}; @@ -175,6 +175,46 @@ impl CostFunction CostFunction { + /// Returns the unique range at which this cost function is zero, if it is unique. + /// + /// Also, if there is a cost value that is neither zero nor infinite, `None` is returned. + pub fn zero_range(&self) -> Option> { + let mut function = self.function.iter(); + + let first = function.next().unwrap(); + let start = if first.1.is_zero() { + SourceType::min_value() + } else if first.1 == Cost::max_value() { + if let Some(first) = function.next() { + if first.1.is_zero() { + first.0.clone() + } else { + return None; + } + } else { + return None; + } + } else { + return None; + }; + let end = if let Some(last) = function.next() { + if last.1 == Cost::max_value() { + last.0.clone() + } else { + return None; + } + } else { + SourceType::max_value() + }; + if function.next().is_some() { + None + } else { + Some(start..end) + } + } +} + impl TryFrom> for CostFunction { type Error = Error; fn try_from(function: Vec<(SourceType, Cost)>) -> Result { diff --git a/lib_tsalign/src/costs/gap_affine.rs b/lib_tsalign/src/costs/gap_affine.rs index abe6bb3..dfd4f1b 100644 --- a/lib_tsalign/src/costs/gap_affine.rs +++ b/lib_tsalign/src/costs/gap_affine.rs @@ -228,6 +228,67 @@ impl GapAffineAlignmentCostTable Option { + let mut match_costs = + AlphabetType::iter().map(|character| self.match_cost(character.clone(), character)); + let match_cost = match_costs.next().unwrap(); + for other_match_cost in match_costs { + if other_match_cost != match_cost { + return None; + } + } + Some(match_cost) + } + + /// Return the unique substitution cost, if it is unique. + pub fn unique_substitution_cost(&self) -> Option { + let mut substitution_costs = AlphabetType::iter().flat_map(|c1| { + iter::repeat(c1) + .zip(AlphabetType::iter()) + .filter_map(|(c1, c2)| { + if c1 != c2 { + Some(self.substitution_cost(c1.clone(), c2)) + } else { + None + } + }) + }); + let substitution_cost = substitution_costs.next().unwrap(); + for other_substitution_cost in substitution_costs { + if other_substitution_cost != substitution_cost { + return None; + } + } + Some(substitution_cost) + } + + /// Return the unique gap open cost, if it is unique. + pub fn unique_gap_open_cost(&self) -> Option { + let mut gap_open_costs = + AlphabetType::iter().map(|character| self.gap_open_cost(character)); + let gap_open_cost = gap_open_costs.next().unwrap(); + for other_gap_open_cost in gap_open_costs { + if other_gap_open_cost != gap_open_cost { + return None; + } + } + Some(gap_open_cost) + } + + /// Return the unique gap extend cost, if it is unique. + pub fn unique_gap_extend_cost(&self) -> Option { + let mut gap_extend_costs = + AlphabetType::iter().map(|character| self.gap_extend_cost(character)); + let gap_extend_cost = gap_extend_costs.next().unwrap(); + for other_gap_extend_cost in gap_extend_costs { + if other_gap_extend_cost != gap_extend_cost { + return None; + } + } + Some(gap_extend_cost) + } } fn vec_into_min(mut vec: Vec) -> Vec { diff --git a/test_files/config/chainalign/config.tsa b/test_files/config/chainalign/config.tsa new file mode 100644 index 0000000..5b7ab93 --- /dev/null +++ b/test_files/config/chainalign/config.tsa @@ -0,0 +1,127 @@ +# Limits + +left_flank_length = 0 +right_flank_length = 0 + +# Base Cost + +rrf_cost = 31 +rqf_cost = 30 +qrf_cost = 30 +qqf_cost = 31 +rrr_cost = 2 +rqr_cost = 2 +qrr_cost = 2 +qqr_cost = 2 + +# Jump Costs + +Offset + -inf -300 301 + inf 0 inf + +Length + 0 10 200 + inf 0 inf + +LengthDifference + -inf -100 -20 -10 11 21 101 + inf 0 0 0 0 0 inf + +ForwardAntiPrimaryGap + -inf 1 + 0 inf + +ReverseAntiPrimaryGap + -inf + 0 + +# Primary Edit Costs + +SubstitutionCostTable + | A C G T +--+------------ +A | 0 2 2 2 +C | 2 0 2 2 +G | 2 2 0 2 +T | 2 2 2 0 + +GapOpenCostVector + A C G T + 3 3 3 3 + +GapExtendCostVector + A C G T + 1 1 1 1 + +# Secondary Forward Edit Costs + +SubstitutionCostTable + | A C G T +--+------------ +A | 0 4 4 4 +C | 4 0 4 4 +G | 4 4 0 4 +T | 4 4 4 0 + +GapOpenCostVector + A C G T + 3 3 3 3 + +GapExtendCostVector + A C G T + 2 2 2 2 + +# Secondary Reverse Edit Costs + +SubstitutionCostTable + | A C G T +--+------------ +A | 0 4 4 4 +C | 4 0 4 4 +G | 4 4 0 4 +T | 4 4 4 0 + +GapOpenCostVector + A C G T + 3 3 3 3 + +GapExtendCostVector + A C G T + 2 2 2 2 + +# Left Flank Edit Costs + +SubstitutionCostTable + | A C G T +--+------------ +A | 0 3 3 3 +C | 3 0 3 3 +G | 3 3 0 3 +T | 3 3 3 0 + +GapOpenCostVector + A C G T + 4 4 4 4 + +GapExtendCostVector + A C G T + 1 1 1 1 + +# Right Flank Edit Costs + +SubstitutionCostTable + | A C G T +--+------------ +A | 0 3 3 3 +C | 3 0 3 3 +G | 3 3 0 3 +T | 3 3 3 0 + +GapOpenCostVector + A C G T + 4 4 4 4 + +GapExtendCostVector + A C G T + 1 1 1 1 diff --git a/test_files/test_chainalign.sh b/test_files/test_chainalign.sh new file mode 100755 index 0000000..0b4ffe6 --- /dev/null +++ b/test_files/test_chainalign.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +cargo run -- align --alignment-method a-star-chain-ts -c test_files/config/chainalign --alphabet dna -p "$1" \ No newline at end of file diff --git a/test_files/test_chainalign_release.sh b/test_files/test_chainalign_release.sh new file mode 100755 index 0000000..9bbb881 --- /dev/null +++ b/test_files/test_chainalign_release.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +cargo run --release -- align --alignment-method a-star-chain-ts -c test_files/config/chainalign --alphabet dna -p "$1" diff --git a/tsalign/Cargo.toml b/tsalign/Cargo.toml index 402cf07..9c33a69 100644 --- a/tsalign/Cargo.toml +++ b/tsalign/Cargo.toml @@ -12,6 +12,7 @@ repository.workspace = true lib_tsalign = { version = "0.19.1", path = "../lib_tsalign", features = [ "serde", ] } +lib_ts_chainalign = { version = "0.1.0", path = "../lib_ts_chainalign" } lib_tsshow = { version = "0.19.1", path = "../lib_tsshow" } clap = { version = "4.5.16", features = ["derive"] } compact-genome = { workspace = true, features = ["io"] } @@ -22,3 +23,5 @@ log.workspace = true simplelog = "0.12.2" anyhow = "1.0.97" utf8-chars = "3.0.5" +sha = "1.0.3" +bincode = { version = "2.0.1", features = ["serde"] } diff --git a/tsalign/src/align.rs b/tsalign/src/align.rs index eab1df8..c1c10c8 100644 --- a/tsalign/src/align.rs +++ b/tsalign/src/align.rs @@ -41,8 +41,12 @@ use template_switch_distance_type_selectors::{ align_a_star_template_switch_distance, }; -use crate::align::fasta_parser::{parse_pair_fasta_file, parse_single_fasta_file}; +use crate::align::{ + a_star_chain_ts::align_a_star_chain_ts, + fasta_parser::{parse_pair_fasta_file, parse_single_fasta_file}, +}; +mod a_star_chain_ts; mod fasta_parser; mod template_switch_distance_type_selectors; @@ -58,6 +62,10 @@ pub struct Cli { #[clap(long, short = 'o')] output: Option, + /// The directory in which preprocessed data is stored. + #[clap(long)] + cache_directory: Option, + /// The alphabet present in the input files. /// /// This must also match the alphabet used in the config. @@ -79,6 +87,12 @@ pub struct Cli { #[clap(long, default_value = "a-star-template-switch")] alignment_method: AlignmentMethod, + /// k-mer size for tschainalign. + /// + /// If it is not specified, it is inferred from the sequence lengths. + #[clap(short)] + k: Option, + #[clap(long, default_value = "anti-diagonal")] ts_node_ord_strategy: TemplateSwitchNodeOrdStrategySelector, @@ -197,6 +211,7 @@ enum AlignmentMethod { Matrix, AStarGapAffine, AStarTemplateSwitch, + AStarChainTS, } #[derive(Debug, Clone, Eq, PartialEq, ValueEnum)] @@ -219,6 +234,7 @@ pub fn cli(cli: Cli) -> Result<()> { .unwrap(); if cli.alignment_method != AlignmentMethod::AStarTemplateSwitch + && cli.alignment_method != AlignmentMethod::AStarChainTS && cli.alphabet != InputAlphabet::Dna { // Only A*-TS algo supports alphabets other than DNA. @@ -354,6 +370,14 @@ fn execute_with_alphabet( &format!("{} {}", reference_record.id, reference_record.comment), &format!("{} {}", query_record.id, query_record.comment), ), + AlignmentMethod::AStarChainTS => align_a_star_chain_ts( + cli, + reference_sequence, + query_sequence, + range, + &format!("{} {}", reference_record.id, reference_record.comment), + &format!("{} {}", query_record.id, query_record.comment), + ), } Ok(()) diff --git a/tsalign/src/align/a_star_chain_ts.rs b/tsalign/src/align/a_star_chain_ts.rs new file mode 100644 index 0000000..f9ccf34 --- /dev/null +++ b/tsalign/src/align/a_star_chain_ts.rs @@ -0,0 +1,109 @@ +use compact_genome::interface::{alphabet::Alphabet, sequence::GenomeSequence}; +use lib_ts_chainalign::costs::AlignmentCosts; +use lib_tsalign::{ + a_star_aligner::alignment_geometry::AlignmentRange, config::TemplateSwitchConfig, +}; +use log::{info, warn}; +use sha::{ + sha1::Sha1, + utils::{Digest, DigestExt}, +}; +use std::{fmt::Debug, fs::File, path::PathBuf}; + +use crate::align::Cli; + +pub fn align_a_star_chain_ts< + AlphabetType: Alphabet + Debug + Clone + Eq, + SubsequenceType: GenomeSequence + ?Sized, +>( + cli: Cli, + reference: &SubsequenceType, + query: &SubsequenceType, + range: AlignmentRange, + reference_name: &str, + query_name: &str, +) { + let mut config_path = cli.configuration_directory.clone(); + info!("Loading alignment config directory {config_path:?}"); + + config_path.push("config.tsa"); + let config_file = std::io::BufReader::new( + std::fs::File::open(&config_path) + .unwrap_or_else(|error| panic!("Error opening config file {config_path:?}: {error}")), + ); + let alignment_costs = TemplateSwitchConfig::::read_plain(config_file) + .unwrap_or_else(|error| panic!("Error parsing template switch config:\n{error}")); + let alignment_costs: AlignmentCosts<_> = alignment_costs.into(); + + let cache_directory = cli.cache_directory.clone().unwrap_or_else(|| { + warn!("No cache directory specified, dropping files into current working directory."); + PathBuf::new() + }); + let max_n = 1 << (usize::BITS - (reference.len().max(query.len()) - 1).leading_zeros()); + let k = cli.k.unwrap_or_else(|| { + // This evaluates to ceil(log_2(length_sum)). + // The motivation is that there are length_sum k-mers, + // so for each to be different, k needs to be at least ceil(log_4(length_sum)). + // However, the birthday paradoxon states that for avoiding collisions, + // the amount of possible k-mers needs to grow in the square of the amount available k-mers, + // so we square that and arrive at ceil(log_2(length_sum)). + usize::BITS - ((reference.len() + query.len()) - 1).leading_zeros() + }); + info!("Using max_n = {max_n}"); + info!("Using k = {k}"); + let max_match_run = k - 1; + let cost_hash = Sha1::default() + .digest( + &bincode::serde::encode_to_vec(&alignment_costs, bincode::config::standard()).unwrap(), + ) + .to_hex(); + info!("Using cost_hash = {cost_hash}"); + + let cache_file: PathBuf = [ + cache_directory, + PathBuf::from(format!("{cost_hash}-{k}-{max_n}.tsc")), + ] + .iter() + .collect(); + + let chaining_lower_bounds = if let Ok(mut file) = File::open(&cache_file) { + info!("Loading preprocessed data from cache at {cache_file:?}"); + bincode::serde::decode_from_std_read(&mut file, bincode::config::standard()).unwrap() + } else { + info!("Preprocessing..."); + let chaining_lower_bounds = + lib_ts_chainalign::preprocess(max_n, max_match_run, alignment_costs); + + info!("Storing preprocessed data into cache at {cache_file:?}"); + let mut file = File::create(&cache_file).unwrap(); + bincode::serde::encode_into_std_write( + &chaining_lower_bounds, + &mut file, + bincode::config::standard(), + ) + .unwrap(); + chaining_lower_bounds + }; + + let reference = reference.clone_as_vec(); + let query = query.clone_as_vec(); + info!("Aligning..."); + let alignment = lib_ts_chainalign::align( + &reference, + &query, + range, + reference_name, + query_name, + &chaining_lower_bounds, + ); + info!("Finished aligning"); + + if let Some(output) = cli.output { + info!("Outputting alignment statistics to {output:?}"); + use std::io::Write; + let mut output = std::io::BufWriter::new(std::fs::File::create(output).unwrap()); + write!(output, "{}", toml::to_string(&alignment).unwrap()).unwrap(); + } + + println!("{alignment}"); +} diff --git a/tsalign/src/main.rs b/tsalign/src/main.rs index ac3f029..573e4a4 100644 --- a/tsalign/src/main.rs +++ b/tsalign/src/main.rs @@ -15,7 +15,7 @@ struct Cli { #[derive(clap::Subcommand)] enum Subcommand { - Align(align::Cli), + Align(Box), Show(show::Cli), } @@ -23,7 +23,7 @@ fn main() -> Result<()> { let cli = Cli::parse(); match cli.subcommand { - Subcommand::Align(cli) => align::cli(cli), + Subcommand::Align(cli) => align::cli(*cli), Subcommand::Show(cli) => show::cli(cli), } } From 47c9b906800e7a2fac07ca63c3f5714fd2879d05 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Wed, 3 Dec 2025 15:55:27 +0200 Subject: [PATCH 16/31] Implement anchor finding. --- Cargo.lock | 2 + lib_ts_chainalign/Cargo.toml | 2 + lib_ts_chainalign/src/alignment/sequences.rs | 8 + lib_ts_chainalign/src/anchors.rs | 235 ++++++++++++++++++ lib_ts_chainalign/src/anchors/kmer_matches.rs | 76 ++++++ lib_ts_chainalign/src/anchors/kmers.rs | 93 +++++++ .../src/chaining_lower_bounds.rs | 6 + lib_ts_chainalign/src/lib.rs | 28 ++- test_files/test_chainalign.sh | 2 +- test_files/test_chainalign_release.sh | 2 +- test_files/twin_10_eq.fa | 4 + test_files/twin_10_ts.fa | 8 + tsalign/src/align/a_star_chain_ts.rs | 21 +- 13 files changed, 473 insertions(+), 14 deletions(-) create mode 100644 lib_ts_chainalign/src/anchors.rs create mode 100644 lib_ts_chainalign/src/anchors/kmer_matches.rs create mode 100644 lib_ts_chainalign/src/anchors/kmers.rs create mode 100644 test_files/twin_10_eq.fa create mode 100644 test_files/twin_10_ts.fa diff --git a/Cargo.lock b/Cargo.lock index e7c0cd6..55440f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -492,7 +492,9 @@ version = "0.1.0" dependencies = [ "compact-genome", "generic_a_star", + "itertools", "lib_tsalign", + "log", "ndarray 0.17.1", "num-traits", "serde", diff --git a/lib_ts_chainalign/Cargo.toml b/lib_ts_chainalign/Cargo.toml index c1e0672..871c166 100644 --- a/lib_ts_chainalign/Cargo.toml +++ b/lib_ts_chainalign/Cargo.toml @@ -15,3 +15,5 @@ ndarray = { version = "0.17.1", features = ["serde"] } num-traits.workspace = true serde.workspace = true compact-genome.workspace = true +itertools = "0.14.0" +log.workspace = true diff --git a/lib_ts_chainalign/src/alignment/sequences.rs b/lib_ts_chainalign/src/alignment/sequences.rs index 4e3de11..7082591 100644 --- a/lib_ts_chainalign/src/alignment/sequences.rs +++ b/lib_ts_chainalign/src/alignment/sequences.rs @@ -44,4 +44,12 @@ impl AlignmentSequences { pub fn end(&self) -> AlignmentCoordinates { AlignmentCoordinates::new_primary(self.seq1.len(), self.seq2.len()) } + + pub fn seq1(&self) -> &[u8] { + &self.seq1 + } + + pub fn seq2(&self) -> &[u8] { + &self.seq2 + } } diff --git a/lib_ts_chainalign/src/anchors.rs b/lib_ts_chainalign/src/anchors.rs new file mode 100644 index 0000000..63e5918 --- /dev/null +++ b/lib_ts_chainalign/src/anchors.rs @@ -0,0 +1,235 @@ +use std::fmt::Display; + +use lib_tsalign::a_star_aligner::alignment_geometry::AlignmentRange; +use log::debug; + +use crate::{ + alignment::sequences::AlignmentSequences, + anchors::{ + kmer_matches::find_kmer_matches, + kmers::{Kmer, KmerStore}, + }, +}; + +pub mod kmer_matches; +pub mod kmers; + +pub struct Anchors { + pub primary_anchors: Vec, + pub secondary_anchors_11: Vec, + pub secondary_anchors_12: Vec, + pub secondary_anchors_21: Vec, + pub secondary_anchors_22: Vec, +} + +pub struct PrimaryAnchor { + pub seq1: usize, + pub seq2: usize, +} + +pub struct SecondaryAnchor { + /// Ancestor index in the reverse complement. + pub ancestor: usize, + /// Descendant index in the primary sequence. + pub descendant: usize, +} + +impl Anchors { + pub fn new( + sequences: &AlignmentSequences, + range: AlignmentRange, + k: u32, + rc_fn: &dyn Fn(u8) -> u8, + ) -> Self { + if k <= 8 { + Self::new_with_kmer_store::(sequences, range, k, rc_fn) + } else if k <= 16 { + Self::new_with_kmer_store::(sequences, range, k, rc_fn) + } else if k <= 32 { + Self::new_with_kmer_store::(sequences, range, k, rc_fn) + } else if k <= 64 { + Self::new_with_kmer_store::(sequences, range, k, rc_fn) + } else { + panic!("Only k-mer sizes up to 64 are supported, but got {k}"); + } + } + + fn new_with_kmer_store( + sequences: &AlignmentSequences, + range: AlignmentRange, + k: u32, + rc_fn: &dyn Fn(u8) -> u8, + ) -> Self { + let k = usize::try_from(k).unwrap(); + let s1 = sequences.seq1(); + let s2 = sequences.seq2(); + let s1_rc: Vec<_> = s1.iter().copied().rev().map(rc_fn).collect(); + let s2_rc: Vec<_> = s2.iter().copied().rev().map(rc_fn).collect(); + + let s1_kmer_count = + (range.reference_limit() - range.reference_offset() + 1).saturating_sub(k); + let s2_kmer_count = (range.query_limit() - range.query_offset() + 1).saturating_sub(k); + + // Compute k-mers. + let mut s1_kmers: Vec<_> = (range.reference_offset() + ..range.reference_offset() + s1_kmer_count) + .map(|offset| (Kmer::::from(&s1[offset..offset + k]), offset)) + .collect(); + s1_kmers.sort(); + let s1_kmers = s1_kmers; + let mut s2_kmers: Vec<_> = (range.query_offset()..range.query_offset() + s2_kmer_count) + .map(|offset| (Kmer::::from(&s2[offset..offset + k]), offset)) + .collect(); + s2_kmers.sort(); + let s2_kmers = s2_kmers; + let mut s1_rc_kmers: Vec<_> = (0..s1_rc.len() + 1 - k) + .map(|offset| (Kmer::::from(&s1_rc[offset..offset + k]), offset)) + .collect(); + s1_rc_kmers.sort(); + let s1_rc_kmers = s1_rc_kmers; + let mut s2_rc_kmers: Vec<_> = (0..s2_rc.len() + 1 - k) + .map(|offset| (Kmer::::from(&s2_rc[offset..offset + k]), offset)) + .collect(); + s2_rc_kmers.sort(); + let s2_rc_kmers = s2_rc_kmers; + + debug!("s1_kmers: {s1_kmers:?}"); + debug!("s2_kmers: {s2_kmers:?}"); + + // Compute anchors. + let mut primary_anchors: Vec<_> = find_kmer_matches(&s1_kmers, &s2_kmers) + .into_iter() + .map(|(seq1, seq2)| PrimaryAnchor { seq1, seq2 }) + .collect(); + let mut secondary_anchors_11: Vec<_> = find_kmer_matches(&s1_rc_kmers, &s1_kmers) + .into_iter() + .map(|(ancestor, descendant)| SecondaryAnchor { + ancestor, + descendant, + }) + .collect(); + let mut secondary_anchors_12: Vec<_> = find_kmer_matches(&s1_rc_kmers, &s2_kmers) + .into_iter() + .map(|(ancestor, descendant)| SecondaryAnchor { + ancestor, + descendant, + }) + .collect(); + let mut secondary_anchors_21: Vec<_> = find_kmer_matches(&s2_rc_kmers, &s1_kmers) + .into_iter() + .map(|(ancestor, descendant)| SecondaryAnchor { + ancestor, + descendant, + }) + .collect(); + let mut secondary_anchors_22: Vec<_> = find_kmer_matches(&s2_rc_kmers, &s2_kmers) + .into_iter() + .map(|(ancestor, descendant)| SecondaryAnchor { + ancestor, + descendant, + }) + .collect(); + + // Sort anchors. + primary_anchors + .sort_unstable_by_key(|primary_anchor| primary_anchor.seq1.min(primary_anchor.seq2)); + secondary_anchors_11.sort_unstable_by_key(|secondary_anchor| { + secondary_anchor.ancestor.min(secondary_anchor.descendant) + }); + secondary_anchors_12.sort_unstable_by_key(|secondary_anchor| { + secondary_anchor.ancestor.min(secondary_anchor.descendant) + }); + secondary_anchors_21.sort_unstable_by_key(|secondary_anchor| { + secondary_anchor.ancestor.min(secondary_anchor.descendant) + }); + secondary_anchors_22.sort_unstable_by_key(|secondary_anchor| { + secondary_anchor.ancestor.min(secondary_anchor.descendant) + }); + + Self { + primary_anchors, + secondary_anchors_11, + secondary_anchors_12, + secondary_anchors_21, + secondary_anchors_22, + } + } +} + +impl Display for Anchors { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "P: [")?; + let mut once = true; + for primary_anchor in &self.primary_anchors { + if once { + once = false; + } else { + write!(f, ", ")?; + } + write!(f, "{primary_anchor}")?; + } + writeln!(f, "]")?; + + write!(f, "S11: [")?; + let mut once = true; + for secondary_anchor in &self.secondary_anchors_11 { + if once { + once = false; + } else { + write!(f, ", ")?; + } + write!(f, "{secondary_anchor}")?; + } + writeln!(f, "]")?; + + write!(f, "S12: [")?; + let mut once = true; + for secondary_anchor in &self.secondary_anchors_12 { + if once { + once = false; + } else { + write!(f, ", ")?; + } + write!(f, "{secondary_anchor}")?; + } + writeln!(f, "]")?; + + write!(f, "S21: [")?; + let mut once = true; + for secondary_anchor in &self.secondary_anchors_21 { + if once { + once = false; + } else { + write!(f, ", ")?; + } + write!(f, "{secondary_anchor}")?; + } + writeln!(f, "]")?; + + write!(f, "S22: [")?; + let mut once = true; + for secondary_anchor in &self.secondary_anchors_22 { + if once { + once = false; + } else { + write!(f, ", ")?; + } + write!(f, "{secondary_anchor}")?; + } + write!(f, "]")?; + + Ok(()) + } +} + +impl Display for PrimaryAnchor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "({}, {})", self.seq1, self.seq2) + } +} + +impl Display for SecondaryAnchor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "({}, {})", self.ancestor, self.descendant) + } +} diff --git a/lib_ts_chainalign/src/anchors/kmer_matches.rs b/lib_ts_chainalign/src/anchors/kmer_matches.rs new file mode 100644 index 0000000..4c16eeb --- /dev/null +++ b/lib_ts_chainalign/src/anchors/kmer_matches.rs @@ -0,0 +1,76 @@ +use std::mem; + +use itertools::iproduct; +use num_traits::bounds::LowerBounded; + +use crate::anchors::kmers::{Kmer, KmerStore}; + +struct Cluster { + indexes_a: Vec, + indexes_b: Vec, + kmer: Kmer, +} + +impl Cluster { + fn new(kmer: Kmer) -> Self { + Self { + indexes_a: Vec::new(), + indexes_b: Vec::new(), + kmer, + } + } + + fn reset(&mut self, kmer: Kmer) -> impl Iterator { + self.kmer = kmer; + let indexes_a = mem::take(&mut self.indexes_a); + let indexes_b = mem::take(&mut self.indexes_b); + + iproduct!(indexes_a, indexes_b) + } +} + +pub fn find_kmer_matches( + mut a: &[(Kmer, usize)], + mut b: &[(Kmer, usize)], +) -> Vec<(usize, usize)> { + debug_assert!(a.is_sorted()); + debug_assert!(b.is_sorted()); + + let mut result = Vec::new(); + let mut cluster = Cluster::new(Kmer::min_value()); + + while let (Some((kmer_a, index_a)), Some((kmer_b, index_b))) = (a.first(), b.first()) { + if kmer_a < kmer_b { + a = &a[1..]; + + if kmer_a != &cluster.kmer { + result.extend(cluster.reset(*kmer_a)); + } + cluster.indexes_a.push(*index_a); + } else { + b = &b[1..]; + + if kmer_b != &cluster.kmer { + result.extend(cluster.reset(*kmer_b)); + } + cluster.indexes_b.push(*index_b); + } + } + + for (kmer, index) in a { + if kmer != &cluster.kmer { + result.extend(cluster.reset(*kmer)); + } + cluster.indexes_a.push(*index); + } + + for (kmer, index) in b { + if kmer != &cluster.kmer { + result.extend(cluster.reset(*kmer)); + } + cluster.indexes_b.push(*index); + } + + result.extend(cluster.reset(cluster.kmer)); + result +} diff --git a/lib_ts_chainalign/src/anchors/kmers.rs b/lib_ts_chainalign/src/anchors/kmers.rs new file mode 100644 index 0000000..899d923 --- /dev/null +++ b/lib_ts_chainalign/src/anchors/kmers.rs @@ -0,0 +1,93 @@ +use std::{ + fmt::Debug, + mem, + ops::{BitOrAssign, ShlAssign, ShrAssign}, +}; + +use num_traits::{PrimInt, bounds::LowerBounded}; + +pub trait KmerStore: + PrimInt + ShlAssign + ShrAssign + BitOrAssign + From + Into +{ +} + +#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] +pub struct Kmer { + kmer: Store, +} + +pub type Kmer64 = Kmer; +pub type Kmer32 = Kmer; +pub type Kmer16 = Kmer; +pub type Kmer8 = Kmer; + +impl KmerStore for u128 {} +impl KmerStore for u64 {} +impl KmerStore for u32 {} +impl KmerStore for u16 {} + +fn char_to_bits>(char: u8) -> Store { + match char { + b'A' | b'a' => 0, + b'C' | b'c' => 1, + b'G' | b'g' => 2, + b'T' | b't' => 3, + char => panic!("Unsupported character: {}", char as char), + } + .into() +} + +fn bits_to_char>(bits: Store) -> u8 { + match bits.into() & 3 { + 0 => b'A', + 1 => b'C', + 2 => b'G', + 3 => b'T', + _ => unreachable!(), + } +} + +impl From<&[u8]> for Kmer { + fn from(value: &[u8]) -> Self { + assert!(value.len() <= mem::size_of::() * 4); + + let mut store = Store::zero(); + for c in value { + store <<= 2; + store |= char_to_bits(*c); + } + Self { kmer: store } + } +} + +impl LowerBounded for Kmer { + fn min_value() -> Self { + Self { + kmer: Store::min_value(), + } + } +} + +impl Kmer { + fn to_vec(self, k: usize) -> Vec { + assert!(k <= mem::size_of::() * 4); + let mut result = Vec::new(); + let mut kmer = self.kmer; + for _ in 0..k { + result.push(bits_to_char(kmer)); + kmer >>= 2; + } + result.reverse(); + result + } + + fn to_string(self, k: usize) -> String { + String::from_utf8_lossy(&self.to_vec(k)).to_string() + } +} + +impl Debug for Kmer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.to_string(mem::size_of::() * 4)) + } +} diff --git a/lib_ts_chainalign/src/chaining_lower_bounds.rs b/lib_ts_chainalign/src/chaining_lower_bounds.rs index e8c941d..7c0bc64 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds.rs @@ -17,6 +17,7 @@ pub struct ChainingLowerBounds { secondary: GapAffineLowerBounds, jump: TsJumpLowerBounds, alignment_costs: AlignmentCosts, + max_match_run: u32, } impl ChainingLowerBounds { @@ -40,6 +41,7 @@ impl ChainingLowerBounds { ), jump: TsJumpLowerBounds::new(max_n, max_match_run, &alignment_costs), alignment_costs, + max_match_run, } } } @@ -78,4 +80,8 @@ impl ChainingLowerBounds { pub fn alignment_costs(&self) -> &AlignmentCosts { &self.alignment_costs } + + pub fn max_match_run(&self) -> u32 { + self.max_match_run + } } diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index 5f228bc..a0bc442 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -3,10 +3,15 @@ use lib_tsalign::a_star_aligner::{ alignment_geometry::AlignmentRange, alignment_result::AlignmentResult, template_switch_distance::AlignmentType, }; +use log::{debug, info}; -use crate::{chaining_lower_bounds::ChainingLowerBounds, costs::AlignmentCosts}; +use crate::{ + alignment::sequences::AlignmentSequences, anchors::Anchors, + chaining_lower_bounds::ChainingLowerBounds, costs::AlignmentCosts, +}; pub mod alignment; +pub mod anchors; pub mod chaining_cost_function; pub mod chaining_lower_bounds; pub mod costs; @@ -27,12 +32,25 @@ pub fn preprocess( } pub fn align( - _reference: &[u8], - _query: &[u8], - _range: AlignmentRange, + reference: Vec, + query: Vec, + range: AlignmentRange, + rc_fn: &dyn Fn(u8) -> u8, _reference_name: &str, _query_name: &str, - _chaining_lower_bounds: &ChainingLowerBounds, + chaining_lower_bounds: &ChainingLowerBounds, ) -> AlignmentResult { + debug!( + "Reference sequence: {}", + String::from_utf8_lossy(&reference) + ); + debug!("Query sequence: {}", String::from_utf8_lossy(&query)); + info!("Aligning on subsequence {}", range); + + let sequences = AlignmentSequences::new(reference, query); + let k = chaining_lower_bounds.max_match_run() + 1; + + let anchors = Anchors::new(&sequences, range, k, rc_fn); + println!("Anchors:\n{anchors}"); todo!() } diff --git a/test_files/test_chainalign.sh b/test_files/test_chainalign.sh index 0b4ffe6..08f85fc 100755 --- a/test_files/test_chainalign.sh +++ b/test_files/test_chainalign.sh @@ -1,3 +1,3 @@ #!/bin/bash -cargo run -- align --alignment-method a-star-chain-ts -c test_files/config/chainalign --alphabet dna -p "$1" \ No newline at end of file +cargo run -- align --log-level debug --alignment-method a-star-chain-ts -c test_files/config/chainalign --alphabet dna -p "$@" \ No newline at end of file diff --git a/test_files/test_chainalign_release.sh b/test_files/test_chainalign_release.sh index 9bbb881..0f2f77b 100755 --- a/test_files/test_chainalign_release.sh +++ b/test_files/test_chainalign_release.sh @@ -1,3 +1,3 @@ #!/bin/bash -cargo run --release -- align --alignment-method a-star-chain-ts -c test_files/config/chainalign --alphabet dna -p "$1" +cargo run --release -- align --alignment-method a-star-chain-ts -c test_files/config/chainalign --alphabet dna -p "$@" diff --git a/test_files/twin_10_eq.fa b/test_files/twin_10_eq.fa new file mode 100644 index 0000000..59cde2b --- /dev/null +++ b/test_files/twin_10_eq.fa @@ -0,0 +1,4 @@ +>reference +ACGCAGATGA +>query +ACGCAGATGA \ No newline at end of file diff --git a/test_files/twin_10_ts.fa b/test_files/twin_10_ts.fa new file mode 100644 index 0000000..1643ec6 --- /dev/null +++ b/test_files/twin_10_ts.fa @@ -0,0 +1,8 @@ +>reference +AC +ATCTGC +GA +>query +AC +GCAGAT +GA \ No newline at end of file diff --git a/tsalign/src/align/a_star_chain_ts.rs b/tsalign/src/align/a_star_chain_ts.rs index f9ccf34..83b372c 100644 --- a/tsalign/src/align/a_star_chain_ts.rs +++ b/tsalign/src/align/a_star_chain_ts.rs @@ -1,9 +1,12 @@ -use compact_genome::interface::{alphabet::Alphabet, sequence::GenomeSequence}; +use compact_genome::interface::{ + alphabet::{Alphabet, AlphabetCharacter}, + sequence::GenomeSequence, +}; use lib_ts_chainalign::costs::AlignmentCosts; use lib_tsalign::{ a_star_aligner::alignment_geometry::AlignmentRange, config::TemplateSwitchConfig, }; -use log::{info, warn}; +use log::{debug, info, warn}; use sha::{ sha1::Sha1, utils::{Digest, DigestExt}, @@ -49,7 +52,7 @@ pub fn align_a_star_chain_ts< // so we square that and arrive at ceil(log_2(length_sum)). usize::BITS - ((reference.len() + query.len()) - 1).leading_zeros() }); - info!("Using max_n = {max_n}"); + debug!("Using max_n = {max_n}"); info!("Using k = {k}"); let max_match_run = k - 1; let cost_hash = Sha1::default() @@ -57,7 +60,7 @@ pub fn align_a_star_chain_ts< &bincode::serde::encode_to_vec(&alignment_costs, bincode::config::standard()).unwrap(), ) .to_hex(); - info!("Using cost_hash = {cost_hash}"); + debug!("Using cost_hash = {cost_hash}"); let cache_file: PathBuf = [ cache_directory, @@ -87,11 +90,15 @@ pub fn align_a_star_chain_ts< let reference = reference.clone_as_vec(); let query = query.clone_as_vec(); - info!("Aligning..."); let alignment = lib_ts_chainalign::align( - &reference, - &query, + reference, + query, range, + &|c| { + AlphabetType::character_to_ascii( + AlphabetType::ascii_to_character(c).unwrap().complement(), + ) + }, reference_name, query_name, &chaining_lower_bounds, From af33335bbb23029256f3e80408283161a0cee455 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Thu, 4 Dec 2025 12:38:03 +0200 Subject: [PATCH 17/31] Initialise chaining cost function with lower bounds. --- lib_ts_chainalign/src/anchors.rs | 242 +++++++++++++++--- lib_ts_chainalign/src/anchors/tests.rs | 62 +++++ .../src/chaining_cost_function.rs | 217 +++++++++++++++- 3 files changed, 479 insertions(+), 42 deletions(-) create mode 100644 lib_ts_chainalign/src/anchors/tests.rs diff --git a/lib_ts_chainalign/src/anchors.rs b/lib_ts_chainalign/src/anchors.rs index 63e5918..128a8db 100644 --- a/lib_ts_chainalign/src/anchors.rs +++ b/lib_ts_chainalign/src/anchors.rs @@ -4,7 +4,11 @@ use lib_tsalign::a_star_aligner::alignment_geometry::AlignmentRange; use log::debug; use crate::{ - alignment::sequences::AlignmentSequences, + alignment::{ + coordinates::AlignmentCoordinates, + sequences::AlignmentSequences, + ts_kind::{TsDescendant, TsKind}, + }, anchors::{ kmer_matches::find_kmer_matches, kmers::{Kmer, KmerStore}, @@ -13,25 +17,30 @@ use crate::{ pub mod kmer_matches; pub mod kmers; +#[cfg(test)] +mod tests; +#[derive(Debug, PartialEq, Eq)] pub struct Anchors { - pub primary_anchors: Vec, - pub secondary_anchors_11: Vec, - pub secondary_anchors_12: Vec, - pub secondary_anchors_21: Vec, - pub secondary_anchors_22: Vec, + pub primary: Vec, + pub secondary_11: Vec, + pub secondary_12: Vec, + pub secondary_21: Vec, + pub secondary_22: Vec, } +#[derive(Debug, PartialEq, Eq)] pub struct PrimaryAnchor { - pub seq1: usize, - pub seq2: usize, + seq1: usize, + seq2: usize, } +#[derive(Debug, PartialEq, Eq)] pub struct SecondaryAnchor { - /// Ancestor index in the reverse complement. - pub ancestor: usize, - /// Descendant index in the primary sequence. - pub descendant: usize, + /// Ancestor right index in the primary sequence. + ancestor: usize, + /// Descendant left index in the primary sequence. + descendant: usize, } impl Anchors { @@ -97,70 +106,209 @@ impl Anchors { debug!("s2_kmers: {s2_kmers:?}"); // Compute anchors. - let mut primary_anchors: Vec<_> = find_kmer_matches(&s1_kmers, &s2_kmers) + let mut primary: Vec<_> = find_kmer_matches(&s1_kmers, &s2_kmers) .into_iter() .map(|(seq1, seq2)| PrimaryAnchor { seq1, seq2 }) .collect(); - let mut secondary_anchors_11: Vec<_> = find_kmer_matches(&s1_rc_kmers, &s1_kmers) + let mut secondary_11: Vec<_> = find_kmer_matches(&s1_rc_kmers, &s1_kmers) .into_iter() .map(|(ancestor, descendant)| SecondaryAnchor { - ancestor, + ancestor: s1.len() - ancestor, descendant, }) .collect(); - let mut secondary_anchors_12: Vec<_> = find_kmer_matches(&s1_rc_kmers, &s2_kmers) + let mut secondary_12: Vec<_> = find_kmer_matches(&s1_rc_kmers, &s2_kmers) .into_iter() .map(|(ancestor, descendant)| SecondaryAnchor { - ancestor, + ancestor: s1.len() - ancestor, descendant, }) .collect(); - let mut secondary_anchors_21: Vec<_> = find_kmer_matches(&s2_rc_kmers, &s1_kmers) + let mut secondary_21: Vec<_> = find_kmer_matches(&s2_rc_kmers, &s1_kmers) .into_iter() .map(|(ancestor, descendant)| SecondaryAnchor { - ancestor, + ancestor: s2.len() - ancestor, descendant, }) .collect(); - let mut secondary_anchors_22: Vec<_> = find_kmer_matches(&s2_rc_kmers, &s2_kmers) + let mut secondary_22: Vec<_> = find_kmer_matches(&s2_rc_kmers, &s2_kmers) .into_iter() .map(|(ancestor, descendant)| SecondaryAnchor { - ancestor, + ancestor: s2.len() - ancestor, descendant, }) .collect(); // Sort anchors. - primary_anchors - .sort_unstable_by_key(|primary_anchor| primary_anchor.seq1.min(primary_anchor.seq2)); - secondary_anchors_11.sort_unstable_by_key(|secondary_anchor| { - secondary_anchor.ancestor.min(secondary_anchor.descendant) + primary.sort_unstable_by_key(|primary_anchor| { + ( + primary_anchor.seq1.min(primary_anchor.seq2), + primary_anchor.seq1, + primary_anchor.seq2, + ) + }); + secondary_11.sort_unstable_by_key(|secondary_anchor| { + ( + secondary_anchor.ancestor.min(secondary_anchor.descendant), + secondary_anchor.ancestor, + secondary_anchor.descendant, + ) }); - secondary_anchors_12.sort_unstable_by_key(|secondary_anchor| { - secondary_anchor.ancestor.min(secondary_anchor.descendant) + secondary_12.sort_unstable_by_key(|secondary_anchor| { + ( + secondary_anchor.ancestor.min(secondary_anchor.descendant), + secondary_anchor.ancestor, + secondary_anchor.descendant, + ) }); - secondary_anchors_21.sort_unstable_by_key(|secondary_anchor| { - secondary_anchor.ancestor.min(secondary_anchor.descendant) + secondary_21.sort_unstable_by_key(|secondary_anchor| { + ( + secondary_anchor.ancestor.min(secondary_anchor.descendant), + secondary_anchor.ancestor, + secondary_anchor.descendant, + ) }); - secondary_anchors_22.sort_unstable_by_key(|secondary_anchor| { - secondary_anchor.ancestor.min(secondary_anchor.descendant) + secondary_22.sort_unstable_by_key(|secondary_anchor| { + ( + secondary_anchor.ancestor.min(secondary_anchor.descendant), + secondary_anchor.ancestor, + secondary_anchor.descendant, + ) }); Self { - primary_anchors, - secondary_anchors_11, - secondary_anchors_12, - secondary_anchors_21, - secondary_anchors_22, + primary, + secondary_11, + secondary_12, + secondary_21, + secondary_22, + } + } +} + +impl PrimaryAnchor { + pub fn new(seq1: usize, seq2: usize) -> Self { + Self { seq1, seq2 } + } + + pub fn start(&self) -> AlignmentCoordinates { + AlignmentCoordinates::Primary { + a: self.seq1, + b: self.seq2, + } + } + + pub fn end(&self, k: usize) -> AlignmentCoordinates { + AlignmentCoordinates::Primary { + a: self.seq1 + k, + b: self.seq2 + k, + } + } + + pub fn chaining_gaps(&self, second: &Self, k: usize) -> Option<(usize, usize)> { + let gap_start = self.end(k); + let gap_end = second.start(); + + let gap1 = gap_end + .primary_ordinate_a() + .unwrap() + .checked_sub(gap_start.primary_ordinate_a().unwrap())?; + let gap2 = gap_end + .primary_ordinate_b() + .unwrap() + .checked_sub(gap_start.primary_ordinate_b().unwrap())?; + + Some((gap1, gap2)) + } + + pub fn chaining_jump_gap( + &self, + second: &SecondaryAnchor, + ts_kind: TsKind, + k: usize, + ) -> Option { + let gap_start = self.end(k); + let gap_end = second.start(ts_kind); + + let gap_start = match ts_kind.descendant { + TsDescendant::Seq1 => gap_start.primary_ordinate_a().unwrap(), + TsDescendant::Seq2 => gap_start.primary_ordinate_b().unwrap(), + }; + let gap_end = gap_end.secondary_ordinate_descendant().unwrap(); + + gap_end.checked_sub(gap_start) + } +} + +impl SecondaryAnchor { + pub fn new(ancestor: usize, descendant: usize) -> Self { + Self { + ancestor, + descendant, + } + } + + pub fn start(&self, ts_kind: TsKind) -> AlignmentCoordinates { + AlignmentCoordinates::Secondary { + ancestor: self.ancestor, + descendant: self.descendant, + ts_kind, } } + + pub fn end(&self, ts_kind: TsKind, k: usize) -> AlignmentCoordinates { + AlignmentCoordinates::Secondary { + ancestor: self.ancestor.checked_sub(k).unwrap(), + descendant: self.descendant + k, + ts_kind, + } + } + + pub fn chaining_gaps( + &self, + second: &Self, + ts_kind: TsKind, + k: usize, + ) -> Option<(usize, usize)> { + let gap_start = self.end(ts_kind, k); + let gap_end = second.start(ts_kind); + + let gap1 = gap_start + .secondary_ordinate_ancestor() + .unwrap() + .checked_sub(gap_end.secondary_ordinate_ancestor().unwrap())?; + let gap2 = gap_end + .secondary_ordinate_descendant() + .unwrap() + .checked_sub(gap_start.secondary_ordinate_descendant().unwrap())?; + + Some((gap1, gap2)) + } + + pub fn chaining_jump_gap( + &self, + second: &PrimaryAnchor, + ts_kind: TsKind, + k: usize, + ) -> Option { + let gap_start = self.end(ts_kind, k); + let gap_end = second.start(); + + let gap_start = gap_start.secondary_ordinate_descendant().unwrap(); + let gap_end = match ts_kind.descendant { + TsDescendant::Seq1 => gap_end.primary_ordinate_a().unwrap(), + TsDescendant::Seq2 => gap_end.primary_ordinate_b().unwrap(), + }; + + gap_end.checked_sub(gap_start) + } } impl Display for Anchors { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "P: [")?; let mut once = true; - for primary_anchor in &self.primary_anchors { + for primary_anchor in &self.primary { if once { once = false; } else { @@ -172,7 +320,7 @@ impl Display for Anchors { write!(f, "S11: [")?; let mut once = true; - for secondary_anchor in &self.secondary_anchors_11 { + for secondary_anchor in &self.secondary_11 { if once { once = false; } else { @@ -184,7 +332,7 @@ impl Display for Anchors { write!(f, "S12: [")?; let mut once = true; - for secondary_anchor in &self.secondary_anchors_12 { + for secondary_anchor in &self.secondary_12 { if once { once = false; } else { @@ -196,7 +344,7 @@ impl Display for Anchors { write!(f, "S21: [")?; let mut once = true; - for secondary_anchor in &self.secondary_anchors_21 { + for secondary_anchor in &self.secondary_21 { if once { once = false; } else { @@ -208,7 +356,7 @@ impl Display for Anchors { write!(f, "S22: [")?; let mut once = true; - for secondary_anchor in &self.secondary_anchors_22 { + for secondary_anchor in &self.secondary_22 { if once { once = false; } else { @@ -233,3 +381,15 @@ impl Display for SecondaryAnchor { write!(f, "({}, {})", self.ancestor, self.descendant) } } + +impl From<(usize, usize)> for PrimaryAnchor { + fn from(value: (usize, usize)) -> Self { + Self::new(value.0, value.1) + } +} + +impl From<(usize, usize)> for SecondaryAnchor { + fn from(value: (usize, usize)) -> Self { + Self::new(value.0, value.1) + } +} diff --git a/lib_ts_chainalign/src/anchors/tests.rs b/lib_ts_chainalign/src/anchors/tests.rs new file mode 100644 index 0000000..03a8eab --- /dev/null +++ b/lib_ts_chainalign/src/anchors/tests.rs @@ -0,0 +1,62 @@ +use lib_tsalign::a_star_aligner::alignment_geometry::AlignmentRange; + +use crate::{ + alignment::sequences::AlignmentSequences, + anchors::{Anchors, PrimaryAnchor, SecondaryAnchor}, +}; + +fn rc_fn(c: u8) -> u8 { + match c { + b'A' => b'T', + b'C' => b'G', + b'G' => b'C', + b'T' => b'A', + c => unimplemented!("Unsupported character {c}"), + } +} + +#[test] +fn test_coordinates() { + let sequences = AlignmentSequences::new(b"ACAC".to_vec(), b"ACGT".to_vec()); + let range = AlignmentRange::new_complete(sequences.seq1().len(), sequences.seq2().len()); + let k = 2; + + let anchors = Anchors::new(&sequences, range, k, &rc_fn); + assert_eq!(anchors.primary, [(0, 0), (2, 0)].map(PrimaryAnchor::from)); + assert!(anchors.secondary_11.is_empty()); + assert_eq!( + anchors.secondary_12, + [(2, 2), (4, 2)].map(SecondaryAnchor::from) + ); + assert_eq!( + anchors.secondary_21, + [(4, 0), (4, 2)].map(SecondaryAnchor::from) + ); + assert_eq!( + anchors.secondary_22, + [(4, 0), (3, 1), (2, 2)].map(SecondaryAnchor::from) + ); +} + +#[test] +fn test_coordinates_rev() { + let sequences = AlignmentSequences::new(b"ACGT".to_vec(), b"ACAC".to_vec()); + let range = AlignmentRange::new_complete(sequences.seq1().len(), sequences.seq2().len()); + let k = 2; + + let anchors = Anchors::new(&sequences, range, k, &rc_fn); + assert_eq!(anchors.primary, [(0, 0), (0, 2)].map(PrimaryAnchor::from)); + assert!(anchors.secondary_22.is_empty()); + assert_eq!( + anchors.secondary_21, + [(2, 2), (4, 2)].map(SecondaryAnchor::from) + ); + assert_eq!( + anchors.secondary_12, + [(4, 0), (4, 2)].map(SecondaryAnchor::from) + ); + assert_eq!( + anchors.secondary_11, + [(4, 0), (3, 1), (2, 2)].map(SecondaryAnchor::from) + ); +} diff --git a/lib_ts_chainalign/src/chaining_cost_function.rs b/lib_ts_chainalign/src/chaining_cost_function.rs index 3f9ee32..bdb969c 100644 --- a/lib_ts_chainalign/src/chaining_cost_function.rs +++ b/lib_ts_chainalign/src/chaining_cost_function.rs @@ -1 +1,216 @@ -pub struct ChainingCostFunction {} +use generic_a_star::cost::AStarCost; +use ndarray::{Array, Array2}; + +use crate::{ + alignment::ts_kind::TsKind, anchors::Anchors, chaining_lower_bounds::ChainingLowerBounds, +}; + +pub struct ChainingCostFunction { + primary: Array2, + secondary_11: Array2, + secondary_12: Array2, + secondary_21: Array2, + secondary_22: Array2, + jump_12_to_11: Array2, + jump_12_to_12: Array2, + jump_12_to_21: Array2, + jump_12_to_22: Array2, + jump_34_from_11: Array2, + jump_34_from_12: Array2, + jump_34_from_21: Array2, + jump_34_from_22: Array2, +} + +impl ChainingCostFunction { + pub fn new_from_lower_bounds( + chaining_lower_bounds: &ChainingLowerBounds, + anchors: &Anchors, + ) -> Self { + let k = usize::try_from(chaining_lower_bounds.max_match_run() + 1).unwrap(); + + let mut primary = Array2::from_elem( + (anchors.primary.len(), anchors.primary.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.primary.iter().enumerate() { + for (to_index, to_anchor) in anchors.primary.iter().enumerate() { + if let Some((gap1, gap2)) = from_anchor.chaining_gaps(to_anchor, k) { + primary[[from_index, to_index]] = + chaining_lower_bounds.primary_lower_bound(gap1, gap2); + } + } + } + + let mut secondary_11 = Array2::from_elem( + (anchors.secondary_11.len(), anchors.secondary_11.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.secondary_11.iter().enumerate() { + for (to_index, to_anchor) in anchors.secondary_11.iter().enumerate() { + if let Some((gap1, gap2)) = from_anchor.chaining_gaps(to_anchor, TsKind::TS11, k) { + secondary_11[[from_index, to_index]] = + chaining_lower_bounds.secondary_lower_bound(gap1, gap2); + } + } + } + + let mut secondary_12 = Array2::from_elem( + (anchors.secondary_12.len(), anchors.secondary_12.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.secondary_12.iter().enumerate() { + for (to_index, to_anchor) in anchors.secondary_12.iter().enumerate() { + if let Some((gap1, gap2)) = from_anchor.chaining_gaps(to_anchor, TsKind::TS12, k) { + secondary_12[[from_index, to_index]] = + chaining_lower_bounds.secondary_lower_bound(gap1, gap2); + } + } + } + + let mut secondary_21 = Array2::from_elem( + (anchors.secondary_21.len(), anchors.secondary_21.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.secondary_21.iter().enumerate() { + for (to_index, to_anchor) in anchors.secondary_21.iter().enumerate() { + if let Some((gap1, gap2)) = from_anchor.chaining_gaps(to_anchor, TsKind::TS21, k) { + secondary_21[[from_index, to_index]] = + chaining_lower_bounds.secondary_lower_bound(gap1, gap2); + } + } + } + + let mut secondary_22 = Array2::from_elem( + (anchors.secondary_22.len(), anchors.secondary_22.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.secondary_22.iter().enumerate() { + for (to_index, to_anchor) in anchors.secondary_22.iter().enumerate() { + if let Some((gap1, gap2)) = from_anchor.chaining_gaps(to_anchor, TsKind::TS22, k) { + secondary_22[[from_index, to_index]] = + chaining_lower_bounds.secondary_lower_bound(gap1, gap2); + } + } + } + + let mut jump_12_to_11 = Array::from_elem( + (anchors.primary.len(), anchors.secondary_11.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.primary.iter().enumerate() { + for (to_index, to_anchor) in anchors.secondary_11.iter().enumerate() { + if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS11, k) { + jump_12_to_11[[from_index, to_index]] = + chaining_lower_bounds.jump_12_lower_bound(gap); + } + } + } + + let mut jump_12_to_12 = Array::from_elem( + (anchors.primary.len(), anchors.secondary_12.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.primary.iter().enumerate() { + for (to_index, to_anchor) in anchors.secondary_12.iter().enumerate() { + if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS12, k) { + jump_12_to_12[[from_index, to_index]] = + chaining_lower_bounds.jump_12_lower_bound(gap); + } + } + } + + let mut jump_12_to_21 = Array::from_elem( + (anchors.primary.len(), anchors.secondary_21.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.primary.iter().enumerate() { + for (to_index, to_anchor) in anchors.secondary_21.iter().enumerate() { + if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS21, k) { + jump_12_to_21[[from_index, to_index]] = + chaining_lower_bounds.jump_12_lower_bound(gap); + } + } + } + + let mut jump_12_to_22 = Array::from_elem( + (anchors.primary.len(), anchors.secondary_22.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.primary.iter().enumerate() { + for (to_index, to_anchor) in anchors.secondary_22.iter().enumerate() { + if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS22, k) { + jump_12_to_22[[from_index, to_index]] = + chaining_lower_bounds.jump_12_lower_bound(gap); + } + } + } + + let mut jump_34_from_11 = Array::from_elem( + (anchors.secondary_11.len(), anchors.primary.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.secondary_11.iter().enumerate() { + for (to_index, to_anchor) in anchors.primary.iter().enumerate() { + if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS11, k) { + jump_34_from_11[[from_index, to_index]] = + chaining_lower_bounds.jump_34_lower_bound(gap); + } + } + } + + let mut jump_34_from_12 = Array::from_elem( + (anchors.secondary_12.len(), anchors.primary.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.secondary_12.iter().enumerate() { + for (to_index, to_anchor) in anchors.primary.iter().enumerate() { + if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS12, k) { + jump_34_from_12[[from_index, to_index]] = + chaining_lower_bounds.jump_34_lower_bound(gap); + } + } + } + + let mut jump_34_from_21 = Array::from_elem( + (anchors.secondary_21.len(), anchors.primary.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.secondary_21.iter().enumerate() { + for (to_index, to_anchor) in anchors.primary.iter().enumerate() { + if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS21, k) { + jump_34_from_21[[from_index, to_index]] = + chaining_lower_bounds.jump_34_lower_bound(gap); + } + } + } + + let mut jump_34_from_22 = Array::from_elem( + (anchors.secondary_22.len(), anchors.primary.len()), + Cost::max_value(), + ); + for (from_index, from_anchor) in anchors.secondary_22.iter().enumerate() { + for (to_index, to_anchor) in anchors.primary.iter().enumerate() { + if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS22, k) { + jump_34_from_22[[from_index, to_index]] = + chaining_lower_bounds.jump_34_lower_bound(gap); + } + } + } + + Self { + primary, + secondary_11, + secondary_12, + secondary_21, + secondary_22, + jump_12_to_11, + jump_12_to_12, + jump_12_to_21, + jump_12_to_22, + jump_34_from_11, + jump_34_from_12, + jump_34_from_21, + jump_34_from_22, + } + } +} From 5b1157864dd48225553f351933b49cb0dbfba306 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Thu, 4 Dec 2025 14:58:12 +0200 Subject: [PATCH 18/31] Compute initial chain. --- lib_ts_chainalign/src/anchors.rs | 76 +++++- lib_ts_chainalign/src/chain_align.rs | 76 ++++++ lib_ts_chainalign/src/chain_align/chainer.rs | 219 ++++++++++++++++++ .../src/chaining_cost_function.rs | 107 ++++++++- lib_ts_chainalign/src/lib.rs | 24 +- tsalign/src/align/a_star_chain_ts.rs | 2 + 6 files changed, 481 insertions(+), 23 deletions(-) create mode 100644 lib_ts_chainalign/src/chain_align.rs create mode 100644 lib_ts_chainalign/src/chain_align/chainer.rs diff --git a/lib_ts_chainalign/src/anchors.rs b/lib_ts_chainalign/src/anchors.rs index 128a8db..c1db19e 100644 --- a/lib_ts_chainalign/src/anchors.rs +++ b/lib_ts_chainalign/src/anchors.rs @@ -208,17 +208,19 @@ impl PrimaryAnchor { pub fn chaining_gaps(&self, second: &Self, k: usize) -> Option<(usize, usize)> { let gap_start = self.end(k); let gap_end = second.start(); + primary_chaining_gaps(gap_start, gap_end) + } - let gap1 = gap_end - .primary_ordinate_a() - .unwrap() - .checked_sub(gap_start.primary_ordinate_a().unwrap())?; - let gap2 = gap_end - .primary_ordinate_b() - .unwrap() - .checked_sub(gap_start.primary_ordinate_b().unwrap())?; + pub fn chaining_gaps_from_start(&self, start: AlignmentCoordinates) -> (usize, usize) { + let gap_end = self.start(); + primary_chaining_gaps(start, gap_end) + .unwrap_or_else(|| panic!("self: {self}, start: {start}")) + } - Some((gap1, gap2)) + pub fn chaining_gaps_to_end(&self, end: AlignmentCoordinates, k: usize) -> (usize, usize) { + let gap_start = self.end(k); + primary_chaining_gaps(gap_start, end) + .unwrap_or_else(|| panic!("self: {self}, end: {end}, k: {k}")) } pub fn chaining_jump_gap( @@ -238,6 +240,26 @@ impl PrimaryAnchor { gap_end.checked_sub(gap_start) } + + pub fn is_direct_predecessor_of(&self, successor: &Self) -> bool { + self.seq1 + 1 == successor.seq1 && self.seq2 + 1 == successor.seq2 + } +} + +fn primary_chaining_gaps( + gap_start: AlignmentCoordinates, + gap_end: AlignmentCoordinates, +) -> Option<(usize, usize)> { + let gap1 = gap_end + .primary_ordinate_a() + .unwrap() + .checked_sub(gap_start.primary_ordinate_a().unwrap())?; + let gap2 = gap_end + .primary_ordinate_b() + .unwrap() + .checked_sub(gap_start.primary_ordinate_b().unwrap())?; + + Some((gap1, gap2)) } impl SecondaryAnchor { @@ -302,6 +324,42 @@ impl SecondaryAnchor { gap_end.checked_sub(gap_start) } + + pub fn chaining_jump_gap_from_start( + &self, + start: AlignmentCoordinates, + ts_kind: TsKind, + ) -> usize { + let gap_start = match ts_kind.descendant { + TsDescendant::Seq1 => start.primary_ordinate_a().unwrap(), + TsDescendant::Seq2 => start.primary_ordinate_b().unwrap(), + }; + let gap_end = self.start(ts_kind).secondary_ordinate_descendant().unwrap(); + + gap_end.checked_sub(gap_start).unwrap() + } + + pub fn chaining_jump_gap_to_end( + &self, + end: AlignmentCoordinates, + ts_kind: TsKind, + k: usize, + ) -> usize { + let gap_start = self + .end(ts_kind, k) + .secondary_ordinate_descendant() + .unwrap(); + let gap_end = match ts_kind.descendant { + TsDescendant::Seq1 => end.primary_ordinate_a().unwrap(), + TsDescendant::Seq2 => end.primary_ordinate_b().unwrap(), + }; + + gap_end.checked_sub(gap_start).unwrap() + } + + pub fn is_direct_predecessor_of(&self, successor: &Self) -> bool { + self.ancestor - 1 == successor.ancestor && self.descendant + 1 == successor.descendant + } } impl Display for Anchors { diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs new file mode 100644 index 0000000..a3d67e4 --- /dev/null +++ b/lib_ts_chainalign/src/chain_align.rs @@ -0,0 +1,76 @@ +use generic_a_star::{AStar, AStarResult, cost::AStarCost}; +use log::debug; +use std::fmt::Write; + +use crate::{ + alignment::{coordinates::AlignmentCoordinates, sequences::AlignmentSequences}, + anchors::Anchors, + chain_align::chainer::{Context, Identifier}, + chaining_cost_function::ChainingCostFunction, +}; + +mod chainer; + +pub fn align( + sequences: &AlignmentSequences, + start: AlignmentCoordinates, + end: AlignmentCoordinates, + anchors: &Anchors, + chaining_cost_function: &mut ChainingCostFunction, +) { + let context = Context::new(anchors, chaining_cost_function); + let mut astar = AStar::new(context); + astar.initialise(); + + loop { + let chain = match astar.search() { + AStarResult::FoundTarget { cost, .. } => { + debug!("Found chain with cost {cost}"); + let mut chain = astar.reconstruct_path(); + chain.push(Identifier::End); + astar.reset(); + debug!("Chain (len: {}):\n{}", chain.len(), { + let mut s = String::new(); + let mut once = true; + for identifier in &chain { + if once { + once = false; + } else { + writeln!(s).unwrap(); + } + match identifier { + Identifier::Start => write!(s, "start").unwrap(), + Identifier::Primary { index } => { + write!(s, "P{}", anchors.primary[*index]).unwrap() + } + Identifier::End => write!(s, "end").unwrap(), + } + } + s + }); + chain + } + AStarResult::ExceededCostLimit { .. } => unreachable!("Cost limit is None"), + AStarResult::ExceededMemoryLimit { .. } => unreachable!("Memory limit is None"), + AStarResult::NoTarget => panic!("No chain found"), + }; + + for window in chain.windows(2) { + let from_anchor = window[0]; + let to_anchor = window[1]; + + match (from_anchor, to_anchor) { + (Identifier::Start, Identifier::End) => todo!(), + (Identifier::Start, Identifier::Primary { index }) => todo!(), + (Identifier::Primary { index }, Identifier::End) => todo!(), + ( + Identifier::Primary { index: from_index }, + Identifier::Primary { index: to_index }, + ) => todo!(), + (Identifier::End, _) | (_, Identifier::Start) => unreachable!(), + } + } + + todo!() + } +} diff --git a/lib_ts_chainalign/src/chain_align/chainer.rs b/lib_ts_chainalign/src/chain_align/chainer.rs new file mode 100644 index 0000000..547f503 --- /dev/null +++ b/lib_ts_chainalign/src/chain_align/chainer.rs @@ -0,0 +1,219 @@ +use std::{fmt::Display, iter}; + +use generic_a_star::{AStarContext, AStarNode, cost::AStarCost, reset::Reset}; + +use crate::{anchors::Anchors, chaining_cost_function::ChainingCostFunction}; + +const DEBUG_CHAINER: bool = false; + +pub struct Context<'anchors, 'chaining_cost_function, Cost> { + anchors: &'anchors Anchors, + chaining_cost_function: &'chaining_cost_function mut ChainingCostFunction, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub struct Node { + identifier: Identifier, + predecessor: Option, + cost: Cost, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub enum Identifier { + Start, + Primary { index: usize }, + End, +} + +impl<'anchors, 'chaining_cost_function, Cost> Context<'anchors, 'chaining_cost_function, Cost> { + pub fn new( + anchors: &'anchors Anchors, + chaining_cost_function: &'chaining_cost_function mut ChainingCostFunction, + ) -> Self { + Self { + anchors, + chaining_cost_function, + } + } +} + +impl AStarContext for Context<'_, '_, Cost> { + type Node = Node; + + fn create_root(&self) -> Self::Node { + Node { + identifier: Identifier::Start, + predecessor: None, + cost: Cost::zero(), + } + } + + fn generate_successors(&mut self, node: &Self::Node, output: &mut impl Extend) { + let predecessor = Some(node.identifier); + let predecessor_cost = node.cost; + + if DEBUG_CHAINER { + println!("Generating successors of {node}"); + } + + match node.identifier { + Identifier::Start => { + output.extend( + (0..self.anchors.primary.len()) + .flat_map(|successor_index| { + if DEBUG_CHAINER { + println!( + "Checking anchor P-{successor_index}: {}", + self.anchors.primary[successor_index] + ); + } + + let cost = predecessor_cost.checked_add( + &self + .chaining_cost_function + .primary_from_start(successor_index), + )?; + if DEBUG_CHAINER { + println!("Cost: {cost}"); + } + + (cost != Cost::max_value()).then_some(Node { + identifier: Identifier::Primary { + index: successor_index, + }, + predecessor, + cost, + }) + }) + .chain(iter::once({ + let cost = predecessor_cost + .checked_add(&self.chaining_cost_function.start_to_end()) + .unwrap(); + debug_assert_ne!(cost, Cost::max_value()); + Node { + identifier: Identifier::End, + predecessor, + cost, + } + })), + ); + } + Identifier::Primary { index } => output.extend( + (0..self.anchors.primary.len()) + .flat_map(|successor_index| { + if DEBUG_CHAINER { + println!( + "Checking anchor P-{successor_index}: {}", + self.anchors.primary[successor_index] + ); + } + + let cost = predecessor_cost.checked_add( + &self.chaining_cost_function.primary(index, successor_index), + )?; + if DEBUG_CHAINER { + println!("Cost: {cost}"); + } + + (cost != Cost::max_value()).then_some(Node { + identifier: Identifier::Primary { + index: successor_index, + }, + predecessor, + cost, + }) + }) + .chain(iter::once({ + let cost = predecessor_cost + .checked_add(&self.chaining_cost_function.primary_to_end(index)) + .unwrap(); + debug_assert_ne!(cost, Cost::max_value()); + Node { + identifier: Identifier::End, + predecessor, + cost, + } + })), + ), + Identifier::End => { /* Has no successors */ } + } + } + + fn is_target(&self, node: &Self::Node) -> bool { + node.identifier == Identifier::End + } + + fn cost_limit(&self) -> Option<::Cost> { + None + } + + fn memory_limit(&self) -> Option { + None + } +} + +impl Reset for Context<'_, '_, Cost> { + fn reset(&mut self) { + // Nothing to do. + } +} + +impl AStarNode for Node { + type Identifier = Identifier; + + type EdgeType = Identifier; + + type Cost = Cost; + + fn identifier(&self) -> &Self::Identifier { + &self.identifier + } + + fn cost(&self) -> Self::Cost { + self.cost + } + + fn a_star_lower_bound(&self) -> Self::Cost { + Cost::zero() + } + + fn secondary_maximisable_score(&self) -> usize { + 0 + } + + fn predecessor(&self) -> Option<&Self::Identifier> { + self.predecessor.as_ref() + } + + fn predecessor_edge_type(&self) -> Option { + self.predecessor + } +} + +impl Ord for Node { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.cost.cmp(&other.cost) + } +} + +impl PartialOrd for Node { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Display for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}: {}", self.identifier, self.cost) + } +} + +impl Display for Identifier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Identifier::Start => write!(f, "start"), + Identifier::Primary { index } => write!(f, "P-{index}"), + Identifier::End => write!(f, "end"), + } + } +} diff --git a/lib_ts_chainalign/src/chaining_cost_function.rs b/lib_ts_chainalign/src/chaining_cost_function.rs index bdb969c..6f6d87e 100644 --- a/lib_ts_chainalign/src/chaining_cost_function.rs +++ b/lib_ts_chainalign/src/chaining_cost_function.rs @@ -2,7 +2,9 @@ use generic_a_star::cost::AStarCost; use ndarray::{Array, Array2}; use crate::{ - alignment::ts_kind::TsKind, anchors::Anchors, chaining_lower_bounds::ChainingLowerBounds, + alignment::{coordinates::AlignmentCoordinates, ts_kind::TsKind}, + anchors::Anchors, + chaining_lower_bounds::ChainingLowerBounds, }; pub struct ChainingCostFunction { @@ -25,19 +27,36 @@ impl ChainingCostFunction { pub fn new_from_lower_bounds( chaining_lower_bounds: &ChainingLowerBounds, anchors: &Anchors, + start: AlignmentCoordinates, + end: AlignmentCoordinates, ) -> Self { let k = usize::try_from(chaining_lower_bounds.max_match_run() + 1).unwrap(); let mut primary = Array2::from_elem( - (anchors.primary.len(), anchors.primary.len()), + (anchors.primary.len() + 2, anchors.primary.len() + 2), Cost::max_value(), ); + let gap1 = end.primary_ordinate_a().unwrap() - start.primary_ordinate_a().unwrap(); + let gap2 = end.primary_ordinate_b().unwrap() - start.primary_ordinate_b().unwrap(); + primary[[0, anchors.primary.len() + 1]] = + chaining_lower_bounds.primary_lower_bound(gap1, gap2); for (from_index, from_anchor) in anchors.primary.iter().enumerate() { + let from_index = from_index + 1; + let (gap1, gap2) = from_anchor.chaining_gaps_from_start(start); + primary[[0, from_index]] = chaining_lower_bounds.primary_lower_bound(gap1, gap2); + let (gap1, gap2) = from_anchor.chaining_gaps_to_end(end, k); + primary[[from_index, anchors.primary.len() + 1]] = + chaining_lower_bounds.primary_lower_bound(gap1, gap2); + for (to_index, to_anchor) in anchors.primary.iter().enumerate() { + let to_index = to_index + 1; if let Some((gap1, gap2)) = from_anchor.chaining_gaps(to_anchor, k) { primary[[from_index, to_index]] = chaining_lower_bounds.primary_lower_bound(gap1, gap2); } + if from_anchor.is_direct_predecessor_of(to_anchor) { + primary[[from_index, to_index]] = Cost::zero(); + } } } @@ -51,6 +70,9 @@ impl ChainingCostFunction { secondary_11[[from_index, to_index]] = chaining_lower_bounds.secondary_lower_bound(gap1, gap2); } + if from_anchor.is_direct_predecessor_of(to_anchor) { + secondary_11[[from_index, to_index]] = Cost::zero(); + } } } @@ -64,6 +86,9 @@ impl ChainingCostFunction { secondary_12[[from_index, to_index]] = chaining_lower_bounds.secondary_lower_bound(gap1, gap2); } + if from_anchor.is_direct_predecessor_of(to_anchor) { + secondary_12[[from_index, to_index]] = Cost::zero(); + } } } @@ -77,6 +102,9 @@ impl ChainingCostFunction { secondary_21[[from_index, to_index]] = chaining_lower_bounds.secondary_lower_bound(gap1, gap2); } + if from_anchor.is_direct_predecessor_of(to_anchor) { + secondary_21[[from_index, to_index]] = Cost::zero(); + } } } @@ -90,14 +118,18 @@ impl ChainingCostFunction { secondary_22[[from_index, to_index]] = chaining_lower_bounds.secondary_lower_bound(gap1, gap2); } + if from_anchor.is_direct_predecessor_of(to_anchor) { + secondary_22[[from_index, to_index]] = Cost::zero(); + } } } let mut jump_12_to_11 = Array::from_elem( - (anchors.primary.len(), anchors.secondary_11.len()), + (anchors.primary.len() + 2, anchors.secondary_11.len()), Cost::max_value(), ); for (from_index, from_anchor) in anchors.primary.iter().enumerate() { + let from_index = from_index + 1; for (to_index, to_anchor) in anchors.secondary_11.iter().enumerate() { if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS11, k) { jump_12_to_11[[from_index, to_index]] = @@ -107,10 +139,11 @@ impl ChainingCostFunction { } let mut jump_12_to_12 = Array::from_elem( - (anchors.primary.len(), anchors.secondary_12.len()), + (anchors.primary.len() + 2, anchors.secondary_12.len()), Cost::max_value(), ); for (from_index, from_anchor) in anchors.primary.iter().enumerate() { + let from_index = from_index + 1; for (to_index, to_anchor) in anchors.secondary_12.iter().enumerate() { if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS12, k) { jump_12_to_12[[from_index, to_index]] = @@ -120,10 +153,11 @@ impl ChainingCostFunction { } let mut jump_12_to_21 = Array::from_elem( - (anchors.primary.len(), anchors.secondary_21.len()), + (anchors.primary.len() + 2, anchors.secondary_21.len()), Cost::max_value(), ); for (from_index, from_anchor) in anchors.primary.iter().enumerate() { + let from_index = from_index + 1; for (to_index, to_anchor) in anchors.secondary_21.iter().enumerate() { if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS21, k) { jump_12_to_21[[from_index, to_index]] = @@ -133,10 +167,11 @@ impl ChainingCostFunction { } let mut jump_12_to_22 = Array::from_elem( - (anchors.primary.len(), anchors.secondary_22.len()), + (anchors.primary.len() + 2, anchors.secondary_22.len()), Cost::max_value(), ); for (from_index, from_anchor) in anchors.primary.iter().enumerate() { + let from_index = from_index + 1; for (to_index, to_anchor) in anchors.secondary_22.iter().enumerate() { if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS22, k) { jump_12_to_22[[from_index, to_index]] = @@ -146,11 +181,12 @@ impl ChainingCostFunction { } let mut jump_34_from_11 = Array::from_elem( - (anchors.secondary_11.len(), anchors.primary.len()), + (anchors.secondary_11.len(), anchors.primary.len() + 2), Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_11.iter().enumerate() { for (to_index, to_anchor) in anchors.primary.iter().enumerate() { + let to_index = to_index + 1; if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS11, k) { jump_34_from_11[[from_index, to_index]] = chaining_lower_bounds.jump_34_lower_bound(gap); @@ -159,11 +195,12 @@ impl ChainingCostFunction { } let mut jump_34_from_12 = Array::from_elem( - (anchors.secondary_12.len(), anchors.primary.len()), + (anchors.secondary_12.len(), anchors.primary.len() + 2), Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_12.iter().enumerate() { for (to_index, to_anchor) in anchors.primary.iter().enumerate() { + let to_index = to_index + 1; if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS12, k) { jump_34_from_12[[from_index, to_index]] = chaining_lower_bounds.jump_34_lower_bound(gap); @@ -172,11 +209,12 @@ impl ChainingCostFunction { } let mut jump_34_from_21 = Array::from_elem( - (anchors.secondary_21.len(), anchors.primary.len()), + (anchors.secondary_21.len(), anchors.primary.len() + 2), Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_21.iter().enumerate() { for (to_index, to_anchor) in anchors.primary.iter().enumerate() { + let to_index = to_index + 1; if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS21, k) { jump_34_from_21[[from_index, to_index]] = chaining_lower_bounds.jump_34_lower_bound(gap); @@ -185,11 +223,12 @@ impl ChainingCostFunction { } let mut jump_34_from_22 = Array::from_elem( - (anchors.secondary_22.len(), anchors.primary.len()), + (anchors.secondary_22.len(), anchors.primary.len() + 2), Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_22.iter().enumerate() { for (to_index, to_anchor) in anchors.primary.iter().enumerate() { + let to_index = to_index + 1; if let Some(gap) = from_anchor.chaining_jump_gap(to_anchor, TsKind::TS22, k) { jump_34_from_22[[from_index, to_index]] = chaining_lower_bounds.jump_34_lower_bound(gap); @@ -197,6 +236,38 @@ impl ChainingCostFunction { } } + for (to_index, to_anchor) in anchors.secondary_11.iter().enumerate() { + let gap = to_anchor.chaining_jump_gap_from_start(start, TsKind::TS11); + jump_12_to_11[[0, to_index]] = chaining_lower_bounds.jump_12_lower_bound(gap); + let gap = to_anchor.chaining_jump_gap_to_end(end, TsKind::TS11, k); + jump_34_from_11[[to_index, anchors.primary.len() + 1]] = + chaining_lower_bounds.jump_34_lower_bound(gap); + } + + for (to_index, to_anchor) in anchors.secondary_12.iter().enumerate() { + let gap = to_anchor.chaining_jump_gap_from_start(start, TsKind::TS12); + jump_12_to_12[[0, to_index]] = chaining_lower_bounds.jump_12_lower_bound(gap); + let gap = to_anchor.chaining_jump_gap_to_end(end, TsKind::TS12, k); + jump_34_from_12[[to_index, anchors.primary.len() + 1]] = + chaining_lower_bounds.jump_34_lower_bound(gap); + } + + for (to_index, to_anchor) in anchors.secondary_21.iter().enumerate() { + let gap = to_anchor.chaining_jump_gap_from_start(start, TsKind::TS21); + jump_12_to_21[[0, to_index]] = chaining_lower_bounds.jump_12_lower_bound(gap); + let gap = to_anchor.chaining_jump_gap_to_end(end, TsKind::TS21, k); + jump_34_from_21[[to_index, anchors.primary.len() + 1]] = + chaining_lower_bounds.jump_34_lower_bound(gap); + } + + for (to_index, to_anchor) in anchors.secondary_22.iter().enumerate() { + let gap = to_anchor.chaining_jump_gap_from_start(start, TsKind::TS22); + jump_12_to_22[[0, to_index]] = chaining_lower_bounds.jump_12_lower_bound(gap); + let gap = to_anchor.chaining_jump_gap_to_end(end, TsKind::TS22, k); + jump_34_from_22[[to_index, anchors.primary.len() + 1]] = + chaining_lower_bounds.jump_34_lower_bound(gap); + } + Self { primary, secondary_11, @@ -213,4 +284,20 @@ impl ChainingCostFunction { jump_34_from_22, } } + + pub fn primary(&self, from_primary_index: usize, to_primary_index: usize) -> Cost { + self.primary[[from_primary_index + 1, to_primary_index + 1]] + } + + pub fn primary_from_start(&self, primary_index: usize) -> Cost { + self.primary[[0, primary_index + 1]] + } + + pub fn primary_to_end(&self, primary_index: usize) -> Cost { + self.primary[[primary_index + 1, self.primary.dim().1 - 1]] + } + + pub fn start_to_end(&self) -> Cost { + self.primary[[0, self.primary.dim().1 - 1]] + } } diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index a0bc442..a894c7b 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -6,12 +6,16 @@ use lib_tsalign::a_star_aligner::{ use log::{debug, info}; use crate::{ - alignment::sequences::AlignmentSequences, anchors::Anchors, - chaining_lower_bounds::ChainingLowerBounds, costs::AlignmentCosts, + alignment::{coordinates::AlignmentCoordinates, sequences::AlignmentSequences}, + anchors::Anchors, + chaining_cost_function::ChainingCostFunction, + chaining_lower_bounds::ChainingLowerBounds, + costs::AlignmentCosts, }; pub mod alignment; pub mod anchors; +pub mod chain_align; pub mod chaining_cost_function; pub mod chaining_lower_bounds; pub mod costs; @@ -50,7 +54,19 @@ pub fn align( let sequences = AlignmentSequences::new(reference, query); let k = chaining_lower_bounds.max_match_run() + 1; - let anchors = Anchors::new(&sequences, range, k, rc_fn); - println!("Anchors:\n{anchors}"); + let anchors = Anchors::new(&sequences, range.clone(), k, rc_fn); + let start = AlignmentCoordinates::new_primary(range.reference_offset(), range.query_offset()); + let end = AlignmentCoordinates::new_primary(range.reference_limit(), range.query_limit()); + let mut chaining_cost_function = + ChainingCostFunction::new_from_lower_bounds(chaining_lower_bounds, &anchors, start, end); + + chain_align::align( + &sequences, + start, + end, + &anchors, + &mut chaining_cost_function, + ); + todo!() } diff --git a/tsalign/src/align/a_star_chain_ts.rs b/tsalign/src/align/a_star_chain_ts.rs index 83b372c..84cc2ac 100644 --- a/tsalign/src/align/a_star_chain_ts.rs +++ b/tsalign/src/align/a_star_chain_ts.rs @@ -52,6 +52,8 @@ pub fn align_a_star_chain_ts< // so we square that and arrive at ceil(log_2(length_sum)). usize::BITS - ((reference.len() + query.len()) - 1).leading_zeros() }); + // Decrease k a little, because we can hopefully afford a few more anchors. + let k = (k.saturating_sub(3)).max(2); debug!("Using max_n = {max_n}"); info!("Using k = {k}"); let max_match_run = k - 1; From 82e329061c44f39fd508fa6e2559897d38460995 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Thu, 4 Dec 2025 15:28:32 +0200 Subject: [PATCH 19/31] Refine chain with primary alignments. --- generic_a_star/src/lib.rs | 4 + lib_ts_chainalign/src/anchors.rs | 6 +- lib_ts_chainalign/src/chain_align.rs | 99 +++++++++++++++++-- lib_ts_chainalign/src/chain_align/chainer.rs | 4 +- .../src/chaining_cost_function.rs | 39 ++++++++ lib_ts_chainalign/src/lib.rs | 3 + 6 files changed, 140 insertions(+), 15 deletions(-) diff --git a/generic_a_star/src/lib.rs b/generic_a_star/src/lib.rs index 75c77e3..9f0baa1 100644 --- a/generic_a_star/src/lib.rs +++ b/generic_a_star/src/lib.rs @@ -209,6 +209,10 @@ impl AStar { &self.context } + pub fn context_mut(&mut self) -> &mut Context { + &mut self.context + } + pub fn into_context(self) -> Context { self.context } diff --git a/lib_ts_chainalign/src/anchors.rs b/lib_ts_chainalign/src/anchors.rs index c1db19e..58829bc 100644 --- a/lib_ts_chainalign/src/anchors.rs +++ b/lib_ts_chainalign/src/anchors.rs @@ -1,7 +1,7 @@ use std::fmt::Display; use lib_tsalign::a_star_aligner::alignment_geometry::AlignmentRange; -use log::debug; +use log::trace; use crate::{ alignment::{ @@ -102,8 +102,8 @@ impl Anchors { s2_rc_kmers.sort(); let s2_rc_kmers = s2_rc_kmers; - debug!("s1_kmers: {s1_kmers:?}"); - debug!("s2_kmers: {s2_kmers:?}"); + trace!("s1_kmers: {s1_kmers:?}"); + trace!("s2_kmers: {s2_kmers:?}"); // Compute anchors. let mut primary: Vec<_> = find_kmer_matches(&s1_kmers, &s2_kmers) diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index a3d67e4..e72e240 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -1,5 +1,5 @@ use generic_a_star::{AStar, AStarResult, cost::AStarCost}; -use log::debug; +use log::{debug, trace}; use std::fmt::Write; use crate::{ @@ -7,29 +7,36 @@ use crate::{ anchors::Anchors, chain_align::chainer::{Context, Identifier}, chaining_cost_function::ChainingCostFunction, + costs::AlignmentCosts, + exact_chaining::gap_affine::GapAffineAlignment, }; mod chainer; +#[expect(clippy::too_many_arguments)] pub fn align( sequences: &AlignmentSequences, start: AlignmentCoordinates, end: AlignmentCoordinates, + alignment_costs: &AlignmentCosts, + rc_fn: &dyn Fn(u8) -> u8, + max_match_run: u32, anchors: &Anchors, chaining_cost_function: &mut ChainingCostFunction, ) { + let k = usize::try_from(max_match_run + 1).unwrap(); let context = Context::new(anchors, chaining_cost_function); let mut astar = AStar::new(context); - astar.initialise(); loop { + astar.reset(); + astar.initialise(); let chain = match astar.search() { AStarResult::FoundTarget { cost, .. } => { debug!("Found chain with cost {cost}"); let mut chain = astar.reconstruct_path(); chain.push(Identifier::End); - astar.reset(); - debug!("Chain (len: {}):\n{}", chain.len(), { + trace!("Chain (len: {}):\n{}", chain.len(), { let mut s = String::new(); let mut once = true; for identifier in &chain { @@ -55,22 +62,94 @@ pub fn align( AStarResult::NoTarget => panic!("No chain found"), }; - for window in chain.windows(2) { + let mut cost_increased = false; + 'update_chain: for window in chain.windows(2) { let from_anchor = window[0]; let to_anchor = window[1]; match (from_anchor, to_anchor) { - (Identifier::Start, Identifier::End) => todo!(), - (Identifier::Start, Identifier::Primary { index }) => todo!(), - (Identifier::Primary { index }, Identifier::End) => todo!(), + (Identifier::Start, Identifier::End) => { + let alignment = GapAffineAlignment::new( + start, + end, + sequences, + &alignment_costs.primary_costs, + rc_fn, + max_match_run, + ); + cost_increased = cost_increased + || astar + .context_mut() + .chaining_cost_function + .update_start_to_end(alignment.cost()); + } + (Identifier::Start, Identifier::Primary { index }) => { + let end = anchors.primary[index].start(); + let alignment = GapAffineAlignment::new( + start, + end, + sequences, + &alignment_costs.primary_costs, + rc_fn, + max_match_run, + ); + cost_increased = cost_increased + || astar + .context_mut() + .chaining_cost_function + .update_primary_from_start(index, alignment.cost()); + } + (Identifier::Primary { index }, Identifier::End) => { + let start = anchors.primary[index].end(k); + let alignment = GapAffineAlignment::new( + start, + end, + sequences, + &alignment_costs.primary_costs, + rc_fn, + max_match_run, + ); + cost_increased = cost_increased + || astar + .context_mut() + .chaining_cost_function + .update_primary_to_end(index, alignment.cost()); + } ( Identifier::Primary { index: from_index }, Identifier::Primary { index: to_index }, - ) => todo!(), + ) => { + if anchors.primary[from_index] + .is_direct_predecessor_of(&anchors.primary[to_index]) + { + continue 'update_chain; + } + + let start = anchors.primary[from_index].end(k); + let end = anchors.primary[to_index].start(); + let alignment = GapAffineAlignment::new( + start, + end, + sequences, + &alignment_costs.primary_costs, + rc_fn, + max_match_run, + ); + cost_increased = cost_increased + || astar.context_mut().chaining_cost_function.update_primary( + from_index, + to_index, + alignment.cost(), + ); + } (Identifier::End, _) | (_, Identifier::Start) => unreachable!(), } } - todo!() + if !cost_increased { + break; + } } + + todo!("alignment found") } diff --git a/lib_ts_chainalign/src/chain_align/chainer.rs b/lib_ts_chainalign/src/chain_align/chainer.rs index 547f503..83c074e 100644 --- a/lib_ts_chainalign/src/chain_align/chainer.rs +++ b/lib_ts_chainalign/src/chain_align/chainer.rs @@ -7,8 +7,8 @@ use crate::{anchors::Anchors, chaining_cost_function::ChainingCostFunction}; const DEBUG_CHAINER: bool = false; pub struct Context<'anchors, 'chaining_cost_function, Cost> { - anchors: &'anchors Anchors, - chaining_cost_function: &'chaining_cost_function mut ChainingCostFunction, + pub anchors: &'anchors Anchors, + pub chaining_cost_function: &'chaining_cost_function mut ChainingCostFunction, } #[derive(Debug, Clone, Copy, Eq, PartialEq)] diff --git a/lib_ts_chainalign/src/chaining_cost_function.rs b/lib_ts_chainalign/src/chaining_cost_function.rs index 6f6d87e..8ee5be4 100644 --- a/lib_ts_chainalign/src/chaining_cost_function.rs +++ b/lib_ts_chainalign/src/chaining_cost_function.rs @@ -300,4 +300,43 @@ impl ChainingCostFunction { pub fn start_to_end(&self) -> Cost { self.primary[[0, self.primary.dim().1 - 1]] } + + pub fn update_primary( + &mut self, + from_primary_index: usize, + to_primary_index: usize, + cost: Cost, + ) -> bool { + let target = &mut self.primary[[from_primary_index + 1, to_primary_index + 1]]; + assert!(*target <= cost); + let result = *target < cost; + *target = cost; + result + } + + pub fn update_primary_from_start(&mut self, primary_index: usize, cost: Cost) -> bool { + let target = &mut self.primary[[0, primary_index + 1]]; + assert!(*target <= cost); + let result = *target < cost; + *target = cost; + result + } + + pub fn update_primary_to_end(&mut self, primary_index: usize, cost: Cost) -> bool { + let end_index = self.primary.dim().1 - 1; + let target = &mut self.primary[[primary_index + 1, end_index]]; + assert!(*target <= cost); + let result = *target < cost; + *target = cost; + result + } + + pub fn update_start_to_end(&mut self, cost: Cost) -> bool { + let end_index = self.primary.dim().1 - 1; + let target = &mut self.primary[[0, end_index]]; + assert!(*target <= cost); + let result = *target < cost; + *target = cost; + result + } } diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index a894c7b..fb70f16 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -64,6 +64,9 @@ pub fn align( &sequences, start, end, + chaining_lower_bounds.alignment_costs(), + rc_fn, + chaining_lower_bounds.max_match_run(), &anchors, &mut chaining_cost_function, ); From 8f3ad66749cae2043d5ce636d48a872e4205afeb Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Fri, 5 Dec 2025 13:21:36 +0200 Subject: [PATCH 20/31] Buggy complete version of tschainalign. --- lib_ts_chainalign/src/alignment.rs | 1 + lib_ts_chainalign/src/alignment/sequences.rs | 26 +- lib_ts_chainalign/src/alignment/ts_kind.rs | 35 ++ lib_ts_chainalign/src/anchors.rs | 11 +- lib_ts_chainalign/src/chain_align.rs | 297 ++++++++++++++++- lib_ts_chainalign/src/chain_align/chainer.rs | 139 +++++++- .../src/chaining_cost_function.rs | 314 ++++++++++++++++-- .../src/exact_chaining/ts_12_jump.rs | 4 +- .../src/exact_chaining/ts_34_jump.rs | 6 +- lib_ts_chainalign/src/lib.rs | 20 +- .../src/a_star_aligner/alignment_result.rs | 11 +- .../alignment_result/alignment.rs | 19 +- test_files/test_chainalign.sh | 2 +- test_files/twin_10_ts.fa | 4 +- tsalign/src/align/a_star_chain_ts.rs | 2 +- 15 files changed, 833 insertions(+), 58 deletions(-) diff --git a/lib_ts_chainalign/src/alignment.rs b/lib_ts_chainalign/src/alignment.rs index 4825125..0f1a3fe 100644 --- a/lib_ts_chainalign/src/alignment.rs +++ b/lib_ts_chainalign/src/alignment.rs @@ -27,6 +27,7 @@ pub enum GapType { InB, } +#[derive(Debug, Clone)] pub struct Alignment { pub alignment: Vec<(usize, AlignmentType)>, } diff --git a/lib_ts_chainalign/src/alignment/sequences.rs b/lib_ts_chainalign/src/alignment/sequences.rs index 7082591..76e4481 100644 --- a/lib_ts_chainalign/src/alignment/sequences.rs +++ b/lib_ts_chainalign/src/alignment/sequences.rs @@ -6,11 +6,27 @@ use crate::alignment::{ pub struct AlignmentSequences { seq1: Vec, seq2: Vec, + seq1_name: String, + seq2_name: String, } impl AlignmentSequences { pub fn new(seq1: Vec, seq2: Vec) -> Self { - Self { seq1, seq2 } + Self { + seq1, + seq2, + seq1_name: "seq1".to_string(), + seq2_name: "seq2".to_string(), + } + } + + pub fn new_named(seq1: Vec, seq2: Vec, seq1_name: String, seq2_name: String) -> Self { + Self { + seq1, + seq2, + seq1_name, + seq2_name, + } } pub fn characters( @@ -52,4 +68,12 @@ impl AlignmentSequences { pub fn seq2(&self) -> &[u8] { &self.seq2 } + + pub fn seq1_name(&self) -> &str { + &self.seq1_name + } + + pub fn seq2_name(&self) -> &str { + &self.seq2_name + } } diff --git a/lib_ts_chainalign/src/alignment/ts_kind.rs b/lib_ts_chainalign/src/alignment/ts_kind.rs index 3e229ec..d9e987d 100644 --- a/lib_ts_chainalign/src/alignment/ts_kind.rs +++ b/lib_ts_chainalign/src/alignment/ts_kind.rs @@ -1,5 +1,9 @@ use std::fmt::Display; +use lib_tsalign::a_star_aligner::template_switch_distance::{ + TemplateSwitchPrimary, TemplateSwitchSecondary, +}; + #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Hash)] pub struct TsKind { pub ancestor: TsAncestor, @@ -35,6 +39,37 @@ impl TsKind { ancestor: TsAncestor::Seq2, descendant: TsDescendant::Seq2, }; + + pub fn iter() -> impl Iterator { + [Self::TS11, Self::TS12, Self::TS21, Self::TS22].into_iter() + } + + pub fn digits(&self) -> &'static str { + match (self.ancestor, self.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => "11", + (TsAncestor::Seq1, TsDescendant::Seq2) => "12", + (TsAncestor::Seq2, TsDescendant::Seq1) => "21", + (TsAncestor::Seq2, TsDescendant::Seq2) => "22", + } + } +} + +impl TsAncestor { + pub fn into_tsalign_secondary(self) -> TemplateSwitchSecondary { + match self { + TsAncestor::Seq1 => TemplateSwitchSecondary::Reference, + TsAncestor::Seq2 => TemplateSwitchSecondary::Query, + } + } +} + +impl TsDescendant { + pub fn into_tsalign_primary(self) -> TemplateSwitchPrimary { + match self { + TsDescendant::Seq1 => TemplateSwitchPrimary::Reference, + TsDescendant::Seq2 => TemplateSwitchPrimary::Query, + } + } } impl Display for TsKind { diff --git a/lib_ts_chainalign/src/anchors.rs b/lib_ts_chainalign/src/anchors.rs index 58829bc..b50aa77 100644 --- a/lib_ts_chainalign/src/anchors.rs +++ b/lib_ts_chainalign/src/anchors.rs @@ -7,7 +7,7 @@ use crate::{ alignment::{ coordinates::AlignmentCoordinates, sequences::AlignmentSequences, - ts_kind::{TsDescendant, TsKind}, + ts_kind::{TsAncestor, TsDescendant, TsKind}, }, anchors::{ kmer_matches::find_kmer_matches, @@ -184,6 +184,15 @@ impl Anchors { secondary_22, } } + + pub fn secondary(&self, ts_kind: TsKind) -> &[SecondaryAnchor] { + match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => &self.secondary_11, + (TsAncestor::Seq1, TsDescendant::Seq2) => &self.secondary_12, + (TsAncestor::Seq2, TsDescendant::Seq1) => &self.secondary_21, + (TsAncestor::Seq2, TsDescendant::Seq2) => &self.secondary_22, + } + } } impl PrimaryAnchor { diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index e72e240..f27c234 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -1,20 +1,40 @@ +use compact_genome::{ + implementation::vec_sequence::VectorGenome, + interface::{ + alphabet::Alphabet, + sequence::{GenomeSequence, OwnedGenomeSequence}, + }, +}; use generic_a_star::{AStar, AStarResult, cost::AStarCost}; +use lib_tsalign::a_star_aligner::{ + alignment_result::AlignmentResult, + template_switch_distance::{EqualCostRange, TemplateSwitchDirection}, +}; use log::{debug, trace}; -use std::fmt::Write; +use std::{ + fmt::Write, + iter, + time::{Duration, Instant}, +}; use crate::{ - alignment::{coordinates::AlignmentCoordinates, sequences::AlignmentSequences}, + alignment::{ + Alignment, AlignmentType, coordinates::AlignmentCoordinates, sequences::AlignmentSequences, + }, anchors::Anchors, chain_align::chainer::{Context, Identifier}, chaining_cost_function::ChainingCostFunction, costs::AlignmentCosts, - exact_chaining::gap_affine::GapAffineAlignment, + exact_chaining::{ + gap_affine::GapAffineAlignment, ts_12_jump::Ts12JumpAlignment, + ts_34_jump::Ts34JumpAlignment, + }, }; mod chainer; #[expect(clippy::too_many_arguments)] -pub fn align( +pub fn align( sequences: &AlignmentSequences, start: AlignmentCoordinates, end: AlignmentCoordinates, @@ -23,15 +43,22 @@ pub fn align( max_match_run: u32, anchors: &Anchors, chaining_cost_function: &mut ChainingCostFunction, -) { +) -> AlignmentResult { + let start_time = Instant::now(); + let mut chaining_duration = Duration::default(); + let mut evaluation_duration = Duration::default(); + let k = usize::try_from(max_match_run + 1).unwrap(); let context = Context::new(anchors, chaining_cost_function); let mut astar = AStar::new(context); - loop { + let (alignments, result) = loop { + let chaining_start_time = Instant::now(); + astar.reset(); astar.initialise(); - let chain = match astar.search() { + let result = astar.search(); + let chain = match result { AStarResult::FoundTarget { cost, .. } => { debug!("Found chain with cost {cost}"); let mut chain = astar.reconstruct_path(); @@ -50,6 +77,13 @@ pub fn align( Identifier::Primary { index } => { write!(s, "P{}", anchors.primary[*index]).unwrap() } + Identifier::Secondary { index, ts_kind } => write!( + s, + "S{}{}", + ts_kind.digits(), + anchors.secondary(*ts_kind)[*index] + ) + .unwrap(), Identifier::End => write!(s, "end").unwrap(), } } @@ -62,7 +96,13 @@ pub fn align( AStarResult::NoTarget => panic!("No chain found"), }; + let chaining_end_time = Instant::now(); + chaining_duration += chaining_end_time - chaining_start_time; + + let evaluation_start_time = Instant::now(); + let mut cost_increased = false; + let mut alignments = Vec::new(); 'update_chain: for window in chain.windows(2) { let from_anchor = window[0]; let to_anchor = window[1]; @@ -82,6 +122,7 @@ pub fn align( .context_mut() .chaining_cost_function .update_start_to_end(alignment.cost()); + alignments.push(alignment.alignment().clone()); } (Identifier::Start, Identifier::Primary { index }) => { let end = anchors.primary[index].start(); @@ -98,6 +139,24 @@ pub fn align( .context_mut() .chaining_cost_function .update_primary_from_start(index, alignment.cost()); + alignments.push(alignment.alignment().clone()); + } + (Identifier::Start, Identifier::Secondary { index, ts_kind }) => { + let end = anchors.secondary(ts_kind)[index].start(ts_kind); + let alignment = Ts12JumpAlignment::new( + start, + end, + sequences, + alignment_costs, + rc_fn, + max_match_run, + ); + cost_increased = cost_increased + || astar + .context_mut() + .chaining_cost_function + .update_jump_12_from_start(index, ts_kind, alignment.cost()); + alignments.push(alignment.alignment().clone()); } (Identifier::Primary { index }, Identifier::End) => { let start = anchors.primary[index].end(k); @@ -114,6 +173,26 @@ pub fn align( .context_mut() .chaining_cost_function .update_primary_to_end(index, alignment.cost()); + alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); + alignments.push(alignment.alignment().clone()); + } + (Identifier::Secondary { index, ts_kind }, Identifier::End) => { + let start = anchors.secondary(ts_kind)[index].end(ts_kind, k); + let alignment = Ts34JumpAlignment::new( + start, + end, + sequences, + alignment_costs, + rc_fn, + max_match_run, + ); + cost_increased = cost_increased + || astar + .context_mut() + .chaining_cost_function + .update_jump_34_to_end(index, ts_kind, alignment.cost()); + alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); + alignments.push(alignment.alignment().clone()); } ( Identifier::Primary { index: from_index }, @@ -122,6 +201,7 @@ pub fn align( if anchors.primary[from_index] .is_direct_predecessor_of(&anchors.primary[to_index]) { + alignments.push(Alignment::from(vec![AlignmentType::Match])); continue 'update_chain; } @@ -141,15 +221,214 @@ pub fn align( to_index, alignment.cost(), ); + alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); + alignments.push(alignment.alignment().clone()); + } + ( + Identifier::Primary { index: from_index }, + Identifier::Secondary { + index: to_index, + ts_kind, + }, + ) => { + let start = anchors.primary[from_index].end(k); + let end = anchors.secondary(ts_kind)[to_index].start(ts_kind); + let alignment = Ts12JumpAlignment::new( + start, + end, + sequences, + alignment_costs, + rc_fn, + max_match_run, + ); + cost_increased = cost_increased + || astar.context_mut().chaining_cost_function.update_jump_12( + from_index, + to_index, + ts_kind, + alignment.cost(), + ); + alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); + alignments.push(alignment.alignment().clone()); + } + ( + Identifier::Secondary { + index: from_index, + ts_kind, + }, + Identifier::Secondary { + index: to_index, + ts_kind: to_ts_kind, + }, + ) => { + assert_eq!(ts_kind, to_ts_kind); + if anchors.secondary(ts_kind)[from_index] + .is_direct_predecessor_of(&anchors.secondary(ts_kind)[to_index]) + { + alignments.push(Alignment::from(vec![AlignmentType::Match])); + continue 'update_chain; + } + let start = anchors.secondary(ts_kind)[from_index].end(ts_kind, k); + let end = anchors.secondary(ts_kind)[to_index].start(ts_kind); + let alignment = GapAffineAlignment::new( + start, + end, + sequences, + &alignment_costs.secondary_costs, + rc_fn, + max_match_run, + ); + cost_increased = cost_increased + || astar.context_mut().chaining_cost_function.update_secondary( + from_index, + to_index, + ts_kind, + alignment.cost(), + ); + alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); + alignments.push(alignment.alignment().clone()); + } + ( + Identifier::Secondary { + index: from_index, + ts_kind, + }, + Identifier::Primary { index: to_index }, + ) => { + let start = anchors.secondary(ts_kind)[from_index].end(ts_kind, k); + let end = anchors.primary[to_index].start(); + let alignment = Ts34JumpAlignment::new( + start, + end, + sequences, + alignment_costs, + rc_fn, + max_match_run, + ); + cost_increased = cost_increased + || astar.context_mut().chaining_cost_function.update_jump_34( + from_index, + to_index, + ts_kind, + alignment.cost(), + ); + alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); + alignments.push(alignment.alignment().clone()); } (Identifier::End, _) | (_, Identifier::Start) => unreachable!(), } } + let evaluation_end_time = Instant::now(); + evaluation_duration += evaluation_end_time - evaluation_start_time; + if !cost_increased { - break; + break (alignments, result); + } + }; + + debug!("Chaining took {:.1}s", chaining_duration.as_secs_f64()); + debug!("Evaluation took {:.1}s", evaluation_duration.as_secs_f64()); + + let mut tsalign_alignment = + lib_tsalign::a_star_aligner::alignment_result::alignment::Alignment::new(); + let mut is_primary = true; + let mut anti_primary_gap = 0; + + for alignment in alignments { + use lib_tsalign::a_star_aligner::template_switch_distance::AlignmentType as TsAlignAlignmentType; + + for (multiplicity, alignment_type) in alignment.alignment { + match alignment_type { + AlignmentType::Match => tsalign_alignment.push_n( + multiplicity, + if is_primary { + TsAlignAlignmentType::PrimaryMatch + } else { + anti_primary_gap -= 1; + TsAlignAlignmentType::SecondaryMatch + }, + ), + AlignmentType::Substitution => tsalign_alignment.push_n( + multiplicity, + if is_primary { + TsAlignAlignmentType::PrimarySubstitution + } else { + anti_primary_gap -= 1; + TsAlignAlignmentType::SecondarySubstitution + }, + ), + AlignmentType::GapA => tsalign_alignment.push_n( + multiplicity, + if is_primary { + TsAlignAlignmentType::PrimaryInsertion + } else { + TsAlignAlignmentType::SecondaryInsertion + }, + ), + AlignmentType::GapB => tsalign_alignment.push_n( + multiplicity, + if is_primary { + TsAlignAlignmentType::PrimaryDeletion + } else { + anti_primary_gap -= 1; + TsAlignAlignmentType::SecondaryDeletion + }, + ), + AlignmentType::TsStart { jump, ts_kind } => { + assert!(is_primary); + assert_eq!(multiplicity, 1); + is_primary = false; + anti_primary_gap = jump; + + tsalign_alignment.push_n( + multiplicity, + TsAlignAlignmentType::TemplateSwitchEntrance { + first_offset: jump, + equal_cost_range: EqualCostRange::new_invalid(), + primary: ts_kind.descendant.into_tsalign_primary(), + secondary: ts_kind.ancestor.into_tsalign_secondary(), + direction: TemplateSwitchDirection::Reverse, + }, + ); + } + AlignmentType::TsEnd { jump } => { + assert!(!is_primary); + assert_eq!(multiplicity, 1); + is_primary = true; + + tsalign_alignment.push_n( + multiplicity, + TsAlignAlignmentType::TemplateSwitchExit { + anti_primary_gap: anti_primary_gap + jump, + }, + ) + } + } } } - todo!("alignment found") + let end_time = Instant::now(); + let duration_seconds = (end_time - start_time).as_secs_f64(); + + AlignmentResult::new_with_target::( + tsalign_alignment.into_inner(), + VectorGenome::from_slice_u8(sequences.seq1()) + .unwrap() + .as_genome_subsequence(), + VectorGenome::from_slice_u8(sequences.seq2()) + .unwrap() + .as_genome_subsequence(), + sequences.seq1_name(), + sequences.seq2_name(), + start.primary_ordinate_a().unwrap(), + start.primary_ordinate_b().unwrap(), + result.without_node_identifier(), + duration_seconds, + 0, + 0, + 0, + sequences.seq1().len(), + sequences.seq2().len(), + ) } diff --git a/lib_ts_chainalign/src/chain_align/chainer.rs b/lib_ts_chainalign/src/chain_align/chainer.rs index 83c074e..f30c94c 100644 --- a/lib_ts_chainalign/src/chain_align/chainer.rs +++ b/lib_ts_chainalign/src/chain_align/chainer.rs @@ -2,7 +2,9 @@ use std::{fmt::Display, iter}; use generic_a_star::{AStarContext, AStarNode, cost::AStarCost, reset::Reset}; -use crate::{anchors::Anchors, chaining_cost_function::ChainingCostFunction}; +use crate::{ + alignment::ts_kind::TsKind, anchors::Anchors, chaining_cost_function::ChainingCostFunction, +}; const DEBUG_CHAINER: bool = false; @@ -22,6 +24,7 @@ pub struct Node { pub enum Identifier { Start, Primary { index: usize }, + Secondary { index: usize, ts_kind: TsKind }, End, } @@ -85,6 +88,37 @@ impl AStarContext for Context<'_, '_, Cost> { cost, }) }) + .chain(TsKind::iter().flat_map(|ts_kind| { + (0..self.anchors.secondary(ts_kind).len()) + .zip(iter::repeat(&self)) + .flat_map(move |(successor_index, context)| { + if DEBUG_CHAINER { + println!( + "Checking anchor S{}-{successor_index}: {}", + ts_kind.digits(), + context.anchors.secondary(ts_kind)[successor_index] + ); + } + + let cost = predecessor_cost.checked_add( + &context + .chaining_cost_function + .jump_12_from_start(successor_index, ts_kind), + )?; + if DEBUG_CHAINER { + println!("Cost: {cost}"); + } + + (cost != Cost::max_value()).then_some(Node { + identifier: Identifier::Secondary { + index: successor_index, + ts_kind, + }, + predecessor, + cost, + }) + }) + })) .chain(iter::once({ let cost = predecessor_cost .checked_add(&self.chaining_cost_function.start_to_end()) @@ -123,6 +157,39 @@ impl AStarContext for Context<'_, '_, Cost> { cost, }) }) + .chain(TsKind::iter().flat_map(|ts_kind| { + (0..self.anchors.secondary(ts_kind).len()) + .zip(iter::repeat(&self)) + .flat_map(move |(successor_index, context)| { + if DEBUG_CHAINER { + println!( + "Checking anchor S{}-{successor_index}: {}", + ts_kind.digits(), + context.anchors.secondary(ts_kind)[successor_index] + ); + } + + let cost = predecessor_cost.checked_add( + &context.chaining_cost_function.jump_12( + index, + successor_index, + ts_kind, + ), + )?; + if DEBUG_CHAINER { + println!("Cost: {cost}"); + } + + (cost != Cost::max_value()).then_some(Node { + identifier: Identifier::Secondary { + index: successor_index, + ts_kind, + }, + predecessor, + cost, + }) + }) + })) .chain(iter::once({ let cost = predecessor_cost .checked_add(&self.chaining_cost_function.primary_to_end(index)) @@ -135,6 +202,75 @@ impl AStarContext for Context<'_, '_, Cost> { } })), ), + Identifier::Secondary { index, ts_kind } => output.extend( + (0..self.anchors.secondary(ts_kind).len()) + .flat_map(|successor_index| { + if DEBUG_CHAINER { + println!( + "Checking anchor S{}-{successor_index}: {}", + ts_kind.digits(), + self.anchors.secondary(ts_kind)[successor_index] + ); + } + + let cost = predecessor_cost.checked_add( + &self + .chaining_cost_function + .secondary(index, successor_index, ts_kind), + )?; + if DEBUG_CHAINER { + println!("Cost: {cost}"); + } + + (cost != Cost::max_value()).then_some(Node { + identifier: Identifier::Secondary { + index: successor_index, + ts_kind, + }, + predecessor, + cost, + }) + }) + .chain((0..self.anchors.primary.len()).flat_map(|successor_index| { + if DEBUG_CHAINER { + println!( + "Checking anchor P-{successor_index}: {}", + self.anchors.primary[successor_index] + ); + } + + let cost = predecessor_cost.checked_add( + &self + .chaining_cost_function + .jump_34(index, successor_index, ts_kind), + )?; + if DEBUG_CHAINER { + println!("Cost: {cost}"); + } + + (cost != Cost::max_value()).then_some(Node { + identifier: Identifier::Primary { + index: successor_index, + }, + predecessor, + cost, + }) + })) + .chain(iter::once({ + let cost = predecessor_cost + .checked_add( + &self.chaining_cost_function.jump_34_to_end(index, ts_kind), + ) + .unwrap(); + debug_assert_ne!(cost, Cost::max_value()); + Node { + identifier: Identifier::End, + predecessor, + cost, + } + })), + ), + Identifier::End => { /* Has no successors */ } } } @@ -213,6 +349,7 @@ impl Display for Identifier { match self { Identifier::Start => write!(f, "start"), Identifier::Primary { index } => write!(f, "P-{index}"), + Identifier::Secondary { index, ts_kind } => write!(f, "S{}-{index}", ts_kind.digits()), Identifier::End => write!(f, "end"), } } diff --git a/lib_ts_chainalign/src/chaining_cost_function.rs b/lib_ts_chainalign/src/chaining_cost_function.rs index 8ee5be4..dfa4eed 100644 --- a/lib_ts_chainalign/src/chaining_cost_function.rs +++ b/lib_ts_chainalign/src/chaining_cost_function.rs @@ -2,7 +2,10 @@ use generic_a_star::cost::AStarCost; use ndarray::{Array, Array2}; use crate::{ - alignment::{coordinates::AlignmentCoordinates, ts_kind::TsKind}, + alignment::{ + coordinates::AlignmentCoordinates, + ts_kind::{TsAncestor, TsDescendant, TsKind}, + }, anchors::Anchors, chaining_lower_bounds::ChainingLowerBounds, }; @@ -236,35 +239,35 @@ impl ChainingCostFunction { } } - for (to_index, to_anchor) in anchors.secondary_11.iter().enumerate() { - let gap = to_anchor.chaining_jump_gap_from_start(start, TsKind::TS11); - jump_12_to_11[[0, to_index]] = chaining_lower_bounds.jump_12_lower_bound(gap); - let gap = to_anchor.chaining_jump_gap_to_end(end, TsKind::TS11, k); - jump_34_from_11[[to_index, anchors.primary.len() + 1]] = + for (index, anchor) in anchors.secondary_11.iter().enumerate() { + let gap = anchor.chaining_jump_gap_from_start(start, TsKind::TS11); + jump_12_to_11[[0, index]] = chaining_lower_bounds.jump_12_lower_bound(gap); + let gap = anchor.chaining_jump_gap_to_end(end, TsKind::TS11, k); + jump_34_from_11[[index, anchors.primary.len() + 1]] = chaining_lower_bounds.jump_34_lower_bound(gap); } - for (to_index, to_anchor) in anchors.secondary_12.iter().enumerate() { - let gap = to_anchor.chaining_jump_gap_from_start(start, TsKind::TS12); - jump_12_to_12[[0, to_index]] = chaining_lower_bounds.jump_12_lower_bound(gap); - let gap = to_anchor.chaining_jump_gap_to_end(end, TsKind::TS12, k); - jump_34_from_12[[to_index, anchors.primary.len() + 1]] = + for (index, anchor) in anchors.secondary_12.iter().enumerate() { + let gap = anchor.chaining_jump_gap_from_start(start, TsKind::TS12); + jump_12_to_12[[0, index]] = chaining_lower_bounds.jump_12_lower_bound(gap); + let gap = anchor.chaining_jump_gap_to_end(end, TsKind::TS12, k); + jump_34_from_12[[index, anchors.primary.len() + 1]] = chaining_lower_bounds.jump_34_lower_bound(gap); } - for (to_index, to_anchor) in anchors.secondary_21.iter().enumerate() { - let gap = to_anchor.chaining_jump_gap_from_start(start, TsKind::TS21); - jump_12_to_21[[0, to_index]] = chaining_lower_bounds.jump_12_lower_bound(gap); - let gap = to_anchor.chaining_jump_gap_to_end(end, TsKind::TS21, k); - jump_34_from_21[[to_index, anchors.primary.len() + 1]] = + for (index, anchor) in anchors.secondary_21.iter().enumerate() { + let gap = anchor.chaining_jump_gap_from_start(start, TsKind::TS21); + jump_12_to_21[[0, index]] = chaining_lower_bounds.jump_12_lower_bound(gap); + let gap = anchor.chaining_jump_gap_to_end(end, TsKind::TS21, k); + jump_34_from_21[[index, anchors.primary.len() + 1]] = chaining_lower_bounds.jump_34_lower_bound(gap); } - for (to_index, to_anchor) in anchors.secondary_22.iter().enumerate() { - let gap = to_anchor.chaining_jump_gap_from_start(start, TsKind::TS22); - jump_12_to_22[[0, to_index]] = chaining_lower_bounds.jump_12_lower_bound(gap); - let gap = to_anchor.chaining_jump_gap_to_end(end, TsKind::TS22, k); - jump_34_from_22[[to_index, anchors.primary.len() + 1]] = + for (index, anchor) in anchors.secondary_22.iter().enumerate() { + let gap = anchor.chaining_jump_gap_from_start(start, TsKind::TS22); + jump_12_to_22[[0, index]] = chaining_lower_bounds.jump_12_lower_bound(gap); + let gap = anchor.chaining_jump_gap_to_end(end, TsKind::TS22, k); + jump_34_from_22[[index, anchors.primary.len() + 1]] = chaining_lower_bounds.jump_34_lower_bound(gap); } @@ -301,6 +304,138 @@ impl ChainingCostFunction { self.primary[[0, self.primary.dim().1 - 1]] } + pub fn jump_12_to_11(&self, from_primary_index: usize, to_secondary_11_index: usize) -> Cost { + self.jump_12_to_11[[from_primary_index + 1, to_secondary_11_index]] + } + + pub fn jump_12_to_12(&self, from_primary_index: usize, to_secondary_12_index: usize) -> Cost { + self.jump_12_to_12[[from_primary_index + 1, to_secondary_12_index]] + } + + pub fn jump_12_to_21(&self, from_primary_index: usize, to_secondary_21_index: usize) -> Cost { + self.jump_12_to_21[[from_primary_index + 1, to_secondary_21_index]] + } + + pub fn jump_12_to_22(&self, from_primary_index: usize, to_secondary_22_index: usize) -> Cost { + self.jump_12_to_22[[from_primary_index + 1, to_secondary_22_index]] + } + + pub fn jump_12( + &self, + from_primary_index: usize, + to_secondary_index: usize, + ts_kind: TsKind, + ) -> Cost { + match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + self.jump_12_to_11(from_primary_index, to_secondary_index) + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + self.jump_12_to_12(from_primary_index, to_secondary_index) + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + self.jump_12_to_21(from_primary_index, to_secondary_index) + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + self.jump_12_to_22(from_primary_index, to_secondary_index) + } + } + } + + pub fn jump_12_to_11_from_start(&self, to_secondary_11_index: usize) -> Cost { + self.jump_12_to_11[[0, to_secondary_11_index]] + } + + pub fn jump_12_to_12_from_start(&self, to_secondary_12_index: usize) -> Cost { + self.jump_12_to_12[[0, to_secondary_12_index]] + } + + pub fn jump_12_to_21_from_start(&self, to_secondary_21_index: usize) -> Cost { + self.jump_12_to_21[[0, to_secondary_21_index]] + } + + pub fn jump_12_to_22_from_start(&self, to_secondary_22_index: usize) -> Cost { + self.jump_12_to_22[[0, to_secondary_22_index]] + } + + pub fn jump_12_from_start(&self, to_secondary_index: usize, ts_kind: TsKind) -> Cost { + match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + self.jump_12_to_11_from_start(to_secondary_index) + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + self.jump_12_to_12_from_start(to_secondary_index) + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + self.jump_12_to_21_from_start(to_secondary_index) + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + self.jump_12_to_22_from_start(to_secondary_index) + } + } + } + + pub fn secondary( + &self, + from_secondary_index: usize, + to_secondary_index: usize, + ts_kind: TsKind, + ) -> Cost { + match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + self.secondary_11[[from_secondary_index, to_secondary_index]] + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + self.secondary_12[[from_secondary_index, to_secondary_index]] + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + self.secondary_21[[from_secondary_index, to_secondary_index]] + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + self.secondary_22[[from_secondary_index, to_secondary_index]] + } + } + } + + pub fn jump_34( + &self, + from_secondary_index: usize, + to_primary_index: usize, + ts_kind: TsKind, + ) -> Cost { + match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + self.jump_34_from_11[[from_secondary_index, to_primary_index + 1]] + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + self.jump_34_from_12[[from_secondary_index, to_primary_index + 1]] + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + self.jump_34_from_21[[from_secondary_index, to_primary_index + 1]] + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + self.jump_34_from_22[[from_secondary_index, to_primary_index + 1]] + } + } + } + + pub fn jump_34_to_end(&self, from_secondary_index: usize, ts_kind: TsKind) -> Cost { + match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + self.jump_34_from_11[[from_secondary_index, self.jump_34_from_11.dim().1 - 1]] + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + self.jump_34_from_12[[from_secondary_index, self.jump_34_from_12.dim().1 - 1]] + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + self.jump_34_from_21[[from_secondary_index, self.jump_34_from_21.dim().1 - 1]] + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + self.jump_34_from_22[[from_secondary_index, self.jump_34_from_22.dim().1 - 1]] + } + } + } + pub fn update_primary( &mut self, from_primary_index: usize, @@ -339,4 +474,141 @@ impl ChainingCostFunction { *target = cost; result } + + pub fn update_jump_12( + &mut self, + from_primary_index: usize, + to_secondary_index: usize, + ts_kind: TsKind, + cost: Cost, + ) -> bool { + let target = match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + &mut self.jump_12_to_11[[from_primary_index + 1, to_secondary_index]] + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + &mut self.jump_12_to_12[[from_primary_index + 1, to_secondary_index]] + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + &mut self.jump_12_to_21[[from_primary_index + 1, to_secondary_index]] + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + &mut self.jump_12_to_22[[from_primary_index + 1, to_secondary_index]] + } + }; + assert!(*target <= cost); + let result = *target < cost; + *target = cost; + result + } + + pub fn update_jump_12_from_start( + &mut self, + to_secondary_index: usize, + ts_kind: TsKind, + cost: Cost, + ) -> bool { + let target = match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + &mut self.jump_12_to_11[[0, to_secondary_index]] + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + &mut self.jump_12_to_12[[0, to_secondary_index]] + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + &mut self.jump_12_to_21[[0, to_secondary_index]] + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + &mut self.jump_12_to_22[[0, to_secondary_index]] + } + }; + assert!(*target <= cost); + let result = *target < cost; + *target = cost; + result + } + + pub fn update_secondary( + &mut self, + from_secondary_index: usize, + to_secondary_index: usize, + ts_kind: TsKind, + cost: Cost, + ) -> bool { + let target = match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + &mut self.secondary_11[[from_secondary_index, to_secondary_index]] + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + &mut self.secondary_12[[from_secondary_index, to_secondary_index]] + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + &mut self.secondary_21[[from_secondary_index, to_secondary_index]] + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + &mut self.secondary_22[[from_secondary_index, to_secondary_index]] + } + }; + assert!(*target <= cost); + let result = *target < cost; + *target = cost; + result + } + + pub fn update_jump_34( + &mut self, + from_secondary_index: usize, + to_primary_index: usize, + ts_kind: TsKind, + cost: Cost, + ) -> bool { + let target = match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + &mut self.jump_34_from_11[[from_secondary_index, to_primary_index + 1]] + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + &mut self.jump_34_from_12[[from_secondary_index, to_primary_index + 1]] + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + &mut self.jump_34_from_21[[from_secondary_index, to_primary_index + 1]] + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + &mut self.jump_34_from_22[[from_secondary_index, to_primary_index + 1]] + } + }; + assert!(*target <= cost); + let result = *target < cost; + *target = cost; + result + } + + pub fn update_jump_34_to_end( + &mut self, + from_secondary_index: usize, + ts_kind: TsKind, + cost: Cost, + ) -> bool { + let target = match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + let end_index = self.jump_34_from_11.dim().1 - 1; + &mut self.jump_34_from_11[[from_secondary_index, end_index]] + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + let end_index = self.jump_34_from_12.dim().1 - 1; + &mut self.jump_34_from_12[[from_secondary_index, end_index]] + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + let end_index = self.jump_34_from_21.dim().1 - 1; + &mut self.jump_34_from_21[[from_secondary_index, end_index]] + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + let end_index = self.jump_34_from_22.dim().1 - 1; + &mut self.jump_34_from_22[[from_secondary_index, end_index]] + } + }; + assert!(*target <= cost); + let result = *target < cost; + *target = cost; + result + } } diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs index 3a235d5..be13024 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump.rs @@ -22,14 +22,14 @@ impl Ts12JumpAlignment { start: AlignmentCoordinates, end: AlignmentCoordinates, sequences: &AlignmentSequences, - cost_table: &AlignmentCosts, + alignment_costs: &AlignmentCosts, rc_fn: &dyn Fn(u8) -> u8, max_match_run: u32, ) -> Self { assert!(start.is_primary()); assert!(end.is_secondary()); - let context = Context::new(cost_table, sequences, rc_fn, start, end, max_match_run); + let context = Context::new(alignment_costs, sequences, rc_fn, start, end, max_match_run); let mut a_star = AStar::new(context); a_star.initialise(); match a_star.search() { diff --git a/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs b/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs index 9f5eda5..ecaa3fe 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_34_jump.rs @@ -22,14 +22,14 @@ impl Ts34JumpAlignment { start: AlignmentCoordinates, end: AlignmentCoordinates, sequences: &AlignmentSequences, - cost_table: &AlignmentCosts, + alignment_costs: &AlignmentCosts, rc_fn: &dyn Fn(u8) -> u8, max_match_run: u32, ) -> Self { assert!(start.is_secondary()); assert!(end.is_primary()); - let context = Context::new(cost_table, sequences, rc_fn, start, end, max_match_run); + let context = Context::new(alignment_costs, sequences, rc_fn, start, end, max_match_run); let mut a_star = AStar::new(context); a_star.initialise(); match a_star.search() { @@ -39,7 +39,7 @@ impl Ts34JumpAlignment { alignment: a_star.reconstruct_path().into(), // The TS base cost is applied at the 12-jump, but we anyways apply it in this algorithm to make it label-setting if the base cost is non-zero. // But since the 34-jump has zero cost, we subtract it again. - cost: cost.0 - cost_table.ts_base_cost, + cost: cost.0 - alignment_costs.ts_base_cost, }, AStarResult::ExceededCostLimit { .. } => unreachable!("Cost limit is None"), AStarResult::ExceededMemoryLimit { .. } => unreachable!("Cost limit is None"), diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index fb70f16..71615ba 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -1,3 +1,4 @@ +use compact_genome::interface::alphabet::Alphabet; use generic_a_star::cost::U32Cost; use lib_tsalign::a_star_aligner::{ alignment_geometry::AlignmentRange, alignment_result::AlignmentResult, @@ -35,13 +36,13 @@ pub fn preprocess( ChainingLowerBounds::new(max_n, max_match_run, alignment_costs) } -pub fn align( +pub fn align( reference: Vec, query: Vec, range: AlignmentRange, rc_fn: &dyn Fn(u8) -> u8, - _reference_name: &str, - _query_name: &str, + reference_name: &str, + query_name: &str, chaining_lower_bounds: &ChainingLowerBounds, ) -> AlignmentResult { debug!( @@ -51,7 +52,12 @@ pub fn align( debug!("Query sequence: {}", String::from_utf8_lossy(&query)); info!("Aligning on subsequence {}", range); - let sequences = AlignmentSequences::new(reference, query); + let sequences = AlignmentSequences::new_named( + reference, + query, + reference_name.to_string(), + query_name.to_string(), + ); let k = chaining_lower_bounds.max_match_run() + 1; let anchors = Anchors::new(&sequences, range.clone(), k, rc_fn); @@ -60,7 +66,7 @@ pub fn align( let mut chaining_cost_function = ChainingCostFunction::new_from_lower_bounds(chaining_lower_bounds, &anchors, start, end); - chain_align::align( + chain_align::align::( &sequences, start, end, @@ -69,7 +75,5 @@ pub fn align( chaining_lower_bounds.max_match_run(), &anchors, &mut chaining_cost_function, - ); - - todo!() + ) } diff --git a/lib_tsalign/src/a_star_aligner/alignment_result.rs b/lib_tsalign/src/a_star_aligner/alignment_result.rs index 30825e7..43db1a7 100644 --- a/lib_tsalign/src/a_star_aligner/alignment_result.rs +++ b/lib_tsalign/src/a_star_aligner/alignment_result.rs @@ -206,10 +206,13 @@ impl AlignmentResult Alignment { where AlignmentType: Eq, { - if let Some((multiplicity, last_alignment_type)) = self.alignment.last_mut() { + self.push_n(1, alignment_type); + } + + pub fn push_n(&mut self, multiplicity: usize, alignment_type: AlignmentType) + where + AlignmentType: Eq, + { + if let Some((existing_multiplicity, last_alignment_type)) = self.alignment.last_mut() { if *last_alignment_type == alignment_type { - *multiplicity += 1; + *existing_multiplicity += multiplicity; } else { - self.alignment.push((1, alignment_type)); + self.alignment.push((multiplicity, alignment_type)); } } else { - self.alignment.push((1, alignment_type)); + self.alignment.push((multiplicity, alignment_type)); } } pub fn inner_mut(&mut self) -> &mut Vec<(usize, AlignmentType)> { &mut self.alignment } + + pub fn into_inner(self) -> Vec<(usize, AlignmentType)> { + self.alignment + } } impl Alignment { diff --git a/test_files/test_chainalign.sh b/test_files/test_chainalign.sh index 08f85fc..efc29d2 100755 --- a/test_files/test_chainalign.sh +++ b/test_files/test_chainalign.sh @@ -1,3 +1,3 @@ #!/bin/bash -cargo run -- align --log-level debug --alignment-method a-star-chain-ts -c test_files/config/chainalign --alphabet dna -p "$@" \ No newline at end of file +cargo run -- align --alignment-method a-star-chain-ts -c test_files/config/chainalign --alphabet dna -p "$@" \ No newline at end of file diff --git a/test_files/twin_10_ts.fa b/test_files/twin_10_ts.fa index 1643ec6..b141381 100644 --- a/test_files/twin_10_ts.fa +++ b/test_files/twin_10_ts.fa @@ -1,8 +1,8 @@ >reference AC ATCTGC -GA +AA >query AC GCAGAT -GA \ No newline at end of file +AA \ No newline at end of file diff --git a/tsalign/src/align/a_star_chain_ts.rs b/tsalign/src/align/a_star_chain_ts.rs index 84cc2ac..05e3787 100644 --- a/tsalign/src/align/a_star_chain_ts.rs +++ b/tsalign/src/align/a_star_chain_ts.rs @@ -92,7 +92,7 @@ pub fn align_a_star_chain_ts< let reference = reference.clone_as_vec(); let query = query.clone_as_vec(); - let alignment = lib_ts_chainalign::align( + let alignment = lib_ts_chainalign::align::( reference, query, range, From 46cc26bd130594f8b9c57f4a42253a41113ed7e5 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Fri, 5 Dec 2025 14:33:25 +0200 Subject: [PATCH 21/31] Fix empty jumps. --- .../src/alignment/coordinates.rs | 4 +- lib_ts_chainalign/src/chain_align.rs | 153 ++++++++++++------ lib_ts_chainalign/src/chain_align/chainer.rs | 32 +++- .../src/exact_chaining/ts_12_jump/tests.rs | 141 +++++++++++++++- .../src/exact_chaining/ts_34_jump/algo.rs | 22 ++- .../src/exact_chaining/ts_34_jump/tests.rs | 119 +++++++++++++- lib_ts_chainalign/src/lib.rs | 3 +- 7 files changed, 399 insertions(+), 75 deletions(-) diff --git a/lib_ts_chainalign/src/alignment/coordinates.rs b/lib_ts_chainalign/src/alignment/coordinates.rs index dda432e..4b02100 100644 --- a/lib_ts_chainalign/src/alignment/coordinates.rs +++ b/lib_ts_chainalign/src/alignment/coordinates.rs @@ -230,7 +230,7 @@ impl AlignmentCoordinates { TsDescendant::Seq2 => b, }; - (start.secondary_ordinate_ancestor().unwrap()..ancestor_limit).map(move |ancestor| { + (start.secondary_ordinate_ancestor().unwrap()..=ancestor_limit).map(move |ancestor| { ( ancestor as isize - ancestor_zero, Self::Secondary { @@ -258,7 +258,7 @@ impl AlignmentCoordinates { panic!("Can only generate 34-jumps from secondary coordinates"); }; - (0..match ts_kind.descendant { + (0..=match ts_kind.descendant { TsDescendant::Seq1 => end.primary_ordinate_b().unwrap(), TsDescendant::Seq2 => end.primary_ordinate_a().unwrap(), }) diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index f27c234..599927d 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -117,11 +117,12 @@ pub fn align( rc_fn, max_match_run, ); - cost_increased = cost_increased - || astar - .context_mut() - .chaining_cost_function - .update_start_to_end(alignment.cost()); + trace!("Aligning from start to end costs {}", alignment.cost()); + cost_increased = astar + .context_mut() + .chaining_cost_function + .update_start_to_end(alignment.cost()) + || cost_increased; alignments.push(alignment.alignment().clone()); } (Identifier::Start, Identifier::Primary { index }) => { @@ -134,11 +135,16 @@ pub fn align( rc_fn, max_match_run, ); - cost_increased = cost_increased - || astar - .context_mut() - .chaining_cost_function - .update_primary_from_start(index, alignment.cost()); + trace!( + "Aligning from start to P{index}{} costs {}", + anchors.primary[index], + alignment.cost() + ); + cost_increased = astar + .context_mut() + .chaining_cost_function + .update_primary_from_start(index, alignment.cost()) + || cost_increased; alignments.push(alignment.alignment().clone()); } (Identifier::Start, Identifier::Secondary { index, ts_kind }) => { @@ -151,11 +157,17 @@ pub fn align( rc_fn, max_match_run, ); - cost_increased = cost_increased - || astar - .context_mut() - .chaining_cost_function - .update_jump_12_from_start(index, ts_kind, alignment.cost()); + trace!( + "Aligning from start to S{}[{index}]{} costs {}", + ts_kind.digits(), + anchors.secondary(ts_kind)[index], + alignment.cost() + ); + cost_increased = astar + .context_mut() + .chaining_cost_function + .update_jump_12_from_start(index, ts_kind, alignment.cost()) + || cost_increased; alignments.push(alignment.alignment().clone()); } (Identifier::Primary { index }, Identifier::End) => { @@ -168,11 +180,16 @@ pub fn align( rc_fn, max_match_run, ); - cost_increased = cost_increased - || astar - .context_mut() - .chaining_cost_function - .update_primary_to_end(index, alignment.cost()); + trace!( + "Aligning from P{index}{} to end costs {}", + anchors.primary[index], + alignment.cost() + ); + cost_increased = astar + .context_mut() + .chaining_cost_function + .update_primary_to_end(index, alignment.cost()) + || cost_increased; alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -186,11 +203,17 @@ pub fn align( rc_fn, max_match_run, ); - cost_increased = cost_increased - || astar - .context_mut() - .chaining_cost_function - .update_jump_34_to_end(index, ts_kind, alignment.cost()); + trace!( + "Aligning from S{}[{index}]{} to end costs {}", + ts_kind.digits(), + anchors.secondary(ts_kind)[index], + alignment.cost() + ); + cost_increased = astar + .context_mut() + .chaining_cost_function + .update_jump_34_to_end(index, ts_kind, alignment.cost()) + || cost_increased; alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -215,12 +238,17 @@ pub fn align( rc_fn, max_match_run, ); - cost_increased = cost_increased - || astar.context_mut().chaining_cost_function.update_primary( - from_index, - to_index, - alignment.cost(), - ); + trace!( + "Aligning from P{from_index}{} to P{to_index}{} costs {}", + anchors.primary[from_index], + anchors.primary[to_index], + alignment.cost() + ); + cost_increased = astar.context_mut().chaining_cost_function.update_primary( + from_index, + to_index, + alignment.cost(), + ) || cost_increased; alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -241,13 +269,19 @@ pub fn align( rc_fn, max_match_run, ); - cost_increased = cost_increased - || astar.context_mut().chaining_cost_function.update_jump_12( - from_index, - to_index, - ts_kind, - alignment.cost(), - ); + cost_increased = astar.context_mut().chaining_cost_function.update_jump_12( + from_index, + to_index, + ts_kind, + alignment.cost(), + ) || cost_increased; + trace!( + "Aligning from P{from_index}{} to S{}[{to_index}]{} costs {}", + anchors.primary[from_index], + ts_kind.digits(), + anchors.secondary(ts_kind)[to_index], + alignment.cost() + ); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -278,13 +312,20 @@ pub fn align( rc_fn, max_match_run, ); - cost_increased = cost_increased - || astar.context_mut().chaining_cost_function.update_secondary( - from_index, - to_index, - ts_kind, - alignment.cost(), - ); + trace!( + "Aligning from S{}[{from_index}]{} to S{}[{to_index}]{} costs {}", + ts_kind.digits(), + anchors.secondary(ts_kind)[from_index], + ts_kind.digits(), + anchors.secondary(ts_kind)[to_index], + alignment.cost() + ); + cost_increased = astar.context_mut().chaining_cost_function.update_secondary( + from_index, + to_index, + ts_kind, + alignment.cost(), + ) || cost_increased; alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -305,13 +346,21 @@ pub fn align( rc_fn, max_match_run, ); - cost_increased = cost_increased - || astar.context_mut().chaining_cost_function.update_jump_34( - from_index, - to_index, - ts_kind, - alignment.cost(), - ); + cost_increased = astar.context_mut().chaining_cost_function.update_jump_34( + from_index, + to_index, + ts_kind, + alignment.cost(), + ) || cost_increased; + trace!( + "Aligning from S{}[{from_index}]{} to P{to_index}{} (S{} to P{}) costs {}", + ts_kind.digits(), + anchors.secondary(ts_kind)[from_index], + anchors.primary[to_index], + start, + end, + alignment.cost() + ); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } diff --git a/lib_ts_chainalign/src/chain_align/chainer.rs b/lib_ts_chainalign/src/chain_align/chainer.rs index f30c94c..956f9fd 100644 --- a/lib_ts_chainalign/src/chain_align/chainer.rs +++ b/lib_ts_chainalign/src/chain_align/chainer.rs @@ -77,7 +77,7 @@ impl AStarContext for Context<'_, '_, Cost> { .primary_from_start(successor_index), )?; if DEBUG_CHAINER { - println!("Cost: {cost}"); + println!("Cost: {}+{}", predecessor_cost, cost - predecessor_cost); } (cost != Cost::max_value()).then_some(Node { @@ -106,7 +106,11 @@ impl AStarContext for Context<'_, '_, Cost> { .jump_12_from_start(successor_index, ts_kind), )?; if DEBUG_CHAINER { - println!("Cost: {cost}"); + println!( + "Cost: {}+{}", + predecessor_cost, + cost - predecessor_cost + ); } (cost != Cost::max_value()).then_some(Node { @@ -123,6 +127,10 @@ impl AStarContext for Context<'_, '_, Cost> { let cost = predecessor_cost .checked_add(&self.chaining_cost_function.start_to_end()) .unwrap(); + if DEBUG_CHAINER { + println!("Checking anchor end"); + println!("Cost: {}+{}", predecessor_cost, cost - predecessor_cost); + } debug_assert_ne!(cost, Cost::max_value()); Node { identifier: Identifier::End, @@ -146,7 +154,7 @@ impl AStarContext for Context<'_, '_, Cost> { &self.chaining_cost_function.primary(index, successor_index), )?; if DEBUG_CHAINER { - println!("Cost: {cost}"); + println!("Cost: {}+{}", predecessor_cost, cost - predecessor_cost); } (cost != Cost::max_value()).then_some(Node { @@ -177,7 +185,11 @@ impl AStarContext for Context<'_, '_, Cost> { ), )?; if DEBUG_CHAINER { - println!("Cost: {cost}"); + println!( + "Cost: {}+{}", + predecessor_cost, + cost - predecessor_cost + ); } (cost != Cost::max_value()).then_some(Node { @@ -194,6 +206,10 @@ impl AStarContext for Context<'_, '_, Cost> { let cost = predecessor_cost .checked_add(&self.chaining_cost_function.primary_to_end(index)) .unwrap(); + if DEBUG_CHAINER { + println!("Checking anchor end"); + println!("Cost: {}+{}", predecessor_cost, cost - predecessor_cost); + } debug_assert_ne!(cost, Cost::max_value()); Node { identifier: Identifier::End, @@ -219,7 +235,7 @@ impl AStarContext for Context<'_, '_, Cost> { .secondary(index, successor_index, ts_kind), )?; if DEBUG_CHAINER { - println!("Cost: {cost}"); + println!("Cost: {}+{}", predecessor_cost, cost - predecessor_cost); } (cost != Cost::max_value()).then_some(Node { @@ -245,7 +261,7 @@ impl AStarContext for Context<'_, '_, Cost> { .jump_34(index, successor_index, ts_kind), )?; if DEBUG_CHAINER { - println!("Cost: {cost}"); + println!("Cost: {}+{}", predecessor_cost, cost - predecessor_cost); } (cost != Cost::max_value()).then_some(Node { @@ -262,6 +278,10 @@ impl AStarContext for Context<'_, '_, Cost> { &self.chaining_cost_function.jump_34_to_end(index, ts_kind), ) .unwrap(); + if DEBUG_CHAINER { + println!("Checking anchor end"); + println!("Cost: {}+{}", predecessor_cost, cost - predecessor_cost); + } debug_assert_ne!(cost, Cost::max_value()); Node { identifier: Identifier::End, diff --git a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs index 613b562..9c1fd30 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_12_jump/tests.rs @@ -256,9 +256,8 @@ fn test_max_match_run_1() { alignment.alignment().alignment, vec![ (1, AlignmentType::Match), - (6, AlignmentType::GapA), - (1, AlignmentType::Match), - (6, AlignmentType::GapA), + (12, AlignmentType::GapA), + (1, AlignmentType::Substitution), (1, AlignmentType::Match), ( 1, @@ -281,12 +280,12 @@ fn test_max_match_run_2() { let cost_table = AlignmentCosts { primary_costs: GapAffineCosts::new( U32Cost::from(2u8), - U32Cost::from(3u8), + U32Cost::from(30u8), U32Cost::from(1u8), ), secondary_costs: GapAffineCosts::new( U32Cost::from(4u8), - U32Cost::from(6u8), + U32Cost::from(60u8), U32Cost::from(2u8), ), ts_base_cost: U32Cost::from(2u8), @@ -328,3 +327,135 @@ fn test_max_match_run_2() { ); assert_eq!(alignment.cost(), U32Cost::from(14u8)); } + +#[test] +fn test_only_jump() { + let seq1 = b"ACGTACGTAC".to_vec(); + let seq2 = b"ACGTACGTAC".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(2u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(4u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_primary(5, 6); + let end = AlignmentCoordinates::new_secondary(3, 6, TsKind::TS12); + let alignment = Ts12JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 2); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![( + 1, + AlignmentType::TsStart { + jump: -2, + ts_kind: TsKind::TS12 + } + ),] + ); + assert_eq!(alignment.cost(), U32Cost::from(2u8)); +} + +#[test] +fn test_only_jump_start() { + let seq1 = b"ACGTACGTAC".to_vec(); + let seq2 = b"ACGTACGTAC".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(2u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(4u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_primary(0, 0); + let end = AlignmentCoordinates::new_secondary(0, 0, TsKind::TS12); + let alignment = Ts12JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 2); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![( + 1, + AlignmentType::TsStart { + jump: 0, + ts_kind: TsKind::TS12 + } + ),] + ); + assert_eq!(alignment.cost(), U32Cost::from(2u8)); +} + +#[test] +fn test_only_jump_end() { + let seq1 = b"ACGTACGTAC".to_vec(); + let seq2 = b"ACGTACGTAC".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(2u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(4u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_primary(10, 10); + let end = AlignmentCoordinates::new_secondary(10, 10, TsKind::TS12); + let alignment = Ts12JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 2); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![( + 1, + AlignmentType::TsStart { + jump: 0, + ts_kind: TsKind::TS12 + } + ),] + ); + assert_eq!(alignment.cost(), U32Cost::from(2u8)); +} diff --git a/lib_ts_chainalign/src/exact_chaining/ts_34_jump/algo.rs b/lib_ts_chainalign/src/exact_chaining/ts_34_jump/algo.rs index 0ad9531..c33193a 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_34_jump/algo.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_34_jump/algo.rs @@ -14,6 +14,8 @@ use crate::{ costs::AlignmentCosts, }; +const DEBUG_EXACT_34_JUMP: bool = false; + pub struct Context<'costs, 'sequences, 'rc_fn, Cost> { costs: &'costs AlignmentCosts, sequences: &'sequences AlignmentSequences, @@ -194,12 +196,20 @@ impl AStarContext for Context<'_, '_, '_, Cost> { output.extend( coordinates .generate_34_jumps(self.end) - .map(|(jump, coordinates)| Node { - identifier: Identifier::Jump34 { coordinates }, - predecessor, - predecessor_alignment_type: Some(AlignmentType::TsEnd { jump }), - cost: new_cost, - match_run: 0, + .map(|(jump, coordinates)| { + if DEBUG_EXACT_34_JUMP { + println!( + "Jump from {} to {coordinates} by {jump}", + node.identifier.coordinates() + ); + } + Node { + identifier: Identifier::Jump34 { coordinates }, + predecessor, + predecessor_alignment_type: Some(AlignmentType::TsEnd { jump }), + cost: new_cost, + match_run: 0, + } }), ); } diff --git a/lib_ts_chainalign/src/exact_chaining/ts_34_jump/tests.rs b/lib_ts_chainalign/src/exact_chaining/ts_34_jump/tests.rs index 6fac089..193c5ab 100644 --- a/lib_ts_chainalign/src/exact_chaining/ts_34_jump/tests.rs +++ b/lib_ts_chainalign/src/exact_chaining/ts_34_jump/tests.rs @@ -190,9 +190,8 @@ fn test_max_match_run_0() { assert_eq!( alignment.alignment().alignment, vec![ - (1, AlignmentType::TsEnd { jump: 6 }), - (2, AlignmentType::Substitution), - (14, AlignmentType::GapA), + (1, AlignmentType::TsEnd { jump: 8 }), + (16, AlignmentType::GapA), ] ); assert_eq!(alignment.cost(), U32Cost::from(18u8)); @@ -296,3 +295,117 @@ fn test_max_match_run_2() { ); assert_eq!(alignment.cost(), U32Cost::from(4u8)); } + +#[test] +fn test_only_jump() { + let seq1 = b"ACATCTGCAA".to_vec(); + let seq2 = b"ACGCAGATAA".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_secondary(2, 8, TsKind::TS21); + let end = AlignmentCoordinates::new_primary(8, 8); + let alignment = Ts34JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 2); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![(1, AlignmentType::TsEnd { jump: 6 })] + ); + assert_eq!(alignment.cost(), U32Cost::from(0u8)); +} + +#[test] +fn test_only_jump_start() { + let seq1 = b"ACATCTGCAA".to_vec(); + let seq2 = b"ACGCAGATAA".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_secondary(0, 0, TsKind::TS21); + let end = AlignmentCoordinates::new_primary(0, 0); + let alignment = Ts34JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 2); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![(1, AlignmentType::TsEnd { jump: 0 })] + ); + assert_eq!(alignment.cost(), U32Cost::from(0u8)); +} + +#[test] +fn test_only_jump_end() { + let seq1 = b"ACATCTGCAA".to_vec(); + let seq2 = b"ACGCAGATAA".to_vec(); + let sequences = AlignmentSequences::new(seq1, seq2); + let cost_table = AlignmentCosts { + primary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(3u8), + U32Cost::from(1u8), + ), + secondary_costs: GapAffineCosts::new( + U32Cost::from(1u8), + U32Cost::from(6u8), + U32Cost::from(2u8), + ), + ts_base_cost: U32Cost::from(2u8), + ts_limits: TsLimits { + jump_12: -100..100, + jump_34: -100..100, + length_23: 0..100, + ancestor_gap: -100..100, + }, + }; + + let start = AlignmentCoordinates::new_secondary(10, 10, TsKind::TS21); + let end = AlignmentCoordinates::new_primary(10, 10); + let alignment = Ts34JumpAlignment::new(start, end, &sequences, &cost_table, &rc_fn, 2); + + assert_eq!(alignment.start(), start); + assert_eq!(alignment.end(), end); + assert_eq!( + alignment.alignment().alignment, + vec![(1, AlignmentType::TsEnd { jump: 0 })] + ); + assert_eq!(alignment.cost(), U32Cost::from(0u8)); +} diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index 71615ba..cbfe68e 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -4,7 +4,7 @@ use lib_tsalign::a_star_aligner::{ alignment_geometry::AlignmentRange, alignment_result::AlignmentResult, template_switch_distance::AlignmentType, }; -use log::{debug, info}; +use log::{debug, info, trace}; use crate::{ alignment::{coordinates::AlignmentCoordinates, sequences::AlignmentSequences}, @@ -61,6 +61,7 @@ pub fn align( let k = chaining_lower_bounds.max_match_run() + 1; let anchors = Anchors::new(&sequences, range.clone(), k, rc_fn); + trace!("Anchors:\n{anchors}"); let start = AlignmentCoordinates::new_primary(range.reference_offset(), range.query_offset()); let end = AlignmentCoordinates::new_primary(range.reference_limit(), range.query_limit()); let mut chaining_cost_function = From 7ece69a89f4a4af2d017347e3be943456a43e507 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Fri, 5 Dec 2025 14:52:57 +0200 Subject: [PATCH 22/31] Fix k and more diagnostics. --- lib_ts_chainalign/src/anchors.rs | 11 ++++++++++- lib_ts_chainalign/src/chain_align.rs | 5 ++++- test_files/twin_1000_5ts.fa | 4 ++++ tsalign/src/align/a_star_chain_ts.rs | 6 +++--- 4 files changed, 21 insertions(+), 5 deletions(-) create mode 100644 test_files/twin_1000_5ts.fa diff --git a/lib_ts_chainalign/src/anchors.rs b/lib_ts_chainalign/src/anchors.rs index b50aa77..6c49298 100644 --- a/lib_ts_chainalign/src/anchors.rs +++ b/lib_ts_chainalign/src/anchors.rs @@ -1,7 +1,7 @@ use std::fmt::Display; use lib_tsalign::a_star_aligner::alignment_geometry::AlignmentRange; -use log::trace; +use log::{debug, trace}; use crate::{ alignment::{ @@ -176,6 +176,15 @@ impl Anchors { ) }); + debug!( + "Found {} anchors", + primary.len() + + secondary_11.len() + + secondary_12.len() + + secondary_21.len() + + secondary_22.len() + ); + Self { primary, secondary_11, diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index 599927d..154de19 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -51,6 +51,7 @@ pub fn align( let k = usize::try_from(max_match_run + 1).unwrap(); let context = Context::new(anchors, chaining_cost_function); let mut astar = AStar::new(context); + let mut chaining_execution_count = 0; let (alignments, result) = loop { let chaining_start_time = Instant::now(); @@ -58,9 +59,10 @@ pub fn align( astar.reset(); astar.initialise(); let result = astar.search(); + chaining_execution_count += 1; let chain = match result { AStarResult::FoundTarget { cost, .. } => { - debug!("Found chain with cost {cost}"); + trace!("Found chain with cost {cost}"); let mut chain = astar.reconstruct_path(); chain.push(Identifier::End); trace!("Chain (len: {}):\n{}", chain.len(), { @@ -376,6 +378,7 @@ pub fn align( } }; + debug!("Computed {chaining_execution_count} chains"); debug!("Chaining took {:.1}s", chaining_duration.as_secs_f64()); debug!("Evaluation took {:.1}s", evaluation_duration.as_secs_f64()); diff --git a/test_files/twin_1000_5ts.fa b/test_files/twin_1000_5ts.fa new file mode 100644 index 0000000..e0a7502 --- /dev/null +++ b/test_files/twin_1000_5ts.fa @@ -0,0 +1,4 @@ +>reference +AGTTAACATCTAGCCCGGCTCTATCAGTACACCAGTGCCTTGAATGACATACTCATCATTAAACTTTCTCAACAGTCAAACGACCAAGTGCATTTCCAAGGAGTGCGATGGAGATTCATTCTCTCGCCAGCACTGTAATAGGCACTAAAAGAGTGATGATAATCATGAGTGCCGTGCTAAGACGGTGTCGGAACAAAGCGGTCTTACGGTCAGTCGTATTTCCTCTCGAGTCCCGTCCAGTTGAGCGTATCACTCCCAGTGTACTAGCAAGCCGAGAAGGCTGTGCTTGGAGTCAATCGGATGTAGGATGGTCTCCAATTGACTCCAAGCCTTCACGCCTAAAGCATAAACGTCGAGCAGTCATGAAAGTCTTAGTACCGGACGTGCCGTTTCACTGCGAATATTACCTGAAGCTGTACCGTTATTGCGGAGCAAAGATGCAGTGCTGCTCTTATCATATTTGTATTGACGACAGCCGCCTTCGCGGTTTCCTCAGACACTTAAGAATAAGGGCTTATTGTAGGCAGGGGCACGCCCTTTTAGTGGCTGCGGCAAAATATCTTCGGATCCCCTTGTCTAACCAAATTAATCGAATTCTCTCATTTAAGACCCTAATATGTCATCATTAGTGTTTAAATGCCACCCCGAAAATACCGCCTAGAAATGTCTATGATTGGTCCACTAAAGTTGATTAAAACGACTGCTAAATCCGCGTGATAGGGCATTTGAAGTTTAATTTTGTATCGCAAGGTACTCCCGATCTTAATGGATGGCCGGAAGTGGTACGGATGCAATAAGCGCGGGTGAGAGGGTAATTAGGCGCGTTCACCTACGCTACGCTAACGGGCGATTCTATAAGAATGCACATTGCGTCTCTTATGAATCTGTCTCGACCGCATGCGCAACTTGTGAAGTGTCTACTATCCCTAAGCGCATATCTCGCACAGTAACCCCCGAATATGTCGGCATCTGATGTTACCCGGGTTGAGTTAGTGT +>query +AGTTAACATCTAGCCCGGCTCTATCAGTACACCAGTGCCTTGAATGACATACTCATCATTAAACTTTCTCAACAGTCAAACGACCAAGAGAAAGTTTAATCCAAGTGCATTTCCAAGGAGTGCGATGGAGATTCATTCTCTCGCCAGCACTGTAATAGGCACTAAAAGAGTGATGATAATCATGAGTGCCGTGCTAAGACGGTGTCGGAACAAAGCGGTCTTACGGTCAGTCGTATTTCCTCTCGAGTCCCGTCCAGTTGAGCGTATCACTCCCAGTGTACTAGCAAGCCGAGAAGGCTGTGCTTGGGGTCAATCGGATGTAGGATGGTCTCCAGACACCGGGCCACCACTCTTCACGCCTAAAGCATAAACGTCGAGCAGTCATGAAAGTCTTAGTACCGGACGTGCCGTTTCACTGCGAATATTACCTGAAGCTGTACCGTTATTGCGGAGCAAAGATGCAGTGCTGCTCTTATCATATTTGTATTGACGACAGCCGCCTTCGCGGTTTCCTCAGACACTTAAGAATAAGGGCTTATTGTAGGCAGAGGCACGCCCTTTTAGTGGCTGCGTGCCTCTGCCTTCGGATCCCCTTGTCTAACCAAATTAATCGAATTCTCTCATTTAAGACCCTAATATGTCATCATTAGTGTTTAAATGCCACCCCGAAAATACCGCCTAGAAATGTCTATGATTGGTCCACTAAAGTTGATTATTTAGCAGTCGTTTTCCGCGTGATAGGGCATTTGAAGTTTAATTTTGTATCGCAAGGTACTCCCGATCTTAATGGATGGCCGGAAGTGGTACGGATGCAATAAGCGCGGGTGAGAGGGTAATTAGGCGCGTTCACCTACGCTACGCTAACGGGCGATTCTATAAGAATGCACATTGCGTCGATTCATAAGATGTCTCGACCGCATGCGCAACTTGTGAAGTGTCTACTATCCCTAAGCGCATATCTCGCACAGTAACCCCCGAATATGTCGGCATCTGATGTTACCCGGGTTGAGTTAGTGT \ No newline at end of file diff --git a/tsalign/src/align/a_star_chain_ts.rs b/tsalign/src/align/a_star_chain_ts.rs index 05e3787..9f5b5c4 100644 --- a/tsalign/src/align/a_star_chain_ts.rs +++ b/tsalign/src/align/a_star_chain_ts.rs @@ -50,10 +50,10 @@ pub fn align_a_star_chain_ts< // However, the birthday paradoxon states that for avoiding collisions, // the amount of possible k-mers needs to grow in the square of the amount available k-mers, // so we square that and arrive at ceil(log_2(length_sum)). - usize::BITS - ((reference.len() + query.len()) - 1).leading_zeros() + let k = usize::BITS - ((reference.len() + query.len()) - 1).leading_zeros(); + // Decrease k a little, because we can hopefully afford a few more anchors. + k.saturating_sub(3).max(2) }); - // Decrease k a little, because we can hopefully afford a few more anchors. - let k = (k.saturating_sub(3)).max(2); debug!("Using max_n = {max_n}"); info!("Using k = {k}"); let max_match_run = k - 1; From e9f2cb04ff37693316c54053170c0859c7fd1e21 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 8 Dec 2025 09:57:50 +0200 Subject: [PATCH 23/31] Add progress indicator to chainalign. --- Cargo.lock | 122 +++++++++++++++++++++ lib_ts_chainalign/Cargo.toml | 1 + lib_ts_chainalign/src/chain_align.rs | 14 +++ test_files/config/chainalignn/config.tsa | 132 +++++++++++++++++++++++ 4 files changed, 269 insertions(+) create mode 100644 test_files/config/chainalignn/config.tsa diff --git a/Cargo.lock b/Cargo.lock index 55440f2..82c5b32 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -153,6 +153,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3acc5ce9c60e68df21b877f13f908ef95c89f01cb6c656cf76ba95f10bc72f5" +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + [[package]] name = "bytemuck" version = "1.23.2" @@ -245,6 +251,19 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "120133d4db2ec47efe2e26502ee984747630c67f51974fca0b6c1340cf2368d3" +[[package]] +name = "console" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b430743a6eb14e9764d4260d4c0d8123087d504eeb9c48f2b2a5e810dd369df4" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys 0.61.0", +] + [[package]] name = "core_maths" version = "0.1.1" @@ -293,6 +312,12 @@ dependencies = [ "log", ] +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + [[package]] name = "enum-iterator" version = "2.3.0" @@ -448,6 +473,19 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "indicatif" +version = "0.18.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88" +dependencies = [ + "console", + "portable-atomic", + "unicode-width", + "unit-prefix", + "web-time", +] + [[package]] name = "indoc" version = "2.0.6" @@ -469,6 +507,16 @@ dependencies = [ "either", ] +[[package]] +name = "js-sys" +version = "0.3.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + [[package]] name = "kurbo" version = "0.11.3" @@ -492,6 +540,7 @@ version = "0.1.0" dependencies = [ "compact-genome", "generic_a_star", + "indicatif", "itertools", "lib_tsalign", "log", @@ -932,6 +981,12 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + [[package]] name = "rustybuzz" version = "0.20.1" @@ -1383,12 +1438,24 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1d386ff53b415b7fe27b50bb44679e2cc4660272694b7b6f3326d8480823a94" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "unindent" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" +[[package]] +name = "unit-prefix" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" + [[package]] name = "unty" version = "0.0.4" @@ -1449,6 +1516,61 @@ version = "0.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" +[[package]] +name = "wasm-bindgen" +version = "0.2.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "weezl" version = "0.1.10" diff --git a/lib_ts_chainalign/Cargo.toml b/lib_ts_chainalign/Cargo.toml index 871c166..41e2d07 100644 --- a/lib_ts_chainalign/Cargo.toml +++ b/lib_ts_chainalign/Cargo.toml @@ -17,3 +17,4 @@ serde.workspace = true compact-genome.workspace = true itertools = "0.14.0" log.workspace = true +indicatif = "0.18.3" diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index 154de19..6214627 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -6,6 +6,7 @@ use compact_genome::{ }, }; use generic_a_star::{AStar, AStarResult, cost::AStarCost}; +use indicatif::ProgressBar; use lib_tsalign::a_star_aligner::{ alignment_result::AlignmentResult, template_switch_distance::{EqualCostRange, TemplateSwitchDirection}, @@ -44,6 +45,9 @@ pub fn align( anchors: &Anchors, chaining_cost_function: &mut ChainingCostFunction, ) -> AlignmentResult { + let progress_bar = ProgressBar::new_spinner(); + progress_bar.enable_steady_tick(Duration::from_millis(200)); + let start_time = Instant::now(); let mut chaining_duration = Duration::default(); let mut evaluation_duration = Duration::default(); @@ -52,8 +56,15 @@ pub fn align( let context = Context::new(anchors, chaining_cost_function); let mut astar = AStar::new(context); let mut chaining_execution_count = 0; + let mut current_cost = Cost::zero(); let (alignments, result) = loop { + progress_bar.inc(1); + progress_bar.set_message(format!( + "Computing chain number {} (cost {})", + chaining_execution_count + 1, + current_cost, + )); let chaining_start_time = Instant::now(); astar.reset(); @@ -63,6 +74,8 @@ pub fn align( let chain = match result { AStarResult::FoundTarget { cost, .. } => { trace!("Found chain with cost {cost}"); + current_cost = cost; + let mut chain = astar.reconstruct_path(); chain.push(Identifier::End); trace!("Chain (len: {}):\n{}", chain.len(), { @@ -378,6 +391,7 @@ pub fn align( } }; + progress_bar.finish_and_clear(); debug!("Computed {chaining_execution_count} chains"); debug!("Chaining took {:.1}s", chaining_duration.as_secs_f64()); debug!("Evaluation took {:.1}s", evaluation_duration.as_secs_f64()); diff --git a/test_files/config/chainalignn/config.tsa b/test_files/config/chainalignn/config.tsa new file mode 100644 index 0000000..c9d2a9d --- /dev/null +++ b/test_files/config/chainalignn/config.tsa @@ -0,0 +1,132 @@ +# Limits + +left_flank_length = 0 +right_flank_length = 0 + +# Base Cost + +rrf_cost = 31 +rqf_cost = 30 +qrf_cost = 30 +qqf_cost = 31 +rrr_cost = 2 +rqr_cost = 2 +qrr_cost = 2 +qqr_cost = 2 + +# Jump Costs + +Offset + -inf -300 301 + inf 0 inf + +Length + 0 10 200 + inf 0 inf + +LengthDifference + -inf -100 -20 -10 11 21 101 + inf 0 0 0 0 0 inf + +ForwardAntiPrimaryGap + -inf 1 + 0 inf + +ReverseAntiPrimaryGap + -inf + 0 + +# Primary Edit Costs + +SubstitutionCostTable + | A C G T N +--+--------------- +A | 0 2 2 2 2 +C | 2 0 2 2 2 +G | 2 2 0 2 2 +T | 2 2 2 0 2 +N | 2 2 2 2 0 + +GapOpenCostVector + A C G T N + 3 3 3 3 3 + +GapExtendCostVector + A C G T N + 1 1 1 1 1 + +# Secondary Forward Edit Costs + +SubstitutionCostTable + | A C G T N +--+--------------- +A | 0 4 4 4 4 +C | 4 0 4 4 4 +G | 4 4 0 4 4 +T | 4 4 4 0 4 +N | 4 4 4 4 0 + +GapOpenCostVector + A C G T N + 3 3 3 3 3 + +GapExtendCostVector + A C G T N + 2 2 2 2 2 + +# Secondary Reverse Edit Costs + +SubstitutionCostTable + | A C G T N +--+--------------- +A | 0 4 4 4 4 +C | 4 0 4 4 4 +G | 4 4 0 4 4 +T | 4 4 4 0 4 +N | 4 4 4 4 0 + +GapOpenCostVector + A C G T N + 3 3 3 3 3 + +GapExtendCostVector + A C G T N + 2 2 2 2 2 + +# Left Flank Edit Costs + +SubstitutionCostTable + | A C G T N +--+--------------- +A | 0 2 2 2 2 +C | 2 0 2 2 2 +G | 2 2 0 2 2 +T | 2 2 2 0 2 +N | 2 2 2 2 0 + +GapOpenCostVector + A C G T N + 3 3 3 3 3 + +GapExtendCostVector + A C G T N + 1 1 1 1 1 + +# Right Flank Edit Costs + +SubstitutionCostTable + | A C G T N +--+--------------- +A | 0 2 2 2 2 +C | 2 0 2 2 2 +G | 2 2 0 2 2 +T | 2 2 2 0 2 +N | 2 2 2 2 0 + +GapOpenCostVector + A C G T N + 3 3 3 3 3 + +GapExtendCostVector + A C G T N + 1 1 1 1 1 From 15203ae5d0eef1bedeccb994382e6cc64527a86b Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 8 Dec 2025 10:22:41 +0200 Subject: [PATCH 24/31] More diagnostics. --- lib_ts_chainalign/src/anchors.rs | 13 +++-- lib_ts_chainalign/src/chain_align.rs | 57 ++++++++++++++++++-- lib_ts_chainalign/src/chain_align/chainer.rs | 15 +++--- 3 files changed, 71 insertions(+), 14 deletions(-) diff --git a/lib_ts_chainalign/src/anchors.rs b/lib_ts_chainalign/src/anchors.rs index 6c49298..9c033cd 100644 --- a/lib_ts_chainalign/src/anchors.rs +++ b/lib_ts_chainalign/src/anchors.rs @@ -1,7 +1,7 @@ use std::fmt::Display; use lib_tsalign::a_star_aligner::alignment_geometry::AlignmentRange; -use log::{debug, trace}; +use log::{info, trace}; use crate::{ alignment::{ @@ -176,13 +176,18 @@ impl Anchors { ) }); - debug!( - "Found {} anchors", + info!( + "Found {} anchors ({} + {} + {} + {} + {})", primary.len() + secondary_11.len() + secondary_12.len() + secondary_21.len() - + secondary_22.len() + + secondary_22.len(), + primary.len(), + secondary_11.len(), + secondary_12.len(), + secondary_21.len(), + secondary_22.len(), ); Self { diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index 6214627..afa75e7 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -56,25 +56,35 @@ pub fn align( let context = Context::new(anchors, chaining_cost_function); let mut astar = AStar::new(context); let mut chaining_execution_count = 0; - let mut current_cost = Cost::zero(); + let mut current_lower_bound = Cost::zero(); + let mut current_upper_bound = Cost::max_value(); + let mut total_chaining_opened_nodes = 0; + let mut total_chaining_suboptimal_opened_nodes = 0; + let mut total_chaining_closed_nodes = 0; let (alignments, result) = loop { progress_bar.inc(1); progress_bar.set_message(format!( - "Computing chain number {} (cost {})", + "Computing chain number {} (cost {}->{})", chaining_execution_count + 1, - current_cost, + current_lower_bound, + current_upper_bound, )); let chaining_start_time = Instant::now(); astar.reset(); astar.initialise(); let result = astar.search(); + total_chaining_opened_nodes += astar.performance_counters().opened_nodes; + total_chaining_suboptimal_opened_nodes += + astar.performance_counters().suboptimal_opened_nodes; + total_chaining_closed_nodes += astar.performance_counters().closed_nodes; + chaining_execution_count += 1; let chain = match result { AStarResult::FoundTarget { cost, .. } => { trace!("Found chain with cost {cost}"); - current_cost = cost; + current_lower_bound = cost; let mut chain = astar.reconstruct_path(); chain.push(Identifier::End); @@ -117,6 +127,7 @@ pub fn align( let evaluation_start_time = Instant::now(); let mut cost_increased = false; + current_upper_bound = Cost::zero(); let mut alignments = Vec::new(); 'update_chain: for window in chain.windows(2) { let from_anchor = window[0]; @@ -138,6 +149,7 @@ pub fn align( .chaining_cost_function .update_start_to_end(alignment.cost()) || cost_increased; + current_upper_bound += astar.context().chaining_cost_function.start_to_end(); alignments.push(alignment.alignment().clone()); } (Identifier::Start, Identifier::Primary { index }) => { @@ -160,6 +172,10 @@ pub fn align( .chaining_cost_function .update_primary_from_start(index, alignment.cost()) || cost_increased; + current_upper_bound += astar + .context() + .chaining_cost_function + .primary_from_start(index); alignments.push(alignment.alignment().clone()); } (Identifier::Start, Identifier::Secondary { index, ts_kind }) => { @@ -183,6 +199,10 @@ pub fn align( .chaining_cost_function .update_jump_12_from_start(index, ts_kind, alignment.cost()) || cost_increased; + current_upper_bound += astar + .context() + .chaining_cost_function + .jump_12_from_start(index, ts_kind); alignments.push(alignment.alignment().clone()); } (Identifier::Primary { index }, Identifier::End) => { @@ -205,6 +225,8 @@ pub fn align( .chaining_cost_function .update_primary_to_end(index, alignment.cost()) || cost_increased; + current_upper_bound += + astar.context().chaining_cost_function.primary_to_end(index); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -229,6 +251,10 @@ pub fn align( .chaining_cost_function .update_jump_34_to_end(index, ts_kind, alignment.cost()) || cost_increased; + current_upper_bound += astar + .context() + .chaining_cost_function + .jump_34_to_end(index, ts_kind); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -264,6 +290,10 @@ pub fn align( to_index, alignment.cost(), ) || cost_increased; + current_upper_bound += astar + .context() + .chaining_cost_function + .primary(from_index, to_index); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -297,6 +327,10 @@ pub fn align( anchors.secondary(ts_kind)[to_index], alignment.cost() ); + current_upper_bound += astar + .context() + .chaining_cost_function + .jump_12(from_index, to_index, ts_kind); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -341,6 +375,10 @@ pub fn align( ts_kind, alignment.cost(), ) || cost_increased; + current_upper_bound += astar + .context() + .chaining_cost_function + .secondary(from_index, to_index, ts_kind); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -376,6 +414,10 @@ pub fn align( end, alignment.cost() ); + current_upper_bound += astar + .context() + .chaining_cost_function + .jump_34(from_index, to_index, ts_kind); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -395,6 +437,13 @@ pub fn align( debug!("Computed {chaining_execution_count} chains"); debug!("Chaining took {:.1}s", chaining_duration.as_secs_f64()); debug!("Evaluation took {:.1}s", evaluation_duration.as_secs_f64()); + debug!("Chaining opened nodes: {total_chaining_opened_nodes}"); + debug!( + "Chaining suboptimal openend nodes: {} ({:.0}%)", + total_chaining_suboptimal_opened_nodes, + total_chaining_suboptimal_opened_nodes as f64 / total_chaining_opened_nodes as f64 * 100.0, + ); + debug!("Chaining closed nodes: {total_chaining_closed_nodes}"); let mut tsalign_alignment = lib_tsalign::a_star_aligner::alignment_result::alignment::Alignment::new(); diff --git a/lib_ts_chainalign/src/chain_align/chainer.rs b/lib_ts_chainalign/src/chain_align/chainer.rs index 956f9fd..156bb20 100644 --- a/lib_ts_chainalign/src/chain_align/chainer.rs +++ b/lib_ts_chainalign/src/chain_align/chainer.rs @@ -71,16 +71,19 @@ impl AStarContext for Context<'_, '_, Cost> { ); } - let cost = predecessor_cost.checked_add( - &self - .chaining_cost_function - .primary_from_start(successor_index), - )?; + let cost = predecessor_cost + .checked_add( + &self + .chaining_cost_function + .primary_from_start(successor_index), + ) + .unwrap(); if DEBUG_CHAINER { println!("Cost: {}+{}", predecessor_cost, cost - predecessor_cost); } - (cost != Cost::max_value()).then_some(Node { + debug_assert_ne!(cost, Cost::max_value()); + Some(Node { identifier: Identifier::Primary { index: successor_index, }, From 8a630a84ac1e232db1fc8395bcbead0d59c52e4b Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 8 Dec 2025 10:49:15 +0200 Subject: [PATCH 25/31] Automatically convert sequences to upper case. --- tsalign/src/align.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tsalign/src/align.rs b/tsalign/src/align.rs index c1c10c8..1bda7dc 100644 --- a/tsalign/src/align.rs +++ b/tsalign/src/align.rs @@ -286,6 +286,10 @@ fn execute_with_alphabet( .sequence_handle .retain(|c| !skip_characters.contains(&c)); + // Convert sequences to upper case. + reference_record.sequence_handle.make_ascii_uppercase(); + query_record.sequence_handle.make_ascii_uppercase(); + // Parse RQ ranges. let range = if cli.use_embedded_rq_ranges { ensure!( From 120b144ca5f70fb8603af3475786dac8f14b0e54 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 8 Dec 2025 11:03:51 +0200 Subject: [PATCH 26/31] Fix anti-primary gap computation in chainalign. Closes #110 --- lib_ts_chainalign/src/chain_align.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index afa75e7..2219b88 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -448,7 +448,7 @@ pub fn align( let mut tsalign_alignment = lib_tsalign::a_star_aligner::alignment_result::alignment::Alignment::new(); let mut is_primary = true; - let mut anti_primary_gap = 0; + let mut anti_primary_gap = 0isize; for alignment in alignments { use lib_tsalign::a_star_aligner::template_switch_distance::AlignmentType as TsAlignAlignmentType; @@ -460,7 +460,7 @@ pub fn align( if is_primary { TsAlignAlignmentType::PrimaryMatch } else { - anti_primary_gap -= 1; + anti_primary_gap -= multiplicity as isize; TsAlignAlignmentType::SecondaryMatch }, ), @@ -469,7 +469,7 @@ pub fn align( if is_primary { TsAlignAlignmentType::PrimarySubstitution } else { - anti_primary_gap -= 1; + anti_primary_gap -= multiplicity as isize; TsAlignAlignmentType::SecondarySubstitution }, ), @@ -486,7 +486,7 @@ pub fn align( if is_primary { TsAlignAlignmentType::PrimaryDeletion } else { - anti_primary_gap -= 1; + anti_primary_gap -= multiplicity as isize; TsAlignAlignmentType::SecondaryDeletion }, ), From 6bf5e96be15a0bd9ef53bfbaa74ce1c90ad93375 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 8 Dec 2025 12:52:50 +0200 Subject: [PATCH 27/31] Evaluate each chained pair only once. --- Cargo.lock | 1 + lib_ts_chainalign/Cargo.toml | 1 + lib_ts_chainalign/src/chain_align.rs | 554 ++++++++++-------- .../src/chaining_cost_function.rs | 363 ++++++++++-- .../src/chaining_cost_function/cost_array.rs | 109 ++++ 5 files changed, 733 insertions(+), 295 deletions(-) create mode 100644 lib_ts_chainalign/src/chaining_cost_function/cost_array.rs diff --git a/Cargo.lock b/Cargo.lock index 82c5b32..1d72299 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -538,6 +538,7 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" name = "lib_ts_chainalign" version = "0.1.0" dependencies = [ + "bitvec", "compact-genome", "generic_a_star", "indicatif", diff --git a/lib_ts_chainalign/Cargo.toml b/lib_ts_chainalign/Cargo.toml index 41e2d07..0b2f6b1 100644 --- a/lib_ts_chainalign/Cargo.toml +++ b/lib_ts_chainalign/Cargo.toml @@ -18,3 +18,4 @@ compact-genome.workspace = true itertools = "0.14.0" log.workspace = true indicatif = "0.18.3" +bitvec = "1.0.1" diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index 2219b88..0f9cc65 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -52,7 +52,6 @@ pub fn align( let mut chaining_duration = Duration::default(); let mut evaluation_duration = Duration::default(); - let k = usize::try_from(max_match_run + 1).unwrap(); let context = Context::new(anchors, chaining_cost_function); let mut astar = AStar::new(context); let mut chaining_execution_count = 0; @@ -62,7 +61,7 @@ pub fn align( let mut total_chaining_suboptimal_opened_nodes = 0; let mut total_chaining_closed_nodes = 0; - let (alignments, result) = loop { + let (chain, result) = loop { progress_bar.inc(1); progress_bar.set_message(format!( "Computing chain number {} (cost {}->{})", @@ -126,15 +125,182 @@ pub fn align( let evaluation_start_time = Instant::now(); - let mut cost_increased = false; - current_upper_bound = Cost::zero(); - let mut alignments = Vec::new(); - 'update_chain: for window in chain.windows(2) { - let from_anchor = window[0]; - let to_anchor = window[1]; + let (evaluated_cost, _) = evaluate_chain( + anchors, + &chain, + sequences, + start, + end, + alignment_costs, + rc_fn, + max_match_run, + astar.context_mut().chaining_cost_function, + false, + ); + let cost_increased = evaluated_cost > current_lower_bound; + current_upper_bound = evaluated_cost; + + let evaluation_end_time = Instant::now(); + evaluation_duration += evaluation_end_time - evaluation_start_time; + + if !cost_increased { + break (chain, result); + } + }; + + progress_bar.finish_and_clear(); + debug!("Computed {chaining_execution_count} chains"); + debug!("Chaining took {:.1}s", chaining_duration.as_secs_f64()); + debug!("Evaluation took {:.1}s", evaluation_duration.as_secs_f64()); + debug!("Chaining opened nodes: {total_chaining_opened_nodes}"); + debug!( + "Chaining suboptimal openend nodes: {} ({:.0}%)", + total_chaining_suboptimal_opened_nodes, + total_chaining_suboptimal_opened_nodes as f64 / total_chaining_opened_nodes as f64 * 100.0, + ); + debug!("Chaining closed nodes: {total_chaining_closed_nodes}"); + + let mut tsalign_alignment = + lib_tsalign::a_star_aligner::alignment_result::alignment::Alignment::new(); + let mut is_primary = true; + let mut anti_primary_gap = 0isize; + + debug!("Evaluating final chain"); + let (evaluated_cost, alignments) = evaluate_chain( + anchors, + &chain, + sequences, + start, + end, + alignment_costs, + rc_fn, + max_match_run, + astar.context_mut().chaining_cost_function, + true, + ); + assert_eq!(evaluated_cost, result.cost()); + + for alignment in alignments { + use lib_tsalign::a_star_aligner::template_switch_distance::AlignmentType as TsAlignAlignmentType; + + for (multiplicity, alignment_type) in alignment.alignment { + match alignment_type { + AlignmentType::Match => tsalign_alignment.push_n( + multiplicity, + if is_primary { + TsAlignAlignmentType::PrimaryMatch + } else { + anti_primary_gap -= multiplicity as isize; + TsAlignAlignmentType::SecondaryMatch + }, + ), + AlignmentType::Substitution => tsalign_alignment.push_n( + multiplicity, + if is_primary { + TsAlignAlignmentType::PrimarySubstitution + } else { + anti_primary_gap -= multiplicity as isize; + TsAlignAlignmentType::SecondarySubstitution + }, + ), + AlignmentType::GapA => tsalign_alignment.push_n( + multiplicity, + if is_primary { + TsAlignAlignmentType::PrimaryInsertion + } else { + TsAlignAlignmentType::SecondaryInsertion + }, + ), + AlignmentType::GapB => tsalign_alignment.push_n( + multiplicity, + if is_primary { + TsAlignAlignmentType::PrimaryDeletion + } else { + anti_primary_gap -= multiplicity as isize; + TsAlignAlignmentType::SecondaryDeletion + }, + ), + AlignmentType::TsStart { jump, ts_kind } => { + assert!(is_primary); + assert_eq!(multiplicity, 1); + is_primary = false; + anti_primary_gap = jump; + + tsalign_alignment.push_n( + multiplicity, + TsAlignAlignmentType::TemplateSwitchEntrance { + first_offset: jump, + equal_cost_range: EqualCostRange::new_invalid(), + primary: ts_kind.descendant.into_tsalign_primary(), + secondary: ts_kind.ancestor.into_tsalign_secondary(), + direction: TemplateSwitchDirection::Reverse, + }, + ); + } + AlignmentType::TsEnd { jump } => { + assert!(!is_primary); + assert_eq!(multiplicity, 1); + is_primary = true; + + tsalign_alignment.push_n( + multiplicity, + TsAlignAlignmentType::TemplateSwitchExit { + anti_primary_gap: anti_primary_gap + jump, + }, + ) + } + } + } + } + + let end_time = Instant::now(); + let duration_seconds = (end_time - start_time).as_secs_f64(); - match (from_anchor, to_anchor) { - (Identifier::Start, Identifier::End) => { + AlignmentResult::new_with_target::( + tsalign_alignment.into_inner(), + VectorGenome::from_slice_u8(sequences.seq1()) + .unwrap() + .as_genome_subsequence(), + VectorGenome::from_slice_u8(sequences.seq2()) + .unwrap() + .as_genome_subsequence(), + sequences.seq1_name(), + sequences.seq2_name(), + start.primary_ordinate_a().unwrap(), + start.primary_ordinate_b().unwrap(), + result.without_node_identifier(), + duration_seconds, + 0, + 0, + 0, + sequences.seq1().len(), + sequences.seq2().len(), + ) +} + +#[expect(clippy::too_many_arguments)] +fn evaluate_chain( + anchors: &Anchors, + chain: &[Identifier], + sequences: &AlignmentSequences, + start: AlignmentCoordinates, + end: AlignmentCoordinates, + alignment_costs: &AlignmentCosts, + rc_fn: &dyn Fn(u8) -> u8, + max_match_run: u32, + chaining_cost_function: &mut ChainingCostFunction, + complete_evaluation: bool, +) -> (Cost, Vec) { + let k = usize::try_from(max_match_run + 1).unwrap(); + let mut current_upper_bound = Cost::zero(); + let mut alignments = Vec::new(); + for window in chain.windows(2) { + let from_anchor = window[0]; + let to_anchor = window[1]; + + match (from_anchor, to_anchor) { + (Identifier::Start, Identifier::End) => { + if complete_evaluation || !chaining_cost_function.is_start_to_end_exact() { let alignment = GapAffineAlignment::new( start, end, @@ -144,16 +310,15 @@ pub fn align( max_match_run, ); trace!("Aligning from start to end costs {}", alignment.cost()); - cost_increased = astar - .context_mut() - .chaining_cost_function - .update_start_to_end(alignment.cost()) - || cost_increased; - current_upper_bound += astar.context().chaining_cost_function.start_to_end(); + chaining_cost_function.update_start_to_end(alignment.cost(), true); alignments.push(alignment.alignment().clone()); } - (Identifier::Start, Identifier::Primary { index }) => { - let end = anchors.primary[index].start(); + current_upper_bound += chaining_cost_function.start_to_end(); + } + (Identifier::Start, Identifier::Primary { index }) => { + let end = anchors.primary[index].start(); + if complete_evaluation || !chaining_cost_function.is_primary_from_start_exact(index) + { let alignment = GapAffineAlignment::new( start, end, @@ -167,19 +332,16 @@ pub fn align( anchors.primary[index], alignment.cost() ); - cost_increased = astar - .context_mut() - .chaining_cost_function - .update_primary_from_start(index, alignment.cost()) - || cost_increased; - current_upper_bound += astar - .context() - .chaining_cost_function - .primary_from_start(index); + chaining_cost_function.update_primary_from_start(index, alignment.cost(), true); alignments.push(alignment.alignment().clone()); } - (Identifier::Start, Identifier::Secondary { index, ts_kind }) => { - let end = anchors.secondary(ts_kind)[index].start(ts_kind); + current_upper_bound += chaining_cost_function.primary_from_start(index); + } + (Identifier::Start, Identifier::Secondary { index, ts_kind }) => { + let end = anchors.secondary(ts_kind)[index].start(ts_kind); + if complete_evaluation + || !chaining_cost_function.is_jump_12_from_start_exact(index, ts_kind) + { let alignment = Ts12JumpAlignment::new( start, end, @@ -194,19 +356,19 @@ pub fn align( anchors.secondary(ts_kind)[index], alignment.cost() ); - cost_increased = astar - .context_mut() - .chaining_cost_function - .update_jump_12_from_start(index, ts_kind, alignment.cost()) - || cost_increased; - current_upper_bound += astar - .context() - .chaining_cost_function - .jump_12_from_start(index, ts_kind); + chaining_cost_function.update_jump_12_from_start( + index, + ts_kind, + alignment.cost(), + true, + ); alignments.push(alignment.alignment().clone()); } - (Identifier::Primary { index }, Identifier::End) => { - let start = anchors.primary[index].end(k); + current_upper_bound += chaining_cost_function.jump_12_from_start(index, ts_kind); + } + (Identifier::Primary { index }, Identifier::End) => { + let start = anchors.primary[index].end(k); + if complete_evaluation || !chaining_cost_function.is_primary_to_end_exact(index) { let alignment = GapAffineAlignment::new( start, end, @@ -220,18 +382,17 @@ pub fn align( anchors.primary[index], alignment.cost() ); - cost_increased = astar - .context_mut() - .chaining_cost_function - .update_primary_to_end(index, alignment.cost()) - || cost_increased; - current_upper_bound += - astar.context().chaining_cost_function.primary_to_end(index); + chaining_cost_function.update_primary_to_end(index, alignment.cost(), true); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } - (Identifier::Secondary { index, ts_kind }, Identifier::End) => { - let start = anchors.secondary(ts_kind)[index].end(ts_kind, k); + current_upper_bound += chaining_cost_function.primary_to_end(index); + } + (Identifier::Secondary { index, ts_kind }, Identifier::End) => { + let start = anchors.secondary(ts_kind)[index].end(ts_kind, k); + if complete_evaluation + || !chaining_cost_function.is_jump_34_to_end_exact(index, ts_kind) + { let alignment = Ts34JumpAlignment::new( start, end, @@ -246,31 +407,32 @@ pub fn align( anchors.secondary(ts_kind)[index], alignment.cost() ); - cost_increased = astar - .context_mut() - .chaining_cost_function - .update_jump_34_to_end(index, ts_kind, alignment.cost()) - || cost_increased; - current_upper_bound += astar - .context() - .chaining_cost_function - .jump_34_to_end(index, ts_kind); + chaining_cost_function.update_jump_34_to_end( + index, + ts_kind, + alignment.cost(), + true, + ); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } - ( - Identifier::Primary { index: from_index }, - Identifier::Primary { index: to_index }, - ) => { - if anchors.primary[from_index] - .is_direct_predecessor_of(&anchors.primary[to_index]) - { - alignments.push(Alignment::from(vec![AlignmentType::Match])); - continue 'update_chain; - } + current_upper_bound += chaining_cost_function.jump_34_to_end(index, ts_kind); + } + ( + Identifier::Primary { index: from_index }, + Identifier::Primary { index: to_index }, + ) => { + if anchors.primary[from_index].is_direct_predecessor_of(&anchors.primary[to_index]) + { + alignments.push(Alignment::from(vec![AlignmentType::Match])); + continue; + } - let start = anchors.primary[from_index].end(k); - let end = anchors.primary[to_index].start(); + let start = anchors.primary[from_index].end(k); + let end = anchors.primary[to_index].start(); + if complete_evaluation + || !chaining_cost_function.is_primary_exact(from_index, to_index) + { let alignment = GapAffineAlignment::new( start, end, @@ -285,27 +447,29 @@ pub fn align( anchors.primary[to_index], alignment.cost() ); - cost_increased = astar.context_mut().chaining_cost_function.update_primary( + chaining_cost_function.update_primary( from_index, to_index, alignment.cost(), - ) || cost_increased; - current_upper_bound += astar - .context() - .chaining_cost_function - .primary(from_index, to_index); + true, + ); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } - ( - Identifier::Primary { index: from_index }, - Identifier::Secondary { - index: to_index, - ts_kind, - }, - ) => { - let start = anchors.primary[from_index].end(k); - let end = anchors.secondary(ts_kind)[to_index].start(ts_kind); + current_upper_bound += chaining_cost_function.primary(from_index, to_index); + } + ( + Identifier::Primary { index: from_index }, + Identifier::Secondary { + index: to_index, + ts_kind, + }, + ) => { + let start = anchors.primary[from_index].end(k); + let end = anchors.secondary(ts_kind)[to_index].start(ts_kind); + if complete_evaluation + || !chaining_cost_function.is_jump_12_exact(from_index, to_index, ts_kind) + { let alignment = Ts12JumpAlignment::new( start, end, @@ -314,12 +478,13 @@ pub fn align( rc_fn, max_match_run, ); - cost_increased = astar.context_mut().chaining_cost_function.update_jump_12( + chaining_cost_function.update_jump_12( from_index, to_index, ts_kind, alignment.cost(), - ) || cost_increased; + true, + ); trace!( "Aligning from P{from_index}{} to S{}[{to_index}]{} costs {}", anchors.primary[from_index], @@ -327,32 +492,34 @@ pub fn align( anchors.secondary(ts_kind)[to_index], alignment.cost() ); - current_upper_bound += astar - .context() - .chaining_cost_function - .jump_12(from_index, to_index, ts_kind); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } - ( - Identifier::Secondary { - index: from_index, - ts_kind, - }, - Identifier::Secondary { - index: to_index, - ts_kind: to_ts_kind, - }, - ) => { - assert_eq!(ts_kind, to_ts_kind); - if anchors.secondary(ts_kind)[from_index] - .is_direct_predecessor_of(&anchors.secondary(ts_kind)[to_index]) - { - alignments.push(Alignment::from(vec![AlignmentType::Match])); - continue 'update_chain; - } - let start = anchors.secondary(ts_kind)[from_index].end(ts_kind, k); - let end = anchors.secondary(ts_kind)[to_index].start(ts_kind); + current_upper_bound += + chaining_cost_function.jump_12(from_index, to_index, ts_kind); + } + ( + Identifier::Secondary { + index: from_index, + ts_kind, + }, + Identifier::Secondary { + index: to_index, + ts_kind: to_ts_kind, + }, + ) => { + assert_eq!(ts_kind, to_ts_kind); + if anchors.secondary(ts_kind)[from_index] + .is_direct_predecessor_of(&anchors.secondary(ts_kind)[to_index]) + { + alignments.push(Alignment::from(vec![AlignmentType::Match])); + continue; + } + let start = anchors.secondary(ts_kind)[from_index].end(ts_kind, k); + let end = anchors.secondary(ts_kind)[to_index].start(ts_kind); + if complete_evaluation + || !chaining_cost_function.is_secondary_exact(from_index, to_index, ts_kind) + { let alignment = GapAffineAlignment::new( start, end, @@ -369,28 +536,31 @@ pub fn align( anchors.secondary(ts_kind)[to_index], alignment.cost() ); - cost_increased = astar.context_mut().chaining_cost_function.update_secondary( + chaining_cost_function.update_secondary( from_index, to_index, ts_kind, alignment.cost(), - ) || cost_increased; - current_upper_bound += astar - .context() - .chaining_cost_function - .secondary(from_index, to_index, ts_kind); + true, + ); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } - ( - Identifier::Secondary { - index: from_index, - ts_kind, - }, - Identifier::Primary { index: to_index }, - ) => { - let start = anchors.secondary(ts_kind)[from_index].end(ts_kind, k); - let end = anchors.primary[to_index].start(); + current_upper_bound += + chaining_cost_function.secondary(from_index, to_index, ts_kind); + } + ( + Identifier::Secondary { + index: from_index, + ts_kind, + }, + Identifier::Primary { index: to_index }, + ) => { + let start = anchors.secondary(ts_kind)[from_index].end(ts_kind, k); + let end = anchors.primary[to_index].start(); + if complete_evaluation + || !chaining_cost_function.is_jump_34_exact(from_index, to_index, ts_kind) + { let alignment = Ts34JumpAlignment::new( start, end, @@ -399,12 +569,13 @@ pub fn align( rc_fn, max_match_run, ); - cost_increased = astar.context_mut().chaining_cost_function.update_jump_34( + chaining_cost_function.update_jump_34( from_index, to_index, ts_kind, alignment.cost(), - ) || cost_increased; + true, + ); trace!( "Aligning from S{}[{from_index}]{} to P{to_index}{} (S{} to P{}) costs {}", ts_kind.digits(), @@ -414,136 +585,15 @@ pub fn align( end, alignment.cost() ); - current_upper_bound += astar - .context() - .chaining_cost_function - .jump_34(from_index, to_index, ts_kind); alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } - (Identifier::End, _) | (_, Identifier::Start) => unreachable!(), - } - } - - let evaluation_end_time = Instant::now(); - evaluation_duration += evaluation_end_time - evaluation_start_time; - - if !cost_increased { - break (alignments, result); - } - }; - - progress_bar.finish_and_clear(); - debug!("Computed {chaining_execution_count} chains"); - debug!("Chaining took {:.1}s", chaining_duration.as_secs_f64()); - debug!("Evaluation took {:.1}s", evaluation_duration.as_secs_f64()); - debug!("Chaining opened nodes: {total_chaining_opened_nodes}"); - debug!( - "Chaining suboptimal openend nodes: {} ({:.0}%)", - total_chaining_suboptimal_opened_nodes, - total_chaining_suboptimal_opened_nodes as f64 / total_chaining_opened_nodes as f64 * 100.0, - ); - debug!("Chaining closed nodes: {total_chaining_closed_nodes}"); - - let mut tsalign_alignment = - lib_tsalign::a_star_aligner::alignment_result::alignment::Alignment::new(); - let mut is_primary = true; - let mut anti_primary_gap = 0isize; - - for alignment in alignments { - use lib_tsalign::a_star_aligner::template_switch_distance::AlignmentType as TsAlignAlignmentType; - - for (multiplicity, alignment_type) in alignment.alignment { - match alignment_type { - AlignmentType::Match => tsalign_alignment.push_n( - multiplicity, - if is_primary { - TsAlignAlignmentType::PrimaryMatch - } else { - anti_primary_gap -= multiplicity as isize; - TsAlignAlignmentType::SecondaryMatch - }, - ), - AlignmentType::Substitution => tsalign_alignment.push_n( - multiplicity, - if is_primary { - TsAlignAlignmentType::PrimarySubstitution - } else { - anti_primary_gap -= multiplicity as isize; - TsAlignAlignmentType::SecondarySubstitution - }, - ), - AlignmentType::GapA => tsalign_alignment.push_n( - multiplicity, - if is_primary { - TsAlignAlignmentType::PrimaryInsertion - } else { - TsAlignAlignmentType::SecondaryInsertion - }, - ), - AlignmentType::GapB => tsalign_alignment.push_n( - multiplicity, - if is_primary { - TsAlignAlignmentType::PrimaryDeletion - } else { - anti_primary_gap -= multiplicity as isize; - TsAlignAlignmentType::SecondaryDeletion - }, - ), - AlignmentType::TsStart { jump, ts_kind } => { - assert!(is_primary); - assert_eq!(multiplicity, 1); - is_primary = false; - anti_primary_gap = jump; - - tsalign_alignment.push_n( - multiplicity, - TsAlignAlignmentType::TemplateSwitchEntrance { - first_offset: jump, - equal_cost_range: EqualCostRange::new_invalid(), - primary: ts_kind.descendant.into_tsalign_primary(), - secondary: ts_kind.ancestor.into_tsalign_secondary(), - direction: TemplateSwitchDirection::Reverse, - }, - ); - } - AlignmentType::TsEnd { jump } => { - assert!(!is_primary); - assert_eq!(multiplicity, 1); - is_primary = true; - - tsalign_alignment.push_n( - multiplicity, - TsAlignAlignmentType::TemplateSwitchExit { - anti_primary_gap: anti_primary_gap + jump, - }, - ) - } + current_upper_bound += + chaining_cost_function.jump_34(from_index, to_index, ts_kind); } + (Identifier::End, _) | (_, Identifier::Start) => unreachable!(), } } - let end_time = Instant::now(); - let duration_seconds = (end_time - start_time).as_secs_f64(); - - AlignmentResult::new_with_target::( - tsalign_alignment.into_inner(), - VectorGenome::from_slice_u8(sequences.seq1()) - .unwrap() - .as_genome_subsequence(), - VectorGenome::from_slice_u8(sequences.seq2()) - .unwrap() - .as_genome_subsequence(), - sequences.seq1_name(), - sequences.seq2_name(), - start.primary_ordinate_a().unwrap(), - start.primary_ordinate_b().unwrap(), - result.without_node_identifier(), - duration_seconds, - 0, - 0, - 0, - sequences.seq1().len(), - sequences.seq2().len(), - ) + (current_upper_bound, alignments) } diff --git a/lib_ts_chainalign/src/chaining_cost_function.rs b/lib_ts_chainalign/src/chaining_cost_function.rs index dfa4eed..da71c0c 100644 --- a/lib_ts_chainalign/src/chaining_cost_function.rs +++ b/lib_ts_chainalign/src/chaining_cost_function.rs @@ -1,5 +1,4 @@ use generic_a_star::cost::AStarCost; -use ndarray::{Array, Array2}; use crate::{ alignment::{ @@ -7,23 +6,26 @@ use crate::{ ts_kind::{TsAncestor, TsDescendant, TsKind}, }, anchors::Anchors, + chaining_cost_function::cost_array::CostArray2D, chaining_lower_bounds::ChainingLowerBounds, }; +mod cost_array; + pub struct ChainingCostFunction { - primary: Array2, - secondary_11: Array2, - secondary_12: Array2, - secondary_21: Array2, - secondary_22: Array2, - jump_12_to_11: Array2, - jump_12_to_12: Array2, - jump_12_to_21: Array2, - jump_12_to_22: Array2, - jump_34_from_11: Array2, - jump_34_from_12: Array2, - jump_34_from_21: Array2, - jump_34_from_22: Array2, + primary: CostArray2D, + secondary_11: CostArray2D, + secondary_12: CostArray2D, + secondary_21: CostArray2D, + secondary_22: CostArray2D, + jump_12_to_11: CostArray2D, + jump_12_to_12: CostArray2D, + jump_12_to_21: CostArray2D, + jump_12_to_22: CostArray2D, + jump_34_from_11: CostArray2D, + jump_34_from_12: CostArray2D, + jump_34_from_21: CostArray2D, + jump_34_from_22: CostArray2D, } impl ChainingCostFunction { @@ -35,8 +37,8 @@ impl ChainingCostFunction { ) -> Self { let k = usize::try_from(chaining_lower_bounds.max_match_run() + 1).unwrap(); - let mut primary = Array2::from_elem( - (anchors.primary.len() + 2, anchors.primary.len() + 2), + let mut primary = CostArray2D::new_from_cost( + [anchors.primary.len() + 2, anchors.primary.len() + 2], Cost::max_value(), ); let gap1 = end.primary_ordinate_a().unwrap() - start.primary_ordinate_a().unwrap(); @@ -63,8 +65,8 @@ impl ChainingCostFunction { } } - let mut secondary_11 = Array2::from_elem( - (anchors.secondary_11.len(), anchors.secondary_11.len()), + let mut secondary_11 = CostArray2D::new_from_cost( + [anchors.secondary_11.len(), anchors.secondary_11.len()], Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_11.iter().enumerate() { @@ -79,8 +81,8 @@ impl ChainingCostFunction { } } - let mut secondary_12 = Array2::from_elem( - (anchors.secondary_12.len(), anchors.secondary_12.len()), + let mut secondary_12 = CostArray2D::new_from_cost( + [anchors.secondary_12.len(), anchors.secondary_12.len()], Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_12.iter().enumerate() { @@ -95,8 +97,8 @@ impl ChainingCostFunction { } } - let mut secondary_21 = Array2::from_elem( - (anchors.secondary_21.len(), anchors.secondary_21.len()), + let mut secondary_21 = CostArray2D::new_from_cost( + [anchors.secondary_21.len(), anchors.secondary_21.len()], Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_21.iter().enumerate() { @@ -111,8 +113,8 @@ impl ChainingCostFunction { } } - let mut secondary_22 = Array2::from_elem( - (anchors.secondary_22.len(), anchors.secondary_22.len()), + let mut secondary_22 = CostArray2D::new_from_cost( + [anchors.secondary_22.len(), anchors.secondary_22.len()], Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_22.iter().enumerate() { @@ -127,8 +129,8 @@ impl ChainingCostFunction { } } - let mut jump_12_to_11 = Array::from_elem( - (anchors.primary.len() + 2, anchors.secondary_11.len()), + let mut jump_12_to_11 = CostArray2D::new_from_cost( + [anchors.primary.len() + 2, anchors.secondary_11.len()], Cost::max_value(), ); for (from_index, from_anchor) in anchors.primary.iter().enumerate() { @@ -141,8 +143,8 @@ impl ChainingCostFunction { } } - let mut jump_12_to_12 = Array::from_elem( - (anchors.primary.len() + 2, anchors.secondary_12.len()), + let mut jump_12_to_12 = CostArray2D::new_from_cost( + [anchors.primary.len() + 2, anchors.secondary_12.len()], Cost::max_value(), ); for (from_index, from_anchor) in anchors.primary.iter().enumerate() { @@ -155,8 +157,8 @@ impl ChainingCostFunction { } } - let mut jump_12_to_21 = Array::from_elem( - (anchors.primary.len() + 2, anchors.secondary_21.len()), + let mut jump_12_to_21 = CostArray2D::new_from_cost( + [anchors.primary.len() + 2, anchors.secondary_21.len()], Cost::max_value(), ); for (from_index, from_anchor) in anchors.primary.iter().enumerate() { @@ -169,8 +171,8 @@ impl ChainingCostFunction { } } - let mut jump_12_to_22 = Array::from_elem( - (anchors.primary.len() + 2, anchors.secondary_22.len()), + let mut jump_12_to_22 = CostArray2D::new_from_cost( + [anchors.primary.len() + 2, anchors.secondary_22.len()], Cost::max_value(), ); for (from_index, from_anchor) in anchors.primary.iter().enumerate() { @@ -183,8 +185,8 @@ impl ChainingCostFunction { } } - let mut jump_34_from_11 = Array::from_elem( - (anchors.secondary_11.len(), anchors.primary.len() + 2), + let mut jump_34_from_11 = CostArray2D::new_from_cost( + [anchors.secondary_11.len(), anchors.primary.len() + 2], Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_11.iter().enumerate() { @@ -197,8 +199,8 @@ impl ChainingCostFunction { } } - let mut jump_34_from_12 = Array::from_elem( - (anchors.secondary_12.len(), anchors.primary.len() + 2), + let mut jump_34_from_12 = CostArray2D::new_from_cost( + [anchors.secondary_12.len(), anchors.primary.len() + 2], Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_12.iter().enumerate() { @@ -211,8 +213,8 @@ impl ChainingCostFunction { } } - let mut jump_34_from_21 = Array::from_elem( - (anchors.secondary_21.len(), anchors.primary.len() + 2), + let mut jump_34_from_21 = CostArray2D::new_from_cost( + [anchors.secondary_21.len(), anchors.primary.len() + 2], Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_21.iter().enumerate() { @@ -225,8 +227,8 @@ impl ChainingCostFunction { } } - let mut jump_34_from_22 = Array::from_elem( - (anchors.secondary_22.len(), anchors.primary.len() + 2), + let mut jump_34_from_22 = CostArray2D::new_from_cost( + [anchors.secondary_22.len(), anchors.primary.len() + 2], Cost::max_value(), ); for (from_index, from_anchor) in anchors.secondary_22.iter().enumerate() { @@ -436,12 +438,187 @@ impl ChainingCostFunction { } } + pub fn is_primary_exact(&self, from_primary_index: usize, to_primary_index: usize) -> bool { + self.primary + .is_exact(from_primary_index + 1, to_primary_index + 1) + } + + pub fn is_primary_from_start_exact(&self, primary_index: usize) -> bool { + self.primary.is_exact(0, primary_index + 1) + } + + pub fn is_primary_to_end_exact(&self, primary_index: usize) -> bool { + self.primary + .is_exact(primary_index + 1, self.primary.dim().1 - 1) + } + + pub fn is_start_to_end_exact(&self) -> bool { + self.primary.is_exact(0, self.primary.dim().1 - 1) + } + + pub fn is_jump_12_to_11_exact( + &self, + from_primary_index: usize, + to_secondary_11_index: usize, + ) -> bool { + self.jump_12_to_11 + .is_exact(from_primary_index + 1, to_secondary_11_index) + } + + pub fn is_jump_12_to_12_exact( + &self, + from_primary_index: usize, + to_secondary_12_index: usize, + ) -> bool { + self.jump_12_to_12 + .is_exact(from_primary_index + 1, to_secondary_12_index) + } + + pub fn is_jump_12_to_21_exact( + &self, + from_primary_index: usize, + to_secondary_21_index: usize, + ) -> bool { + self.jump_12_to_21 + .is_exact(from_primary_index + 1, to_secondary_21_index) + } + + pub fn is_jump_12_to_22_exact( + &self, + from_primary_index: usize, + to_secondary_22_index: usize, + ) -> bool { + self.jump_12_to_22 + .is_exact(from_primary_index + 1, to_secondary_22_index) + } + + pub fn is_jump_12_exact( + &self, + from_primary_index: usize, + to_secondary_index: usize, + ts_kind: TsKind, + ) -> bool { + match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + self.is_jump_12_to_11_exact(from_primary_index, to_secondary_index) + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + self.is_jump_12_to_12_exact(from_primary_index, to_secondary_index) + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + self.is_jump_12_to_21_exact(from_primary_index, to_secondary_index) + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + self.is_jump_12_to_22_exact(from_primary_index, to_secondary_index) + } + } + } + + pub fn is_jump_12_to_11_from_start_exact(&self, to_secondary_11_index: usize) -> bool { + self.jump_12_to_11.is_exact(0, to_secondary_11_index) + } + + pub fn is_jump_12_to_12_from_start_exact(&self, to_secondary_12_index: usize) -> bool { + self.jump_12_to_12.is_exact(0, to_secondary_12_index) + } + + pub fn is_jump_12_to_21_from_start_exact(&self, to_secondary_21_index: usize) -> bool { + self.jump_12_to_21.is_exact(0, to_secondary_21_index) + } + + pub fn is_jump_12_to_22_from_start_exact(&self, to_secondary_22_index: usize) -> bool { + self.jump_12_to_22.is_exact(0, to_secondary_22_index) + } + + pub fn is_jump_12_from_start_exact(&self, to_secondary_index: usize, ts_kind: TsKind) -> bool { + match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => { + self.is_jump_12_to_11_from_start_exact(to_secondary_index) + } + (TsAncestor::Seq1, TsDescendant::Seq2) => { + self.is_jump_12_to_12_from_start_exact(to_secondary_index) + } + (TsAncestor::Seq2, TsDescendant::Seq1) => { + self.is_jump_12_to_21_from_start_exact(to_secondary_index) + } + (TsAncestor::Seq2, TsDescendant::Seq2) => { + self.is_jump_12_to_22_from_start_exact(to_secondary_index) + } + } + } + + pub fn is_secondary_exact( + &self, + from_secondary_index: usize, + to_secondary_index: usize, + ts_kind: TsKind, + ) -> bool { + match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => self + .secondary_11 + .is_exact(from_secondary_index, to_secondary_index), + (TsAncestor::Seq1, TsDescendant::Seq2) => self + .secondary_12 + .is_exact(from_secondary_index, to_secondary_index), + (TsAncestor::Seq2, TsDescendant::Seq1) => self + .secondary_21 + .is_exact(from_secondary_index, to_secondary_index), + (TsAncestor::Seq2, TsDescendant::Seq2) => self + .secondary_22 + .is_exact(from_secondary_index, to_secondary_index), + } + } + + pub fn is_jump_34_exact( + &self, + from_secondary_index: usize, + to_primary_index: usize, + ts_kind: TsKind, + ) -> bool { + match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => self + .jump_34_from_11 + .is_exact(from_secondary_index, to_primary_index + 1), + (TsAncestor::Seq1, TsDescendant::Seq2) => self + .jump_34_from_12 + .is_exact(from_secondary_index, to_primary_index + 1), + (TsAncestor::Seq2, TsDescendant::Seq1) => self + .jump_34_from_21 + .is_exact(from_secondary_index, to_primary_index + 1), + (TsAncestor::Seq2, TsDescendant::Seq2) => self + .jump_34_from_22 + .is_exact(from_secondary_index, to_primary_index + 1), + } + } + + pub fn is_jump_34_to_end_exact(&self, from_secondary_index: usize, ts_kind: TsKind) -> bool { + match (ts_kind.ancestor, ts_kind.descendant) { + (TsAncestor::Seq1, TsDescendant::Seq1) => self + .jump_34_from_11 + .is_exact(from_secondary_index, self.jump_34_from_11.dim().1 - 1), + (TsAncestor::Seq1, TsDescendant::Seq2) => self + .jump_34_from_12 + .is_exact(from_secondary_index, self.jump_34_from_12.dim().1 - 1), + (TsAncestor::Seq2, TsDescendant::Seq1) => self + .jump_34_from_21 + .is_exact(from_secondary_index, self.jump_34_from_21.dim().1 - 1), + (TsAncestor::Seq2, TsDescendant::Seq2) => self + .jump_34_from_22 + .is_exact(from_secondary_index, self.jump_34_from_22.dim().1 - 1), + } + } + pub fn update_primary( &mut self, from_primary_index: usize, to_primary_index: usize, cost: Cost, + is_exact: bool, ) -> bool { + if is_exact { + self.primary + .set_exact(from_primary_index + 1, to_primary_index + 1); + } let target = &mut self.primary[[from_primary_index + 1, to_primary_index + 1]]; assert!(*target <= cost); let result = *target < cost; @@ -449,7 +626,15 @@ impl ChainingCostFunction { result } - pub fn update_primary_from_start(&mut self, primary_index: usize, cost: Cost) -> bool { + pub fn update_primary_from_start( + &mut self, + primary_index: usize, + cost: Cost, + is_exact: bool, + ) -> bool { + if is_exact { + self.primary.set_exact(0, primary_index + 1); + } let target = &mut self.primary[[0, primary_index + 1]]; assert!(*target <= cost); let result = *target < cost; @@ -457,8 +642,16 @@ impl ChainingCostFunction { result } - pub fn update_primary_to_end(&mut self, primary_index: usize, cost: Cost) -> bool { + pub fn update_primary_to_end( + &mut self, + primary_index: usize, + cost: Cost, + is_exact: bool, + ) -> bool { let end_index = self.primary.dim().1 - 1; + if is_exact { + self.primary.set_exact(primary_index + 1, end_index); + } let target = &mut self.primary[[primary_index + 1, end_index]]; assert!(*target <= cost); let result = *target < cost; @@ -466,8 +659,11 @@ impl ChainingCostFunction { result } - pub fn update_start_to_end(&mut self, cost: Cost) -> bool { + pub fn update_start_to_end(&mut self, cost: Cost, is_exact: bool) -> bool { let end_index = self.primary.dim().1 - 1; + if is_exact { + self.primary.set_exact(0, end_index); + } let target = &mut self.primary[[0, end_index]]; assert!(*target <= cost); let result = *target < cost; @@ -481,18 +677,35 @@ impl ChainingCostFunction { to_secondary_index: usize, ts_kind: TsKind, cost: Cost, + is_exact: bool, ) -> bool { let target = match (ts_kind.ancestor, ts_kind.descendant) { (TsAncestor::Seq1, TsDescendant::Seq1) => { + if is_exact { + self.jump_12_to_11 + .set_exact(from_primary_index + 1, to_secondary_index); + } &mut self.jump_12_to_11[[from_primary_index + 1, to_secondary_index]] } (TsAncestor::Seq1, TsDescendant::Seq2) => { + if is_exact { + self.jump_12_to_12 + .set_exact(from_primary_index + 1, to_secondary_index); + } &mut self.jump_12_to_12[[from_primary_index + 1, to_secondary_index]] } (TsAncestor::Seq2, TsDescendant::Seq1) => { + if is_exact { + self.jump_12_to_21 + .set_exact(from_primary_index + 1, to_secondary_index); + } &mut self.jump_12_to_21[[from_primary_index + 1, to_secondary_index]] } (TsAncestor::Seq2, TsDescendant::Seq2) => { + if is_exact { + self.jump_12_to_22 + .set_exact(from_primary_index + 1, to_secondary_index); + } &mut self.jump_12_to_22[[from_primary_index + 1, to_secondary_index]] } }; @@ -507,18 +720,31 @@ impl ChainingCostFunction { to_secondary_index: usize, ts_kind: TsKind, cost: Cost, + is_exact: bool, ) -> bool { let target = match (ts_kind.ancestor, ts_kind.descendant) { (TsAncestor::Seq1, TsDescendant::Seq1) => { + if is_exact { + self.jump_12_to_11.set_exact(0, to_secondary_index); + } &mut self.jump_12_to_11[[0, to_secondary_index]] } (TsAncestor::Seq1, TsDescendant::Seq2) => { + if is_exact { + self.jump_12_to_12.set_exact(0, to_secondary_index); + } &mut self.jump_12_to_12[[0, to_secondary_index]] } (TsAncestor::Seq2, TsDescendant::Seq1) => { + if is_exact { + self.jump_12_to_21.set_exact(0, to_secondary_index); + } &mut self.jump_12_to_21[[0, to_secondary_index]] } (TsAncestor::Seq2, TsDescendant::Seq2) => { + if is_exact { + self.jump_12_to_22.set_exact(0, to_secondary_index); + } &mut self.jump_12_to_22[[0, to_secondary_index]] } }; @@ -534,18 +760,35 @@ impl ChainingCostFunction { to_secondary_index: usize, ts_kind: TsKind, cost: Cost, + is_exact: bool, ) -> bool { let target = match (ts_kind.ancestor, ts_kind.descendant) { (TsAncestor::Seq1, TsDescendant::Seq1) => { + if is_exact { + self.jump_12_to_11 + .set_exact(from_secondary_index, to_secondary_index); + } &mut self.secondary_11[[from_secondary_index, to_secondary_index]] } (TsAncestor::Seq1, TsDescendant::Seq2) => { + if is_exact { + self.jump_12_to_12 + .set_exact(from_secondary_index, to_secondary_index); + } &mut self.secondary_12[[from_secondary_index, to_secondary_index]] } (TsAncestor::Seq2, TsDescendant::Seq1) => { + if is_exact { + self.jump_12_to_21 + .set_exact(from_secondary_index, to_secondary_index); + } &mut self.secondary_21[[from_secondary_index, to_secondary_index]] } (TsAncestor::Seq2, TsDescendant::Seq2) => { + if is_exact { + self.jump_12_to_22 + .set_exact(from_secondary_index, to_secondary_index); + } &mut self.secondary_22[[from_secondary_index, to_secondary_index]] } }; @@ -561,18 +804,35 @@ impl ChainingCostFunction { to_primary_index: usize, ts_kind: TsKind, cost: Cost, + is_exact: bool, ) -> bool { let target = match (ts_kind.ancestor, ts_kind.descendant) { (TsAncestor::Seq1, TsDescendant::Seq1) => { + if is_exact { + self.jump_12_to_11 + .set_exact(from_secondary_index, to_primary_index + 1); + } &mut self.jump_34_from_11[[from_secondary_index, to_primary_index + 1]] } (TsAncestor::Seq1, TsDescendant::Seq2) => { + if is_exact { + self.jump_12_to_12 + .set_exact(from_secondary_index, to_primary_index + 1); + } &mut self.jump_34_from_12[[from_secondary_index, to_primary_index + 1]] } (TsAncestor::Seq2, TsDescendant::Seq1) => { + if is_exact { + self.jump_12_to_21 + .set_exact(from_secondary_index, to_primary_index + 1); + } &mut self.jump_34_from_21[[from_secondary_index, to_primary_index + 1]] } (TsAncestor::Seq2, TsDescendant::Seq2) => { + if is_exact { + self.jump_12_to_22 + .set_exact(from_secondary_index, to_primary_index + 1); + } &mut self.jump_34_from_22[[from_secondary_index, to_primary_index + 1]] } }; @@ -587,22 +847,39 @@ impl ChainingCostFunction { from_secondary_index: usize, ts_kind: TsKind, cost: Cost, + is_exact: bool, ) -> bool { let target = match (ts_kind.ancestor, ts_kind.descendant) { (TsAncestor::Seq1, TsDescendant::Seq1) => { let end_index = self.jump_34_from_11.dim().1 - 1; + if is_exact { + self.jump_12_to_11 + .set_exact(from_secondary_index, end_index); + } &mut self.jump_34_from_11[[from_secondary_index, end_index]] } (TsAncestor::Seq1, TsDescendant::Seq2) => { let end_index = self.jump_34_from_12.dim().1 - 1; + if is_exact { + self.jump_12_to_12 + .set_exact(from_secondary_index, end_index); + } &mut self.jump_34_from_12[[from_secondary_index, end_index]] } (TsAncestor::Seq2, TsDescendant::Seq1) => { let end_index = self.jump_34_from_21.dim().1 - 1; + if is_exact { + self.jump_12_to_21 + .set_exact(from_secondary_index, end_index); + } &mut self.jump_34_from_21[[from_secondary_index, end_index]] } (TsAncestor::Seq2, TsDescendant::Seq2) => { let end_index = self.jump_34_from_22.dim().1 - 1; + if is_exact { + self.jump_12_to_22 + .set_exact(from_secondary_index, end_index); + } &mut self.jump_34_from_22[[from_secondary_index, end_index]] } }; diff --git a/lib_ts_chainalign/src/chaining_cost_function/cost_array.rs b/lib_ts_chainalign/src/chaining_cost_function/cost_array.rs new file mode 100644 index 0000000..afd597f --- /dev/null +++ b/lib_ts_chainalign/src/chaining_cost_function/cost_array.rs @@ -0,0 +1,109 @@ +use std::{ + io::{Read, Write}, + mem, + ops::{Index, IndexMut}, + slice, +}; + +use bitvec::{bitvec, order::LocalBits, vec::BitVec}; + +pub struct CostArray2D { + len: [usize; 2], + data: Vec, + is_exact: BitVec, +} + +impl CostArray2D { + pub fn new_from_cost(len: [usize; 2], cost: Cost) -> Self + where + Cost: Clone, + { + Self { + len, + data: vec![cost; len[0] * len[1]], + is_exact: bitvec![usize, LocalBits; 0; len[0] * len[1]], + } + } + + pub fn is_exact(&self, c1: usize, c2: usize) -> bool { + self.is_exact[coordinates_to_index(c1, c2, self.len)] + } + + pub fn set_exact(&mut self, c1: usize, c2: usize) { + debug_assert!(!self.is_exact(c1, c2)); + self.is_exact + .set(coordinates_to_index(c1, c2, self.len), true); + } + + pub fn dim(&self) -> (usize, usize) { + (self.len[0], self.len[1]) + } + + #[expect(dead_code)] + pub fn write(&self, mut write: impl Write) -> std::io::Result<()> + where + Cost: Copy, + { + write.write_all(&self.len[0].to_ne_bytes())?; + write.write_all(&self.len[1].to_ne_bytes())?; + + let cost_size = mem::size_of::(); + let data: &[u8] = unsafe { + slice::from_raw_parts(self.data.as_ptr() as *const u8, self.data.len() * cost_size) + }; + write.write_all(data) + } + + #[expect(dead_code)] + pub fn read(mut read: impl Read) -> std::io::Result + where + Cost: Copy, + { + let mut buffer = [0; mem::size_of::()]; + read.read_exact(&mut buffer)?; + let len1 = usize::from_ne_bytes(buffer); + read.read_exact(&mut buffer)?; + let len2 = usize::from_ne_bytes(buffer); + let len = [len1, len2]; + + let cost_size = mem::size_of::(); + let data_len_bytes = cost_size * len1 * len2; + + let mut data = Vec::::with_capacity(data_len_bytes); + let mut data_bytes = unsafe { + Vec::from_raw_parts(data.as_mut_ptr() as *mut u8, 0, data.capacity() * cost_size) + }; + read.by_ref() + .take(data_len_bytes.try_into().unwrap()) + .read_to_end(&mut data_bytes)?; + unsafe { + data.set_len(len1 * len2); + }; + data_bytes.leak(); + + Ok(Self { + len, + data, + is_exact: bitvec![usize, LocalBits; 0; len[0] * len[1]], + }) + } +} + +impl Index<[usize; 2]> for CostArray2D { + type Output = Cost; + + fn index(&self, index: [usize; 2]) -> &Self::Output { + &self.data[coordinates_to_index(index[0], index[1], self.len)] + } +} + +impl IndexMut<[usize; 2]> for CostArray2D { + fn index_mut(&mut self, index: [usize; 2]) -> &mut Self::Output { + &mut self.data[coordinates_to_index(index[0], index[1], self.len)] + } +} + +#[inline] +fn coordinates_to_index(c1: usize, c2: usize, len: [usize; 2]) -> usize { + c1 * len[1] + c2 +} From d01f41611f63e22f4b16df3869d8b8aa7c723c06 Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 8 Dec 2025 14:04:19 +0200 Subject: [PATCH 28/31] Custom I/O implementation for chaining lower bounds. --- Cargo.lock | 1 + lib_ts_chainalign/Cargo.toml | 1 + .../src/chaining_cost_function/cost_array.rs | 56 +-------- .../src/chaining_lower_bounds.rs | 53 ++++++++- .../src/chaining_lower_bounds/cost_array.rs | 111 ++++++++++++++++++ .../src/chaining_lower_bounds/gap_affine.rs | 39 ++++-- .../src/chaining_lower_bounds/ts_jump.rs | 40 +++++-- lib_ts_chainalign/src/costs.rs | 6 +- tsalign/src/align/a_star_chain_ts.rs | 14 +-- 9 files changed, 237 insertions(+), 84 deletions(-) create mode 100644 lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs diff --git a/Cargo.lock b/Cargo.lock index 1d72299..40e754d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -538,6 +538,7 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" name = "lib_ts_chainalign" version = "0.1.0" dependencies = [ + "bincode", "bitvec", "compact-genome", "generic_a_star", diff --git a/lib_ts_chainalign/Cargo.toml b/lib_ts_chainalign/Cargo.toml index 0b2f6b1..f48dd20 100644 --- a/lib_ts_chainalign/Cargo.toml +++ b/lib_ts_chainalign/Cargo.toml @@ -19,3 +19,4 @@ itertools = "0.14.0" log.workspace = true indicatif = "0.18.3" bitvec = "1.0.1" +bincode = { version = "2.0.1", features = ["serde"] } diff --git a/lib_ts_chainalign/src/chaining_cost_function/cost_array.rs b/lib_ts_chainalign/src/chaining_cost_function/cost_array.rs index afd597f..8189c0a 100644 --- a/lib_ts_chainalign/src/chaining_cost_function/cost_array.rs +++ b/lib_ts_chainalign/src/chaining_cost_function/cost_array.rs @@ -1,9 +1,4 @@ -use std::{ - io::{Read, Write}, - mem, - ops::{Index, IndexMut}, - slice, -}; +use std::ops::{Index, IndexMut}; use bitvec::{bitvec, order::LocalBits, vec::BitVec}; @@ -38,55 +33,6 @@ impl CostArray2D { pub fn dim(&self) -> (usize, usize) { (self.len[0], self.len[1]) } - - #[expect(dead_code)] - pub fn write(&self, mut write: impl Write) -> std::io::Result<()> - where - Cost: Copy, - { - write.write_all(&self.len[0].to_ne_bytes())?; - write.write_all(&self.len[1].to_ne_bytes())?; - - let cost_size = mem::size_of::(); - let data: &[u8] = unsafe { - slice::from_raw_parts(self.data.as_ptr() as *const u8, self.data.len() * cost_size) - }; - write.write_all(data) - } - - #[expect(dead_code)] - pub fn read(mut read: impl Read) -> std::io::Result - where - Cost: Copy, - { - let mut buffer = [0; mem::size_of::()]; - read.read_exact(&mut buffer)?; - let len1 = usize::from_ne_bytes(buffer); - read.read_exact(&mut buffer)?; - let len2 = usize::from_ne_bytes(buffer); - let len = [len1, len2]; - - let cost_size = mem::size_of::(); - let data_len_bytes = cost_size * len1 * len2; - - let mut data = Vec::::with_capacity(data_len_bytes); - let mut data_bytes = unsafe { - Vec::from_raw_parts(data.as_mut_ptr() as *mut u8, 0, data.capacity() * cost_size) - }; - read.by_ref() - .take(data_len_bytes.try_into().unwrap()) - .read_to_end(&mut data_bytes)?; - unsafe { - data.set_len(len1 * len2); - }; - data_bytes.leak(); - - Ok(Self { - len, - data, - is_exact: bitvec![usize, LocalBits; 0; len[0] * len[1]], - }) - } } impl Index<[usize; 2]> for CostArray2D { diff --git a/lib_ts_chainalign/src/chaining_lower_bounds.rs b/lib_ts_chainalign/src/chaining_lower_bounds.rs index 7c0bc64..06e603e 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds.rs @@ -1,17 +1,19 @@ //! Compute lower bounds for chaining anchors with gaps. +use std::io::{Read, Write}; + use generic_a_star::cost::AStarCost; -use serde::{Deserialize, Serialize}; +use serde::{Serialize, de::DeserializeOwned}; use crate::{ chaining_lower_bounds::{gap_affine::GapAffineLowerBounds, ts_jump::TsJumpLowerBounds}, costs::AlignmentCosts, }; +mod cost_array; pub mod gap_affine; pub mod ts_jump; -#[derive(Serialize, Deserialize)] pub struct ChainingLowerBounds { primary: GapAffineLowerBounds, secondary: GapAffineLowerBounds, @@ -44,6 +46,53 @@ impl ChainingLowerBounds { max_match_run, } } + + pub fn write(&self, mut write: impl Write) -> std::io::Result<()> + where + Cost: Copy + Serialize, + { + self.primary.write(&mut write)?; + self.secondary.write(&mut write)?; + self.jump.write(&mut write)?; + bincode::serde::encode_into_std_write( + &self.alignment_costs, + &mut write, + bincode::config::standard(), + ) + .map_err(|error| match error { + bincode::error::EncodeError::Io { inner, .. } => inner, + error => panic!("I/O error: {error}"), + })?; + write.write_all(&self.max_match_run.to_ne_bytes()) + } + + pub fn read(mut read: impl Read) -> std::io::Result + where + Cost: Copy + DeserializeOwned, + { + let primary = GapAffineLowerBounds::read(&mut read)?; + let secondary = GapAffineLowerBounds::read(&mut read)?; + let jump = TsJumpLowerBounds::read(&mut read)?; + let alignment_costs = + bincode::serde::decode_from_std_read(&mut read, bincode::config::standard()).map_err( + |error| match error { + bincode::error::DecodeError::Io { inner, .. } => inner, + error => panic!("I/O error: {error}"), + }, + )?; + + let mut buffer = [0; std::mem::size_of::()]; + read.read_exact(&mut buffer)?; + let max_match_run = u32::from_ne_bytes(buffer); + + Ok(Self { + primary, + secondary, + jump, + alignment_costs, + max_match_run, + }) + } } impl ChainingLowerBounds { diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs b/lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs new file mode 100644 index 0000000..7e90063 --- /dev/null +++ b/lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs @@ -0,0 +1,111 @@ +use std::{ + io::{Read, Write}, + mem, + ops::{Index, IndexMut}, + slice, +}; + +pub struct LowerBoundCostArray { + dim: [usize; DIMENSION], + data: Vec, +} + +impl LowerBoundCostArray { + pub fn new_from_cost(dim: [usize; DIMENSION], cost: Cost) -> Self + where + Cost: Clone, + { + Self { + dim, + data: vec![cost; dim[0] * dim[1]], + } + } + + pub fn write(&self, mut write: impl Write) -> std::io::Result<()> + where + Cost: Copy, + { + for dimension in self.dim { + write.write_all(&dimension.to_ne_bytes())?; + } + + let cost_size = mem::size_of::(); + let data: &[u8] = unsafe { + slice::from_raw_parts(self.data.as_ptr() as *const u8, self.data.len() * cost_size) + }; + write.write_all(data) + } + + pub fn read(mut read: impl Read) -> std::io::Result + where + Cost: Copy, + { + let mut buffer = [0; mem::size_of::()]; + let mut dim = [0usize; DIMENSION]; + for dimension in &mut dim { + read.read_exact(&mut buffer)?; + *dimension = usize::from_ne_bytes(buffer); + } + let dim = dim; + + let cost_size = mem::size_of::(); + let data_len = dim.into_iter().product(); + let data_bytes_len = cost_size * data_len; + + let mut data = Vec::::with_capacity(data_bytes_len); + let mut data_bytes = unsafe { + Vec::from_raw_parts(data.as_mut_ptr() as *mut u8, 0, data.capacity() * cost_size) + }; + read.by_ref() + .take(data_bytes_len.try_into().unwrap()) + .read_to_end(&mut data_bytes)?; + unsafe { + data.set_len(data_len); + }; + data_bytes.leak(); + + Ok(Self { dim, data }) + } +} + +impl Index<[usize; DIMENSION]> + for LowerBoundCostArray +{ + type Output = Cost; + + fn index(&self, index: [usize; DIMENSION]) -> &Self::Output { + &self.data[coordinates_to_index(index, self.dim)] + } +} + +impl IndexMut<[usize; DIMENSION]> + for LowerBoundCostArray +{ + fn index_mut(&mut self, index: [usize; DIMENSION]) -> &mut Self::Output { + &mut self.data[coordinates_to_index(index, self.dim)] + } +} + +impl FromIterator for LowerBoundCostArray<1, Cost> { + fn from_iter>(iter: T) -> Self { + let data: Vec<_> = iter.into_iter().collect(); + let dim = [data.len()]; + Self { dim, data } + } +} + +#[inline] +fn coordinates_to_index( + coordinates: [usize; DIMENSION], + dim: [usize; DIMENSION], +) -> usize { + let mut result = 0; + for (index, ordinate) in coordinates.into_iter().enumerate() { + let mut factor = 1; + for dimension in dim.into_iter().skip(index + 1) { + factor *= dimension; + } + result += factor * ordinate; + } + result +} diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs index 60ad212..a6370e0 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/gap_affine.rs @@ -1,17 +1,19 @@ +use std::io::{Read, Write}; + use generic_a_star::{AStar, AStarNode, cost::AStarCost}; -use ndarray::{Array1, Array2}; -use serde::{Deserialize, Serialize}; -use crate::{chaining_lower_bounds::gap_affine::algo::Context, costs::GapAffineCosts}; +use crate::{ + chaining_lower_bounds::{cost_array::LowerBoundCostArray, gap_affine::algo::Context}, + costs::GapAffineCosts, +}; mod algo; #[cfg(test)] mod tests; -#[derive(Serialize, Deserialize)] pub struct GapAffineLowerBounds { - lower_bounds: Array2, - variable_gap2_lower_bounds: Array1, + lower_bounds: LowerBoundCostArray<2, Cost>, + variable_gap2_lower_bounds: LowerBoundCostArray<1, Cost>, } impl GapAffineLowerBounds { @@ -33,7 +35,8 @@ impl GapAffineLowerBounds { cost_table: &GapAffineCosts, allow_all_match_run: bool, ) -> Self { - let mut lower_bounds = Array2::from_elem((max_n + 1, max_n + 1), Cost::max_value()); + let mut lower_bounds = + LowerBoundCostArray::new_from_cost([max_n + 1, max_n + 1], Cost::max_value()); lower_bounds[[0, 0]] = Cost::zero(); let context = Context::new(cost_table, max_match_run, max_n); let mut a_star = AStar::new(context); @@ -48,7 +51,7 @@ impl GapAffineLowerBounds { } false }); - let variable_gap2_lower_bounds = Array1::from_iter((0..=max_n).map(|gap1| { + let variable_gap2_lower_bounds = LowerBoundCostArray::from_iter((0..=max_n).map(|gap1| { (0..=max_n) .map(|gap2| lower_bounds[[gap1, gap2]]) .min() @@ -60,6 +63,26 @@ impl GapAffineLowerBounds { variable_gap2_lower_bounds, } } + + pub fn write(&self, mut write: impl Write) -> std::io::Result<()> + where + Cost: Copy, + { + self.lower_bounds.write(&mut write)?; + self.variable_gap2_lower_bounds.write(write) + } + + pub fn read(mut read: impl Read) -> std::io::Result + where + Cost: Copy, + { + let lower_bounds = LowerBoundCostArray::read(&mut read)?; + let variable_gap2_lower_bounds = LowerBoundCostArray::read(read)?; + Ok(Self { + lower_bounds, + variable_gap2_lower_bounds, + }) + } } impl GapAffineLowerBounds { diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump.rs b/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump.rs index 402d606..326a6b4 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/ts_jump.rs @@ -1,16 +1,18 @@ +use std::io::{Read, Write}; + use generic_a_star::cost::AStarCost; -use ndarray::Array1; -use serde::{Deserialize, Serialize}; -use crate::{chaining_lower_bounds::gap_affine::GapAffineLowerBounds, costs::AlignmentCosts}; +use crate::{ + chaining_lower_bounds::{cost_array::LowerBoundCostArray, gap_affine::GapAffineLowerBounds}, + costs::AlignmentCosts, +}; #[cfg(test)] mod tests; -#[derive(Serialize, Deserialize)] pub struct TsJumpLowerBounds { - lower_bounds_12: Array1, - lower_bounds_34: Array1, + lower_bounds_12: LowerBoundCostArray<1, Cost>, + lower_bounds_34: LowerBoundCostArray<1, Cost>, } impl TsJumpLowerBounds { @@ -28,7 +30,8 @@ impl TsJumpLowerBounds { // This way of calculating the lower bound for the 12-jump does not take the shape limits of the template switch into account. // However, most of the time these limits are gonna be big, so they should not have a big impact on the lower bound. - let mut lower_bounds_12 = Array1::from_elem(max_n + 1, Cost::max_value()); + let mut lower_bounds_12 = + LowerBoundCostArray::new_from_cost([max_n + 1], Cost::max_value()); for primary_descendant_gap in 0..=max_n { for secondary_descendant_gap in 0..=max_n - primary_descendant_gap { let lower_bound = primary_lower_bounds @@ -41,7 +44,8 @@ impl TsJumpLowerBounds { } } - let mut lower_bounds_34 = Array1::from_elem(max_n + 1, Cost::max_value()); + let mut lower_bounds_34 = + LowerBoundCostArray::new_from_cost([max_n + 1], Cost::max_value()); for secondary_descendant_gap in 0..=max_n { for primary_descendant_gap in 0..=max_n - secondary_descendant_gap { let lower_bound = secondary_lower_bounds @@ -58,6 +62,26 @@ impl TsJumpLowerBounds { lower_bounds_34, } } + + pub fn write(&self, mut write: impl Write) -> std::io::Result<()> + where + Cost: Copy, + { + self.lower_bounds_12.write(&mut write)?; + self.lower_bounds_34.write(write) + } + + pub fn read(mut read: impl Read) -> std::io::Result + where + Cost: Copy, + { + let lower_bounds_12 = LowerBoundCostArray::read(&mut read)?; + let lower_bounds_34 = LowerBoundCostArray::read(read)?; + Ok(Self { + lower_bounds_12, + lower_bounds_34, + }) + } } impl TsJumpLowerBounds { diff --git a/lib_ts_chainalign/src/costs.rs b/lib_ts_chainalign/src/costs.rs index edbfc8c..e9e744a 100644 --- a/lib_ts_chainalign/src/costs.rs +++ b/lib_ts_chainalign/src/costs.rs @@ -5,14 +5,14 @@ use serde::{Deserialize, Serialize}; mod compat; -#[derive(Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, Eq, PartialEq)] pub struct GapAffineCosts { pub substitution: Cost, pub gap_open: Cost, pub gap_extend: Cost, } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, Eq, PartialEq)] pub struct TsLimits { pub jump_12: Range, pub jump_34: Range, @@ -20,7 +20,7 @@ pub struct TsLimits { pub ancestor_gap: Range, } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, Eq, PartialEq)] pub struct AlignmentCosts { pub primary_costs: GapAffineCosts, pub secondary_costs: GapAffineCosts, diff --git a/tsalign/src/align/a_star_chain_ts.rs b/tsalign/src/align/a_star_chain_ts.rs index 9f5b5c4..9620611 100644 --- a/tsalign/src/align/a_star_chain_ts.rs +++ b/tsalign/src/align/a_star_chain_ts.rs @@ -2,7 +2,7 @@ use compact_genome::interface::{ alphabet::{Alphabet, AlphabetCharacter}, sequence::GenomeSequence, }; -use lib_ts_chainalign::costs::AlignmentCosts; +use lib_ts_chainalign::{chaining_lower_bounds::ChainingLowerBounds, costs::AlignmentCosts}; use lib_tsalign::{ a_star_aligner::alignment_geometry::AlignmentRange, config::TemplateSwitchConfig, }; @@ -73,7 +73,10 @@ pub fn align_a_star_chain_ts< let chaining_lower_bounds = if let Ok(mut file) = File::open(&cache_file) { info!("Loading preprocessed data from cache at {cache_file:?}"); - bincode::serde::decode_from_std_read(&mut file, bincode::config::standard()).unwrap() + let chaining_lower_bounds = ChainingLowerBounds::read(&mut file).unwrap(); + assert_eq!(chaining_lower_bounds.alignment_costs(), &alignment_costs); + assert_eq!(chaining_lower_bounds.max_match_run(), max_match_run); + chaining_lower_bounds } else { info!("Preprocessing..."); let chaining_lower_bounds = @@ -81,12 +84,7 @@ pub fn align_a_star_chain_ts< info!("Storing preprocessed data into cache at {cache_file:?}"); let mut file = File::create(&cache_file).unwrap(); - bincode::serde::encode_into_std_write( - &chaining_lower_bounds, - &mut file, - bincode::config::standard(), - ) - .unwrap(); + chaining_lower_bounds.write(&mut file).unwrap(); chaining_lower_bounds }; From 70a3664ae87021a6b0f3dcef57e6490fccef441c Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 8 Dec 2025 14:07:57 +0200 Subject: [PATCH 29/31] Use rust-analyzer from toolchain. --- rust-toolchain.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 69e34fe..6017514 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,3 @@ [toolchain] -channel = "1.85.1" \ No newline at end of file +channel = "1.85.1" +components = ["rust-analyzer"] From ee205c8dd608aa414ced411338831dd2688d526c Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 8 Dec 2025 14:12:04 +0200 Subject: [PATCH 30/31] Fix LowerBoundCostArray constructor. --- lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs b/lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs index 7e90063..82c6271 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs @@ -17,7 +17,7 @@ impl LowerBoundCostArray { { Self { dim, - data: vec![cost; dim[0] * dim[1]], + data: vec![cost; dim.into_iter().product()], } } @@ -101,11 +101,7 @@ fn coordinates_to_index( ) -> usize { let mut result = 0; for (index, ordinate) in coordinates.into_iter().enumerate() { - let mut factor = 1; - for dimension in dim.into_iter().skip(index + 1) { - factor *= dimension; - } - result += factor * ordinate; + result += dim.into_iter().skip(index + 1).product::() * ordinate; } result } From 790570f05bf63376f44317468f27aa9340efd14c Mon Sep 17 00:00:00 2001 From: Sebastian Schmidt Date: Mon, 8 Dec 2025 14:25:04 +0200 Subject: [PATCH 31/31] Fix. --- lib_ts_chainalign/src/chain_align.rs | 127 ++++++++++-------- .../src/chaining_cost_function.rs | 16 +-- .../src/chaining_lower_bounds/cost_array.rs | 3 + 3 files changed, 85 insertions(+), 61 deletions(-) diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index 0f9cc65..e7666fe 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -289,7 +289,7 @@ fn evaluate_chain( rc_fn: &dyn Fn(u8) -> u8, max_match_run: u32, chaining_cost_function: &mut ChainingCostFunction, - complete_evaluation: bool, + final_evaluation: bool, ) -> (Cost, Vec) { let k = usize::try_from(max_match_run + 1).unwrap(); let mut current_upper_bound = Cost::zero(); @@ -300,7 +300,7 @@ fn evaluate_chain( match (from_anchor, to_anchor) { (Identifier::Start, Identifier::End) => { - if complete_evaluation || !chaining_cost_function.is_start_to_end_exact() { + if final_evaluation || !chaining_cost_function.is_start_to_end_exact() { let alignment = GapAffineAlignment::new( start, end, @@ -310,15 +310,16 @@ fn evaluate_chain( max_match_run, ); trace!("Aligning from start to end costs {}", alignment.cost()); - chaining_cost_function.update_start_to_end(alignment.cost(), true); + if !final_evaluation { + chaining_cost_function.update_start_to_end(alignment.cost(), true); + } alignments.push(alignment.alignment().clone()); } current_upper_bound += chaining_cost_function.start_to_end(); } (Identifier::Start, Identifier::Primary { index }) => { let end = anchors.primary[index].start(); - if complete_evaluation || !chaining_cost_function.is_primary_from_start_exact(index) - { + if final_evaluation || !chaining_cost_function.is_primary_from_start_exact(index) { let alignment = GapAffineAlignment::new( start, end, @@ -332,14 +333,20 @@ fn evaluate_chain( anchors.primary[index], alignment.cost() ); - chaining_cost_function.update_primary_from_start(index, alignment.cost(), true); + if !final_evaluation { + chaining_cost_function.update_primary_from_start( + index, + alignment.cost(), + true, + ); + } alignments.push(alignment.alignment().clone()); } current_upper_bound += chaining_cost_function.primary_from_start(index); } (Identifier::Start, Identifier::Secondary { index, ts_kind }) => { let end = anchors.secondary(ts_kind)[index].start(ts_kind); - if complete_evaluation + if final_evaluation || !chaining_cost_function.is_jump_12_from_start_exact(index, ts_kind) { let alignment = Ts12JumpAlignment::new( @@ -356,19 +363,21 @@ fn evaluate_chain( anchors.secondary(ts_kind)[index], alignment.cost() ); - chaining_cost_function.update_jump_12_from_start( - index, - ts_kind, - alignment.cost(), - true, - ); + if !final_evaluation { + chaining_cost_function.update_jump_12_from_start( + index, + ts_kind, + alignment.cost(), + true, + ); + } alignments.push(alignment.alignment().clone()); } current_upper_bound += chaining_cost_function.jump_12_from_start(index, ts_kind); } (Identifier::Primary { index }, Identifier::End) => { let start = anchors.primary[index].end(k); - if complete_evaluation || !chaining_cost_function.is_primary_to_end_exact(index) { + if final_evaluation || !chaining_cost_function.is_primary_to_end_exact(index) { let alignment = GapAffineAlignment::new( start, end, @@ -382,7 +391,9 @@ fn evaluate_chain( anchors.primary[index], alignment.cost() ); - chaining_cost_function.update_primary_to_end(index, alignment.cost(), true); + if !final_evaluation { + chaining_cost_function.update_primary_to_end(index, alignment.cost(), true); + } alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -390,7 +401,7 @@ fn evaluate_chain( } (Identifier::Secondary { index, ts_kind }, Identifier::End) => { let start = anchors.secondary(ts_kind)[index].end(ts_kind, k); - if complete_evaluation + if final_evaluation || !chaining_cost_function.is_jump_34_to_end_exact(index, ts_kind) { let alignment = Ts34JumpAlignment::new( @@ -407,12 +418,14 @@ fn evaluate_chain( anchors.secondary(ts_kind)[index], alignment.cost() ); - chaining_cost_function.update_jump_34_to_end( - index, - ts_kind, - alignment.cost(), - true, - ); + if !final_evaluation { + chaining_cost_function.update_jump_34_to_end( + index, + ts_kind, + alignment.cost(), + true, + ); + } alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -430,7 +443,7 @@ fn evaluate_chain( let start = anchors.primary[from_index].end(k); let end = anchors.primary[to_index].start(); - if complete_evaluation + if final_evaluation || !chaining_cost_function.is_primary_exact(from_index, to_index) { let alignment = GapAffineAlignment::new( @@ -447,12 +460,14 @@ fn evaluate_chain( anchors.primary[to_index], alignment.cost() ); - chaining_cost_function.update_primary( - from_index, - to_index, - alignment.cost(), - true, - ); + if !final_evaluation { + chaining_cost_function.update_primary( + from_index, + to_index, + alignment.cost(), + true, + ); + } alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -467,7 +482,7 @@ fn evaluate_chain( ) => { let start = anchors.primary[from_index].end(k); let end = anchors.secondary(ts_kind)[to_index].start(ts_kind); - if complete_evaluation + if final_evaluation || !chaining_cost_function.is_jump_12_exact(from_index, to_index, ts_kind) { let alignment = Ts12JumpAlignment::new( @@ -478,13 +493,15 @@ fn evaluate_chain( rc_fn, max_match_run, ); - chaining_cost_function.update_jump_12( - from_index, - to_index, - ts_kind, - alignment.cost(), - true, - ); + if !final_evaluation { + chaining_cost_function.update_jump_12( + from_index, + to_index, + ts_kind, + alignment.cost(), + true, + ); + } trace!( "Aligning from P{from_index}{} to S{}[{to_index}]{} costs {}", anchors.primary[from_index], @@ -517,7 +534,7 @@ fn evaluate_chain( } let start = anchors.secondary(ts_kind)[from_index].end(ts_kind, k); let end = anchors.secondary(ts_kind)[to_index].start(ts_kind); - if complete_evaluation + if final_evaluation || !chaining_cost_function.is_secondary_exact(from_index, to_index, ts_kind) { let alignment = GapAffineAlignment::new( @@ -536,13 +553,15 @@ fn evaluate_chain( anchors.secondary(ts_kind)[to_index], alignment.cost() ); - chaining_cost_function.update_secondary( - from_index, - to_index, - ts_kind, - alignment.cost(), - true, - ); + if !final_evaluation { + chaining_cost_function.update_secondary( + from_index, + to_index, + ts_kind, + alignment.cost(), + true, + ); + } alignments.push(iter::repeat_n(AlignmentType::Match, k).collect()); alignments.push(alignment.alignment().clone()); } @@ -558,7 +577,7 @@ fn evaluate_chain( ) => { let start = anchors.secondary(ts_kind)[from_index].end(ts_kind, k); let end = anchors.primary[to_index].start(); - if complete_evaluation + if final_evaluation || !chaining_cost_function.is_jump_34_exact(from_index, to_index, ts_kind) { let alignment = Ts34JumpAlignment::new( @@ -569,13 +588,15 @@ fn evaluate_chain( rc_fn, max_match_run, ); - chaining_cost_function.update_jump_34( - from_index, - to_index, - ts_kind, - alignment.cost(), - true, - ); + if !final_evaluation { + chaining_cost_function.update_jump_34( + from_index, + to_index, + ts_kind, + alignment.cost(), + true, + ); + } trace!( "Aligning from S{}[{from_index}]{} to P{to_index}{} (S{} to P{}) costs {}", ts_kind.digits(), diff --git a/lib_ts_chainalign/src/chaining_cost_function.rs b/lib_ts_chainalign/src/chaining_cost_function.rs index da71c0c..7202924 100644 --- a/lib_ts_chainalign/src/chaining_cost_function.rs +++ b/lib_ts_chainalign/src/chaining_cost_function.rs @@ -809,28 +809,28 @@ impl ChainingCostFunction { let target = match (ts_kind.ancestor, ts_kind.descendant) { (TsAncestor::Seq1, TsDescendant::Seq1) => { if is_exact { - self.jump_12_to_11 + self.jump_34_from_11 .set_exact(from_secondary_index, to_primary_index + 1); } &mut self.jump_34_from_11[[from_secondary_index, to_primary_index + 1]] } (TsAncestor::Seq1, TsDescendant::Seq2) => { if is_exact { - self.jump_12_to_12 + self.jump_34_from_12 .set_exact(from_secondary_index, to_primary_index + 1); } &mut self.jump_34_from_12[[from_secondary_index, to_primary_index + 1]] } (TsAncestor::Seq2, TsDescendant::Seq1) => { if is_exact { - self.jump_12_to_21 + self.jump_34_from_21 .set_exact(from_secondary_index, to_primary_index + 1); } &mut self.jump_34_from_21[[from_secondary_index, to_primary_index + 1]] } (TsAncestor::Seq2, TsDescendant::Seq2) => { if is_exact { - self.jump_12_to_22 + self.jump_34_from_22 .set_exact(from_secondary_index, to_primary_index + 1); } &mut self.jump_34_from_22[[from_secondary_index, to_primary_index + 1]] @@ -853,7 +853,7 @@ impl ChainingCostFunction { (TsAncestor::Seq1, TsDescendant::Seq1) => { let end_index = self.jump_34_from_11.dim().1 - 1; if is_exact { - self.jump_12_to_11 + self.jump_34_from_11 .set_exact(from_secondary_index, end_index); } &mut self.jump_34_from_11[[from_secondary_index, end_index]] @@ -861,7 +861,7 @@ impl ChainingCostFunction { (TsAncestor::Seq1, TsDescendant::Seq2) => { let end_index = self.jump_34_from_12.dim().1 - 1; if is_exact { - self.jump_12_to_12 + self.jump_34_from_12 .set_exact(from_secondary_index, end_index); } &mut self.jump_34_from_12[[from_secondary_index, end_index]] @@ -869,7 +869,7 @@ impl ChainingCostFunction { (TsAncestor::Seq2, TsDescendant::Seq1) => { let end_index = self.jump_34_from_21.dim().1 - 1; if is_exact { - self.jump_12_to_21 + self.jump_34_from_21 .set_exact(from_secondary_index, end_index); } &mut self.jump_34_from_21[[from_secondary_index, end_index]] @@ -877,7 +877,7 @@ impl ChainingCostFunction { (TsAncestor::Seq2, TsDescendant::Seq2) => { let end_index = self.jump_34_from_22.dim().1 - 1; if is_exact { - self.jump_12_to_22 + self.jump_34_from_22 .set_exact(from_secondary_index, end_index); } &mut self.jump_34_from_22[[from_secondary_index, end_index]] diff --git a/lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs b/lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs index 82c6271..02e3558 100644 --- a/lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs +++ b/lib_ts_chainalign/src/chaining_lower_bounds/cost_array.rs @@ -5,6 +5,8 @@ use std::{ slice, }; +use log::trace; + pub struct LowerBoundCostArray { dim: [usize; DIMENSION], data: Vec, @@ -47,6 +49,7 @@ impl LowerBoundCostArray { *dimension = usize::from_ne_bytes(buffer); } let dim = dim; + trace!("Read dimensions: {dim:?}"); let cost_size = mem::size_of::(); let data_len = dim.into_iter().product();