diff --git a/Cargo.lock b/Cargo.lock index 40e754d7..53edba61 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -538,8 +538,10 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" name = "lib_ts_chainalign" version = "0.1.0" dependencies = [ + "binary-heap-plus", "bincode", "bitvec", + "clap", "compact-genome", "generic_a_star", "indicatif", @@ -548,6 +550,7 @@ dependencies = [ "log", "ndarray 0.17.1", "num-traits", + "rustc-hash", "serde", ] diff --git a/Cargo.toml b/Cargo.toml index 35cc994a..dd4f11c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ package.rust-version = "1.85.1" package.repository = "https://github.com/sebschmi/template-switch-aligner" [workspace.dependencies] +clap = { version = "4.5.16", features = ["derive"] } serde = "1.0.219" compact-genome = "12.5.0" traitsequence = "8.1.2" diff --git a/generic_a_star/src/closed_lists.rs b/generic_a_star/src/closed_lists.rs index a43b024c..7e35b7e1 100644 --- a/generic_a_star/src/closed_lists.rs +++ b/generic_a_star/src/closed_lists.rs @@ -1,6 +1,36 @@ use rustc_hash::{FxHashMapSeed, FxSeededState}; -use crate::{AStarClosedList, AStarIdentifier, AStarNode, reset::Reset}; +use crate::{AStarIdentifier, AStarNode, reset::Reset}; + +/// The closed list for the A* algorithm. +pub trait AStarClosedList: Reset { + /// Create a new empty closed list. + fn new() -> Self; + + /// Returns the number of nodes in the closed list. + fn len(&self) -> usize; + + /// Returns true if the closed list is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Insert a node with the given identifier into the closed list. + /// + /// If there was a node previously mapped to this identifier, then it is returned. + /// Otherwise, `None` is returned. + fn insert(&mut self, identifier: Identifier, node: Node) -> Option; + + /// Return a reference to the node specified by `identifier`. + /// + /// Returns `None` if no node with the given identifier exists. + fn get(&self, identifier: &Identifier) -> Option<&Node>; + + /// Returns true if the given identifier is mapped to a node. + fn contains_identifier(&self, identifier: &Identifier) -> bool { + self.get(identifier).is_some() + } +} impl AStarClosedList for FxHashMapSeed diff --git a/generic_a_star/src/cost.rs b/generic_a_star/src/cost.rs index a1cf4e65..f7b67fa3 100644 --- a/generic_a_star/src/cost.rs +++ b/generic_a_star/src/cost.rs @@ -36,6 +36,10 @@ pub trait AStarCost: fn as_primitive(&self) -> Self::CostType; fn from_primitive(value: Self::CostType) -> Self; + + fn as_usize(&self) -> usize; + + fn from_usize(value: usize) -> Self; } macro_rules! primitive_cost { @@ -63,6 +67,14 @@ macro_rules! primitive_cost { fn from_primitive(value: Self::CostType) -> Self { Self(value) } + + fn as_usize(&self) -> usize { + self.as_primitive().try_into().unwrap() + } + + fn from_usize(value: usize) -> Self { + Self::from_primitive(value.try_into().unwrap()) + } } impl From<$primitive> for $name { @@ -188,6 +200,14 @@ impl AStarCost for OrderedPairCost { fn from_primitive(value: Self::CostType) -> Self { Self(A::from_primitive(value), B::zero()) } + + fn as_usize(&self) -> usize { + self.0.as_usize() + } + + fn from_usize(value: usize) -> Self { + Self(A::from_usize(value), B::zero()) + } } impl Display for OrderedPairCost { diff --git a/generic_a_star/src/lib.rs b/generic_a_star/src/lib.rs index 6bc0b461..3a333557 100644 --- a/generic_a_star/src/lib.rs +++ b/generic_a_star/src/lib.rs @@ -16,6 +16,8 @@ use num_traits::{Bounded, Zero}; use reset::Reset; use rustc_hash::FxHashMapSeed; +use crate::{closed_lists::AStarClosedList, open_lists::AStarOpenList}; + pub mod closed_lists; pub mod comparator; pub mod cost; @@ -106,60 +108,6 @@ pub trait AStarContext: Reset { } } -/// The closed list for the A* algorithm. -pub trait AStarClosedList: Reset { - /// Create a new empty closed list. - fn new() -> Self; - - /// Returns the number of nodes in the closed list. - fn len(&self) -> usize; - - /// Returns true if the closed list is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Insert a node with the given identifier into the closed list. - /// - /// If there was a node previously mapped to this identifier, then it is returned. - /// Otherwise, `None` is returned. - fn insert(&mut self, identifier: Identifier, node: Node) -> Option; - - /// Return a reference to the node specified by `identifier`. - /// - /// Returns `None` if no node with the given identifier exists. - fn get(&self, identifier: &Identifier) -> Option<&Node>; - - /// Returns true if the given identifier is mapped to a node. - fn contains_identifier(&self, identifier: &Identifier) -> bool { - self.get(identifier).is_some() - } -} - -/// The open list for the A* algorithm. -/// -/// This is a priority queue that must sort nodes with [`AStarNodeComparator`]. -pub trait AStarOpenList: Reset + Extend { - /// Create a new empty open list. - fn new() -> Self; - - /// Returns the number of nodes in the open list. - fn len(&self) -> usize; - - /// Returns true if the open list is empty. - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Add the given node to the priority queue. - fn push(&mut self, node: Node); - - /// Removes and returns a minimum element from the priority queue. - /// - /// Returns `None` if the priority queue is empty. - fn pop_min(&mut self) -> Option; -} - #[derive(Debug, Default)] pub struct AStarPerformanceCounters { pub opened_nodes: usize, diff --git a/generic_a_star/src/open_lists.rs b/generic_a_star/src/open_lists.rs index b5f0c87f..85f5a280 100644 --- a/generic_a_star/src/open_lists.rs +++ b/generic_a_star/src/open_lists.rs @@ -1,6 +1,32 @@ use binary_heap_plus::BinaryHeap; -use crate::{AStarNode, AStarOpenList, comparator::AStarNodeComparator, reset::Reset}; +use crate::{AStarNode, comparator::AStarNodeComparator, reset::Reset}; + +pub mod linear_heap; + +/// The open list for the A* algorithm. +/// +/// This is a priority queue that must sort nodes with [`AStarNodeComparator`]. +pub trait AStarOpenList: Reset + Extend { + /// Create a new empty open list. + fn new() -> Self; + + /// Returns the number of nodes in the open list. + fn len(&self) -> usize; + + /// Returns true if the open list is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Add the given node to the priority queue. + fn push(&mut self, node: Node); + + /// Removes and returns a minimum element from the priority queue. + /// + /// Returns `None` if the priority queue is empty. + fn pop_min(&mut self) -> Option; +} impl AStarOpenList for BinaryHeap { fn new() -> Self { diff --git a/generic_a_star/src/open_lists/linear_heap.rs b/generic_a_star/src/open_lists/linear_heap.rs new file mode 100644 index 00000000..3837bd60 --- /dev/null +++ b/generic_a_star/src/open_lists/linear_heap.rs @@ -0,0 +1,87 @@ +use std::collections::VecDeque; + +use num_traits::{Bounded, Zero}; + +use crate::{AStarNode, cost::AStarCost, open_lists::AStarOpenList, reset::Reset}; + +pub struct LinearHeap { + heap: VecDeque>, + cost_offset: Node::Cost, + len: usize, +} + +impl AStarOpenList for LinearHeap { + fn new() -> Self { + Self { + heap: Default::default(), + cost_offset: Zero::zero(), + len: 0, + } + } + + fn len(&self) -> usize { + self.len + } + + fn push(&mut self, node: Node) { + let cost = node.cost(); + assert_ne!(cost, Node::Cost::max_value()); + + if cost < self.cost_offset { + let prefix_len = (self.cost_offset - cost).as_usize(); + self.heap.reserve(prefix_len); + for _ in 0..prefix_len { + self.heap.push_front(Vec::new()); + } + self.cost_offset = cost; + + self.heap[0].push(node); + } else if cost >= self.cost_offset + Node::Cost::from_usize(self.heap.len()) { + let suffix_len = (cost + Node::Cost::from(1u8) + - (self.cost_offset + Node::Cost::from_usize(self.heap.len()))) + .as_usize(); + self.heap.reserve(suffix_len); + for _ in 0..suffix_len { + self.heap.push_back(Vec::new()); + } + + self.heap.back_mut().unwrap().push(node); + } else { + let index = (cost - self.cost_offset).as_usize(); + self.heap[index].push(node); + } + + self.len += 1; + } + + fn pop_min(&mut self) -> Option { + while let Some(front) = self.heap.front_mut() { + if front.is_empty() { + // Lazily pop the front of the heap. + self.heap.pop_front(); + self.cost_offset += Node::Cost::from(1u8); + } else { + self.len -= 1; + return front.pop(); + } + } + + None + } +} + +impl Extend for LinearHeap { + fn extend>(&mut self, iter: T) { + for node in iter { + self.push(node); + } + } +} + +impl Reset for LinearHeap { + fn reset(&mut self) { + self.heap.clear(); + self.cost_offset = Zero::zero(); + self.len = 0; + } +} diff --git a/lib_ts_chainalign/Cargo.toml b/lib_ts_chainalign/Cargo.toml index f48dd206..9358d0db 100644 --- a/lib_ts_chainalign/Cargo.toml +++ b/lib_ts_chainalign/Cargo.toml @@ -20,3 +20,6 @@ log.workspace = true indicatif = "0.18.3" bitvec = "1.0.1" bincode = { version = "2.0.1", features = ["serde"] } +rustc-hash.workspace = true +binary-heap-plus.workspace = true +clap.workspace = true diff --git a/lib_ts_chainalign/src/chain_align.rs b/lib_ts_chainalign/src/chain_align.rs index bdb94a28..a7189f07 100644 --- a/lib_ts_chainalign/src/chain_align.rs +++ b/lib_ts_chainalign/src/chain_align.rs @@ -1,3 +1,5 @@ +use binary_heap_plus::BinaryHeap; +use clap::ValueEnum; use compact_genome::{ implementation::vec_sequence::VectorGenome, interface::{ @@ -5,7 +7,12 @@ use compact_genome::{ sequence::{GenomeSequence, OwnedGenomeSequence}, }, }; -use generic_a_star::{AStar, AStarResult, cost::AStarCost}; +use generic_a_star::{ + AStar, AStarResult, + comparator::AStarNodeComparator, + cost::AStarCost, + open_lists::{AStarOpenList, linear_heap::LinearHeap}, +}; use indicatif::ProgressBar; use lib_tsalign::a_star_aligner::{ alignment_result::AlignmentResult, @@ -13,6 +20,7 @@ use lib_tsalign::a_star_aligner::{ }; use log::{debug, trace}; use num_traits::Zero; +use rustc_hash::FxHashMapSeed; use std::{ fmt::Write, iter, @@ -24,7 +32,7 @@ use crate::{ Alignment, AlignmentType, coordinates::AlignmentCoordinates, sequences::AlignmentSequences, }, anchors::Anchors, - chain_align::chainer::{Context, Identifier}, + chain_align::chainer::{Context, Identifier, Node}, chaining_cost_function::ChainingCostFunction, costs::AlignmentCosts, exact_chaining::{ @@ -40,6 +48,18 @@ pub struct AlignmentParameters { /// /// At most `max_successors` will be generated at a time, but at least all with minimum chaining cost. pub max_successors: usize, + + /// The open list type to use for chaining. + pub open_list: ChainingOpenList, +} + +/// The open list type to use for chaining. +#[derive(Debug, Clone, ValueEnum)] +pub enum ChainingOpenList { + /// Use [`BinaryHeap`](std::collections::BinaryHeap) as open list. + StdHeap, + /// Use [`LinearHeap`](generic_a_star::open_lists::linear_heap::LinearHeap) as open list. + LinearHeap, } #[expect(clippy::too_many_arguments)] @@ -53,6 +73,46 @@ pub fn align( max_match_run: u32, anchors: &Anchors, chaining_cost_function: &mut ChainingCostFunction, +) -> AlignmentResult { + match parameters.open_list { + ChainingOpenList::StdHeap => { + actually_align::, AStarNodeComparator>>( + sequences, + start, + end, + parameters, + alignment_costs, + rc_fn, + max_match_run, + anchors, + chaining_cost_function, + ) + } + ChainingOpenList::LinearHeap => actually_align::>( + sequences, + start, + end, + parameters, + alignment_costs, + rc_fn, + max_match_run, + anchors, + chaining_cost_function, + ), + } +} + +#[expect(clippy::too_many_arguments)] +fn actually_align>>( + sequences: &AlignmentSequences, + start: AlignmentCoordinates, + end: AlignmentCoordinates, + parameters: &AlignmentParameters, + alignment_costs: &AlignmentCosts, + rc_fn: &dyn Fn(u8) -> u8, + max_match_run: u32, + anchors: &Anchors, + chaining_cost_function: &mut ChainingCostFunction, ) -> AlignmentResult { let progress_bar = ProgressBar::new_spinner(); progress_bar.enable_steady_tick(Duration::from_millis(200)); @@ -69,7 +129,7 @@ pub fn align( k, parameters.max_successors, ); - let mut astar = AStar::<_>::new(context); + let mut astar = AStar::<_, FxHashMapSeed<_, _>, OpenList>::new(context); let mut chaining_execution_count = 0; let mut current_lower_bound = Cost::zero(); let mut current_upper_bound = Cost::max_value(); diff --git a/lib_ts_chainalign/src/chaining_cost_function/cost_array.rs b/lib_ts_chainalign/src/chaining_cost_function/cost_array.rs index 6f75e327..9696bceb 100644 --- a/lib_ts_chainalign/src/chaining_cost_function/cost_array.rs +++ b/lib_ts_chainalign/src/chaining_cost_function/cost_array.rs @@ -49,7 +49,7 @@ impl ChainingCostArray { if self.cost_order_permutation[c1.as_usize()].is_empty() { self.cost_order_permutation[c1.as_usize()].extend( ((0..c1.as_usize()).chain(c1.as_usize() + 1..)) - .take(self.len[1].as_usize() - 1) + .take(self.len[1].as_usize().saturating_sub(1)) .map(AnchorIndex::from), ); self.cost_order_permutation[c1.as_usize()] diff --git a/lib_ts_chainalign/src/lib.rs b/lib_ts_chainalign/src/lib.rs index d0086366..ad8cc6e9 100644 --- a/lib_ts_chainalign/src/lib.rs +++ b/lib_ts_chainalign/src/lib.rs @@ -1,3 +1,5 @@ +#![allow(rustdoc::redundant_explicit_links)] + use compact_genome::interface::alphabet::Alphabet; use generic_a_star::cost::U32Cost; use lib_tsalign::a_star_aligner::{ diff --git a/tsalign/Cargo.toml b/tsalign/Cargo.toml index 9c33a69b..f415770e 100644 --- a/tsalign/Cargo.toml +++ b/tsalign/Cargo.toml @@ -14,7 +14,7 @@ lib_tsalign = { version = "0.19.1", path = "../lib_tsalign", features = [ ] } lib_ts_chainalign = { version = "0.1.0", path = "../lib_ts_chainalign" } lib_tsshow = { version = "0.19.1", path = "../lib_tsshow" } -clap = { version = "4.5.16", features = ["derive"] } +clap.workspace = true compact-genome = { workspace = true, features = ["io"] } traitsequence.workspace = true serde.workspace = true diff --git a/tsalign/src/align.rs b/tsalign/src/align.rs index 1793a9e6..dbd46579 100644 --- a/tsalign/src/align.rs +++ b/tsalign/src/align.rs @@ -24,6 +24,7 @@ use compact_genome::{ sequence_store::SequenceStore, }, }; +use lib_ts_chainalign::chain_align::ChainingOpenList; use lib_tsalign::{ a_star_aligner::{ alignment_geometry::{AlignmentCoordinates, AlignmentRange}, @@ -116,6 +117,12 @@ pub struct Cli { #[clap(long, default_value = "5")] max_chaining_successors: usize, + /// The open list type to use for chaining. + /// + /// This applies only to tschainalign. + #[clap(long, default_value = "linear-heap")] + chaining_open_list: ChainingOpenList, + /// If set, template switches are not allowed. /// /// Use this to compare a template switch alignment against an alignment with out template switches. diff --git a/tsalign/src/align/a_star_chain_ts.rs b/tsalign/src/align/a_star_chain_ts.rs index 1d0a9caa..498062eb 100644 --- a/tsalign/src/align/a_star_chain_ts.rs +++ b/tsalign/src/align/a_star_chain_ts.rs @@ -95,6 +95,7 @@ pub fn align_a_star_chain_ts< let query = query.clone_as_vec(); let parameters = AlignmentParameters { max_successors: cli.max_chaining_successors, + open_list: cli.chaining_open_list, }; let alignment = lib_ts_chainalign::align::(