diff --git a/Cargo.lock b/Cargo.lock index c1e8b9af..05f65364 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -364,6 +364,7 @@ name = "generic_a_star" version = "0.18.0" dependencies = [ "binary-heap-plus", + "compare", "deterministic_default_hasher", "extend_map", "num-traits", diff --git a/generic_a_star/Cargo.toml b/generic_a_star/Cargo.toml index 6cdd44d4..1ef21db1 100644 --- a/generic_a_star/Cargo.toml +++ b/generic_a_star/Cargo.toml @@ -16,3 +16,4 @@ deterministic_default_hasher = "0.14.2" num-traits.workspace = true serde = { workspace = true, features = ["derive"], optional = true } extend_map = "0.14.4" +compare = "0.1.0" diff --git a/generic_a_star/src/comparator.rs b/generic_a_star/src/comparator.rs new file mode 100644 index 00000000..20373333 --- /dev/null +++ b/generic_a_star/src/comparator.rs @@ -0,0 +1,111 @@ +use std::cmp::Ordering; + +use compare::Compare; + +use crate::AStarNode; + +#[derive(Debug, Default)] +pub struct AStarNodeComparator; + +impl Compare for AStarNodeComparator { + fn compare(&self, l: &T, r: &T) -> Ordering { + l.cmp(r).reverse().then_with(|| { + l.secondary_maximisable_score() + .cmp(&r.secondary_maximisable_score()) + }) + } +} + +#[cfg(test)] +mod tests { + use std::fmt::Display; + + use compare::Compare; + + use crate::{AStarNode, cost::U64Cost}; + + use super::AStarNodeComparator; + + #[derive(Debug, PartialEq, Eq)] + struct Node { + cost: U64Cost, + lower_bound: U64Cost, + secondary_maximisable_score: usize, + } + + impl Node { + fn new(cost: u64, lower_bound: u64, secondary_maximisable_score: usize) -> Self { + Self { + cost: cost.into(), + lower_bound: lower_bound.into(), + secondary_maximisable_score, + } + } + } + + impl AStarNode for Node { + type Identifier = (); + + type EdgeType = (); + + type Cost = U64Cost; + + fn identifier(&self) -> &Self::Identifier { + unimplemented!() + } + + fn cost(&self) -> Self::Cost { + self.cost + } + + fn a_star_lower_bound(&self) -> Self::Cost { + self.lower_bound + } + + fn secondary_maximisable_score(&self) -> usize { + self.secondary_maximisable_score + } + + fn predecessor(&self) -> Option<&Self::Identifier> { + unimplemented!() + } + + fn predecessor_edge_type(&self) -> Option { + unimplemented!() + } + } + + impl Display for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{} + {}; {}", + self.cost, self.lower_bound, self.secondary_maximisable_score + ) + } + } + + impl Ord for Node { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + (self.cost + self.lower_bound).cmp(&(other.cost + other.lower_bound)) + } + } + + impl PartialOrd for Node { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + #[test] + fn compare() { + // Heap is a max heap, hence smaller nodes need to be bigger. + assert!(AStarNodeComparator.compares_eq(&Node::new(4, 5, 6), &Node::new(4, 5, 6))); + assert!(AStarNodeComparator.compares_gt(&Node::new(4, 4, 6), &Node::new(4, 5, 6))); + assert!(AStarNodeComparator.compares_gt(&Node::new(4, 4, 6), &Node::new(4, 5, 6))); + assert!(AStarNodeComparator.compares_gt(&Node::new(4, 5, 7), &Node::new(4, 5, 6))); + assert!(AStarNodeComparator.compares_lt(&Node::new(4, 5, 5), &Node::new(4, 5, 6))); + assert!(AStarNodeComparator.compares_lt(&Node::new(4, 6, 6), &Node::new(4, 5, 6))); + assert!(AStarNodeComparator.compares_lt(&Node::new(5, 5, 6), &Node::new(4, 5, 6))); + } +} diff --git a/generic_a_star/src/lib.rs b/generic_a_star/src/lib.rs index a4d5e472..0d98825a 100644 --- a/generic_a_star/src/lib.rs +++ b/generic_a_star/src/lib.rs @@ -1,18 +1,22 @@ #![forbid(clippy::mod_module_files)] use std::{ + cmp::Ordering, collections::HashMap, fmt::{Debug, Display}, hash::Hash, }; -use binary_heap_plus::{BinaryHeap, MinComparator}; +use binary_heap_plus::BinaryHeap; +use comparator::AStarNodeComparator; +use compare::Compare; use cost::AStarCost; use deterministic_default_hasher::DeterministicDefaultHasher; use extend_map::ExtendFilter; -use num_traits::Bounded; +use num_traits::{Bounded, Zero}; use reset::Reset; +mod comparator; pub mod cost; pub mod reset; @@ -43,6 +47,11 @@ pub trait AStarNode: Sized + Ord + Debug + Display { /// Returns the A* lower bound of this node. fn a_star_lower_bound(&self) -> Self::Cost; + /// Returns a score that is used to order nodes of the same cost. + /// + /// This score should be maximised, which is done via complete search. + fn secondary_maximisable_score(&self) -> usize; + /// Returns the identifier of the predecessor of this node. fn predecessor(&self) -> Option<&Self::Identifier>; @@ -116,14 +125,14 @@ pub struct AStar { Context::Node, DeterministicDefaultHasher, >, - open_list: BinaryHeap, + open_list: BinaryHeap, performance_counters: AStarPerformanceCounters, } #[derive(Debug)] pub struct AStarBuffers { closed_list: HashMap, - open_list: BinaryHeap, + open_list: BinaryHeap, } #[derive(Debug, Clone, Ord, PartialOrd, PartialEq, Eq, Hash)] @@ -166,7 +175,7 @@ impl AStar { state: AStarState::Empty, context, closed_list: Default::default(), - open_list: BinaryHeap::new_min(), + open_list: BinaryHeap::from_vec(Vec::new()), performance_counters: Default::default(), } } @@ -275,8 +284,11 @@ impl AStar { self.state = AStarState::Searching; let mut last_node = None; + let mut target_identifier = None; + let mut target_cost = ::Cost::max_value(); + let mut target_secondary_maximisable_score = 0; - let target_identifier = loop { + loop { let Some(node) = self.open_list.pop() else { if last_node.is_none() { unreachable!("Open list was empty."); @@ -294,6 +306,7 @@ impl AStar { } }; + // Check cost limit. // Nodes are ordered by cost plus lower bound. if node.cost() + node.a_star_lower_bound() > cost_limit { self.state = AStarState::Terminated { @@ -302,6 +315,7 @@ impl AStar { return AStarResult::ExceededCostLimit { cost_limit }; } + // Check memory limit. if self.closed_list.len() + self.open_list.len() > node_count_limit { self.state = AStarState::Terminated { result: AStarResult::ExceededMemoryLimit { @@ -313,11 +327,19 @@ impl AStar { }; } + // If label-correcting, abort when the first node more expensive than the cheapest target is visited. + if node.cost() + node.a_star_lower_bound() > target_cost { + debug_assert!(!self.context.is_label_setting()); + break; + } + last_node = Some(node.identifier().clone()); if let Some(previous_visit) = self.closed_list.get(node.identifier()) { + self.performance_counters.suboptimal_opened_nodes += 1; + if self.context.is_label_setting() { - // In label-setting mode, if we have already visited the node, we now must be visiting it with a higher cost. + // In label-setting mode, if we have already visited the node, we now must be visiting it with a higher or equal cost. debug_assert!( previous_visit.cost() + previous_visit.a_star_lower_bound() <= node.cost() + node.a_star_lower_bound(), @@ -343,10 +365,13 @@ impl AStar { out } ); - } - self.performance_counters.suboptimal_opened_nodes += 1; - continue; + continue; + } else if AStarNodeComparator.compare(&node, previous_visit) != Ordering::Greater { + // If we are label-correcting, we may still find a better node later on. + // Skip if equal or worse. + continue; + } } let open_nodes_without_new_successors = self.open_list.len(); @@ -361,20 +386,41 @@ impl AStar { self.performance_counters.opened_nodes += self.open_list.len() - open_nodes_without_new_successors; - if is_target(&self.context, &node) { - let identifier = node.identifier().clone(); - let previous_visit = self.closed_list.insert(node.identifier().clone(), node); - self.performance_counters.closed_nodes += 1; - debug_assert!(previous_visit.is_none() || !self.context.is_label_setting()); - break identifier; + let is_target = is_target(&self.context, &node); + debug_assert!(!is_target || node.a_star_lower_bound().is_zero()); + + if is_target + && (node.cost() < target_cost + || (node.cost() == target_cost + && node.secondary_maximisable_score() > target_secondary_maximisable_score)) + { + target_identifier = Some(node.identifier().clone()); + target_cost = node.cost(); + target_secondary_maximisable_score = node.secondary_maximisable_score(); + + if self.context.is_label_setting() { + let previous_visit = self.closed_list.insert(node.identifier().clone(), node); + self.performance_counters.closed_nodes += 1; + debug_assert!(previous_visit.is_none() || !self.context.is_label_setting()); + break; + } } let previous_visit = self.closed_list.insert(node.identifier().clone(), node); self.performance_counters.closed_nodes += 1; debug_assert!(previous_visit.is_none() || !self.context.is_label_setting()); + } + + let Some(target_identifier) = target_identifier else { + debug_assert!(!self.context.is_label_setting()); + self.state = AStarState::Terminated { + result: AStarResult::NoTarget, + }; + return AStarResult::NoTarget; }; let cost = self.closed_list.get(&target_identifier).unwrap().cost(); + debug_assert_eq!(cost, target_cost); self.state = AStarState::Terminated { result: AStarResult::FoundTarget { identifier: target_identifier.clone(), @@ -549,11 +595,11 @@ impl Iterator for BacktrackingIteratorWithCost<'_, Contex } } -impl Default for AStarBuffers { +impl Default for AStarBuffers { fn default() -> Self { Self { closed_list: Default::default(), - open_list: BinaryHeap::new_min(), + open_list: BinaryHeap::from_vec(Vec::new()), } } } @@ -599,6 +645,10 @@ impl AStarNode for Box { ::a_star_lower_bound(self) } + fn secondary_maximisable_score(&self) -> usize { + ::secondary_maximisable_score(self) + } + fn predecessor(&self) -> Option<&Self::Identifier> { ::predecessor(self) } diff --git a/lib_tsalign/src/a_star_aligner.rs b/lib_tsalign/src/a_star_aligner.rs index cedd10d5..97a841fc 100644 --- a/lib_tsalign/src/a_star_aligner.rs +++ b/lib_tsalign/src/a_star_aligner.rs @@ -52,6 +52,7 @@ where ::EdgeType: IAlignmentType, { info!("Aligning on subsequence {}", context.range()); + debug!("Is label setting: {}", context.is_label_setting()); let start_time = Instant::now(); @@ -159,6 +160,7 @@ pub fn template_switch_distance_a_star_align< >, cost_limit: Option, memory_limit: Option, + force_label_correcting: bool, template_switch_count_memory: ::Memory, ) -> AlignmentResult where @@ -187,6 +189,7 @@ where memory, cost_limit, memory_limit, + force_label_correcting, )); debug!("CIGAR before extending: {}", result.cigar()); diff --git a/lib_tsalign/src/a_star_aligner/alignment_result.rs b/lib_tsalign/src/a_star_aligner/alignment_result.rs index ca6454d3..101a15b6 100644 --- a/lib_tsalign/src/a_star_aligner/alignment_result.rs +++ b/lib_tsalign/src/a_star_aligner/alignment_result.rs @@ -270,7 +270,13 @@ impl> range.query_offset(), config, ); - assert_eq!(initial_cost, (statistics.cost.round().raw() as u64).into()); + let alignment_cost = (statistics.cost.round().raw() as u64).into(); + assert_eq!( + initial_cost, + alignment_cost, + "computed cost {initial_cost} != alignment cost {alignment_cost}; {}", + alignment.cigar() + ); // Extend left with equal cost. while range.reference_offset() > 0 && range.query_offset() > 0 { diff --git a/lib_tsalign/src/a_star_aligner/alignment_result/alignment/template_switch_specifics.rs b/lib_tsalign/src/a_star_aligner/alignment_result/alignment/template_switch_specifics.rs index 43d5c831..51b65f2a 100644 --- a/lib_tsalign/src/a_star_aligner/alignment_result/alignment/template_switch_specifics.rs +++ b/lib_tsalign/src/a_star_aligner/alignment_result/alignment/template_switch_specifics.rs @@ -1113,7 +1113,7 @@ mod tests { LazyLock::new(|| TemplateSwitchConfig { left_flank_length: 0, right_flank_length: 0, - min_length: 3, + template_switch_min_length: 3, base_cost: BaseCost { rrf: 10u64.into(), rqf: 100u64.into(), diff --git a/lib_tsalign/src/a_star_aligner/configurable_a_star_align.rs b/lib_tsalign/src/a_star_aligner/configurable_a_star_align.rs index c523422e..00002013 100644 --- a/lib_tsalign/src/a_star_aligner/configurable_a_star_align.rs +++ b/lib_tsalign/src/a_star_aligner/configurable_a_star_align.rs @@ -34,11 +34,8 @@ use super::{ template_switch_distance::{ AlignmentType, strategies::{ - chaining::{ - ChainingStrategy, LowerBoundChainingStrategy, NoChainingStrategy, - PrecomputeOnlyChainingStrategy, - }, - node_ord::{AntiDiagonalNodeOrdStrategy, CostOnlyNodeOrdStrategy, NodeOrdStrategy}, + chaining::{ChainingStrategy, LowerBoundChainingStrategy, NoChainingStrategy}, + node_ord::{AntiDiagonalNodeOrdStrategy, NodeOrdStrategy}, primary_match::AllowPrimaryMatchStrategy, template_switch_count::{ MaxTemplateSwitchCountStrategy, NoTemplateSwitchCountStrategy, @@ -48,6 +45,10 @@ use super::{ LookaheadTemplateSwitchMinLengthStrategy, NoTemplateSwitchMinLengthStrategy, TemplateSwitchMinLengthStrategy, }, + template_switch_total_length::{ + MaxTemplateSwitchTotalLengthStrategy, NoTemplateSwitchTotalLengthStrategy, + TemplateSwitchTotalLengthStrategy, + }, }, }, }; @@ -67,6 +68,7 @@ pub struct Config { pub node_ord_strategy: NodeOrdStrategySelector, pub min_length_strategy: MinLengthStrategySelector, pub chaining_strategy: ChainingStrategySelector, + pub total_length_strategy: TotalLengthStrategySelector, pub no_ts: bool, pub cost_limit: Option, @@ -85,6 +87,7 @@ impl Default for Config { node_ord_strategy: NodeOrdStrategySelector::AntiDiagonal, min_length_strategy: MinLengthStrategySelector::Lookahead, chaining_strategy: ChainingStrategySelector::None, + total_length_strategy: TotalLengthStrategySelector::Maximise, no_ts: false, cost_limit: None, memory_limit: None, @@ -130,6 +133,14 @@ pub enum ChainingStrategySelector { LowerBound, } +#[derive(Debug)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +#[cfg_attr(feature = "serde", serde(rename_all = "snake_case"))] +pub enum TotalLengthStrategySelector { + None, + Maximise, +} + /// Align `query` to `reference` with the given `config`. /// /// `query` and `reference` must be ASCII strings restricted to the characters specified by `config.alphabet`. @@ -171,12 +182,13 @@ fn a_star_align_select_node_ord_strategy { - a_star_align_select_template_switch_min_length_strategy::<_, _, CostOnlyNodeOrdStrategy>( + /*a_star_align_select_template_switch_min_length_strategy::<_, _, CostOnlyNodeOrdStrategy>( reference.as_genome_subsequence(), query.as_genome_subsequence(), config, costs, - ) + )*/ + unimplemented!("The other option appears to always be better."); } NodeOrdStrategySelector::AntiDiagonal => { a_star_align_select_template_switch_min_length_strategy::< @@ -204,24 +216,22 @@ fn a_star_align_select_template_switch_min_length_strategy< costs: TemplateSwitchConfig, ) -> AlignmentResult { match config.min_length_strategy { - MinLengthStrategySelector::None => align_a_star_template_switch_select_chaining_strategy::< + MinLengthStrategySelector::None => a_star_align_select_chaining_strategy::< _, _, NodeOrd, NoTemplateSwitchMinLengthStrategy, >(reference, query, config, costs), - MinLengthStrategySelector::Lookahead => { - align_a_star_template_switch_select_chaining_strategy::< - _, - _, - NodeOrd, - LookaheadTemplateSwitchMinLengthStrategy, - >(reference, query, config, costs) - } + MinLengthStrategySelector::Lookahead => a_star_align_select_chaining_strategy::< + _, + _, + NodeOrd, + LookaheadTemplateSwitchMinLengthStrategy, + >(reference, query, config, costs), } } -fn align_a_star_template_switch_select_chaining_strategy< +fn a_star_align_select_chaining_strategy< AlphabetType: Alphabet + Debug + Clone + Eq, SubsequenceType: GenomeSequence + ?Sized, NodeOrd: NodeOrdStrategy, @@ -233,7 +243,7 @@ fn align_a_star_template_switch_select_chaining_strategy< costs: TemplateSwitchConfig, ) -> AlignmentResult { match config.chaining_strategy { - ChainingStrategySelector::None => align_a_star_template_switch_select_no_ts_strategy::< + ChainingStrategySelector::None => a_star_align_select_no_ts_strategy::< _, _, NodeOrd, @@ -241,27 +251,26 @@ fn align_a_star_template_switch_select_chaining_strategy< NoChainingStrategy, >(reference, query, config, costs), ChainingStrategySelector::PrecomputeOnly => { - align_a_star_template_switch_select_no_ts_strategy::< + /*a_star_align_select_no_ts_strategy::< _, _, NodeOrd, TemplateSwitchMinLength, PrecomputeOnlyChainingStrategy, - >(reference, query, config, costs) - } - ChainingStrategySelector::LowerBound => { - align_a_star_template_switch_select_no_ts_strategy::< - _, - _, - NodeOrd, - TemplateSwitchMinLength, - LowerBoundChainingStrategy, - >(reference, query, config, costs) + >(reference, query, config, costs)*/ + unimplemented!("No reason to precompute without using the information."); } + ChainingStrategySelector::LowerBound => a_star_align_select_no_ts_strategy::< + _, + _, + NodeOrd, + TemplateSwitchMinLength, + LowerBoundChainingStrategy, + >(reference, query, config, costs), } } -fn align_a_star_template_switch_select_no_ts_strategy< +fn a_star_align_select_no_ts_strategy< AlphabetType: Alphabet + Debug + Clone + Eq, SubsequenceType: GenomeSequence + ?Sized, NodeOrd: NodeOrdStrategy, @@ -274,7 +283,7 @@ fn align_a_star_template_switch_select_no_ts_strategy< costs: TemplateSwitchConfig, ) -> AlignmentResult { if config.no_ts { - align_a_star_template_switch_distance_call::< + a_star_align_select_total_length_strategy::< _, _, NodeOrd, @@ -283,7 +292,7 @@ fn align_a_star_template_switch_select_no_ts_strategy< MaxTemplateSwitchCountStrategy, >(reference, query, config, costs, 0) } else { - align_a_star_template_switch_distance_call::< + a_star_align_select_total_length_strategy::< _, _, NodeOrd, @@ -294,13 +303,62 @@ fn align_a_star_template_switch_select_no_ts_strategy< } } -fn align_a_star_template_switch_distance_call< +fn a_star_align_select_total_length_strategy< + AlphabetType: Alphabet + Debug + Clone + Eq, + SubsequenceType: GenomeSequence + ?Sized, + NodeOrd: NodeOrdStrategy, + TemplateSwitchMinLength: TemplateSwitchMinLengthStrategy, + Chaining: ChainingStrategy, + TemplateSwitchCount: TemplateSwitchCountStrategy, +>( + reference: &SubsequenceType, + query: &SubsequenceType, + config: &Config, + costs: TemplateSwitchConfig, + template_switch_count_memory: ::Memory, +) -> AlignmentResult { + match config.total_length_strategy { + TotalLengthStrategySelector::None => a_star_align_call::< + _, + _, + NodeOrd, + TemplateSwitchMinLength, + Chaining, + TemplateSwitchCount, + NoTemplateSwitchTotalLengthStrategy, + >( + reference, + query, + config, + costs, + template_switch_count_memory, + ), + TotalLengthStrategySelector::Maximise => a_star_align_call::< + _, + _, + NodeOrd, + TemplateSwitchMinLength, + Chaining, + TemplateSwitchCount, + MaxTemplateSwitchTotalLengthStrategy, + >( + reference, + query, + config, + costs, + template_switch_count_memory, + ), + } +} + +fn a_star_align_call< AlphabetType: Alphabet + Debug + Clone + Eq, SubsequenceType: GenomeSequence + ?Sized, NodeOrd: NodeOrdStrategy, TemplateSwitchMinLength: TemplateSwitchMinLengthStrategy, Chaining: ChainingStrategy, TemplateSwitchCount: TemplateSwitchCountStrategy, + TemplateSwitchTotalLength: TemplateSwitchTotalLengthStrategy, >( reference: &SubsequenceType, query: &SubsequenceType, @@ -320,6 +378,7 @@ fn align_a_star_template_switch_distance_call< NoShortcutStrategy, AllowPrimaryMatchStrategy, NoPrunePrimaryRangeStrategy, + TemplateSwitchTotalLength, >, _, >( @@ -331,6 +390,7 @@ fn align_a_star_template_switch_distance_call< costs, config.cost_limit, config.memory_limit, + false, template_switch_count_memory, ) } diff --git a/lib_tsalign/src/a_star_aligner/gap_affine_edit_distance.rs b/lib_tsalign/src/a_star_aligner/gap_affine_edit_distance.rs index 13cfc01a..fc6d7f63 100644 --- a/lib_tsalign/src/a_star_aligner/gap_affine_edit_distance.rs +++ b/lib_tsalign/src/a_star_aligner/gap_affine_edit_distance.rs @@ -87,6 +87,10 @@ impl AStarNode for Node { Cost::zero() } + fn secondary_maximisable_score(&self) -> usize { + 0 + } + fn predecessor(&self) -> Option<&Self::Identifier> { self.predecessor.as_ref() } @@ -286,7 +290,7 @@ impl std::fmt::Display for Node { if let Some(predecessor) = predecessor { write!(f, "predecessor: {predecessor}; ")?; } - write!(f, "alignment_type: {predecessor_edge_type}")?; + write!(f, "alignment_type: {predecessor_edge_type}; ")?; write!(f, "cost: {cost}") } } diff --git a/lib_tsalign/src/a_star_aligner/template_switch_distance.rs b/lib_tsalign/src/a_star_aligner/template_switch_distance.rs index 9d019289..8f22e076 100644 --- a/lib_tsalign/src/a_star_aligner/template_switch_distance.rs +++ b/lib_tsalign/src/a_star_aligner/template_switch_distance.rs @@ -8,6 +8,7 @@ use strategies::{ AlignmentStrategiesNodeMemory, AlignmentStrategySelector, node_ord::NodeOrdStrategy, primary_match::PrimaryMatchStrategy, template_switch_min_length::TemplateSwitchMinLengthStrategy, + template_switch_total_length::TemplateSwitchTotalLengthStrategy, }; mod alignment_type; @@ -68,6 +69,12 @@ impl AStarNode for Node { self.node_data.a_star_lower_bound } + fn secondary_maximisable_score(&self) -> usize { + self.strategies + .template_switch_total_length + .template_switch_total_length() + } + fn predecessor(&self) -> Option<&Self::Identifier> { self.node_data.predecessor.as_ref() } diff --git a/lib_tsalign/src/a_star_aligner/template_switch_distance/context.rs b/lib_tsalign/src/a_star_aligner/template_switch_distance/context.rs index 850c1b24..8804d420 100644 --- a/lib_tsalign/src/a_star_aligner/template_switch_distance/context.rs +++ b/lib_tsalign/src/a_star_aligner/template_switch_distance/context.rs @@ -19,6 +19,7 @@ use super::strategies::secondary_deletion::SecondaryDeletionStrategy; use super::strategies::shortcut::ShortcutStrategy; use super::strategies::template_switch_count::TemplateSwitchCountStrategy; use super::strategies::template_switch_min_length::TemplateSwitchMinLengthStrategy; +use super::strategies::template_switch_total_length::TemplateSwitchTotalLengthStrategy; use super::strategies::{AlignmentStrategiesNodeMemory, AlignmentStrategySelector}; use super::{AlignmentType, Identifier, NodeData}; @@ -50,6 +51,10 @@ pub struct Context< cost_limit: Option, memory_limit: Option, + /// Force the search to run in label-correcting mode. + /// + /// This is for debug purposes only. + force_label_correcting: bool, } pub struct Memory { @@ -80,6 +85,7 @@ impl< memory: Memory, cost_limit: Option, memory_limit: Option, + force_label_correcting: bool, ) -> Self { Self { reference, @@ -93,6 +99,7 @@ impl< memory, cost_limit, memory_limit, + force_label_correcting, } } } @@ -386,7 +393,7 @@ impl< if template_switch_first_offset >= 0 && match template_switch_direction { TemplateSwitchDirection::Forward => { - (secondary_index + self.config.min_length as isize) + (secondary_index + self.config.template_switch_min_length as isize) < secondary_length as isize } TemplateSwitchDirection::Reverse => { @@ -418,7 +425,7 @@ impl< && match template_switch_direction { TemplateSwitchDirection::Forward => secondary_index > 0, TemplateSwitchDirection::Reverse => { - secondary_index > self.config.min_length as isize + secondary_index > self.config.template_switch_min_length as isize } } { @@ -445,11 +452,11 @@ impl< if match template_switch_direction { TemplateSwitchDirection::Forward => { secondary_index >= 0 - && (secondary_index + self.config.min_length as isize) + && (secondary_index + self.config.template_switch_min_length as isize) <= secondary_length as isize } TemplateSwitchDirection::Reverse => { - secondary_index >= self.config.min_length as isize + secondary_index >= self.config.template_switch_min_length as isize && secondary_index <= secondary_length as isize } } { @@ -717,6 +724,10 @@ impl< fn memory_limit(&self) -> Option { self.memory_limit } + + fn is_label_setting(&self) -> bool { + !::makes_label_correcting() && !self.force_label_correcting + } } fn generate_output_mapper_function< diff --git a/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch.rs b/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch.rs index 3ae9cccb..23f38ff8 100644 --- a/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch.rs +++ b/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch.rs @@ -25,6 +25,7 @@ use crate::{ secondary_deletion::ForbidSecondaryDeletionStrategy, shortcut::NoShortcutStrategy, template_switch_count::MaxTemplateSwitchCountStrategy, template_switch_min_length::NoTemplateSwitchMinLengthStrategy, + template_switch_total_length::NoTemplateSwitchTotalLengthStrategy, }, }, }, @@ -58,6 +59,7 @@ type TSLBAlignmentStrategies = AlignmentStrategySelection< NoShortcutStrategy, AllowPrimaryMatchStrategy, NoPrunePrimaryRangeStrategy, + NoTemplateSwitchTotalLengthStrategy, >; impl TemplateSwitchLowerBoundMatrix { @@ -102,6 +104,7 @@ impl TemplateSwitchLowerBoundMatrix { }, None, None, + false, ), ); let root_xy = genome_length / 2; @@ -314,7 +317,7 @@ fn generate_template_switch_lower_bound_config = AlignmentStrategySelection< TemplateSwitchLowerBoundShortcutStrategy, MaxConsecutivePrimaryMatchStrategy, NoPrunePrimaryRangeStrategy, + NoTemplateSwitchTotalLengthStrategy, >; impl TemplateSwitchAlignmentLowerBoundMatrix { @@ -111,6 +113,7 @@ impl TemplateSwitchAlignmentLowerBoundMatrix { }, None, None, + false, ), ); a_star.initialise(); @@ -241,7 +244,7 @@ fn generate_template_switch_alignment_lower_bound_config< TemplateSwitchConfig { left_flank_length: config.left_flank_length, right_flank_length: config.right_flank_length, - min_length: usize::MAX, + template_switch_min_length: usize::MAX, base_cost: BaseCost::new_max(), diff --git a/lib_tsalign/src/a_star_aligner/template_switch_distance/strategies.rs b/lib_tsalign/src/a_star_aligner/template_switch_distance/strategies.rs index 51c1d0db..09485d9b 100644 --- a/lib_tsalign/src/a_star_aligner/template_switch_distance/strategies.rs +++ b/lib_tsalign/src/a_star_aligner/template_switch_distance/strategies.rs @@ -10,6 +10,7 @@ use secondary_deletion::SecondaryDeletionStrategy; use shortcut::ShortcutStrategy; use template_switch_count::TemplateSwitchCountStrategy; use template_switch_min_length::TemplateSwitchMinLengthStrategy; +use template_switch_total_length::TemplateSwitchTotalLengthStrategy; use super::{AlignmentType, Context, Identifier}; @@ -21,6 +22,7 @@ pub mod secondary_deletion; pub mod shortcut; pub mod template_switch_count; pub mod template_switch_min_length; +pub mod template_switch_total_length; pub trait AlignmentStrategySelector: Eq + Clone + std::fmt::Debug { type Alphabet: Alphabet; @@ -33,6 +35,7 @@ pub trait AlignmentStrategySelector: Eq + Clone + std::fmt::Debug { type Shortcut: ShortcutStrategy; type PrimaryMatch: PrimaryMatchStrategy; type PrimaryRange: PrimaryRangeStrategy; + type TemplateSwitchTotalLength: TemplateSwitchTotalLengthStrategy; } #[derive(Debug, Clone, Eq, PartialEq)] @@ -41,6 +44,7 @@ pub struct AlignmentStrategiesNodeMemory { pub template_switch_min_length_strategy: Selector::TemplateSwitchMinLength, pub template_switch_count: Selector::TemplateSwitchCount, pub primary_match: Selector::PrimaryMatch, + pub template_switch_total_length: Selector::TemplateSwitchTotalLength, } pub trait AlignmentStrategy: Eq + Clone + std::fmt::Debug { @@ -79,6 +83,9 @@ impl AlignmentStrategiesNodeMemory AlignmentStrategiesNodeMemory, PrimaryMatch: PrimaryMatchStrategy, PrimaryRange: PrimaryRangeStrategy, + TemplateSwitchTotalLength: TemplateSwitchTotalLengthStrategy, > { #[allow(clippy::type_complexity)] phantom_data: PhantomData<( @@ -141,6 +154,7 @@ pub struct AlignmentStrategySelection< Shortcut, PrimaryMatch, PrimaryRange, + TemplateSwitchTotalLength, )>, } @@ -155,6 +169,7 @@ impl< Shortcut: ShortcutStrategy, PrimaryMatch: PrimaryMatchStrategy, PrimaryRange: PrimaryRangeStrategy, + TemplateSwitchTotalLength: TemplateSwitchTotalLengthStrategy, > AlignmentStrategySelector for AlignmentStrategySelection< AlphabetType, @@ -167,6 +182,7 @@ impl< Shortcut, PrimaryMatch, PrimaryRange, + TemplateSwitchTotalLength, > { type Alphabet = AlphabetType; @@ -179,6 +195,7 @@ impl< type Shortcut = Shortcut; type PrimaryMatch = PrimaryMatch; type PrimaryRange = PrimaryRange; + type TemplateSwitchTotalLength = TemplateSwitchTotalLength; } impl< @@ -192,6 +209,7 @@ impl< Shortcut: ShortcutStrategy, PrimaryMatch: PrimaryMatchStrategy, PrimaryRange: PrimaryRangeStrategy, + TemplateSwitchTotalLength: TemplateSwitchTotalLengthStrategy, > Debug for AlignmentStrategySelection< AlphabetType, @@ -204,6 +222,7 @@ impl< Shortcut, PrimaryMatch, PrimaryRange, + TemplateSwitchTotalLength, > { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -222,6 +241,7 @@ impl< Shortcut: ShortcutStrategy, PrimaryMatch: PrimaryMatchStrategy, PrimaryRange: PrimaryRangeStrategy, + TemplateSwitchTotalLength: TemplateSwitchTotalLengthStrategy, > Clone for AlignmentStrategySelection< AlphabetType, @@ -234,6 +254,7 @@ impl< Shortcut, PrimaryMatch, PrimaryRange, + TemplateSwitchTotalLength, > { fn clone(&self) -> Self { @@ -254,6 +275,7 @@ impl< Shortcut: ShortcutStrategy, PrimaryMatch: PrimaryMatchStrategy, PrimaryRange: PrimaryRangeStrategy, + TemplateSwitchTotalLength: TemplateSwitchTotalLengthStrategy, > PartialEq for AlignmentStrategySelection< AlphabetType, @@ -266,6 +288,7 @@ impl< Shortcut, PrimaryMatch, PrimaryRange, + TemplateSwitchTotalLength, > { fn eq(&self, other: &Self) -> bool { @@ -284,6 +307,7 @@ impl< Shortcut: ShortcutStrategy, PrimaryMatch: PrimaryMatchStrategy, PrimaryRange: PrimaryRangeStrategy, + TemplateSwitchTotalLength: TemplateSwitchTotalLengthStrategy, > Eq for AlignmentStrategySelection< AlphabetType, @@ -296,6 +320,7 @@ impl< Shortcut, PrimaryMatch, PrimaryRange, + TemplateSwitchTotalLength, > { } diff --git a/lib_tsalign/src/a_star_aligner/template_switch_distance/strategies/template_switch_min_length.rs b/lib_tsalign/src/a_star_aligner/template_switch_distance/strategies/template_switch_min_length.rs index 6e9d75f6..ec0b4334 100644 --- a/lib_tsalign/src/a_star_aligner/template_switch_distance/strategies/template_switch_min_length.rs +++ b/lib_tsalign/src/a_star_aligner/template_switch_distance/strategies/template_switch_min_length.rs @@ -145,7 +145,10 @@ impl TemplateSwitchMinLengthStrategy .memory .template_switch_min_length .insert(memory_key, lower_bound); - secondary_root_node.node_data.a_star_lower_bound += lower_bound; + secondary_root_node.node_data.a_star_lower_bound = secondary_root_node + .node_data + .a_star_lower_bound + .max(lower_bound); secondary_root_node } else { @@ -255,9 +258,9 @@ impl< let Identifier::Secondary { length, .. } = node.node_data.identifier else { unreachable!("A non-secondary node was closed before a target was closed.") }; - debug_assert!(length <= self.context.config.min_length); + debug_assert!(length <= self.context.config.template_switch_min_length); - length == self.context.config.min_length + length == self.context.config.template_switch_min_length } fn cost_limit(&self) -> Option { diff --git a/lib_tsalign/src/a_star_aligner/template_switch_distance/strategies/template_switch_total_length.rs b/lib_tsalign/src/a_star_aligner/template_switch_distance/strategies/template_switch_total_length.rs new file mode 100644 index 00000000..7992c4ee --- /dev/null +++ b/lib_tsalign/src/a_star_aligner/template_switch_distance/strategies/template_switch_total_length.rs @@ -0,0 +1,109 @@ +use compact_genome::interface::sequence::GenomeSequence; + +use crate::a_star_aligner::template_switch_distance::{AlignmentType, Context, Identifier}; + +use super::{AlignmentStrategy, AlignmentStrategySelector, primary_match::PrimaryMatchStrategy}; + +pub trait TemplateSwitchTotalLengthStrategy: AlignmentStrategy { + fn template_switch_total_length(&self) -> usize; + + /// True if this choice of strategy makes the search label correcting. + fn makes_label_correcting() -> bool; +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub struct NoTemplateSwitchTotalLengthStrategy; + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub struct MaxTemplateSwitchTotalLengthStrategy { + template_switch_total_length: usize, +} + +impl TemplateSwitchTotalLengthStrategy for NoTemplateSwitchTotalLengthStrategy { + fn template_switch_total_length(&self) -> usize { + 0 + } + + fn makes_label_correcting() -> bool { + false + } +} + +impl TemplateSwitchTotalLengthStrategy for MaxTemplateSwitchTotalLengthStrategy { + fn template_switch_total_length(&self) -> usize { + self.template_switch_total_length + } + + fn makes_label_correcting() -> bool { + true + } +} + +impl AlignmentStrategy for NoTemplateSwitchTotalLengthStrategy { + fn create_root< + SubsequenceType: GenomeSequence + ?Sized, + Strategies: AlignmentStrategySelector, + >( + _context: &Context<'_, '_, SubsequenceType, Strategies>, + ) -> Self { + Self + } + + fn generate_successor< + SubsequenceType: GenomeSequence + ?Sized, + Strategies: AlignmentStrategySelector, + >( + &self, + _identifier: Identifier< + <::PrimaryMatch as PrimaryMatchStrategy< + ::Cost, + >>::IdentifierPrimaryExtraData, + >, + _alignment_type: AlignmentType, + _context: &Context<'_, '_, SubsequenceType, Strategies>, + ) -> Self { + *self + } +} + +impl AlignmentStrategy for MaxTemplateSwitchTotalLengthStrategy { + fn create_root< + SubsequenceType: GenomeSequence + ?Sized, + Strategies: AlignmentStrategySelector, + >( + _context: &Context<'_, '_, SubsequenceType, Strategies>, + ) -> Self { + Self { + template_switch_total_length: 0, + } + } + + fn generate_successor< + SubsequenceType: GenomeSequence + ?Sized, + Strategies: AlignmentStrategySelector, + >( + &self, + _identifier: Identifier< + <::PrimaryMatch as PrimaryMatchStrategy< + ::Cost, + >>::IdentifierPrimaryExtraData, + >, + alignment_type: AlignmentType, + _context: &Context<'_, '_, SubsequenceType, Strategies>, + ) -> Self { + let increment = if matches!( + alignment_type, + AlignmentType::SecondaryMatch + | AlignmentType::SecondarySubstitution + | AlignmentType::SecondaryInsertion + ) { + 1 + } else { + 0 + }; + + Self { + template_switch_total_length: self.template_switch_total_length + increment, + } + } +} diff --git a/lib_tsalign/src/config.rs b/lib_tsalign/src/config.rs index 92db95fa..391a235c 100644 --- a/lib_tsalign/src/config.rs +++ b/lib_tsalign/src/config.rs @@ -17,7 +17,7 @@ pub struct TemplateSwitchConfig { // Limits pub left_flank_length: isize, pub right_flank_length: isize, - pub min_length: usize, + pub template_switch_min_length: usize, // Base cost pub base_cost: BaseCost, @@ -164,7 +164,7 @@ impl Clone for TemplateSwitchConfig Default for TemplateSwitchConfig TemplateSwitchConfig AStarNode for Node { Cost::zero() } + fn secondary_maximisable_score(&self) -> usize { + 0 + } + fn predecessor(&self) -> Option<&Self::Identifier> { self.predecessor.as_ref() } diff --git a/test_files/twin_in_place_inversion_large.fa b/test_files/twin_in_place_inversion_large.fa new file mode 100644 index 00000000..517bc83f --- /dev/null +++ b/test_files/twin_in_place_inversion_large.fa @@ -0,0 +1,12 @@ +>reference +ATGACCCAGT +AAAAAAAAAA +GTAAACACCCGTTGGTGATCAGTTGTAAGGTGGACTTGCTTGTGGCATCAGCGACTAATG +TTTTTTTTTT +CTAACCCCAT +>query +ATGACCCAGT +AAAAAAAAAA +CATTAGTCGCTGATGCCACAAGCAAGTCCACCTTACAACTGATCACCAACGGGTGTTTAC +TTTTTTTTTT +CTAACCCCAT \ No newline at end of file diff --git a/tsalign/src/align.rs b/tsalign/src/align.rs index 8859b1fa..13da07be 100644 --- a/tsalign/src/align.rs +++ b/tsalign/src/align.rs @@ -30,7 +30,8 @@ use log::{LevelFilter, debug, info}; use simplelog::{ColorChoice, TermLogger, TerminalMode}; use template_switch_distance_type_selectors::{ TemplateSwitchChainingStrategySelector, TemplateSwitchMinLengthStrategySelector, - TemplateSwitchNodeOrdStrategySelector, align_a_star_template_switch_distance, + TemplateSwitchNodeOrdStrategySelector, TemplateSwitchTotalLengthStrategySelector, + align_a_star_template_switch_distance, }; mod template_switch_distance_type_selectors; @@ -77,6 +78,13 @@ pub struct Cli { #[clap(long, default_value = "none")] ts_chaining_strategy: TemplateSwitchChainingStrategySelector, + /// If set to maximise, the total length of template switches becomes a secondary search criterion. + /// + /// As a result, instead of reporting an arbitrary alignment with optimal cost, + /// tsalign selects an alignment with maximal total TS length out of all the alignments with optimal cost. + #[clap(long, default_value = "maximise")] + ts_total_length_strategy: TemplateSwitchTotalLengthStrategySelector, + /// If set, template switches are not allowed. /// /// Use this to compare a template switch alignment against an alignment with out template switches. @@ -85,7 +93,7 @@ pub struct Cli { /// A cost limit for the alignment. /// - /// If there is no alignment with that cost, the aligner will abort without result. + /// If there is no alignment with at most that cost, the aligner will abort without result. #[clap(long)] cost_limit: Option, @@ -95,6 +103,12 @@ pub struct Cli { #[clap(long)] memory_limit: Option, + /// Force the search to run in label-correcting mode. + /// + /// This is for debug purposes only. + #[clap(long)] + force_label_correcting: bool, + /// First character in the reference to start the alignment from. /// /// Skipped characters are ignored for computing this index. diff --git a/tsalign/src/align/template_switch_distance_type_selectors.rs b/tsalign/src/align/template_switch_distance_type_selectors.rs index 299b3614..4926ab9f 100644 --- a/tsalign/src/align/template_switch_distance_type_selectors.rs +++ b/tsalign/src/align/template_switch_distance_type_selectors.rs @@ -7,11 +7,8 @@ use lib_tsalign::{ alignment_geometry::{AlignmentCoordinates, AlignmentRange}, template_switch_distance::strategies::{ AlignmentStrategySelection, - chaining::{ - ChainingStrategy, LowerBoundChainingStrategy, NoChainingStrategy, - PrecomputeOnlyChainingStrategy, - }, - node_ord::{AntiDiagonalNodeOrdStrategy, CostOnlyNodeOrdStrategy, NodeOrdStrategy}, + chaining::{ChainingStrategy, LowerBoundChainingStrategy, NoChainingStrategy}, + node_ord::{AntiDiagonalNodeOrdStrategy, NodeOrdStrategy}, primary_match::AllowPrimaryMatchStrategy, primary_range::NoPrunePrimaryRangeStrategy, secondary_deletion::AllowSecondaryDeletionStrategy, @@ -24,6 +21,10 @@ use lib_tsalign::{ LookaheadTemplateSwitchMinLengthStrategy, NoTemplateSwitchMinLengthStrategy, TemplateSwitchMinLengthStrategy, }, + template_switch_total_length::{ + MaxTemplateSwitchTotalLengthStrategy, NoTemplateSwitchTotalLengthStrategy, + TemplateSwitchTotalLengthStrategy, + }, }, template_switch_distance_a_star_align, }, @@ -53,6 +54,12 @@ pub enum TemplateSwitchChainingStrategySelector { LowerBound, } +#[derive(Clone, ValueEnum)] +pub enum TemplateSwitchTotalLengthStrategySelector { + None, + Maximise, +} + pub fn align_a_star_template_switch_distance< AlphabetType: Alphabet + Debug + Clone + Eq, SubsequenceType: GenomeSequence + ?Sized, @@ -84,11 +91,12 @@ fn align_a_star_template_switch_distance_select_node_ord_strategy< ) { match cli.ts_node_ord_strategy { TemplateSwitchNodeOrdStrategySelector::CostOnly => { - align_a_star_template_switch_distance_select_template_switch_min_length_strategy::< + /*align_a_star_template_switch_distance_select_template_switch_min_length_strategy::< _, _, CostOnlyNodeOrdStrategy, - >(cli, reference, query, reference_name, query_name) + >(cli, reference, query, reference_name, query_name)*/ + unimplemented!("The other option appears to always be better."); } TemplateSwitchNodeOrdStrategySelector::AntiDiagonal => { align_a_star_template_switch_distance_select_template_switch_min_length_strategy::< @@ -154,13 +162,14 @@ fn align_a_star_template_switch_select_chaining_strategy< >(cli, reference, query, reference_name, query_name) } TemplateSwitchChainingStrategySelector::PrecomputeOnly => { - align_a_star_template_switch_select_no_ts_strategy::< + /*align_a_star_template_switch_select_no_ts_strategy::< _, _, NodeOrd, TemplateSwitchMinLength, PrecomputeOnlyChainingStrategy, - >(cli, reference, query, reference_name, query_name) + >(cli, reference, query, reference_name, query_name)*/ + unimplemented!("No reason to precompute without using the information."); } TemplateSwitchChainingStrategySelector::LowerBound => { align_a_star_template_switch_select_no_ts_strategy::< @@ -188,7 +197,7 @@ fn align_a_star_template_switch_select_no_ts_strategy< query_name: &str, ) { if cli.no_ts { - align_a_star_template_switch_distance_call::< + align_a_star_template_switch_select_template_switch_total_length_strategy::< _, _, NodeOrd, @@ -197,7 +206,7 @@ fn align_a_star_template_switch_select_no_ts_strategy< MaxTemplateSwitchCountStrategy, >(cli, reference, query, reference_name, query_name, 0) } else { - align_a_star_template_switch_distance_call::< + align_a_star_template_switch_select_template_switch_total_length_strategy::< _, _, NodeOrd, @@ -208,6 +217,61 @@ fn align_a_star_template_switch_select_no_ts_strategy< } } +fn align_a_star_template_switch_select_template_switch_total_length_strategy< + AlphabetType: Alphabet + Debug + Clone + Eq, + SubsequenceType: GenomeSequence + ?Sized, + NodeOrd: NodeOrdStrategy, + TemplateSwitchMinLength: TemplateSwitchMinLengthStrategy, + Chaining: ChainingStrategy, + TemplateSwitchCount: TemplateSwitchCountStrategy, +>( + cli: Cli, + reference: &SubsequenceType, + query: &SubsequenceType, + reference_name: &str, + query_name: &str, + template_switch_count_memory: ::Memory, +) { + match cli.ts_total_length_strategy { + TemplateSwitchTotalLengthStrategySelector::None => { + align_a_star_template_switch_distance_call::< + _, + _, + NodeOrd, + TemplateSwitchMinLength, + Chaining, + TemplateSwitchCount, + NoTemplateSwitchTotalLengthStrategy, + >( + cli, + reference, + query, + reference_name, + query_name, + template_switch_count_memory, + ) + } + TemplateSwitchTotalLengthStrategySelector::Maximise => { + align_a_star_template_switch_distance_call::< + _, + _, + NodeOrd, + TemplateSwitchMinLength, + Chaining, + TemplateSwitchCount, + MaxTemplateSwitchTotalLengthStrategy, + >( + cli, + reference, + query, + reference_name, + query_name, + template_switch_count_memory, + ) + } + } +} + fn align_a_star_template_switch_distance_call< AlphabetType: Alphabet + Debug + Clone + Eq, SubsequenceType: GenomeSequence + ?Sized, @@ -215,6 +279,7 @@ fn align_a_star_template_switch_distance_call< TemplateSwitchMinLength: TemplateSwitchMinLengthStrategy, Chaining: ChainingStrategy, TemplateSwitchCount: TemplateSwitchCountStrategy, + TemplateSwitchTotalLength: TemplateSwitchTotalLengthStrategy, >( cli: Cli, reference: &SubsequenceType, @@ -249,6 +314,7 @@ fn align_a_star_template_switch_distance_call< NoShortcutStrategy, AllowPrimaryMatchStrategy, NoPrunePrimaryRangeStrategy, + TemplateSwitchTotalLength, >, _, >( @@ -260,6 +326,7 @@ fn align_a_star_template_switch_distance_call< costs, cli.cost_limit, cli.memory_limit, + cli.force_label_correcting, template_switch_count_memory, ); info!("Finished aligning");