diff --git a/Cargo.lock b/Cargo.lock index 679f1f00..4e6bf14e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -199,9 +199,9 @@ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "compact-genome" -version = "12.3.0" +version = "12.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "503cda65a29118e687d71d64d7a3c1e4d21f235a3aa55b22cd2902a99d04d9ff" +checksum = "96e9ce285aba3fadbf01e81018d11cf97fd7c2399f15def73084e04e991e67e9" dependencies = [ "bitvec", "enum-iterator", @@ -1232,6 +1232,7 @@ dependencies = [ "simplelog", "toml", "traitsequence", + "utf8-chars", ] [[package]] @@ -1327,6 +1328,15 @@ dependencies = [ "xmlwriter", ] +[[package]] +name = "utf8-chars" +version = "3.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f598f797138b219a4560b4e9c53c255e872e267c9e3fdcc75aa59a2a90953bcd" +dependencies = [ + "arrayvec", +] + [[package]] name = "utf8parse" version = "0.2.2" diff --git a/Cargo.toml b/Cargo.toml index 2369dd06..2c1f08da 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ package.repository = "https://github.com/sebschmi/template-switch-aligner" [workspace.dependencies] serde = "1.0.219" -compact-genome = "12.3.0" +compact-genome = "12.5.0" traitsequence = "8.1.2" log = "0.4.27" num-traits = "0.2.19" diff --git a/lib_tsalign/src/a_star_aligner.rs b/lib_tsalign/src/a_star_aligner.rs index 545e5f89..7454b757 100644 --- a/lib_tsalign/src/a_star_aligner.rs +++ b/lib_tsalign/src/a_star_aligner.rs @@ -154,7 +154,7 @@ pub fn template_switch_distance_a_star_align< query: &SubsequenceType, reference_name: &str, query_name: &str, - range: Option, + range: AlignmentRange, config: &config::TemplateSwitchConfig< Strategies::Alphabet, ::Cost, @@ -167,6 +167,9 @@ pub fn template_switch_distance_a_star_align< where Strategies::Cost: From, { + debug!("Reference sequence: {}", reference.as_string()); + debug!("Query sequence: {}", query.as_string()); + let memory = Memory { template_switch_min_length: AlignmentResult> AlignmentResult { + /// Heuristically extend the alignment beyond the alignment range as long as this does not increase the alignment cost. + /// + /// Returns the amount of steps the alignment was extended. #[allow(clippy::too_many_lines)] - pub fn extend_beyond_range_with_equal_cost< + pub fn extend_beyond_range_without_increasing_cost< AlphabetType: Alphabet, SubsequenceType: GenomeSequence + ?Sized, >( &mut self, reference: &SubsequenceType, query: &SubsequenceType, - range: &mut Option, + range: &mut AlignmentRange, config: &TemplateSwitchConfig, - ) { - let Some(range) = range else { - trace!("No range given to extend beyond."); - return; - }; + ) -> usize { let Self::WithTarget { alignment, statistics, } = self else { trace!("There is no alignment, therefore we cannot postprocess it."); - return; + return 0; }; if config.left_flank_length > 0 || config.right_flank_length > 0 { warn!("Alignment extension does not support flanks"); - return; + return 0; } // Compute cost before extending. @@ -280,6 +279,7 @@ impl> "computed cost {current_cost} != alignment cost {alignment_cost}; {}", alignment.cigar() ); + let mut extension_steps = 0; // Extend left with equal cost. while let Some(new_range) = range.move_offsets_left() { @@ -312,8 +312,10 @@ impl> config, ); + trace!("Extending left to {new_range}. Cost {current_cost} -> {new_cost}"); + if new_cost > current_cost { - // If cost is not equal, revert alignment and break. + // If cost is increasing, revert alignment and break. alignment.inner_mut()[0].0 -= 1; if alignment.inner_mut()[0].0 == 0 { alignment.inner_mut().remove(0); @@ -322,6 +324,7 @@ impl> } current_cost = new_cost; *range = new_range; + extension_steps += 1; } // Extend right with equal cost. @@ -364,8 +367,10 @@ impl> config, ); + trace!("Extending right to {new_range}. Cost {current_cost} -> {new_cost}"); + if new_cost > current_cost { - // If cost is not equal, revert alignment and break. + // If cost is increasing, revert alignment and break. let Some((n, _)) = alignment.inner_mut().last_mut() else { unreachable!("As we have just added at least one element") }; @@ -373,14 +378,17 @@ impl> if *n == 0 { alignment.inner_mut().pop(); } + break; } current_cost = new_cost; *range = new_range; + extension_steps += 1; } // Update statistics from updated range statistics.reference_offset = range.reference_offset(); statistics.query_offset = range.query_offset(); + extension_steps } #[allow(clippy::too_many_lines)] @@ -391,7 +399,7 @@ impl> &mut self, reference: &SubsequenceType, query: &SubsequenceType, - range: &Option, + range: &AlignmentRange, config: &TemplateSwitchConfig, ) { let Self::WithTarget { @@ -407,13 +415,8 @@ impl> return; } - let reference_offset = range.as_ref().map_or( - 0, - super::alignment_geometry::AlignmentRange::reference_offset, - ); - let query_offset = range - .as_ref() - .map_or(0, super::alignment_geometry::AlignmentRange::query_offset); + let reference_offset = range.reference_offset(); + let query_offset = range.query_offset(); for i in 0..alignment.inner_mut().len() { let alignment_type = alignment.inner_mut()[i].1; diff --git a/lib_tsalign/src/a_star_aligner/configurable_a_star_align.rs b/lib_tsalign/src/a_star_aligner/configurable_a_star_align.rs index 02832c0c..06afa9cd 100644 --- a/lib_tsalign/src/a_star_aligner/configurable_a_star_align.rs +++ b/lib_tsalign/src/a_star_aligner/configurable_a_star_align.rs @@ -5,6 +5,7 @@ use compact_genome::{ interface::sequence::{GenomeSequence, OwnedGenomeSequence}, }; use generic_a_star::cost::U64Cost; +use traitsequence::interface::Sequence; use crate::{ a_star_aligner::{ @@ -281,7 +282,8 @@ impl Aligner { query.as_genome_subsequence(), data.reference_name, data.query_name, - data.ranges, + data.ranges + .unwrap_or_else(|| AlignmentRange::new_complete(reference.len(), query.len())), &self.costs, cost_limit, data.memory_limit, diff --git a/lib_tsalign/src/a_star_aligner/template_switch_distance/context.rs b/lib_tsalign/src/a_star_aligner/template_switch_distance/context.rs index 8804d420..d0e05436 100644 --- a/lib_tsalign/src/a_star_aligner/template_switch_distance/context.rs +++ b/lib_tsalign/src/a_star_aligner/template_switch_distance/context.rs @@ -80,7 +80,7 @@ impl< query: &'query SubsequenceType, reference_name: &str, query_name: &str, - range: Option, + range: AlignmentRange, config: TemplateSwitchConfig, memory: Memory, cost_limit: Option, @@ -92,8 +92,7 @@ impl< query, reference_name: reference_name.to_owned(), query_name: query_name.to_owned(), - range: range - .unwrap_or_else(|| AlignmentRange::new_complete(reference.len(), query.len())), + range, config, a_star_buffers: Default::default(), memory, diff --git a/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch.rs b/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch.rs index 23f38ff8..52cdbee0 100644 --- a/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch.rs +++ b/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch.rs @@ -10,9 +10,11 @@ use compact_genome::{ }; use generic_a_star::{AStar, AStarNode, AStarResult, cost::AStarCost}; use log::{debug, info, trace}; +use traitsequence::interface::Sequence; use crate::{ a_star_aligner::{ + alignment_geometry::AlignmentRange, alignment_result::IAlignmentType, template_switch_distance::{ Context, Identifier, Node, @@ -93,7 +95,7 @@ impl TemplateSwitchLowerBoundMatrix { genome.as_genome_subsequence(), "", "", - None, + AlignmentRange::new_complete(genome.len(), genome.len()), lower_bound_config.clone(), Memory { template_switch_min_length: (), diff --git a/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch_alignment.rs b/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch_alignment.rs index d8946f1b..ee1877f2 100644 --- a/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch_alignment.rs +++ b/lib_tsalign/src/a_star_aligner/template_switch_distance/lower_bounds/template_switch_alignment.rs @@ -11,9 +11,11 @@ use compact_genome::{ use generic_a_star::{AStar, AStarNode, AStarResult, cost::AStarCost}; use log::{debug, info, trace}; use ndarray::Array2; +use traitsequence::interface::Sequence; use crate::{ a_star_aligner::{ + alignment_geometry::AlignmentRange, alignment_result::IAlignmentType, template_switch_distance::{ Context, Identifier, @@ -95,7 +97,7 @@ impl TemplateSwitchAlignmentLowerBoundMatrix { genome.as_genome_subsequence(), "", "", - None, + AlignmentRange::new_complete(genome.len(), genome.len()), lower_bound_config.clone(), Memory { template_switch_min_length: (), diff --git a/test_files/twin_embedded.fa b/test_files/twin_embedded.fa new file mode 100644 index 00000000..ed76964d --- /dev/null +++ b/test_files/twin_embedded.fa @@ -0,0 +1,4 @@ +>REF +ACCA|GGAA|TTGG +>QRY +AGCA|GGAA|TAGG \ No newline at end of file diff --git a/tsalign-tests/tests/integration.rs b/tsalign-tests/tests/integration.rs index d990c9ca..6c697d13 100644 --- a/tsalign-tests/tests/integration.rs +++ b/tsalign-tests/tests/integration.rs @@ -22,3 +22,8 @@ fn test_align_with_cost_limit() -> Result<()> { fn test_align_with_memory_limit() -> Result<()> { run_in_repo_root("align -p test_files/twin_100_0.01.fa --memory-limit 1000") } + +#[test] +fn test_align_with_embedded_rq_ranges() -> Result<()> { + run_in_repo_root("align -p test_files/twin_embedded.fa --use-embedded-rq-ranges") +} diff --git a/tsalign/Cargo.toml b/tsalign/Cargo.toml index a024336e..a6a38e85 100644 --- a/tsalign/Cargo.toml +++ b/tsalign/Cargo.toml @@ -21,3 +21,4 @@ toml = "0.8.19" log.workspace = true simplelog = "0.12.2" anyhow = "1.0.97" +utf8-chars = "3.0.5" diff --git a/tsalign/src/align.rs b/tsalign/src/align.rs index 13da07be..eab1df86 100644 --- a/tsalign/src/align.rs +++ b/tsalign/src/align.rs @@ -5,7 +5,7 @@ use std::{ path::PathBuf, }; -use anyhow::Result; +use anyhow::{Result, anyhow, ensure}; use clap::{Args, Parser, ValueEnum}; use compact_genome::{ implementation::{ @@ -15,13 +15,20 @@ use compact_genome::{ rna_alphabet::RnaAlphabet, rna_alphabet_or_n::RnaAlphabetOrN, rna_iupac_nucleic_acid_alphabet::RnaIupacNucleicAcidAlphabet, }, + vec_sequence::VectorGenome, vec_sequence_store::VectorSequenceStore, }, - interface::{alphabet::Alphabet, sequence::GenomeSequence, sequence_store::SequenceStore}, - io::fasta::read_fasta_file, + interface::{ + alphabet::Alphabet, + sequence::{GenomeSequence, OwnedGenomeSequence}, + sequence_store::SequenceStore, + }, }; use lib_tsalign::{ - a_star_aligner::{gap_affine_edit_distance, gap_affine_edit_distance_a_star_align}, + a_star_aligner::{ + alignment_geometry::{AlignmentCoordinates, AlignmentRange}, + gap_affine_edit_distance, gap_affine_edit_distance_a_star_align, + }, alignment_configuration::AlignmentConfiguration, alignment_matrix::AlignmentMatrix, costs::U64Cost, @@ -34,6 +41,9 @@ use template_switch_distance_type_selectors::{ align_a_star_template_switch_distance, }; +use crate::align::fasta_parser::{parse_pair_fasta_file, parse_single_fasta_file}; + +mod fasta_parser; mod template_switch_distance_type_selectors; #[derive(Parser)] @@ -143,6 +153,15 @@ pub struct Cli { /// Template switch inners can still align to the full sequence. #[clap(long)] rq_ranges: Option, + + /// Use the ranges for alignment that are embedded in the fasta file(s). + /// + /// The start and end of the range must be delimited by '|' characters in the fasta file. + /// Both reference and query must contain exactly two delimiters each. + /// + /// Template switch inners can still align to the full sequence. + #[clap(long)] + use_embedded_rq_ranges: bool, } #[derive(Args)] @@ -202,100 +221,142 @@ pub fn cli(cli: Cli) -> Result<()> { if cli.alignment_method != AlignmentMethod::AStarTemplateSwitch && cli.alphabet != InputAlphabet::Dna { + // Only A*-TS algo supports alphabets other than DNA. panic!("Unsupported alphabet type: {:?}", cli.alphabet); } match cli.alphabet { - InputAlphabet::Dna => execute_with_alphabet::(cli), - InputAlphabet::DnaN => execute_with_alphabet::(cli), - InputAlphabet::Rna => execute_with_alphabet::(cli), - InputAlphabet::RnaN => execute_with_alphabet::(cli), - InputAlphabet::DnaIupac => execute_with_alphabet::(cli), - InputAlphabet::RnaIupac => execute_with_alphabet::(cli), + InputAlphabet::Dna => execute_with_alphabet::(cli)?, + InputAlphabet::DnaN => execute_with_alphabet::(cli)?, + InputAlphabet::Rna => execute_with_alphabet::(cli)?, + InputAlphabet::RnaN => execute_with_alphabet::(cli)?, + InputAlphabet::DnaIupac => execute_with_alphabet::(cli)?, + InputAlphabet::RnaIupac => execute_with_alphabet::(cli)?, } Ok(()) } -fn execute_with_alphabet(cli: Cli) { - let mut skip_characters = Vec::new(); - for character in cli.skip_characters.bytes().map(usize::from) { - if skip_characters.len() <= character { - skip_characters.resize(character + 1, false); - } - skip_characters[character] = true; - } - let skip_characters = skip_characters; - - let mut sequence_store = VectorSequenceStore::::new(); - let sequences = if let Some(CliPairInput { pair_fasta }) = &cli.input.pair_input { - info!("Loading pair file {pair_fasta:?}"); - let sequences = read_fasta_file( - pair_fasta, - &mut sequence_store, - false, - true, - &skip_characters, - ) - .unwrap_or_else(|error| panic!("Error loading pair file: {error}")); - - assert_eq!( - sequences.len(), - 2, - "Pair sequence file contains not exactly two records" - ); - - sequences - } else if let Some(CliSeparateInput { reference, query }) = &cli.input.separate_input { - info!("Loading reference file {reference:?}"); - let mut sequences = read_fasta_file( - reference, - &mut sequence_store, - false, - true, - &skip_characters, - ) - .unwrap(); - assert_eq!( - sequences.len(), - 1, - "Reference sequence file contains not exactly one record, but {}", - sequences.len() +fn execute_with_alphabet( + cli: Cli, +) -> Result<()> { + // Load input sequences. + let (mut reference_record, mut query_record) = + if let Some(CliPairInput { pair_fasta }) = &cli.input.pair_input { + info!("Loading pair file {pair_fasta:?}"); + parse_pair_fasta_file(pair_fasta)? + } else if let Some(CliSeparateInput { reference, query }) = &cli.input.separate_input { + info!("Loading reference file {reference:?}"); + let reference = parse_single_fasta_file(reference)?; + + info!("Loading query file {query:?}"); + let query = parse_single_fasta_file(query)?; + + (reference, query) + } else { + return Err(anyhow!("No fasta input file given")); + }; + + // Remove skip characters. + let skip_characters = cli.skip_characters.chars().collect::>(); + ensure!( + !cli.use_embedded_rq_ranges || !skip_characters.contains(&'|'), + "Using embedded RQ ranges, but '|' is part of the skip characters" + ); + reference_record + .sequence_handle + .retain(|c| !skip_characters.contains(&c)); + query_record + .sequence_handle + .retain(|c| !skip_characters.contains(&c)); + + // Parse RQ ranges. + let range = if cli.use_embedded_rq_ranges { + ensure!( + cli.rq_ranges.is_none() + && cli.reference_offset.is_none() + && cli.reference_limit.is_none() + && cli.query_offset.is_none() + && cli.query_limit.is_none(), + "Redundant specification of RQ ranges" ); - info!("Loading query file {query:?}"); - sequences.extend( - read_fasta_file(query, &mut sequence_store, false, true, &skip_characters).unwrap(), + let reference_offset = reference_record.sequence_handle.find('|').ok_or_else(|| { + anyhow!("Using embedded RQ ranges, but reference sequence contains no '|' character.") + })?; + let reference_limit = reference_offset + reference_record.sequence_handle[reference_offset+1..].find('|').ok_or_else(|| { + anyhow!("Using embedded RQ ranges, but reference sequence contains only one '|' character.") + })?; + ensure!( + reference_record.sequence_handle[reference_limit + 2..] + .find('|') + .is_none(), + "Using embedded RQ ranges, but reference sequence contains more than two '|' characters" ); - assert_eq!( - sequences.len(), - 2, - "Query sequence file contains not exactly one record, but {}", - sequences.len() - 1, + reference_record.sequence_handle = reference_record.sequence_handle.replace('|', ""); + + let query_offset = query_record.sequence_handle.find('|').ok_or_else(|| { + anyhow!("Using embedded RQ ranges, but query sequence contains no '|' character.") + })?; + let query_limit = query_offset + query_record.sequence_handle[query_offset+1..].find('|').ok_or_else(|| { + anyhow!("Using embedded RQ ranges, but query sequence contains only one '|' character.") + })?; + ensure!( + query_record.sequence_handle[query_limit + 2..] + .find('|') + .is_none(), + "Using embedded RQ ranges, but query sequence contains more than two '|' characters" ); + query_record.sequence_handle = query_record.sequence_handle.replace('|', ""); - sequences + AlignmentRange::new_offset_limit( + AlignmentCoordinates::new(reference_offset, query_offset), + AlignmentCoordinates::new(reference_limit, query_limit), + ) } else { - panic!("No fasta input file given") + parse_range( + &cli, + reference_record.sequence_handle.len(), + query_record.sequence_handle.len(), + ) }; - let reference = sequence_store.get(&sequences[0].sequence_handle); - let query = sequence_store.get(&sequences[1].sequence_handle); + // Move sequences into sequence store. + let mut sequence_store = VectorSequenceStore::::new(); + let reference_record = + reference_record.try_transform_handle::<_, anyhow::Error>(|sequence| { + Ok(sequence_store.add( + &VectorGenome::from_slice_u8(sequence.as_bytes()).map_err(|error| { + anyhow!("Reference contains non-alphabet character: {error}") + })?, + )) + })?; + let query_record = query_record.try_transform_handle::<_, anyhow::Error>(|sequence| { + Ok(sequence_store.add( + &VectorGenome::from_slice_u8(sequence.as_bytes()) + .map_err(|error| anyhow!("Query contains non-alphabet character: {error}"))?, + )) + })?; + let reference_sequence = sequence_store.get(&reference_record.sequence_handle); + let query_sequence = sequence_store.get(&query_record.sequence_handle); debug!("Choosing alignment method..."); match cli.alignment_method { - AlignmentMethod::Matrix => align_matrix(cli, reference, query), + AlignmentMethod::Matrix => align_matrix(cli, reference_sequence, query_sequence), AlignmentMethod::AStarGapAffine => { - align_a_star_gap_affine_edit_distance(cli, reference, query) + align_a_star_gap_affine_edit_distance(cli, reference_sequence, query_sequence) } AlignmentMethod::AStarTemplateSwitch => align_a_star_template_switch_distance( cli, - reference, - query, - &format!("{} {}", sequences[0].id, sequences[0].comment), - &format!("{} {}", sequences[1].id, sequences[1].comment), + reference_sequence, + query_sequence, + range, + &format!("{} {}", reference_record.id, reference_record.comment), + &format!("{} {}", query_record.id, query_record.comment), ), } + + Ok(()) } fn align_matrix< @@ -379,3 +440,88 @@ fn align_a_star_gap_affine_edit_distance< println!("{alignment}"); } + +fn parse_range(cli: &Cli, reference_length: usize, query_length: usize) -> AlignmentRange { + let complete_reference_range = 0..reference_length; + let complete_query_range = 0..query_length; + + let (reference_range, query_range) = if let Some(rq_ranges) = cli.rq_ranges.as_ref() { + let mut rq_ranges = rq_ranges.chars().peekable(); + + let mut reference_range = None; + let mut query_range = None; + + while rq_ranges.peek().is_some() { + let rq = rq_ranges.next().unwrap(); + + while let Some(c) = rq_ranges.peek() { + if c.is_whitespace() { + rq_ranges.next().unwrap(); + } else { + break; + } + } + + let mut offset = String::new(); + while let Some(c) = rq_ranges.peek() { + if c.is_numeric() { + offset.push(rq_ranges.next().unwrap()); + } else { + break; + } + } + + // Parse .. + assert_eq!(rq_ranges.next(), Some('.')); + assert_eq!(rq_ranges.next(), Some('.')); + + let mut limit = String::new(); + while let Some(c) = rq_ranges.peek() { + if c.is_numeric() { + limit.push(rq_ranges.next().unwrap()); + } else { + break; + } + } + + let offset = offset.parse().unwrap(); + let limit = limit.parse().unwrap(); + + match rq { + 'R' => { + assert!(reference_range.is_none()); + reference_range = Some(offset..limit) + } + 'Q' => { + assert!(query_range.is_none()); + query_range = Some(offset..limit) + } + _ => panic!(), + } + } + + assert!( + reference_range.is_none() + || (cli.reference_offset.is_none() && cli.reference_limit.is_none()) + ); + assert!(query_range.is_none() || (cli.query_offset.is_none() && cli.query_limit.is_none())); + + ( + reference_range.unwrap_or(complete_reference_range), + query_range.unwrap_or(complete_query_range), + ) + } else { + (complete_reference_range, complete_query_range) + }; + + AlignmentRange::new_offset_limit( + AlignmentCoordinates::new( + cli.reference_offset.unwrap_or(reference_range.start), + cli.query_offset.unwrap_or(query_range.start), + ), + AlignmentCoordinates::new( + cli.reference_limit.unwrap_or(reference_range.end), + cli.query_limit.unwrap_or(query_range.end), + ), + ) +} diff --git a/tsalign/src/align/fasta_parser.rs b/tsalign/src/align/fasta_parser.rs new file mode 100644 index 00000000..633b1cdc --- /dev/null +++ b/tsalign/src/align/fasta_parser.rs @@ -0,0 +1,212 @@ +use anyhow::{Result, anyhow}; +use compact_genome::io::fasta::FastaRecord; +use log::debug; +use std::{ + fs::File, + io::{BufReader, Read}, + path::Path, +}; +use utf8_chars::BufReadCharsExt; + +pub fn parse_pair_fasta_file( + path: impl AsRef, +) -> Result<(FastaRecord, FastaRecord)> { + let mut result = parse_fasta_file(path, true)?; + let second = result.remove(1); + let first = result.remove(0); + Ok((first, second)) +} + +pub fn parse_single_fasta_file(path: impl AsRef) -> Result> { + Ok(parse_fasta_file(path, false)?.remove(0)) +} + +fn parse_fasta_file(path: impl AsRef, pair: bool) -> Result>> { + let path = path.as_ref(); + debug!("Parsing fasta file {path:?}"); + + enum State { + FileStart, + ParseId, + ParseComment, + ParseSequence, + } + + let mut input = CharacterIterator::new(BufReader::new( + File::open(path).map_err(|error| anyhow!("Unable to open input file {path:?}: {error}"))?, + )) + .peekable(); + let mut state = State::FileStart; + let mut current_record = FastaRecord { + id: String::new(), + comment: String::new(), + sequence_handle: String::new(), + }; + let mut records = Vec::>::new(); + + 'parser: loop { + match state { + State::FileStart => { + let mut newline = true; + + 'find_first_record: loop { + match input.next() { + Some(result) => match result? { + Character::Newline => newline = true, + Character::RecordStart => { + if newline { + state = State::ParseId; + break 'find_first_record; + } else { + return Err(anyhow!( + "First fasta record is not preceded by a newline character" + )); + } + } + Character::Other(c) => { + newline = false; + if !c.is_whitespace() { + return Err(anyhow!( + "Found non-whitespace character before first fasta record: {c}" + )); + } + } + }, + None => { + return Err(anyhow!("Input file {path:?} contains no fasta record")); + } + } + } + } + State::ParseId => 'collect_id: loop { + match input.next() { + Some(result) => match result? { + Character::Newline => { + state = State::ParseSequence; + break 'collect_id; + } + Character::RecordStart => current_record.id.push('>'), + Character::Other(c) => { + if c.is_whitespace() { + state = State::ParseComment; + break 'collect_id; + } else { + current_record.id.push(c); + } + } + }, + None => { + records.push(current_record); + break 'parser; + } + } + }, + State::ParseComment => 'collect_comment: loop { + match input.next() { + Some(result) => match result? { + Character::Newline => { + state = State::ParseSequence; + break 'collect_comment; + } + Character::RecordStart => current_record.comment.push('>'), + Character::Other(c) => current_record.comment.push(c), + }, + None => { + records.push(current_record); + break 'parser; + } + } + }, + State::ParseSequence => { + let mut newline = true; + + 'collect_sequence: loop { + match input.next() { + Some(result) => match result? { + Character::Newline => newline = true, + Character::RecordStart => { + if newline { + records.push(current_record); + current_record = FastaRecord { + id: String::new(), + comment: String::new(), + sequence_handle: String::new(), + }; + state = State::ParseId; + break 'collect_sequence; + } else { + current_record.sequence_handle.push('>'); + newline = false; + } + } + Character::Other(c) => { + current_record.sequence_handle.push(c); + newline = false; + } + }, + None => { + records.push(current_record); + break 'parser; + } + } + } + } + } + } + + if pair { + if records.len() != 2 { + Err(anyhow!( + "Expected paired fasta file with two records, but found {} records", + records.len() + )) + } else { + Ok(records) + } + } else if records.len() != 1 { + Err(anyhow!( + "Expected single-record fasta file, but found {} records", + records.len() + )) + } else { + Ok(records) + } +} + +enum Character { + Newline, + RecordStart, + Other(char), +} + +struct CharacterIterator { + reader: BufReader, +} + +impl CharacterIterator { + fn new(reader: BufReader) -> Self { + Self { reader } + } +} + +impl Iterator for CharacterIterator { + type Item = Result; + + fn next(&mut self) -> Option { + self.reader + .read_char_raw() + .map(|result| { + result.map(|c| { + if c == '\n' || c == '\r' { + Character::Newline + } else if c == '>' { + Character::RecordStart + } else { + Character::Other(c) + } + }) + }) + .map_err(|error| anyhow!("Error reading character from fasta input file: {error}")) + .transpose() + } +} diff --git a/tsalign/src/align/template_switch_distance_type_selectors.rs b/tsalign/src/align/template_switch_distance_type_selectors.rs index 08ab39bc..cf103071 100644 --- a/tsalign/src/align/template_switch_distance_type_selectors.rs +++ b/tsalign/src/align/template_switch_distance_type_selectors.rs @@ -4,7 +4,7 @@ use clap::ValueEnum; use compact_genome::interface::{alphabet::Alphabet, sequence::GenomeSequence}; use lib_tsalign::{ a_star_aligner::{ - alignment_geometry::{AlignmentCoordinates, AlignmentRange}, + alignment_geometry::AlignmentRange, template_switch_distance::strategies::{ AlignmentStrategySelection, chaining::{ChainingStrategy, LowerBoundChainingStrategy, NoChainingStrategy}, @@ -67,6 +67,7 @@ pub fn align_a_star_template_switch_distance< cli: Cli, reference: &SubsequenceType, query: &SubsequenceType, + range: AlignmentRange, reference_name: &str, query_name: &str, ) { @@ -74,6 +75,7 @@ pub fn align_a_star_template_switch_distance< cli, reference, query, + range, reference_name, query_name, ); @@ -86,6 +88,7 @@ fn align_a_star_template_switch_distance_select_node_ord_strategy< cli: Cli, reference: &SubsequenceType, query: &SubsequenceType, + range: AlignmentRange, reference_name: &str, query_name: &str, ) { @@ -103,7 +106,7 @@ fn align_a_star_template_switch_distance_select_node_ord_strategy< _, _, AntiDiagonalNodeOrdStrategy, - >(cli, reference, query, reference_name, query_name) + >(cli, reference, query, range, reference_name, query_name) } } } @@ -116,6 +119,7 @@ fn align_a_star_template_switch_distance_select_template_switch_min_length_strat cli: Cli, reference: &SubsequenceType, query: &SubsequenceType, + range: AlignmentRange, reference_name: &str, query_name: &str, ) { @@ -126,7 +130,7 @@ fn align_a_star_template_switch_distance_select_template_switch_min_length_strat _, NodeOrd, NoTemplateSwitchMinLengthStrategy, - >(cli, reference, query, reference_name, query_name) + >(cli, reference, query, range, reference_name, query_name) } TemplateSwitchMinLengthStrategySelector::Lookahead => { align_a_star_template_switch_select_chaining_strategy::< @@ -134,7 +138,7 @@ fn align_a_star_template_switch_distance_select_template_switch_min_length_strat _, NodeOrd, LookaheadTemplateSwitchMinLengthStrategy, - >(cli, reference, query, reference_name, query_name) + >(cli, reference, query, range, reference_name, query_name) } } } @@ -148,6 +152,7 @@ fn align_a_star_template_switch_select_chaining_strategy< cli: Cli, reference: &SubsequenceType, query: &SubsequenceType, + range: AlignmentRange, reference_name: &str, query_name: &str, ) { @@ -159,7 +164,7 @@ fn align_a_star_template_switch_select_chaining_strategy< NodeOrd, TemplateSwitchMinLength, NoChainingStrategy, - >(cli, reference, query, reference_name, query_name) + >(cli, reference, query, range, reference_name, query_name) } /*TemplateSwitchChainingStrategySelector::PrecomputeOnly => { align_a_star_template_switch_select_no_ts_strategy::< @@ -178,7 +183,7 @@ fn align_a_star_template_switch_select_chaining_strategy< NodeOrd, TemplateSwitchMinLength, LowerBoundChainingStrategy, - >(cli, reference, query, reference_name, query_name) + >(cli, reference, query, range, reference_name, query_name) } } } @@ -193,6 +198,7 @@ fn align_a_star_template_switch_select_no_ts_strategy< cli: Cli, reference: &SubsequenceType, query: &SubsequenceType, + range: AlignmentRange, reference_name: &str, query_name: &str, ) { @@ -204,7 +210,7 @@ fn align_a_star_template_switch_select_no_ts_strategy< TemplateSwitchMinLength, Chaining, MaxTemplateSwitchCountStrategy, - >(cli, reference, query, reference_name, query_name, 0) + >(cli, reference, query, range, reference_name, query_name, 0) } else { align_a_star_template_switch_select_template_switch_total_length_strategy::< _, @@ -213,7 +219,7 @@ fn align_a_star_template_switch_select_no_ts_strategy< TemplateSwitchMinLength, Chaining, NoTemplateSwitchCountStrategy, - >(cli, reference, query, reference_name, query_name, ()) + >(cli, reference, query, range, reference_name, query_name, ()) } } @@ -228,6 +234,7 @@ fn align_a_star_template_switch_select_template_switch_total_length_strategy< cli: Cli, reference: &SubsequenceType, query: &SubsequenceType, + range: AlignmentRange, reference_name: &str, query_name: &str, template_switch_count_memory: ::Memory, @@ -246,6 +253,7 @@ fn align_a_star_template_switch_select_template_switch_total_length_strategy< cli, reference, query, + range, reference_name, query_name, template_switch_count_memory, @@ -264,6 +272,7 @@ fn align_a_star_template_switch_select_template_switch_total_length_strategy< cli, reference, query, + range, reference_name, query_name, template_switch_count_memory, @@ -284,6 +293,7 @@ fn align_a_star_template_switch_distance_call< cli: Cli, reference: &SubsequenceType, query: &SubsequenceType, + range: AlignmentRange, reference_name: &str, query_name: &str, template_switch_count_memory: ::Memory, @@ -299,9 +309,6 @@ fn align_a_star_template_switch_distance_call< let costs = TemplateSwitchConfig::read_plain(config_file) .unwrap_or_else(|error| panic!("Error parsing template switch config:\n{error}")); - let range = Some(parse_range(&cli, reference.len(), query.len())); - - info!("Calling aligner..."); let alignment = template_switch_distance_a_star_align::< AlignmentStrategySelection< AlphabetType, @@ -340,88 +347,3 @@ fn align_a_star_template_switch_distance_call< println!("{alignment}"); } - -fn parse_range(cli: &Cli, reference_length: usize, query_length: usize) -> AlignmentRange { - let complete_reference_range = 0..reference_length; - let complete_query_range = 0..query_length; - - let (reference_range, query_range) = if let Some(rq_ranges) = cli.rq_ranges.as_ref() { - let mut rq_ranges = rq_ranges.chars().peekable(); - - let mut reference_range = None; - let mut query_range = None; - - while rq_ranges.peek().is_some() { - let rq = rq_ranges.next().unwrap(); - - while let Some(c) = rq_ranges.peek() { - if c.is_whitespace() { - rq_ranges.next().unwrap(); - } else { - break; - } - } - - let mut offset = String::new(); - while let Some(c) = rq_ranges.peek() { - if c.is_numeric() { - offset.push(rq_ranges.next().unwrap()); - } else { - break; - } - } - - // Parse .. - assert_eq!(rq_ranges.next(), Some('.')); - assert_eq!(rq_ranges.next(), Some('.')); - - let mut limit = String::new(); - while let Some(c) = rq_ranges.peek() { - if c.is_numeric() { - limit.push(rq_ranges.next().unwrap()); - } else { - break; - } - } - - let offset = offset.parse().unwrap(); - let limit = limit.parse().unwrap(); - - match rq { - 'R' => { - assert!(reference_range.is_none()); - reference_range = Some(offset..limit) - } - 'Q' => { - assert!(query_range.is_none()); - query_range = Some(offset..limit) - } - _ => panic!(), - } - } - - assert!( - reference_range.is_none() - || (cli.reference_offset.is_none() && cli.reference_limit.is_none()) - ); - assert!(query_range.is_none() || (cli.query_offset.is_none() && cli.query_limit.is_none())); - - ( - reference_range.unwrap_or(complete_reference_range), - query_range.unwrap_or(complete_query_range), - ) - } else { - (complete_reference_range, complete_query_range) - }; - - AlignmentRange::new_offset_limit( - AlignmentCoordinates::new( - cli.reference_offset.unwrap_or(reference_range.start), - cli.query_offset.unwrap_or(query_range.start), - ), - AlignmentCoordinates::new( - cli.reference_limit.unwrap_or(reference_range.end), - cli.query_limit.unwrap_or(query_range.end), - ), - ) -}