diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1328ae1..ddad108 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,41 +26,44 @@ jobs: - name: Linting run: cargo clippy --verbose - example_read_write: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: run example - run: cargo run --release --example read_write - - example_parallel: + example_grep: runs-on: ubuntu-latest + strategy: + matrix: + ext: [bq, vbq, cbq] steps: - uses: actions/checkout@v3 - - name: run example - run: cargo run --release --example parallel_processing + - name: run example ${{ matrix.ext }} + run: cargo run --release --example grep -- ./data/subset.${{ matrix.ext }} "ACGTACGT" - example_example: + example_range: runs-on: ubuntu-latest + strategy: + matrix: + ext: [bq, vbq, cbq] steps: - uses: actions/checkout@v3 - - name: run example - run: cargo run --release --example example + - name: run example ${{ matrix.ext }} + run: cargo run --release --example parallel_range -- ./data/subset.${{ matrix.ext }} 4 30 200 - example_grep: + example_write: runs-on: ubuntu-latest + strategy: + matrix: + ext: [bq, vbq, cbq] steps: - uses: actions/checkout@v3 - - name: run example bq - run: cargo run --release --example grep ./data/subset.bq - - name: run example vbq - run: cargo run --release --example grep ./data/subset.vbq + - name: run example (single) ${{ matrix.ext }} + run: cargo run --release --example write -- ./data/subset_R1.fastq.gz -o ./output.${{ matrix.ext }} + - name: run example (paired) ${{ matrix.ext }} + run: cargo run --release --example write -- ./data/subset_R1.fastq.gz ./data/subset_R2.fastq.gz -o ./output.${{ matrix.ext }} - example_range: + example_read: runs-on: ubuntu-latest + strategy: + matrix: + ext: [bq, vbq, cbq] steps: - uses: actions/checkout@v3 - - name: run example (bq) - run: cargo run --release --example parallel_range -- ./data/subset.bq 4 30 200 - - name: run example (vbq) - run: cargo run --release --example parallel_range -- ./data/subset.vbq 4 30 200 + - name: run example ${{ matrix.ext }} + run: cargo run --release --example read -- ./data/subset.${{ matrix.ext }} diff --git a/Cargo.toml b/Cargo.toml index eb9a326..8828255 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "binseq" version = "0.8.3" -edition = "2021" +edition = "2024" description = "A high efficiency binary format for sequencing data" license = "MIT" authors = ["Noam Teyssier "] @@ -11,25 +11,32 @@ categories = ["science::bioinformatics", "encoding", "data-structures"] keywords = ["bioinformatics", "nucleotide", "sequencing", "genomics", "fastq"] [dependencies] -anyhow = "1.0.100" +anyhow = {version = "1.0.100", optional = true} auto_impl = "1.3.0" -bitnuc = "0.3.2" -bytemuck = "1.24.0" +bitnuc = "0.4.0" +bytemuck = { version = "1.24.0", features = ["derive", "extern_crate_alloc"] } byteorder = "1.5.0" -itoa = "1.0.15" +itoa = "1.0.17" +memchr = "2.7.6" memmap2 = "0.9.9" num_cpus = "1.17.0" +paraseq = { version = "0.4.8", optional = true } +parking_lot = {version = "0.12.5", optional = true } rand = { version = "0.9.2", features = ["small_rng"] } +sucds = "0.8.3" thiserror = "2.0.17" zstd = { version = "0.13.3", features = ["zstdmt"] } [dev-dependencies] -nucgen = "0.2.0" -niffler = "3.0.0" -seq_io = "0.3.4" +anyhow = "1.0.100" parking_lot = "0.12.5" -itoa = "1.0.15" -memchr = "2.7.6" +clap = { version = "4.5.54", features = ["derive"] } +paraseq = "0.4.8" + +[features] +default = ["paraseq", "anyhow"] +anyhow = ["dep:anyhow"] +paraseq = ["dep:paraseq", "dep:parking_lot"] [lints.clippy] pedantic = { level = "warn", priority = -1 } diff --git a/README.md b/README.md index 3d41da4..0774464 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,18 @@ BINSEQ is a binary file format family designed for efficient storage and processing of DNA sequences. They make use of two-bit encoding for nucleotides and are optimized for high-performance parallel processing. -BINSEQ currently has two flavors: +BINSEQ has three variants: 1. **BQ**: (`*.bq`) files are for _fixed-length_ records **without** quality scores. 2. **VBQ**: (`*.vbq`) files are for _variable-length_ records **with optional** quality scores and headers. +3. **CBQ**: (`*.cbq`) files are for _columnar variable-length_ records **with optional** quality scores and headers. -Both flavors support both single and paired sequences. +All variants support both single and paired sequences. + +**Note:** For most use cases, the newest variant _CBQ_ is recommended due to its flexibility, storage efficiency, and decoding speed. +It supersedes _VBQ_ in terms of performance and storage efficiency, at a small cost in encoding speed. +VBQ will still be supported but newer projects should consider using _CBQ_ instead. +For information on the structure of _CBQ_ files, see the [documentation](https://docs.rs/binseq/latest/binseq/cbq/). ## Getting Started @@ -24,4 +30,4 @@ This is a **library** for reading and writing BINSEQ files, for a **command-line To get started please refer to our [documentation](https://docs.rs/binseq/latest/binseq/). For example programs which make use of the library check out our [examples directory](https://github.com/arcinstitute/binseq/tree/main/examples). -For more information about the BINSEQ file family, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1). +For more information about the BINSEQ file family, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v2). diff --git a/data/subset.cbq b/data/subset.cbq new file mode 100644 index 0000000..8214c79 Binary files /dev/null and b/data/subset.cbq differ diff --git a/data/subset.vbq b/data/subset.vbq index 7343b00..af0a307 100644 Binary files a/data/subset.vbq and b/data/subset.vbq differ diff --git a/examples/auto-write.rs b/examples/auto-write.rs new file mode 100644 index 0000000..1a57292 --- /dev/null +++ b/examples/auto-write.rs @@ -0,0 +1,122 @@ +use std::{fs::File, io::BufWriter}; + +use anyhow::Result; +use binseq::{BinseqWriterBuilder, write::Format}; +use bitnuc::BitSize; +use clap::Parser; + +type BoxedWriter = Box; + +#[derive(Parser)] +struct Args { + /// Input FASTX to encode into BINSEQ format + #[clap(required = true)] + input: String, + + /// Input FASTX to encode into BINSEQ format (R2) + #[clap(required = false)] + input2: Option, + + /// Output file path for BINSEQ format + #[clap(short = 'o', long)] + output: Option, + + /// Default prefix for writing BINSEQ: `.` + #[clap(short = 'p', long, default_value = "output")] + prefix: String, + + /// Format of the output BINSEQ file + /// + /// [bq: bq|BQ|b, vbq: vbq|VBQ|v, cbq: cbq|CBQ|c] + #[clap(short = 'f', long)] + format: Option, + + /// Exclude quality information in BINSEQ output + /// + /// (bq ignores quality always) + #[clap(short = 'Q', long)] + exclude_quality: bool, + + /// Exclude sequence headers in BINSEQ output + /// + /// (bq ignores headers always) + #[clap(short = 'H', long)] + exclude_headers: bool, + + /// Compression level for BINSEQ output (0: auto) + #[clap(long, default_value_t = 0)] + compression_level: i32, + + /// Default BITSIZE for BINSEQ output (2: 2bit, 4: 4bit) + #[clap(long, default_value_t = 2)] + bitsize: u8, + + /// Default BLOCKSIZE in KB for BINSEQ output (vbq,cbq) + #[clap(long, default_value_t = 128)] + blocksize: usize, + + /// Number of threads to use for parallel processing, 0: all available + #[clap(short = 'T', long, default_value = "0")] + threads: usize, +} +impl Args { + /// Determines the output format based on the file extension or the provided format + fn format(&self) -> Format { + if let Some(format) = self.format { + format + } else { + if let Some(output) = &self.output { + match output.split(".").last() { + Some("bq") => Format::Bq, + Some("vbq") => Format::Vbq, + Some("cbq") => Format::Cbq, + _ => Format::default(), + } + } else { + Format::default() + } + } + } + fn bitsize(&self) -> BitSize { + match self.bitsize { + 4 => BitSize::Four, + _ => BitSize::Two, + } + } + + /// Creates an output file handle + fn ohandle(&self) -> Result { + let path = if let Some(output) = &self.output { + output.to_string() + } else { + format!("{}{}", &self.prefix, self.format().extension()) + }; + let ofile = File::create(path).map(BufWriter::new)?; + Ok(Box::new(ofile)) + } + + fn is_paired(&self) -> bool { + self.input2.is_some() + } +} + +fn main() -> Result<()> { + let args = Args::parse(); + let handle = args.ohandle()?; + let builder = BinseqWriterBuilder::new(args.format()) + .bitsize(args.bitsize()) + .block_size(args.blocksize * 1024) + .headers(!args.exclude_headers) + .quality(!args.exclude_quality) + .compression_level(args.compression_level) + .encode_fastx(handle); + if args.is_paired() { + builder.input_paired(&args.input, args.input2.as_ref().unwrap()) + } else { + builder.input(&args.input) + } + .threads(args.threads) + .run()?; + + Ok(()) +} diff --git a/examples/grep.rs b/examples/grep.rs index e1f5f3e..fcbc5d7 100644 --- a/examples/grep.rs +++ b/examples/grep.rs @@ -1,14 +1,14 @@ use std::sync::Arc; use anyhow::Result; -use binseq::{context::SeqCtx, prelude::*}; +use binseq::prelude::*; +use clap::Parser; use memchr::memmem::Finder; use parking_lot::Mutex; #[derive(Clone)] pub struct GrepCounter { // (thread) local variables - ctx: SeqCtx, local_count: usize, // search pattern (using memchr::memmem::Finder for fast searching) @@ -21,7 +21,6 @@ impl GrepCounter { #[must_use] pub fn new(pattern: &[u8]) -> Self { Self { - ctx: SeqCtx::default(), pattern: Finder::new(pattern).into_owned(), local_count: 0, count: Arc::new(Mutex::new(0)), @@ -38,9 +37,7 @@ impl GrepCounter { } impl ParallelProcessor for GrepCounter { fn process_record(&mut self, record: R) -> binseq::Result<()> { - self.ctx.fill(&record)?; - - if self.match_sequence(&self.ctx.sbuf()) || self.match_sequence(&self.ctx.xbuf()) { + if self.match_sequence(&record.sseq()) || self.match_sequence(&record.xseq()) { self.local_count += 1; } @@ -54,21 +51,26 @@ impl ParallelProcessor for GrepCounter { } } -fn main() -> Result<()> { - let path = std::env::args() - .nth(1) - .unwrap_or("./data/subset.bq".to_string()); - let pattern = std::env::args() - .nth(2) - .unwrap_or("ACGT".to_string()) - .as_bytes() - .to_vec(); - let n_threads = std::env::args().nth(3).unwrap_or("1".to_string()).parse()?; +#[derive(Parser)] +struct Args { + /// Input BINSEQ path to grep + #[clap(required = true)] + input: String, - let reader = BinseqReader::new(&path)?; - let counter = GrepCounter::new(&pattern); - reader.process_parallel(counter.clone(), n_threads)?; - counter.pprint(); + /// Pattern to search for (either sseq or xseq) + #[clap(required = true)] + pattern: String, + /// Threads to use [0: auto] + #[clap(short = 'T', long, default_value_t = 0)] + threads: usize, +} + +fn main() -> Result<()> { + let args = Args::parse(); + let reader = BinseqReader::new(&args.input)?; + let counter = GrepCounter::new(args.pattern.as_bytes()); + reader.process_parallel(counter.clone(), args.threads)?; + counter.pprint(); Ok(()) } diff --git a/examples/network_streaming.rs b/examples/network_streaming.rs index 0cd8002..71ca2f9 100644 --- a/examples/network_streaming.rs +++ b/examples/network_streaming.rs @@ -2,10 +2,10 @@ use std::io::{BufReader, BufWriter}; use std::net::{TcpListener, TcpStream}; use std::thread; -use binseq::bq::{BinseqHeader, BinseqHeaderBuilder, StreamReader, StreamWriterBuilder}; +use binseq::bq::{FileHeader, FileHeaderBuilder, StreamReader, StreamWriterBuilder}; use binseq::{BinseqRecord, Policy, Result}; -fn server(header: BinseqHeader, sequence: &[u8]) -> Result<()> { +fn server(header: FileHeader, sequence: &[u8]) -> Result<()> { // Create a listener on localhost:3000 let listener = TcpListener::bind("127.0.0.1:3000").expect("Failed to bind to address"); println!("Server listening on 127.0.0.1:3000"); @@ -25,6 +25,7 @@ fn server(header: BinseqHeader, sequence: &[u8]) -> Result<()> { // Write sequences in a loop for i in 0..10 { + #[allow(deprecated)] writer.write_record(Some(i), sequence)?; println!("Server: Sent record {i}"); @@ -79,7 +80,7 @@ fn client() -> Result<()> { fn main() -> Result<()> { // Create a header for sequences of length 100 - let header = BinseqHeaderBuilder::new().slen(100).build()?; + let header = FileHeaderBuilder::new().slen(100).build()?; // Create some example sequence data let sequence = b"ACGT".repeat(25); // 100 nucleotides diff --git a/examples/parallel_processing.rs b/examples/parallel_processing.rs deleted file mode 100644 index b9649e0..0000000 --- a/examples/parallel_processing.rs +++ /dev/null @@ -1,160 +0,0 @@ -use std::{ - fs::File, - io::BufWriter, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, - }, -}; - -use anyhow::{bail, Result}; -use binseq::{ - bq::{self, BinseqHeaderBuilder}, - context::SeqCtx, - prelude::*, -}; -use nucgen::Sequence; - -#[derive(Clone, Default)] -pub struct MyProcessor { - local_counter: usize, - counter: Arc, - ctx: SeqCtx, -} -impl MyProcessor { - #[must_use] - pub fn counter(&self) -> usize { - self.counter.load(Ordering::Relaxed) - } -} -impl ParallelProcessor for MyProcessor { - fn process_record(&mut self, record: R) -> binseq::Result<()> { - self.ctx.fill_sequences(&record)?; - self.local_counter += 1; - Ok(()) - } - fn on_batch_complete(&mut self) -> binseq::Result<()> { - self.counter - .fetch_add(self.local_counter, Ordering::Relaxed); - self.local_counter = 0; - Ok(()) - } -} - -fn mmap_processing(binseq_path: &str, n_threads: usize) -> Result<()> { - let reader = BinseqReader::new(binseq_path)?; - let proc = MyProcessor::default(); - reader.process_parallel(proc.clone(), n_threads)?; - Ok(()) -} - -pub fn main() -> Result<()> { - let binseq_path_single = "./data/test.bq"; - let binseq_path_paired = "./data/test_paired.bq"; - let r1_size = 150; - let r2_size = 300; - let num_seq = 1_000_000; - - time_it( - || { - write_single(binseq_path_single, num_seq, r1_size)?; - Ok(()) - }, - "write_single", - ); - - time_it( - || { - write_paired(binseq_path_paired, num_seq, r1_size, r2_size)?; - Ok(()) - }, - "write_paired", - ); - - for n_threads in 1..=16 { - if n_threads > 1 && n_threads % 2 != 0 { - continue; - } - time_it( - || { - mmap_processing(binseq_path_single, n_threads)?; - Ok(()) - }, - &format!("single - mmap_parallel_processing ({n_threads})"), - ); - } - for n_threads in 1..=16 { - if n_threads > 1 && n_threads % 2 != 0 { - continue; - } - time_it( - || { - mmap_processing(binseq_path_paired, n_threads)?; - Ok(()) - }, - &format!("paired - mmap_parallel_processing ({n_threads})"), - ); - } - - Ok(()) -} - -fn time_it(f: F, name: &str) -where - F: Fn() -> Result<()>, -{ - let now = std::time::Instant::now(); - f().unwrap(); - let elapsed = now.elapsed(); - eprintln!("Elapsed time ({name}): {elapsed:?}"); -} - -fn write_single(binseq_path: &str, num_seq: usize, seq_size: usize) -> Result<()> { - // Open the output file - let header = BinseqHeaderBuilder::new().slen(seq_size as u32).build()?; - let out_handle = File::create(binseq_path).map(BufWriter::new)?; - let mut writer = bq::BinseqWriterBuilder::default() - .header(header) - .build(out_handle)?; - - // Write the binary sequence - let mut sequence = Sequence::new(); - let mut rng = rand::rng(); - for _ in 0..num_seq { - sequence.fill_buffer(&mut rng, seq_size); - if !writer.write_record(Some(0), sequence.bytes())? { - bail!("Error writing nucleotides") - } - } - writer.flush()?; - eprintln!("Finished writing {num_seq} records to path: {binseq_path}"); - Ok(()) -} - -fn write_paired(binseq_path: &str, num_seq: usize, r1_size: usize, r2_size: usize) -> Result<()> { - // Open the output file - let header = bq::BinseqHeaderBuilder::new() - .slen(r1_size as u32) - .xlen(r2_size as u32) - .build()?; - let out_handle = File::create(binseq_path).map(BufWriter::new)?; - let mut writer = bq::BinseqWriterBuilder::default() - .header(header) - .build(out_handle)?; - - // Write the binary sequence - let mut r1 = Sequence::new(); - let mut r2 = Sequence::new(); - let mut rng = rand::rng(); - for _ in 0..num_seq { - r1.fill_buffer(&mut rng, r1_size); - r2.fill_buffer(&mut rng, r2_size); - - if !writer.write_paired_record(Some(0), r1.bytes(), r2.bytes())? { - bail!("Error writing nucleotides") - } - } - writer.flush()?; - eprintln!("Finished writing {num_seq} records to path: {binseq_path}"); - Ok(()) -} diff --git a/examples/parallel_range.rs b/examples/parallel_range.rs index e17a1d4..3da00b4 100644 --- a/examples/parallel_range.rs +++ b/examples/parallel_range.rs @@ -1,6 +1,6 @@ use binseq::{BinseqReader, BinseqRecord, ParallelProcessor, ParallelReader, Result}; -use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; #[derive(Clone)] struct RangeProcessor { diff --git a/examples/example.rs b/examples/read.rs similarity index 79% rename from examples/example.rs rename to examples/read.rs index 9406d9e..f060836 100644 --- a/examples/example.rs +++ b/examples/read.rs @@ -1,18 +1,16 @@ use std::fs::File; -use std::io::{stdout, BufWriter, Write}; +use std::io::{BufWriter, Write, stdout}; use std::sync::Arc; use anyhow::Result; use binseq::prelude::*; +use clap::Parser; use parking_lot::Mutex; /// A struct for decoding BINSEQ data back to FASTQ format. #[derive(Clone)] pub struct Decoder { - /// Reusable context - ctx: Ctx, - /// local output buffer local_writer: Vec, @@ -32,7 +30,6 @@ impl Decoder { let global_writer = Arc::new(Mutex::new(writer)); Decoder { local_writer: Vec::new(), - ctx: Ctx::default(), local_count: 0, global_writer, global_count: Arc::new(Mutex::new(0)), @@ -46,21 +43,21 @@ impl Decoder { } impl ParallelProcessor for Decoder { fn process_record(&mut self, record: R) -> binseq::Result<()> { - self.ctx.fill(&record)?; + // write primary fastq to local buffer write_fastq_parts( &mut self.local_writer, - self.ctx.sheader(), - self.ctx.sbuf(), - self.ctx.squal(), + record.sheader(), + record.sseq(), + record.squal(), )?; // write extended fastq to local buffer if record.is_paired() { write_fastq_parts( &mut self.local_writer, - self.ctx.xheader(), - &self.ctx.xbuf(), - self.ctx.xqual(), + record.xheader(), + record.xseq(), + record.xqual(), )?; } @@ -88,7 +85,6 @@ impl ParallelProcessor for Decoder { } } -#[allow(clippy::missing_errors_doc)] pub fn write_fastq_parts( writer: &mut W, index: &[u8], @@ -105,6 +101,7 @@ pub fn write_fastq_parts( Ok(()) } +/// Handle output file path generically (stdout / path) fn match_output(path: Option<&str>) -> Result> { if let Some(path) = path { let writer = File::create(path).map(BufWriter::new)?; @@ -115,18 +112,23 @@ fn match_output(path: Option<&str>) -> Result> { } } -fn main() -> Result<()> { - let file = std::env::args() - .nth(1) - .unwrap_or("./data/subset.bq".to_string()); - let n_threads = std::env::args().nth(2).unwrap_or("1".to_string()).parse()?; +#[derive(Parser)] +struct Args { + /// Input BINSEQ path to decode + #[clap(required = true)] + input: String, + + /// Number of threads to use for decoding [0: auto] + #[clap(short = 'T', long, default_value_t = 0)] + threads: usize, +} - let reader = BinseqReader::new(&file)?; +fn main() -> Result<()> { + let args = Args::parse(); + let reader = BinseqReader::new(&args.input)?; let writer = match_output(None)?; let proc = Decoder::new(writer); - - reader.process_parallel(proc.clone(), n_threads)?; + reader.process_parallel(proc.clone(), args.threads)?; eprintln!("Read {} records", proc.num_records()); - Ok(()) } diff --git a/examples/read_write.rs b/examples/read_write.rs deleted file mode 100644 index 4e9b7c4..0000000 --- a/examples/read_write.rs +++ /dev/null @@ -1,185 +0,0 @@ -use std::{ - fs::File, - io::{BufReader, BufWriter}, -}; - -use anyhow::{bail, Result}; -use binseq::{ - bq::{BinseqHeaderBuilder, BinseqWriterBuilder, MmapReader}, - BinseqRecord, -}; -use seq_io::fastq::{Reader, Record}; - -fn read_write_single(fastq_path: &str, binseq_path: &str, seq_size: usize) -> Result<()> { - // Open the input FASTQ file - let (in_handle, _comp) = niffler::from_path(fastq_path)?; - - // Open the output file - let header = BinseqHeaderBuilder::new().slen(seq_size as u32).build()?; - let out_handle = File::create(binseq_path).map(BufWriter::new)?; - let mut writer = BinseqWriterBuilder::default() - .header(header) - .build(out_handle)?; - - let mut all_sequences = Vec::new(); - - // Write the binary sequence - let mut reader = Reader::new(in_handle); - let mut num_records_write = 0; - let mut skipped_records = 0; - while let Some(record) = reader.next() { - let record = record?; - let seq = record.seq(); - if writer.write_record(Some(0), seq)? { - num_records_write += 1; - all_sequences.push(seq.to_vec()); - } else { - skipped_records += 1; - } - } - writer.flush()?; - eprintln!("Finished writing {num_records_write} records to path: {binseq_path}"); - eprintln!("Skipped {skipped_records} records"); - - // Read the binary sequence - let reader = MmapReader::new(binseq_path)?; - let mut num_records_read = 0; - let mut sbuf = Vec::new(); - for idx in 0..reader.num_records() { - let record = reader.get(idx)?; - record.decode_s(&mut sbuf)?; - - // Check if the decoded sequence matches the original - let buf_str = std::str::from_utf8(&sbuf)?; - let seq_str = std::str::from_utf8(&all_sequences[num_records_read])?; - assert_eq!(buf_str, seq_str); - - num_records_read += 1; - sbuf.clear(); - } - eprintln!("Finished reading {num_records_read} records (mmap)"); - eprintln!( - "Difference in total records: {}", - num_records_write - num_records_read - ); - eprintln!("Number of records in vec: {}", all_sequences.len()); - - Ok(()) -} - -fn read_write_paired( - fastq_path_r1: &str, - fastq_path_r2: &str, - binseq_path: &str, - seq_size_r1: usize, - seq_size_r2: usize, -) -> Result<()> { - // Open the input FASTQ files - - let in_buf_r1 = File::open(fastq_path_r1).map(BufReader::new)?; - let in_buf_r2 = File::open(fastq_path_r2).map(BufReader::new)?; - - let (in_handle_r1, _comp) = niffler::get_reader(Box::new(in_buf_r1))?; - let (in_handle_r2, _comp) = niffler::get_reader(Box::new(in_buf_r2))?; - - // Create the header - let header = BinseqHeaderBuilder::new() - .slen(seq_size_r1 as u32) - .xlen(seq_size_r2 as u32) - .build()?; - - // Open the output handle - let out_handle = File::create(binseq_path).map(BufWriter::new)?; - - // Create the writer - let mut writer = BinseqWriterBuilder::default() - .header(header) - .build(out_handle)?; - - // Open the FASTQ readers - let mut reader_r1 = Reader::new(in_handle_r1); - let mut reader_r2 = Reader::new(in_handle_r2); - - // Write the binary sequence - let mut num_records = 0; - let mut num_skipped = 0; - - let mut r1_storage = Vec::new(); - let mut r2_storage = Vec::new(); - - loop { - let (record_r1, record_r2) = match (reader_r1.next(), reader_r2.next()) { - (Some(r1), Some(r2)) => (r1?, r2?), - (None, None) => break, - _ => bail!("Mismatched number of records in R1 and R2"), - }; - - let seq_r1 = record_r1.seq(); - let seq_r2 = record_r2.seq(); - - if writer.write_paired_record(Some(0), seq_r1, seq_r2)? { - num_records += 1; - r1_storage.push(seq_r1.to_vec()); - r2_storage.push(seq_r2.to_vec()); - } else { - num_skipped += 1; - } - } - writer.flush()?; - eprintln!("Finished writing {num_records} records"); - eprintln!("Skipped {num_skipped} records"); - - // Read the binary sequence with mmap - let reader = MmapReader::new(binseq_path)?; - - let mut n_processed = 0; - let mut sbuf = Vec::new(); - let mut xbuf = Vec::new(); - - for idx in 0..reader.num_records() { - let record = reader.get(idx)?; - - record.decode_s(&mut sbuf)?; - record.decode_x(&mut xbuf)?; - - // Check if the decoded sequence matches the original - let s_str = std::str::from_utf8(&sbuf)?; - let x_str = std::str::from_utf8(&xbuf)?; - - let s_exp = std::str::from_utf8(&r1_storage[n_processed])?; - let x_exp = std::str::from_utf8(&r2_storage[n_processed])?; - - assert_eq!(s_str, s_exp); - assert_eq!(x_str, x_exp); - - n_processed += 1; - sbuf.clear(); - xbuf.clear(); - } - eprintln!("Finished reading {n_processed} records"); - - Ok(()) -} - -fn main() -> Result<()> { - // INPUT ARGUMENTS - let fastq_path_r1 = "./data/subset_R1.fastq.gz"; // exists - let fastq_path_r2 = "./data/subset_R2.fastq.gz"; // exists - let binseq_path_r1 = "./data/subset_R1.bq"; // created - let binseq_path_r2 = "./data/subset_R2.bq"; // created - let binseq_path = "./data/subset.bq"; // created - let seq_size_r1 = 28; // a priori known - let seq_size_r2 = 90; // a priori known - - read_write_single(fastq_path_r1, binseq_path_r1, seq_size_r1)?; - read_write_single(fastq_path_r2, binseq_path_r2, seq_size_r2)?; - read_write_paired( - fastq_path_r1, - fastq_path_r2, - binseq_path, - seq_size_r1, - seq_size_r2, - )?; - - Ok(()) -} diff --git a/examples/streaming.rs b/examples/streaming.rs index 4e8b5f1..001ad47 100644 --- a/examples/streaming.rs +++ b/examples/streaming.rs @@ -1,11 +1,11 @@ use std::io::{BufReader, Cursor}; -use binseq::bq::{BinseqHeaderBuilder, StreamReader, StreamWriterBuilder}; +use binseq::bq::{FileHeaderBuilder, StreamReader, StreamWriterBuilder}; use binseq::{BinseqRecord, Policy, Result}; fn main() -> Result<()> { // Create a header for sequences of length 100 - let header = BinseqHeaderBuilder::new().slen(100).build()?; + let header = FileHeaderBuilder::new().slen(100).build()?; // Create some example sequence data let sequence = b"ACGT".repeat(25); // 100 nucleotides @@ -18,9 +18,11 @@ fn main() -> Result<()> { .build(Cursor::new(Vec::new()))?; // Write the sequence with flag 0 + #[allow(deprecated)] writer.write_record(Some(0), &sequence)?; // Write the sequence with flag 1 + #[allow(deprecated)] writer.write_record(Some(1), &sequence)?; // Flush and get the buffer diff --git a/examples/write.rs b/examples/write.rs new file mode 100644 index 0000000..1bde767 --- /dev/null +++ b/examples/write.rs @@ -0,0 +1,270 @@ +use std::{ + io::{BufWriter, Read}, + sync::Arc, +}; + +use anyhow::{Result, bail}; +use binseq::{ + SequencingRecordBuilder, + write::{BinseqWriter, BinseqWriterBuilder, Format}, +}; +use bitnuc::BitSize; +use clap::Parser; +use paraseq::{ + Record, fastx, + prelude::{IntoProcessError, PairedParallelProcessor, ParallelProcessor, ParallelReader}, +}; +use parking_lot::Mutex; + +type BoxedWriter = Box; + +#[derive(Parser)] +struct Args { + /// Input FASTX to encode into BINSEQ format + #[clap(required = true)] + input: String, + + /// Input FASTX to encode into BINSEQ format (R2) + #[clap(required = false)] + input2: Option, + + /// Output file path for BINSEQ format + #[clap(short = 'o', long)] + output: Option, + + /// Default prefix for writing BINSEQ: `.` + #[clap(short = 'p', long, default_value = "output")] + prefix: String, + + /// Format of the output BINSEQ file + /// + /// [bq: bq|BQ|b, vbq: vbq|VBQ|v, cbq: cbq|CBQ|c] + #[clap(short = 'f', long)] + format: Option, + + /// Exclude quality information in BINSEQ output + /// + /// (bq ignores quality always) + #[clap(short = 'Q', long)] + exclude_quality: bool, + + /// Exclude sequence headers in BINSEQ output + /// + /// (bq ignores headers always) + #[clap(short = 'H', long)] + exclude_headers: bool, + + /// Compression level for BINSEQ output (0: auto) + #[clap(long, default_value_t = 0)] + compression_level: i32, + + /// Default BITSIZE for BINSEQ output (2: 2bit, 4: 4bit) + #[clap(long, default_value_t = 2)] + bitsize: u8, + + /// Default BLOCKSIZE in KB for BINSEQ output (vbq,cbq) + #[clap(long, default_value_t = 128)] + blocksize: usize, + + /// Number of threads to use for parallel processing, 0: all available + #[clap(short = 'T', long, default_value = "0")] + threads: usize, +} +impl Args { + /// Determines the output format based on the file extension or the provided format + fn format(&self) -> Format { + if let Some(format) = self.format { + format + } else { + if let Some(output) = &self.output { + match output.split(".").last() { + Some("bq") => Format::Bq, + Some("vbq") => Format::Vbq, + Some("cbq") => Format::Cbq, + _ => Format::default(), + } + } else { + Format::default() + } + } + } + fn bitsize(&self) -> BitSize { + match self.bitsize { + 4 => BitSize::Four, + _ => BitSize::Two, + } + } + + /// Creates an output file handle + fn ohandle(&self) -> Result { + let path = if let Some(output) = &self.output { + output.to_string() + } else { + format!("{}{}", &self.prefix, self.format().extension()) + }; + let ofile = std::fs::File::create(path).map(BufWriter::new)?; + Ok(Box::new(ofile)) + } + + fn is_paired(&self) -> bool { + self.input2.is_some() + } +} + +/// Calculates the sequence length of the first record in the reader +fn get_seq_len(reader: &mut fastx::Reader) -> Result { + let mut rset = reader.new_record_set(); + rset.fill(reader)?; + + let slen = if let Some(record) = rset.iter().next() { + let record = record?; + record.seq().len() + } else { + bail!("No records found in reader"); + }; + + reader.reload(&mut rset)?; + + Ok(slen) +} + +#[derive(Clone)] +struct Encoder { + /// global writer + writer: Arc>>, + thread_writer: BinseqWriter>, +} +impl Encoder { + pub fn new(writer: BinseqWriter) -> Result { + let thread_writer = writer.new_headless_buffer()?; + Ok(Self { + writer: Arc::new(Mutex::new(writer)), + thread_writer, + }) + } + pub fn finish(&mut self) -> Result<()> { + self.writer.lock().finish()?; + Ok(()) + } +} +impl ParallelProcessor for Encoder { + fn process_record(&mut self, record: Rf) -> paraseq::Result<()> { + let seq = record.seq(); + let seq_record = SequencingRecordBuilder::default() + .s_header(record.id()) + .s_seq(&seq) + .opt_s_qual(record.qual()) + .build() + .map_err(IntoProcessError::into_process_error)?; + self.thread_writer + .push(seq_record) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } + fn on_batch_complete(&mut self) -> paraseq::Result<()> { + self.writer + .lock() + .ingest(&mut self.thread_writer) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } +} + +impl PairedParallelProcessor for Encoder { + fn process_record_pair(&mut self, record1: Rf, record2: Rf) -> paraseq::Result<()> { + let sseq = record1.seq(); + let xseq = record2.seq(); + let seq_record = SequencingRecordBuilder::default() + .s_header(record1.id()) + .s_seq(&sseq) + .opt_s_qual(record1.qual()) + .x_header(record2.id()) + .x_seq(&xseq) + .opt_x_qual(record2.qual()) + .build() + .map_err(IntoProcessError::into_process_error)?; + + self.thread_writer + .push(seq_record) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } + + fn on_batch_complete(&mut self) -> paraseq::Result<()> { + self.writer + .lock() + .ingest(&mut self.thread_writer) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } +} + +fn encode_paired(args: &Args) -> Result<()> { + let mut r1 = fastx::Reader::from_path(&args.input)?; + let mut r2 = fastx::Reader::from_path(&args.input2.as_ref().expect("Missing input2"))?; + let ohandle = args.ohandle()?; + + // prepare writer + let writer = { + let format = args.format(); + let mut builder = BinseqWriterBuilder::new(format) + .headers(!args.exclude_headers) + .quality(!args.exclude_quality) + .compression_level(args.compression_level) + .bitsize(args.bitsize()) + .paired(true) + .block_size(args.blocksize * 1024); + + // BQ requires a fixed sequence length from init time + if matches!(format, Format::Bq) { + builder = builder.slen(get_seq_len(&mut r1)? as u32); + builder = builder.xlen(get_seq_len(&mut r2)? as u32); + } + + builder.build(ohandle)? + }; + + let mut encoder = Encoder::new(writer)?; + r1.process_parallel_paired(r2, &mut encoder, args.threads)?; + encoder.finish()?; + + Ok(()) +} + +fn encode_single(args: &Args) -> Result<()> { + let mut reader = fastx::Reader::from_path(&args.input)?; + let ohandle = args.ohandle()?; + + // prepare writer + let writer = { + let format = args.format(); + let mut builder = BinseqWriterBuilder::new(format) + .headers(!args.exclude_headers) + .quality(!args.exclude_quality) + .compression_level(args.compression_level) + .bitsize(args.bitsize()) + .block_size(args.blocksize * 1024); + + // BQ requires a fixed sequence length from init time + if matches!(format, Format::Bq) { + builder = builder.slen(get_seq_len(&mut reader)? as u32); + } + + builder.build(ohandle)? + }; + + let mut encoder = Encoder::new(writer)?; + reader.process_parallel(&mut encoder, args.threads)?; + encoder.finish()?; + + Ok(()) +} + +fn main() -> Result<()> { + let args = Args::parse(); + if args.is_paired() { + encode_paired(&args) + } else { + encode_single(&args) + } +} diff --git a/src/bq/header.rs b/src/bq/header.rs index 4508ca0..42b845e 100644 --- a/src/bq/header.rs +++ b/src/bq/header.rs @@ -32,22 +32,22 @@ pub const SIZE_HEADER: usize = 32; pub const RESERVED: [u8; 17] = [42; 17]; #[derive(Debug, Clone, Copy)] -pub struct BinseqHeaderBuilder { +pub struct FileHeaderBuilder { slen: Option, xlen: Option, bitsize: Option, flags: Option, } -impl Default for BinseqHeaderBuilder { +impl Default for FileHeaderBuilder { fn default() -> Self { Self::new() } } -impl BinseqHeaderBuilder { +impl FileHeaderBuilder { #[must_use] pub fn new() -> Self { - BinseqHeaderBuilder { + FileHeaderBuilder { slen: None, xlen: None, bitsize: None, @@ -74,8 +74,8 @@ impl BinseqHeaderBuilder { self.flags = Some(flags); self } - pub fn build(self) -> Result { - Ok(BinseqHeader { + pub fn build(self) -> Result { + Ok(FileHeader { magic: MAGIC, format: FORMAT, slen: if let Some(slen) = self.slen { @@ -93,13 +93,13 @@ impl BinseqHeaderBuilder { /// Header structure for binary sequence files /// -/// The `BinseqHeader` contains metadata about the binary sequence data stored in a file, +/// The `FileHeader` contains metadata about the binary sequence data stored in a file, /// including format information, sequence lengths, and space for future extensions. /// /// The total size of this structure is 32 bytes, with a fixed layout to ensure /// consistent reading and writing across different platforms. #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct BinseqHeader { +pub struct FileHeader { /// Magic number to identify the file format /// /// 4 bytes @@ -135,7 +135,7 @@ pub struct BinseqHeader { /// 17 bytes pub reserved: [u8; 17], } -impl BinseqHeader { +impl FileHeader { /// Creates a new header with the specified sequence length /// /// This constructor initializes a standard header with the given sequence length, @@ -150,7 +150,7 @@ impl BinseqHeader { /// /// # Returns /// - /// A new `BinseqHeader` instance + /// A new `FileHeader` instance #[must_use] pub fn new(bits: BitSize, slen: u32, flags: bool) -> Self { Self { @@ -178,7 +178,7 @@ impl BinseqHeader { /// /// # Returns /// - /// A new `BinseqHeader` instance with extended sequence information + /// A new `FileHeader` instance with extended sequence information #[must_use] pub fn new_extended(bits: BitSize, slen: u32, xlen: u32, flags: bool) -> Self { Self { @@ -214,7 +214,7 @@ impl BinseqHeader { /// /// # Returns /// - /// * `Ok(BinseqHeader)` - A valid header parsed from the buffer + /// * `Ok(FileHeader)` - A valid header parsed from the buffer /// * `Err(Error)` - If the buffer contains invalid header data /// /// # Errors @@ -266,7 +266,7 @@ impl BinseqHeader { /// /// # Returns /// - /// * `Ok(BinseqHeader)` - A valid header parsed from the buffer + /// * `Ok(FileHeader)` - A valid header parsed from the buffer /// * `Err(Error)` - If the buffer is too small or contains invalid header data /// /// # Errors @@ -324,7 +324,7 @@ impl BinseqHeader { /// /// # Returns /// - /// * `Ok(BinseqHeader)` - A valid header read from the reader + /// * `Ok(FileHeader)` - A valid header read from the reader /// * `Err(Error)` - If reading from the reader failed or the header data is invalid /// /// # Errors diff --git a/src/bq/mod.rs b/src/bq/mod.rs index 39b126a..fd194f6 100644 --- a/src/bq/mod.rs +++ b/src/bq/mod.rs @@ -2,9 +2,9 @@ //! //! *.bq files are BINSEQ variants for **fixed-length** records and **does not support quality scores**. //! -//! For variable-length records and optional quality scores use the [`vbq`](crate::vbq) module. +//! For variable-length records and optional quality scores use the [`cbq`](crate::cbq) or [`vbq`](crate::vbq) modules. //! -//! This module contains the utilities for reading, writing, and interacting with BINSEQ files. +//! This module contains the utilities for reading, writing, and interacting with BQ files. //! //! For detailed information on the file format, see our [paper](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1). //! @@ -40,84 +40,81 @@ //! #### Writing unpaired sequences //! //! ```rust -//! use binseq::bq; -//! use std::fs::File; +//! use binseq::{bq, SequencingRecordBuilder}; +//! use std::io::Cursor; //! -//! // Define a path for the output file -//! let path = "./data/some_output.bq"; +//! // Create an in-memory buffer for output +//! let output_handle = Cursor::new(Vec::new()); //! -//! // Create the file handle -//! let output_handle = File::create(path).unwrap(); +//! // Initialize our BQ header (64 bp, only primary) +//! let header = bq::FileHeaderBuilder::new().slen(64).build().unwrap(); //! -//! // Initialize our BINSEQ header (64 bp, only primary) -//! let header = bq::BinseqHeaderBuilder::new().slen(64).build().unwrap(); -//! -//! // Initialize our BINSEQ writer -//! let mut writer = bq::BinseqWriterBuilder::default() +//! // Initialize our BQ writer +//! let mut writer = bq::WriterBuilder::default() //! .header(header) //! .build(output_handle) //! .unwrap(); //! //! // Generate a random sequence //! let seq = [b'A'; 64]; -//! let flag = 0; //! -//! // Write the sequence to the file -//! writer.write_record(Some(flag), &seq).unwrap(); +//! // Build a record and write it to the file +//! let record = SequencingRecordBuilder::default() +//! .s_seq(&seq) +//! .flag(0) +//! .build() +//! .unwrap(); +//! writer.push(record).unwrap(); //! -//! // Close the file +//! // Flush the writer //! writer.flush().unwrap(); -//! -//! // Remove the file created -//! std::fs::remove_file(path).unwrap(); //! ``` //! //! #### Writing paired sequences //! //! ```rust -//! use binseq::bq; -//! use std::fs::File; -//! -//! // Define a path for the output file -//! let path = "./data/some_output.bq"; +//! use binseq::{bq, SequencingRecordBuilder}; +//! use std::io::Cursor; //! -//! // Create the file handle -//! let output_handle = File::create(path).unwrap(); +//! // Create an in-memory buffer for output +//! let output_handle = Cursor::new(Vec::new()); //! -//! // Initialize our BINSEQ header (64 bp and 128bp) -//! let header = bq::BinseqHeaderBuilder::new().slen(64).xlen(128).build().unwrap(); +//! // Initialize our BQ header (64 bp and 128bp) +//! let header = bq::FileHeaderBuilder::new().slen(64).xlen(128).build().unwrap(); //! -//! // Initialize our BINSEQ writer -//! let mut writer = bq::BinseqWriterBuilder::default() +//! // Initialize our BQ writer +//! let mut writer = bq::WriterBuilder::default() //! .header(header) //! .build(output_handle) //! .unwrap(); //! -//! // Generate a random sequence +//! // Generate paired sequences //! let primary = [b'A'; 64]; //! let secondary = [b'C'; 128]; -//! let flag = 0; //! -//! // Write the sequence to the file -//! writer.write_paired_record(Some(flag), &primary, &secondary).unwrap(); +//! // Build a paired record and write it to the file +//! let record = SequencingRecordBuilder::default() +//! .s_seq(&primary) +//! .x_seq(&secondary) +//! .flag(0) +//! .build() +//! .unwrap(); +//! writer.push(record).unwrap(); //! -//! // Close the file +//! // Flush the writer //! writer.flush().unwrap(); -//! -//! // Remove the file created -//! std::fs::remove_file(path).unwrap(); //! ``` //! //! # Example: Streaming Access //! //! ``` -//! use binseq::{Policy, Result, BinseqRecord}; -//! use binseq::bq::{BinseqHeaderBuilder, StreamReader, StreamWriterBuilder}; +//! use binseq::{Policy, Result, BinseqRecord, SequencingRecordBuilder}; +//! use binseq::bq::{FileHeaderBuilder, StreamReader, StreamWriterBuilder}; //! use std::io::{BufReader, Cursor}; //! //! fn main() -> Result<()> { //! // Create a header for sequences of length 100 -//! let header = BinseqHeaderBuilder::new().slen(100).build()?; +//! let header = FileHeaderBuilder::new().slen(100).build()?; //! //! // Create a stream writer //! let mut writer = StreamWriterBuilder::default() @@ -127,7 +124,11 @@ //! //! // Write sequences //! let sequence = b"ACGT".repeat(25); // 100 nucleotides -//! writer.write_record(Some(0), &sequence)?; +//! let record = SequencingRecordBuilder::default() +//! .s_seq(&sequence) +//! .flag(0) +//! .build()?; +//! writer.push(record)?; //! //! // Get the inner buffer //! let buffer = writer.into_inner()?; @@ -149,7 +150,7 @@ //! //! ## BQ file format //! -//! A BINSEQ file consists of two sections: +//! A BQ file consists of two sections: //! //! 1. Fixed-size header (32 bytes) //! 2. Record data section @@ -240,6 +241,6 @@ mod header; mod reader; mod writer; -pub use header::{BinseqHeader, BinseqHeaderBuilder, SIZE_HEADER}; +pub use header::{FileHeader, FileHeaderBuilder, SIZE_HEADER}; pub use reader::{MmapReader, RefRecord, StreamReader}; -pub use writer::{BinseqWriter, BinseqWriterBuilder, Encoder, StreamWriter, StreamWriterBuilder}; +pub use writer::{Encoder, StreamWriter, StreamWriterBuilder, Writer, WriterBuilder}; diff --git a/src/bq/reader.rs b/src/bq/reader.rs index 93f36b0..fc28cf9 100644 --- a/src/bq/reader.rs +++ b/src/bq/reader.rs @@ -17,10 +17,10 @@ use bitnuc::BitSize; use bytemuck::cast_slice; use memmap2::Mmap; -use super::header::{BinseqHeader, SIZE_HEADER}; +use super::header::{FileHeader, SIZE_HEADER}; use crate::{ + BinseqRecord, DEFAULT_QUALITY_SCORE, Error, ParallelProcessor, ParallelReader, error::{ReadError, Result}, - BinseqRecord, Error, ParallelProcessor, ParallelReader, }; /// A reference to a binary sequence record in a memory-mapped file @@ -39,6 +39,8 @@ pub struct RefRecord<'a> { id: u64, /// The underlying u64 buffer representing the record's binary data buffer: &'a [u64], + /// Reusable default quality buffer + qbuf: &'a [u8], /// The configuration that defines the layout and size of record components config: RecordConfig, /// Cached index string for the sequence header @@ -59,11 +61,12 @@ impl<'a> RefRecord<'a> { /// /// Panics if the buffer length doesn't match the expected size from the config #[must_use] - pub fn new(id: u64, buffer: &'a [u64], config: RecordConfig) -> Self { + pub fn new(id: u64, buffer: &'a [u64], qbuf: &'a [u8], config: RecordConfig) -> Self { assert_eq!(buffer.len(), config.record_size_u64()); Self { id, buffer, + qbuf, config, header_buf: [0; 20], header_len: 0, @@ -127,6 +130,12 @@ impl BinseqRecord for RefRecord<'_> { &self.buffer[self.config.schunk as usize..] } } + fn squal(&self) -> &[u8] { + &self.qbuf[..self.config.slen as usize] + } + fn xqual(&self) -> &[u8] { + &self.qbuf[..self.config.xlen as usize] + } } /// A reference to a record in the map with a precomputed decoded buffer slice @@ -139,6 +148,8 @@ pub struct BatchRecord<'a> { id: u64, /// The configuration that defines the layout and size of record components config: RecordConfig, + /// A reusable pre-initialized quality score buffer + qbuf: &'a [u8], /// Cached index string for the sequence header header_buf: [u8; 20], /// Length of the header in bytes @@ -218,6 +229,12 @@ impl BinseqRecord for BatchRecord<'_> { } &self.dbuf[lbound..rbound] } + fn squal(&self) -> &[u8] { + &self.qbuf[..self.config.slen()] + } + fn xqual(&self) -> &[u8] { + &self.qbuf[..self.config.xlen()] + } } /// Configuration for binary sequence record layout @@ -281,12 +298,12 @@ impl RecordConfig { /// /// # Arguments /// - /// * `header` - A reference to a `BinseqHeader` containing sequence lengths + /// * `header` - A reference to a `FileHeader` containing sequence lengths /// /// # Returns /// /// A new `RecordConfig` instance with the sequence lengths from the header - pub fn from_header(header: &BinseqHeader) -> Self { + pub fn from_header(header: &FileHeader) -> Self { Self::new( header.slen as usize, header.xlen as usize, @@ -394,10 +411,16 @@ pub struct MmapReader { mmap: Arc, /// Binary sequence file header containing format information - header: BinseqHeader, + header: FileHeader, /// Configuration defining the layout of records in the file config: RecordConfig, + + /// Reusable buffer for quality scores + qbuf: Vec, + + /// Default quality score for records without quality scores + default_quality_score: u8, } impl MmapReader { @@ -433,7 +456,7 @@ impl MmapReader { let mmap = unsafe { Mmap::map(&file)? }; // Read header from mapped memory - let header = BinseqHeader::from_buffer(&mmap)?; + let header = FileHeader::from_buffer(&mmap)?; // Record configuraration let config = RecordConfig::from_header(&header); @@ -443,10 +466,15 @@ impl MmapReader { return Err(ReadError::FileTruncation(mmap.len()).into()); } + // preinitialize quality buffer + let qbuf = vec![DEFAULT_QUALITY_SCORE; header.slen.max(header.xlen) as usize]; + Ok(Self { mmap: Arc::new(mmap), header, config, + qbuf, + default_quality_score: DEFAULT_QUALITY_SCORE, }) } @@ -463,7 +491,7 @@ impl MmapReader { /// /// The header contains format information and sequence length specifications. #[must_use] - pub fn header(&self) -> BinseqHeader { + pub fn header(&self) -> FileHeader { self.header } @@ -473,6 +501,18 @@ impl MmapReader { self.header.is_paired() } + /// Sets the default quality score for records without quality information + pub fn set_default_quality_score(&mut self, score: u8) { + self.default_quality_score = score; + self.qbuf = self.build_qbuf(); + } + + /// Creates a new quality score buffer + #[must_use] + pub fn build_qbuf(&self) -> Vec { + vec![self.default_quality_score; self.header.slen.max(self.header.xlen) as usize] + } + /// Returns a reference to a specific record /// /// # Arguments @@ -489,14 +529,18 @@ impl MmapReader { /// Returns an error if the requested index is beyond the number of records in the file pub fn get(&self, idx: usize) -> Result> { if idx > self.num_records() { - return Err(ReadError::OutOfRange(idx, self.num_records()).into()); + return Err(ReadError::OutOfRange { + requested_index: idx, + max_index: self.num_records(), + } + .into()); } let rsize = self.config.record_size_bytes(); let lbound = SIZE_HEADER + (idx * rsize); let rbound = lbound + rsize; let bytes = &self.mmap[lbound..rbound]; let buffer = cast_slice(bytes); - Ok(RefRecord::new(idx as u64, buffer, self.config)) + Ok(RefRecord::new(idx as u64, buffer, &self.qbuf, self.config)) } /// Returns a slice of the buffer containing the underlying u64 for that range @@ -505,7 +549,11 @@ impl MmapReader { /// Note: range 10..40 will return all u64s in the mmap between the record index 10 and 40 pub fn get_buffer_slice(&self, range: Range) -> Result<&[u64]> { if range.end > self.num_records() { - return Err(ReadError::OutOfRange(range.end, self.num_records()).into()); + return Err(ReadError::OutOfRange { + requested_index: range.end, + max_index: self.num_records(), + } + .into()); } let rsize = self.config.record_size_bytes(); let total_records = range.end - range.start; @@ -532,7 +580,7 @@ pub struct StreamReader { reader: R, /// Binary sequence file header containing format information - header: Option, + header: Option, /// Configuration defining the layout of records in the file config: Option, @@ -540,6 +588,12 @@ pub struct StreamReader { /// Buffer for storing incoming data buffer: Vec, + /// Buffer for reusable quality scores + qbuf: Vec, + + /// Default quality score for records without quality information + default_quality_score: u8, + /// Current position in the buffer buffer_pos: usize, @@ -583,10 +637,19 @@ impl StreamReader { header: None, config: None, buffer: vec![0; capacity], + qbuf: vec![0; capacity], buffer_pos: 0, buffer_len: 0, - // buffer_capacity: capacity, + default_quality_score: DEFAULT_QUALITY_SCORE, + } + } + + /// Sets the default quality score for records without quality information + pub fn set_default_quality_score(&mut self, score: u8) { + if score != self.default_quality_score { + self.qbuf.clear(); } + self.default_quality_score = score; } /// Reads and validates the header from the underlying reader @@ -596,7 +659,7 @@ impl StreamReader { /// /// # Returns /// - /// * `Ok(&BinseqHeader)` - A reference to the validated header + /// * `Ok(&FileHeader)` - A reference to the validated header /// * `Err(Error)` - If reading or validating the header fails /// /// # Panics @@ -609,7 +672,7 @@ impl StreamReader { /// * There is an I/O error when reading from the source /// * The header data is invalid /// * End of stream is reached before the full header can be read - pub fn read_header(&mut self) -> Result<&BinseqHeader> { + pub fn read_header(&mut self) -> Result<&FileHeader> { if self.header.is_some() { return Ok(self .header @@ -624,7 +687,7 @@ impl StreamReader { // Parse header let header_slice = &self.buffer[self.buffer_pos..self.buffer_pos + SIZE_HEADER]; - let header = BinseqHeader::from_buffer(header_slice)?; + let header = FileHeader::from_buffer(header_slice)?; self.header = Some(header); self.config = Some(RecordConfig::from_header(&header)); @@ -692,10 +755,10 @@ impl StreamReader { /// * The data format is invalid pub fn next_record(&mut self) -> Option>> { // Ensure header is read - if self.header.is_none() { - if let Some(e) = self.read_header().err() { - return Some(Err(e)); - } + if self.header.is_none() + && let Some(e) = self.read_header().err() + { + return Some(Err(e)); } let config = self @@ -728,9 +791,20 @@ impl StreamReader { let record_bytes = &self.buffer[record_start..record_start + record_size]; let record_u64s = cast_slice(record_bytes); + // update quality score buffer if necessary + if self.qbuf.is_empty() { + let max_size = config.slen.max(config.xlen) as usize; + self.qbuf.resize(max_size, self.default_quality_score); + } + // Create record with incremental ID (based on read position) let id = (record_start - SIZE_HEADER) / record_size; - Some(Ok(RefRecord::new(id as u64, record_u64s, config))) + Some(Ok(RefRecord::new( + id as u64, + record_u64s, + &self.qbuf, + config, + ))) } /// Consumes the stream reader and returns the inner reader @@ -817,9 +891,7 @@ impl ParallelReader for MmapReader { // Validate range let num_records = self.num_records(); - if range.start >= num_records || range.end > num_records || range.start >= range.end { - return Ok(()); // Nothing to process or invalid range - } + self.validate_range(num_records, &range)?; // Calculate number of records for each thread within the range let range_size = range.end - range.start; @@ -849,6 +921,9 @@ impl ParallelReader for MmapReader { // initialize a decoding buffer let mut dbuf = Vec::new(); + // initialize a quality score buffer + let qbuf = reader.build_qbuf(); + // calculate the size of a record in the cast u64 slice let rsize_u64 = reader.config.record_size_bytes() / 8; @@ -895,6 +970,7 @@ impl ParallelReader for MmapReader { let record = BatchRecord { buffer: &ebuf[ebuf_start..(ebuf_start + rsize_u64)], dbuf: &dbuf[dbuf_start..(dbuf_start + dbuf_rsize)], + qbuf: &qbuf, id: idx as u64, config: reader.config, header_buf, @@ -925,3 +1001,312 @@ impl ParallelReader for MmapReader { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::BinseqRecord; + use bitnuc::BitSize; + + const TEST_BQ_FILE: &str = "./data/subset.bq"; + + // ==================== MmapReader Basic Tests ==================== + + #[test] + fn test_mmap_reader_new() { + let reader = MmapReader::new(TEST_BQ_FILE); + assert!(reader.is_ok(), "Failed to create reader"); + } + + #[test] + fn test_mmap_reader_num_records() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let num_records = reader.num_records(); + assert!(num_records > 0, "Expected non-zero records"); + } + + #[test] + fn test_mmap_reader_is_paired() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let is_paired = reader.is_paired(); + // Test that the method returns a boolean + assert!(is_paired || !is_paired); // Always true, tests the method works + } + + #[test] + fn test_mmap_reader_header_access() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let header = reader.header(); + assert!(header.slen > 0, "Expected non-zero sequence length"); + } + + #[test] + fn test_mmap_reader_config_access() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let header = reader.header(); + let config = RecordConfig::from_header(&header); + assert!( + config.slen > 0, + "Expected non-zero sequence length in config" + ); + } + + // ==================== Record Access Tests ==================== + + #[test] + fn test_get_record() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let num_records = reader.num_records(); + + if num_records > 0 { + let record = reader.get(0); + assert!(record.is_ok(), "Expected to get first record"); + + let record = record.unwrap(); + assert_eq!(record.index(), 0, "Expected record index to be 0"); + } + } + + #[test] + fn test_get_record_out_of_bounds() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let num_records = reader.num_records(); + + let record = reader.get(num_records + 100); + assert!(record.is_err(), "Expected error for out of bounds index"); + } + + #[test] + fn test_record_sequence_data() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + + if let Ok(record) = reader.get(0) { + let sbuf = record.sbuf(); + assert!(!sbuf.is_empty(), "Expected non-empty sequence buffer"); + + let slen = record.slen(); + assert!(slen > 0, "Expected non-zero sequence length"); + } + } + + #[test] + fn test_record_quality_data() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + + if let Ok(record) = reader.get(0) { + let squal = record.squal(); + let slen = record.slen() as usize; + assert_eq!( + squal.len(), + slen, + "Quality length should match sequence length" + ); + } + } + + // ==================== Default Quality Score Tests ==================== + + #[test] + fn test_set_default_quality_score() { + let mut reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let custom_score = 42u8; + + reader.set_default_quality_score(custom_score); + + if let Ok(record) = reader.get(0) { + let squal = record.squal(); + // All quality scores should be the custom score + assert!( + squal.iter().all(|&q| q == custom_score), + "All quality scores should be {}", + custom_score + ); + } + } + + // ==================== Parallel Processing Tests ==================== + + #[derive(Clone)] + struct CountingProcessor { + count: Arc>, + } + + impl ParallelProcessor for CountingProcessor { + fn process_record(&mut self, _record: R) -> Result<()> { + let mut count = self.count.lock().unwrap(); + *count += 1; + Ok(()) + } + } + + #[test] + fn test_parallel_processing() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let num_records = reader.num_records(); + + let count = Arc::new(std::sync::Mutex::new(0)); + let processor = CountingProcessor { + count: count.clone(), + }; + + reader.process_parallel(processor, 2).unwrap(); + + let final_count = *count.lock().unwrap(); + assert_eq!(final_count, num_records, "All records should be processed"); + } + + #[test] + fn test_parallel_processing_range() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let num_records = reader.num_records(); + + if num_records >= 100 { + let start = 10; + let end = 50; + let expected_count = end - start; + + let count = Arc::new(std::sync::Mutex::new(0)); + let processor = CountingProcessor { + count: count.clone(), + }; + + reader + .process_parallel_range(processor, 2, start..end) + .unwrap(); + + let final_count = *count.lock().unwrap(); + assert_eq!( + final_count, expected_count, + "Should process exactly {} records", + expected_count + ); + } + } + + // ==================== RecordConfig Tests ==================== + + #[test] + fn test_record_config_from_header() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let header = reader.header(); + let config = RecordConfig::from_header(&header); + + assert_eq!(config.slen, header.slen as u64, "Sequence length mismatch"); + assert_eq!(config.xlen, header.xlen as u64, "Extended length mismatch"); + assert_eq!(config.bitsize, header.bits, "Bit size mismatch"); + } + + #[test] + fn test_record_config_record_size() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let header = reader.header(); + let config = RecordConfig::from_header(&header); + + let size_u64 = config.record_size_u64(); + assert!(size_u64 > 0, "Record size should be non-zero"); + + let size_bytes = config.record_size_bytes(); + assert_eq!(size_bytes, size_u64 * 8, "Byte size should be 8x u64 size"); + } + + // ==================== RefRecord Tests ==================== + + #[test] + fn test_ref_record_bitsize() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + + if let Ok(record) = reader.get(0) { + let bitsize = record.bitsize(); + assert!( + matches!(bitsize, BitSize::Two | BitSize::Four), + "Bitsize should be Two or Four" + ); + } + } + + #[test] + fn test_ref_record_flag() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + + if let Ok(record) = reader.get(0) { + let flag = record.flag(); + // Flag should be Some if header has flags enabled + assert!(flag.is_some() || flag.is_none()); // Tests method works + } + } + + #[test] + fn test_ref_record_paired_data() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + + if reader.is_paired() { + if let Ok(record) = reader.get(0) { + let xbuf = record.xbuf(); + let xlen = record.xlen(); + + if xlen > 0 { + assert!( + !xbuf.is_empty(), + "Extended buffer should not be empty for paired" + ); + } + } + } + } + + // ==================== Error Handling Tests ==================== + + #[test] + fn test_nonexistent_file() { + let result = MmapReader::new("./data/nonexistent.bq"); + assert!(result.is_err(), "Should fail on nonexistent file"); + } + + #[test] + fn test_invalid_file_format() { + // Try to open a non-BQ file as BQ (use Cargo.toml for example) + let result = MmapReader::new("./Cargo.toml"); + // This should either fail to open or fail validation + if let Ok(reader) = result { + // If it opens, try to access records (should fail or have issues) + let num_records = reader.num_records(); + // The number might be nonsensical for invalid data + let _ = num_records; // Just verify it doesn't panic + } + } + + // ==================== Multiple Records Tests ==================== + + #[test] + fn test_sequential_record_access() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let num_records = reader.num_records().min(10); + + for i in 0..num_records { + let record = reader.get(i); + assert!(record.is_ok(), "Should get record at index {}", i); + assert_eq!( + record.unwrap().index() as usize, + i, + "Record index mismatch at {}", + i + ); + } + } + + #[test] + fn test_random_record_access() { + let reader = MmapReader::new(TEST_BQ_FILE).unwrap(); + let num_records = reader.num_records(); + + if num_records > 10 { + let indices = [0, 5, num_records / 2, num_records - 1]; + + for &idx in &indices { + let record = reader.get(idx); + assert!(record.is_ok(), "Should get record at index {}", idx); + assert_eq!(record.unwrap().index() as usize, idx); + } + } + } +} diff --git a/src/bq/writer.rs b/src/bq/writer.rs index 96a978f..084d215 100644 --- a/src/bq/writer.rs +++ b/src/bq/writer.rs @@ -10,12 +10,12 @@ use std::io::{BufWriter, Write}; use byteorder::{LittleEndian, WriteBytesExt}; -use rand::{rngs::SmallRng, SeedableRng}; +use rand::{SeedableRng, rngs::SmallRng}; -use super::BinseqHeader; +use super::FileHeader; use crate::{ + Policy, RNG_SEED, SequencingRecord, error::{Result, WriteError}, - Policy, RNG_SEED, }; /// Writes a single flag value to a writer in little-endian format @@ -66,7 +66,7 @@ pub fn write_buffer(writer: &mut W, ebuf: &[u64]) -> Result<()> { #[derive(Clone)] pub struct Encoder { /// Header containing sequence length and format information - header: BinseqHeader, + header: FileHeader, /// Buffers for storing encoded nucleotides in 2-bit format /// Each u64 can store 32 nucleotides (64 bits / 2 bits per nucleotide) @@ -95,12 +95,12 @@ impl Encoder { /// # Examples /// /// ``` - /// # use binseq::bq::{BinseqHeaderBuilder, Encoder}; - /// let header = BinseqHeaderBuilder::new().slen(100).build().unwrap(); + /// # use binseq::bq::{FileHeaderBuilder, Encoder}; + /// let header = FileHeaderBuilder::new().slen(100).build().unwrap(); /// let encoder = Encoder::new(header); /// ``` #[must_use] - pub fn new(header: BinseqHeader) -> Self { + pub fn new(header: FileHeader) -> Self { Self::with_policy(header, Policy::default()) } @@ -114,13 +114,13 @@ impl Encoder { /// # Examples /// /// ``` - /// # use binseq::bq::{BinseqHeaderBuilder, Encoder}; + /// # use binseq::bq::{FileHeaderBuilder, Encoder}; /// # use binseq::Policy; - /// let header = BinseqHeaderBuilder::new().slen(100).build().unwrap(); + /// let header = FileHeaderBuilder::new().slen(100).build().unwrap(); /// let encoder = Encoder::with_policy(header, Policy::SetToA); /// ``` #[must_use] - pub fn with_policy(header: BinseqHeader, policy: Policy) -> Self { + pub fn with_policy(header: FileHeader, policy: Policy) -> Self { Self { header, policy, @@ -132,6 +132,12 @@ impl Encoder { } } + /// Returns whether the header is paired-end. + #[must_use] + pub fn is_paired(&self) -> bool { + self.header.is_paired() + } + /// Encodes a single sequence as 2-bit. /// /// Will return `None` if the sequence is invalid and the policy does not allow correction. @@ -219,7 +225,7 @@ impl Encoder { } } -/// Builder for creating configured `BinseqWriter` instances +/// Builder for creating configured `Writer` instances /// /// This builder provides a flexible way to create writers with various /// configurations. It follows the builder pattern, allowing for optional @@ -229,10 +235,10 @@ impl Encoder { /// /// ``` /// # use binseq::{Policy, Result}; -/// # use binseq::bq::{BinseqHeaderBuilder, BinseqWriterBuilder}; +/// # use binseq::bq::{FileHeaderBuilder, WriterBuilder}; /// # fn main() -> Result<()> { -/// let header = BinseqHeaderBuilder::new().slen(100).build()?; -/// let writer = BinseqWriterBuilder::default() +/// let header = FileHeaderBuilder::new().slen(100).build()?; +/// let writer = WriterBuilder::default() /// .header(header) /// .policy(Policy::SetToA) /// .headless(false) @@ -241,17 +247,17 @@ impl Encoder { /// # } /// ``` #[derive(Default)] -pub struct BinseqWriterBuilder { +pub struct WriterBuilder { /// Required header defining sequence lengths and format - header: Option, + header: Option, /// Optional policy for handling invalid nucleotides policy: Option, /// Optional headless mode for parallel writing scenarios headless: Option, } -impl BinseqWriterBuilder { +impl WriterBuilder { #[must_use] - pub fn header(mut self, header: BinseqHeader) -> Self { + pub fn header(mut self, header: FileHeader) -> Self { self.header = Some(header); self } @@ -268,11 +274,11 @@ impl BinseqWriterBuilder { self } - pub fn build(self, inner: W) -> Result> { + pub fn build(self, inner: W) -> Result> { let Some(header) = self.header else { return Err(WriteError::MissingHeader.into()); }; - BinseqWriter::new( + Writer::new( inner, header, self.policy.unwrap_or_default(), @@ -295,7 +301,7 @@ impl BinseqWriterBuilder { /// /// * `W` - The underlying writer type that implements `Write` #[derive(Clone)] -pub struct BinseqWriter { +pub struct Writer { /// The underlying writer for output inner: W, @@ -306,11 +312,11 @@ pub struct BinseqWriter { /// When true, the header is not written to the output headless: bool, } -impl BinseqWriter { - /// Creates a new `BinseqWriter` instance with specified configuration +impl Writer { + /// Creates a new `Writer` instance with specified configuration /// /// This is a low-level constructor. For a more convenient way to create a - /// `BinseqWriter`, use the `BinseqWriterBuilder` struct. + /// `Writer`, use the `WriterBuilder` struct. /// /// # Arguments /// @@ -321,17 +327,17 @@ impl BinseqWriter { /// /// # Returns /// - /// * `Ok(BinseqWriter)` - A new writer instance + /// * `Ok(Writer)` - A new writer instance /// * `Err(Error)` - If writing the header fails /// /// # Examples /// /// ``` - /// # use binseq::bq::{BinseqHeaderBuilder, BinseqWriter}; + /// # use binseq::bq::{FileHeaderBuilder, Writer}; /// # use binseq::{Result, Policy}; /// # fn main() -> Result<()> { - /// let header = BinseqHeaderBuilder::new().slen(100).build()?; - /// let writer = BinseqWriter::new( + /// let header = FileHeaderBuilder::new().slen(100).build()?; + /// let writer = Writer::new( /// Vec::new(), /// header, /// Policy::default(), @@ -340,7 +346,7 @@ impl BinseqWriter { /// # Ok(()) /// # } /// ``` - pub fn new(mut inner: W, header: BinseqHeader, policy: Policy, headless: bool) -> Result { + pub fn new(mut inner: W, header: FileHeader, policy: Policy, headless: bool) -> Result { if !headless { header.write_bytes(&mut inner)?; } @@ -351,6 +357,21 @@ impl BinseqWriter { }) } + /// Returns whether the header is paired-end. + pub fn is_paired(&self) -> bool { + self.encoder.is_paired() + } + + /// Returns the header of the writer + pub fn header(&self) -> FileHeader { + self.encoder.header + } + + /// Returns the N-policy of the writer + pub fn policy(&self) -> Policy { + self.encoder.policy + } + /// Writes a single record to the output /// /// This method encodes and writes a primary sequence along with an associated flag. @@ -365,6 +386,7 @@ impl BinseqWriter { /// * `Ok(true)` if the record was written successfully /// * `Ok(false)` if the record was not written because it was empty /// * `Err(WriteError::FlagSet)` if the flag is set but no flag value is provided + #[deprecated] pub fn write_record(&mut self, flag: Option, primary: &[u8]) -> Result { let has_flag = self.encoder.header.flags; if let Some(sbuffer) = self.encoder.encode_single(primary)? { @@ -391,6 +413,7 @@ impl BinseqWriter { /// /// # Returns /// * `Result` - A result indicating whether the write was successful or not + #[deprecated] pub fn write_paired_record( &mut self, flag: Option, @@ -410,6 +433,78 @@ impl BinseqWriter { } } + /// Writes a record using the unified [`SequencingRecord`] API + /// + /// This method provides a consistent interface with VBQ and CBQ writers. + /// Note that BQ format does not support quality scores or headers - these + /// fields from the record will be ignored. + /// + /// # Arguments + /// + /// * `record` - A [`SequencingRecord`] containing the sequence data to write + /// + /// # Returns + /// + /// * `Ok(true)` if the record was written successfully + /// * `Ok(false)` if the record was skipped due to invalid nucleotides + /// * `Err(_)` if writing failed + /// + /// # Examples + /// + /// ``` + /// # use binseq::bq::{FileHeaderBuilder, WriterBuilder}; + /// # use binseq::{Result, SequencingRecordBuilder}; + /// # fn main() -> Result<()> { + /// let header = FileHeaderBuilder::new().slen(8).build()?; + /// let mut writer = WriterBuilder::default() + /// .header(header) + /// .build(Vec::new())?; + /// + /// let record = SequencingRecordBuilder::default() + /// .s_seq(b"ACGTACGT") + /// .flag(42) + /// .build()?; + /// + /// writer.push(record)?; + /// # Ok(()) + /// # } + /// ``` + pub fn push(&mut self, record: SequencingRecord) -> Result { + let has_flag = self.encoder.header.flags; + if has_flag { + write_flag(&mut self.inner, record.flag().unwrap_or(0))?; + } + + // Check paired status - writer can require paired (record must have R2), + // but if writer is single-end, we simply ignore any R2 data in the record. + if self.encoder.header.is_paired() && !record.is_paired() { + return Err(WriteError::ConfigurationMismatch { + attribute: "paired", + expected: self.encoder.header.is_paired(), + actual: record.is_paired(), + } + .into()); + } + + if self.encoder.header.is_paired() { + if let Some((sbuffer, xbuffer)) = self + .encoder + .encode_paired(record.s_seq, record.x_seq.unwrap_or_default())? + { + write_buffer(&mut self.inner, sbuffer)?; + write_buffer(&mut self.inner, xbuffer)?; + Ok(true) + } else { + Ok(false) + } + } else if let Some(buffer) = self.encoder.encode_single(record.s_seq)? { + write_buffer(&mut self.inner, buffer)?; + Ok(true) + } else { + Ok(false) + } + } + /// Consumes the writer and returns the underlying writer /// /// This is useful when you need to access the underlying writer after @@ -418,11 +513,11 @@ impl BinseqWriter { /// # Examples /// /// ``` - /// # use binseq::bq::{BinseqHeaderBuilder, BinseqWriterBuilder}; + /// # use binseq::bq::{FileHeaderBuilder, WriterBuilder}; /// # use binseq::Result; /// # fn main() -> Result<()> { - /// let header = BinseqHeaderBuilder::new().slen(100).build()?; - /// let writer = BinseqWriterBuilder::default() + /// let header = FileHeaderBuilder::new().slen(100).build()?; + /// let writer = WriterBuilder::default() /// .header(header) /// .build(Vec::new())?; /// @@ -438,7 +533,7 @@ impl BinseqWriter { /// Gets a mutable reference to the underlying writer /// /// This allows direct access to the underlying writer while retaining - /// ownership of the `BinseqWriter`. + /// ownership of the `Writer`. pub fn by_ref(&mut self) -> &mut W { &mut self.inner } @@ -496,7 +591,7 @@ impl BinseqWriter { /// /// * `Ok(())` - If the contents were successfully ingested /// * `Err(Error)` - If writing the contents failed - pub fn ingest(&mut self, other: &mut BinseqWriter>) -> Result<()> { + pub fn ingest(&mut self, other: &mut Writer>) -> Result<()> { let other_inner = other.by_ref(); self.inner.write_all(other_inner)?; other_inner.clear(); @@ -512,11 +607,11 @@ impl BinseqWriter { /// - Processing very large datasets /// - Pipeline processing /// -/// The `StreamWriter` is a specialized version of `BinseqWriter` that +/// The `StreamWriter` is a specialized version of `Writer` that /// adds internal buffering and is optimized for streaming scenarios. pub struct StreamWriter { /// The underlying writer for processing sequences - writer: BinseqWriter>, + writer: Writer>, } impl StreamWriter { @@ -536,7 +631,7 @@ impl StreamWriter { /// /// * `Ok(StreamWriter)` - A new streaming writer /// * `Err(Error)` - If initialization fails - pub fn new(inner: W, header: BinseqHeader, policy: Policy, headless: bool) -> Result { + pub fn new(inner: W, header: FileHeader, policy: Policy, headless: bool) -> Result { Self::with_capacity(inner, 8192, header, policy, headless) } @@ -560,29 +655,38 @@ impl StreamWriter { pub fn with_capacity( inner: W, capacity: usize, - header: BinseqHeader, + header: FileHeader, policy: Policy, headless: bool, ) -> Result { let buffered = BufWriter::with_capacity(capacity, inner); - let writer = BinseqWriter::new(buffered, header, policy, headless)?; + let writer = Writer::new(buffered, header, policy, headless)?; Ok(Self { writer }) } + #[deprecated(note = "use `push` method with SequencingRecord instead")] pub fn write_record(&mut self, flag: Option, primary: &[u8]) -> Result { + #[allow(deprecated)] self.writer.write_record(flag, primary) } + #[deprecated(note = "use `push` method with SequencingRecord instead")] pub fn write_paired_record( &mut self, flag: Option, primary: &[u8], extended: &[u8], ) -> Result { + #[allow(deprecated)] self.writer.write_paired_record(flag, primary, extended) } + /// Writes a record using the unified [`SequencingRecord`] API + pub fn push(&mut self, record: SequencingRecord) -> Result { + self.writer.push(record) + } + /// Flushes any buffered data to the underlying writer /// /// # Returns @@ -620,7 +724,7 @@ impl StreamWriter { #[derive(Default)] pub struct StreamWriterBuilder { /// Required header defining sequence lengths and format - header: Option, + header: Option, /// Optional policy for handling invalid nucleotides policy: Option, /// Optional headless mode for parallel writing scenarios @@ -632,7 +736,7 @@ pub struct StreamWriterBuilder { impl StreamWriterBuilder { /// Sets the header for the writer #[must_use] - pub fn header(mut self, header: BinseqHeader) -> Self { + pub fn header(mut self, header: FileHeader) -> Self { self.header = Some(header); self } @@ -690,13 +794,13 @@ mod testing { use std::{fs::File, io::BufWriter}; use super::*; - use crate::bq::{BinseqHeaderBuilder, SIZE_HEADER}; + use crate::bq::{FileHeaderBuilder, SIZE_HEADER}; #[test] fn test_headless() -> Result<()> { let inner = Vec::new(); - let mut writer = BinseqWriterBuilder::default() - .header(BinseqHeaderBuilder::new().slen(32).build()?) + let mut writer = WriterBuilder::default() + .header(FileHeaderBuilder::new().slen(32).build()?) .headless(true) .build(inner)?; assert!(writer.is_headless()); @@ -708,8 +812,8 @@ mod testing { #[test] fn test_not_headless() -> Result<()> { let inner = Vec::new(); - let mut writer = BinseqWriterBuilder::default() - .header(BinseqHeaderBuilder::new().slen(32).build()?) + let mut writer = WriterBuilder::default() + .header(FileHeaderBuilder::new().slen(32).build()?) .build(inner)?; assert!(!writer.is_headless()); let inner = writer.by_ref(); @@ -719,8 +823,8 @@ mod testing { #[test] fn test_stdout() -> Result<()> { - let writer = BinseqWriterBuilder::default() - .header(BinseqHeaderBuilder::new().slen(32).build()?) + let writer = WriterBuilder::default() + .header(FileHeaderBuilder::new().slen(32).build()?) .build(std::io::stdout())?; assert!(!writer.is_headless()); Ok(()) @@ -730,8 +834,8 @@ mod testing { fn test_to_path() -> Result<()> { let path = "test_to_path.file"; let inner = File::create(path).map(BufWriter::new)?; - let mut writer = BinseqWriterBuilder::default() - .header(BinseqHeaderBuilder::new().slen(32).build()?) + let mut writer = WriterBuilder::default() + .header(FileHeaderBuilder::new().slen(32).build()?) .build(inner)?; assert!(!writer.is_headless()); let inner = writer.by_ref(); @@ -747,7 +851,7 @@ mod testing { fn test_stream_writer() -> Result<()> { let inner = Vec::new(); let writer = StreamWriterBuilder::default() - .header(BinseqHeaderBuilder::new().slen(32).build()?) + .header(FileHeaderBuilder::new().slen(32).build()?) .buffer_capacity(16384) .build(inner)?; diff --git a/src/cbq/core/block.rs b/src/cbq/core/block.rs new file mode 100644 index 0000000..96b75b9 --- /dev/null +++ b/src/cbq/core/block.rs @@ -0,0 +1,960 @@ +use std::io; + +use bitnuc::BitSize; +use bytemuck::{cast_slice, cast_slice_mut}; +use sucds::Serializable; +use sucds::mii_sequences::{EliasFano, EliasFanoBuilder}; +use zstd::stream::copy_decode; +use zstd::zstd_safe; + +use crate::cbq::core::utils::sized_compress; +use crate::error::{CbqError, WriteError}; +use crate::{BinseqRecord, DEFAULT_QUALITY_SCORE, Result}; + +use super::utils::{Span, calculate_offsets, extension_read, resize_uninit, slice_and_increment}; +use super::{BlockHeader, BlockRange, FileHeader}; +use crate::SequencingRecord; + +/// A block of records where all data is stored in separate columns. +#[derive(Clone, Default)] +pub struct ColumnarBlock { + /// Separate columns for each data type + seq: Vec, + flags: Vec, + headers: Vec, + qual: Vec, + + /// Length of sequences for each record + pub(crate) l_seq: Vec, + /// Length of headers for each record + pub(crate) l_headers: Vec, + /// Position of all N's in the sequence + pub(crate) npos: Vec, + + /// Reusable buffer for encoding sequences + ebuf: Vec, + + /// An Elias-Fano encoding for the N-positions + pub(crate) ef: Option, + /// Reusable buffer for encoding Elias-Fano struct + pub(crate) ef_bytes: Vec, + /// Length of serialized Elias-Fano encoding in bytes + pub(crate) len_nef: usize, + + // Reusable zstd compression buffer for columnar data + pub(crate) z_seq_len: Vec, + pub(crate) z_header_len: Vec, + pub(crate) z_npos: Vec, + pub(crate) z_seq: Vec, + pub(crate) z_flags: Vec, + pub(crate) z_headers: Vec, + pub(crate) z_qual: Vec, + + // reusable offset buffers + l_seq_offsets: Vec, + l_header_offsets: Vec, + + /// Number of records in the block + /// + /// A record is a logical unit of data. + /// If the records are paired sequences this is the number of pairs. + pub(crate) num_records: usize, + + /// Number of sequences in the block + /// + /// This is the same as the number of records for unpaired sequences. + /// For paired sequences it will be twice the number of records. + pub(crate) num_sequences: usize, + + /// Total nucleotides in this block + pub(crate) nuclen: usize, + /// Number of npos positions + pub(crate) num_npos: usize, + /// Current size of this block (virtual) + current_size: usize, + + /// Reusable buffer for missing quality scores + qbuf: Vec, + default_quality_score: u8, + + /// The file header (used for block configuration) + /// + /// Not to be confused with the `BlockHeader` + pub(crate) header: FileHeader, +} +impl ColumnarBlock { + /// Create a new columnar block with the given block size + #[must_use] + pub fn new(header: FileHeader) -> Self { + Self { + header, + default_quality_score: DEFAULT_QUALITY_SCORE, + ..Default::default() + } + } + + /// Update the default quality score for this block + pub fn set_default_quality_score(&mut self, score: u8) { + self.default_quality_score = score; + self.qbuf.clear(); + } + + fn is_empty(&self) -> bool { + self.current_size == 0 + } + + /// Clears the internal data structures + pub(crate) fn clear(&mut self) { + // clear index counters + { + self.nuclen = 0; + self.num_sequences = 0; + self.num_records = 0; + self.current_size = 0; + self.num_npos = 0; + } + + // clear spans + { + self.l_seq.clear(); + self.l_headers.clear(); + self.l_seq_offsets.clear(); + self.l_header_offsets.clear(); + } + + // clear vectors + { + self.seq.clear(); + self.flags.clear(); + self.headers.clear(); + self.qual.clear(); + self.npos.clear(); + self.ef = None; + } + + // clear encodings + { + self.ebuf.clear(); + self.z_seq_len.clear(); + self.z_header_len.clear(); + self.z_npos.clear(); + self.z_seq.clear(); + self.z_flags.clear(); + self.z_headers.clear(); + self.z_qual.clear(); + self.ef_bytes.clear(); + } + } + + fn add_sequence(&mut self, record: &SequencingRecord) -> Result<()> { + self.l_seq.push(record.s_seq.len() as u64); + self.seq.extend_from_slice(record.s_seq); + self.num_sequences += 1; + + if self.header.is_paired() { + let Some(x_seq) = record.x_seq else { + return Err(WriteError::ConfigurationMismatch { + attribute: "x_seq", + expected: true, + actual: false, + } + .into()); + }; + self.l_seq.push(x_seq.len() as u64); + self.seq.extend_from_slice(x_seq); + self.num_sequences += 1; + } + + // keep the sequence size up to date + self.nuclen = self.seq.len(); + Ok(()) + } + + fn add_flag(&mut self, record: &SequencingRecord) -> Result<()> { + if self.header.has_flags() { + let Some(flag) = record.flag else { + return Err(WriteError::ConfigurationMismatch { + attribute: "flag", + expected: true, + actual: false, + } + .into()); + }; + self.flags.push(flag); + } + Ok(()) + } + + fn add_headers(&mut self, record: &SequencingRecord) -> Result<()> { + if self.header.has_headers() { + let Some(sheader) = record.s_header else { + return Err(WriteError::ConfigurationMismatch { + attribute: "s_header", + expected: true, + actual: false, + } + .into()); + }; + self.l_headers.push(sheader.len() as u64); + self.headers.extend_from_slice(sheader); + + if self.header.is_paired() { + let Some(xheader) = record.x_header else { + return Err(WriteError::ConfigurationMismatch { + attribute: "x_header", + expected: true, + actual: false, + } + .into()); + }; + self.l_headers.push(xheader.len() as u64); + self.headers.extend_from_slice(xheader); + } + } + Ok(()) + } + + /// Note: this does not check if quality scores are different lengths from sequence + fn add_quality(&mut self, record: &SequencingRecord) -> Result<()> { + if self.header.has_qualities() { + let Some(squal) = record.s_qual() else { + return Err(WriteError::ConfigurationMismatch { + attribute: "s_qual", + expected: true, + actual: false, + } + .into()); + }; + self.qual.extend_from_slice(squal); + + if self.header.is_paired() { + let Some(xqual) = record.x_qual() else { + return Err(WriteError::ConfigurationMismatch { + attribute: "x_qual", + expected: true, + actual: false, + } + .into()); + }; + self.qual.extend_from_slice(xqual); + } + } + Ok(()) + } + + /// Calculate the usage of the block as a percentage + #[must_use] + pub fn usage(&self) -> f64 { + self.current_size as f64 / self.header.block_size as f64 + } + + pub(crate) fn can_fit(&self, record: &SequencingRecord<'_>) -> bool { + let configured_size = record.configured_size_cbq( + self.header.is_paired(), + self.header.has_flags(), + self.header.has_headers(), + self.header.has_qualities(), + ); + self.current_size + configured_size <= self.header.block_size as usize + } + + pub(crate) fn can_ingest(&self, other: &Self) -> bool { + self.current_size + other.current_size <= self.header.block_size as usize + } + + /// Ensure that the record can be pushed into the block + fn validate_record(&self, record: &SequencingRecord) -> Result<()> { + let configured_size = record.configured_size_cbq( + self.header.is_paired(), + self.header.has_flags(), + self.header.has_headers(), + self.header.has_qualities(), + ); + + if !self.can_fit(record) { + if configured_size > self.header.block_size as usize { + return Err(WriteError::RecordSizeExceedsMaximumBlockSize( + configured_size, + self.header.block_size as usize, + ) + .into()); + } + return Err(CbqError::BlockFull { + current_size: self.current_size, + record_size: configured_size, + block_size: self.header.block_size as usize, + } + .into()); + } + + // Check paired status - writer can require paired (record must have R2), + // but if writer is single-end, we simply ignore any R2 data in the record. + if self.header.is_paired() && !record.is_paired() { + return Err(WriteError::ConfigurationMismatch { + attribute: "paired", + expected: self.header.is_paired(), + actual: record.is_paired(), + } + .into()); + } + + // For flags, headers, and qualities: the writer can require them (record must have them), + // but if the writer doesn't need them, we simply ignore any extra data in the record. + if self.header.has_flags() && !record.has_flags() { + return Err(WriteError::ConfigurationMismatch { + attribute: "flags", + expected: self.header.has_flags(), + actual: record.has_flags(), + } + .into()); + } + + if self.header.has_headers() && !record.has_headers() { + return Err(WriteError::ConfigurationMismatch { + attribute: "headers", + expected: self.header.has_headers(), + actual: record.has_headers(), + } + .into()); + } + + if self.header.has_qualities() && !record.has_qualities() { + return Err(WriteError::ConfigurationMismatch { + attribute: "qualities", + expected: self.header.has_qualities(), + actual: record.has_qualities(), + } + .into()); + } + Ok(()) + } + + pub fn push(&mut self, record: SequencingRecord) -> Result<()> { + self.validate_record(&record)?; + + let configured_size = record.configured_size_cbq( + self.header.is_paired(), + self.header.has_flags(), + self.header.has_headers(), + self.header.has_qualities(), + ); + + self.add_sequence(&record)?; + self.add_flag(&record)?; + self.add_headers(&record)?; + self.add_quality(&record)?; + self.current_size += configured_size; + self.num_records += 1; + + Ok(()) + } + + /// Returns the expected length of the encoded sequence buffer + /// + /// This is deterministically calculated based on the sequence length and the encoding scheme. + fn ebuf_len(&self) -> usize { + self.nuclen.div_ceil(32) + } + + /// Encode the sequence into a compressed representation + fn encode_sequence(&mut self) -> Result<()> { + bitnuc::twobit::encode_with_invalid(&self.seq, &mut self.ebuf)?; + Ok(()) + } + + /// Find all positions of 'N' in the sequence + fn fill_npos(&mut self) -> Result<()> { + self.npos + .extend(memchr::memchr_iter(b'N', &self.seq).map(|i| i as u64)); + self.num_npos = self.npos.len(); + + // build Elias-Fano encoding for N positions + if self.npos.is_empty() { + self.ef = None; + Ok(()) + } else { + let mut ef_builder = EliasFanoBuilder::new(self.seq.len(), self.npos.len())?; + ef_builder.extend(self.npos.iter().map(|idx| *idx as usize))?; + let ef = ef_builder.build(); + + self.ef = Some(ef); + Ok(()) + } + } + + /// Convert all ambiguous bases back to N + fn backfill_npos(&mut self) { + if let Some(ef) = self.ef.as_ref() { + ef.iter(0).for_each(|idx| { + if let Some(base) = self.seq.get_mut(idx) { + *base = b'N'; + } + }); + } + } + + /// Compress all native columns into compressed representation + fn compress_columns(&mut self, cctx: &mut zstd_safe::CCtx) -> Result<()> { + // compress sequence lengths + + sized_compress(&mut self.z_seq_len, cast_slice(&self.l_seq), cctx)?; + + if !self.headers.is_empty() { + sized_compress(&mut self.z_header_len, cast_slice(&self.l_headers), cctx)?; + } + + // compress N-positions (Elias-Fano encoded) + if let Some(ef) = self.ef.as_ref() { + ef.serialize_into(&mut self.ef_bytes)?; + self.len_nef = self.ef_bytes.len(); + sized_compress(&mut self.z_npos, &self.ef_bytes, cctx)?; + } + + // compress sequence + sized_compress(&mut self.z_seq, cast_slice(&self.ebuf), cctx)?; + + // compress flags + if !self.flags.is_empty() { + sized_compress(&mut self.z_flags, cast_slice(&self.flags), cctx)?; + } + + // compress headers + if !self.headers.is_empty() { + sized_compress(&mut self.z_headers, cast_slice(&self.headers), cctx)?; + } + + // compress quality + if !self.qual.is_empty() { + sized_compress(&mut self.z_qual, cast_slice(&self.qual), cctx)?; + } + + Ok(()) + } + + /// Decompress all columns back to native representation + pub fn decompress_columns(&mut self) -> Result<()> { + // decompress sequence lengths + { + self.l_seq.resize(self.num_sequences, 0); + copy_decode(self.z_seq_len.as_slice(), cast_slice_mut(&mut self.l_seq))?; + } + + // decompress header lengths + if !self.z_header_len.is_empty() { + self.l_headers.resize(self.num_sequences, 0); + copy_decode( + self.z_header_len.as_slice(), + cast_slice_mut(&mut self.l_headers), + )?; + } + + // decompress npos + if !self.z_npos.is_empty() { + self.ef_bytes.resize(self.len_nef, 0); + copy_decode(self.z_npos.as_slice(), &mut self.ef_bytes)?; + + let ef = EliasFano::deserialize_from(self.ef_bytes.as_slice())?; + self.num_npos = ef.len(); + self.ef = Some(ef); + } + + // decompress sequence + { + self.ebuf.resize(self.ebuf_len(), 0); + copy_decode(self.z_seq.as_slice(), cast_slice_mut(&mut self.ebuf))?; + + bitnuc::twobit::decode(&self.ebuf, self.nuclen, &mut self.seq)?; + self.backfill_npos(); + } + + // decompress flags + if !self.z_flags.is_empty() { + self.flags.resize(self.num_records, 0); + copy_decode(self.z_flags.as_slice(), cast_slice_mut(&mut self.flags))?; + } + + // decompress headers + if !self.z_headers.is_empty() { + copy_decode(self.z_headers.as_slice(), &mut self.headers)?; + } + + // decompress quality scores + if !self.z_qual.is_empty() { + copy_decode(self.z_qual.as_slice(), &mut self.qual)?; + } + + // calculate offsets + { + calculate_offsets(&self.l_seq, &mut self.l_seq_offsets); + calculate_offsets(&self.l_headers, &mut self.l_header_offsets); + } + + Ok(()) + } + + fn write(&mut self, writer: &mut W) -> Result<()> { + writer.write_all(&self.z_seq_len)?; + writer.write_all(&self.z_header_len)?; + writer.write_all(&self.z_npos)?; + writer.write_all(&self.z_seq)?; + writer.write_all(&self.z_flags)?; + writer.write_all(&self.z_headers)?; + writer.write_all(&self.z_qual)?; + Ok(()) + } + + pub fn flush_to( + &mut self, + writer: &mut W, + cctx: &mut zstd_safe::CCtx, + ) -> Result> { + if self.is_empty() { + return Ok(None); + } + + // encode all sequences at once + self.encode_sequence()?; + + // fill npos + self.fill_npos()?; + + // compress each column + self.compress_columns(cctx)?; + + // build the block header + let header = BlockHeader::from_block(self); + // eprintln!("{header:?}"); + + // write the block header + header.write(writer)?; + + // write the internal state to the inner writer + self.write(writer)?; + + // clear the internal state + self.clear(); + + Ok(Some(header)) + } + + pub fn read_from(&mut self, reader: &mut R, header: BlockHeader) -> Result<()> { + // clears the internal state + self.clear(); + + // reload the internal state from the reader + self.nuclen = header.nuclen as usize; + self.num_records = header.num_records as usize; + self.num_sequences = header.num_sequences as usize; + self.len_nef = header.len_nef as usize; + + extension_read(reader, &mut self.z_seq_len, header.len_z_seq_len as usize)?; + extension_read( + reader, + &mut self.z_header_len, + header.len_z_header_len as usize, + )?; + extension_read(reader, &mut self.z_npos, header.len_z_npos as usize)?; + extension_read(reader, &mut self.z_seq, header.len_z_seq as usize)?; + extension_read(reader, &mut self.z_flags, header.len_z_flags as usize)?; + extension_read(reader, &mut self.z_headers, header.len_z_headers as usize)?; + extension_read(reader, &mut self.z_qual, header.len_z_qual as usize)?; + Ok(()) + } + + pub fn decompress_from_bytes( + &mut self, + bytes: &[u8], + header: BlockHeader, + dctx: &mut zstd_safe::DCtx, + ) -> Result<()> { + // clears the internal state + self.clear(); + + // reload the internal state from the header + self.nuclen = header.nuclen as usize; + self.num_records = header.num_records as usize; + self.num_sequences = header.num_sequences as usize; + self.len_nef = header.len_nef as usize; + + let mut byte_offset = 0; + + // decompress sequence lengths + { + resize_uninit(&mut self.l_seq, self.num_sequences); + dctx.decompress( + cast_slice_mut(&mut self.l_seq), + slice_and_increment(&mut byte_offset, header.len_z_seq_len, bytes), + ) + .map_err(|e| io::Error::other(zstd_safe::get_error_name(e)))?; + + // update default quality score buffer size + self.l_seq.iter().for_each(|len| { + if *len as usize > self.qbuf.len() { + self.qbuf.resize(*len as usize, self.default_quality_score); + } + }); + } + + // decompress header lengths + if header.len_z_header_len > 0 { + resize_uninit(&mut self.l_headers, self.num_sequences); + dctx.decompress( + cast_slice_mut(&mut self.l_headers), + slice_and_increment(&mut byte_offset, header.len_z_header_len, bytes), + ) + .map_err(|e| io::Error::other(zstd_safe::get_error_name(e)))?; + } + + // calculate offsets + { + calculate_offsets(&self.l_seq, &mut self.l_seq_offsets); + calculate_offsets(&self.l_headers, &mut self.l_header_offsets); + } + + // decompress npos + if header.len_z_npos > 0 { + resize_uninit(&mut self.ef_bytes, self.len_nef); + dctx.decompress( + &mut self.ef_bytes, + slice_and_increment(&mut byte_offset, header.len_z_npos, bytes), + ) + .map_err(|e| io::Error::other(zstd_safe::get_error_name(e)))?; + + // reinitialize the EliasFano encoding + let ef = EliasFano::deserialize_from(self.ef_bytes.as_slice())?; + self.num_npos = ef.len(); + self.ef = Some(ef); + } + + // decompress sequence + { + let ebuf_len = self.ebuf_len(); + resize_uninit(&mut self.ebuf, ebuf_len); + dctx.decompress( + cast_slice_mut(&mut self.ebuf), + slice_and_increment(&mut byte_offset, header.len_z_seq, bytes), + ) + .map_err(|e| io::Error::other(zstd_safe::get_error_name(e)))?; + + bitnuc::twobit::decode(&self.ebuf, self.nuclen, &mut self.seq)?; + self.backfill_npos(); + } + + // decompress flags + if header.len_z_flags > 0 { + resize_uninit(&mut self.flags, self.num_records); + dctx.decompress( + cast_slice_mut(&mut self.flags), + slice_and_increment(&mut byte_offset, header.len_z_flags, bytes), + ) + .map_err(|e| io::Error::other(zstd_safe::get_error_name(e)))?; + } + + // decompress headers + if header.len_z_headers > 0 { + let headers_len = (self.l_header_offsets.last().copied().unwrap_or(0) + + self.l_headers.last().copied().unwrap_or(0)) + as usize; + resize_uninit(&mut self.headers, headers_len); + dctx.decompress( + &mut self.headers, + slice_and_increment(&mut byte_offset, header.len_z_headers, bytes), + ) + .map_err(|e| io::Error::other(zstd_safe::get_error_name(e)))?; + } + + // decompress quality scores + if header.len_z_qual > 0 { + resize_uninit(&mut self.qual, self.nuclen); + dctx.decompress( + &mut self.qual, + slice_and_increment(&mut byte_offset, header.len_z_qual, bytes), + ) + .map_err(|e| io::Error::other(zstd_safe::get_error_name(e)))?; + } + + Ok(()) + } + + pub(crate) fn take_incomplete(&mut self, other: &Self) -> Result<()> { + if !self.can_ingest(other) { + return Err(CbqError::CannotIngestBlock { + self_block_size: self.header.block_size as usize, + other_block_size: other.header.block_size as usize, + } + .into()); + } + + // increment attributes + { + self.nuclen += other.nuclen; + self.num_records += other.num_records; + self.num_sequences += other.num_sequences; + self.current_size += other.current_size; + } + + // extend data + { + self.seq.extend_from_slice(&other.seq); + self.flags.extend_from_slice(&other.flags); + self.headers.extend_from_slice(&other.headers); + self.qual.extend_from_slice(&other.qual); + self.l_seq.extend_from_slice(&other.l_seq); + self.l_headers.extend_from_slice(&other.l_headers); + } + + { + // Note: + // + // Remaining buffers and attributes are left untouched. + // These are not modified because they aren't used mid-writing + // and are populated during the flush step. + } + + Ok(()) + } + + #[must_use] + pub fn iter_records(&self, range: BlockRange) -> RefRecordIter<'_> { + RefRecordIter { + block: self, + range, + qbuf: &self.qbuf, + index: 0, + is_paired: self.header.is_paired(), + has_headers: self.header.has_headers(), + header_buffer: itoa::Buffer::new(), + } + } +} + +/// A zero-copy iterator over [`RefRecord`](crate::cbq::RefRecord)s in a [`ColumnarBlock`](crate::cbq::ColumnarBlock) +pub struct RefRecordIter<'a> { + /// The block containing the records + block: &'a ColumnarBlock, + + /// The record range of this block + range: BlockRange, + + /// Record index within the block + index: usize, + + /// Convenience attribute if block is paired + is_paired: bool, + + /// Convenience attribute if block has headers + has_headers: bool, + + /// Preallocated buffer for quality scores + qbuf: &'a [u8], + + /// Preallocated itoa buffer for converting global record index to string + header_buffer: itoa::Buffer, +} +impl<'a> Iterator for RefRecordIter<'a> { + type Item = RefRecord<'a>; + + fn next(&mut self) -> Option { + if self.index >= self.block.num_records { + None + } else { + // Calculate the actual array index + let seq_idx = if self.is_paired { + self.index * 2 + } else { + self.index + }; + + let sseq_span = + Span::new_u64(self.block.l_seq_offsets[seq_idx], self.block.l_seq[seq_idx]); + let sheader_span = if self.has_headers { + Some(Span::new_u64( + self.block.l_header_offsets[seq_idx], + self.block.l_headers[seq_idx], + )) + } else { + None + }; + let xseq_span = if self.is_paired { + Some(Span::new_u64( + self.block.l_seq_offsets[seq_idx + 1], + self.block.l_seq[seq_idx + 1], + )) + } else { + None + }; + let xheader_span = if self.is_paired && self.has_headers { + Some(Span::new_u64( + self.block.l_header_offsets[seq_idx + 1], + self.block.l_headers[seq_idx + 1], + )) + } else { + None + }; + + let global_index = + self.range.cumulative_records as usize - self.block.num_records + self.index; + + let rr_index = RefRecordIndex::new(global_index, &mut self.header_buffer); + + let record = RefRecord { + block: self.block, + index: self.index, + qbuf: self.qbuf, + global_index, + sseq_span, + sheader_span, + xseq_span, + xheader_span, + rr_index, + }; + + self.index += 1; + Some(record) + } + } +} + +/// A convenience struct for creating global indices as `&[u8]` buffers +#[derive(Clone, Copy)] +struct RefRecordIndex { + index_buf: [u8; 20], + index_len: usize, +} +impl RefRecordIndex { + fn new(index: usize, itoa_buf: &mut itoa::Buffer) -> Self { + let mut index_buf = [0u8; 20]; + let header_str = itoa_buf.format(index); + let index_len = header_str.len(); + index_buf[..index_len].copy_from_slice(header_str.as_bytes()); + Self { + index_buf, + index_len, + } + } + + fn as_bytes(&self) -> &[u8] { + &self.index_buf[..self.index_len] + } +} + +/// A reference to a record in a [`ColumnarBlock`](crate::cbq::ColumnarBlock) that implements the [`BinseqRecord`](crate::BinseqRecord) trait +#[derive(Clone, Copy)] +pub struct RefRecord<'a> { + /// A reference to the block containing this record + block: &'a ColumnarBlock, + + /// Preallocated buffer for quality scores + qbuf: &'a [u8], + + /// Local index of this record within the block + index: usize, + + /// Global index of this record in the file + global_index: usize, + + /// Span of the primary sequence within the block + sseq_span: Span, + + /// Span of the extended sequence within the block + xseq_span: Option, + + /// Span of the primary header within the block + sheader_span: Option, + + /// Span of the extended header within the block + xheader_span: Option, + + /// A buffer to the name of this record when not storing headers + rr_index: RefRecordIndex, +} +impl BinseqRecord for RefRecord<'_> { + fn bitsize(&self) -> BitSize { + BitSize::Two + } + + fn index(&self) -> u64 { + self.global_index as u64 + } + + fn flag(&self) -> Option { + self.block.flags.get(self.index).copied() + } + + fn is_paired(&self) -> bool { + self.xseq_span.is_some() + } + + fn sheader(&self) -> &[u8] { + if let Some(span) = self.sheader_span { + &self.block.headers[span.range()] + } else { + self.rr_index.as_bytes() + } + } + + fn xheader(&self) -> &[u8] { + if let Some(span) = self.xheader_span { + &self.block.headers[span.range()] + } else { + self.rr_index.as_bytes() + } + } + + fn sbuf(&self) -> &[u64] { + unimplemented!("sbuf is not implemented for cbq") + } + + fn xbuf(&self) -> &[u64] { + unimplemented!("xbuf is not implemented for cbq") + } + + fn slen(&self) -> u64 { + self.sseq_span.len() as u64 + } + + fn xlen(&self) -> u64 { + self.xseq_span.map_or(0, |span| span.len() as u64) + } + + fn decode_s(&self, buf: &mut Vec) -> crate::Result<()> { + buf.extend_from_slice(self.sseq()); + Ok(()) + } + + fn decode_x(&self, buf: &mut Vec) -> crate::Result<()> { + buf.extend_from_slice(self.xseq()); + Ok(()) + } + + fn sseq(&self) -> &[u8] { + &self.block.seq[self.sseq_span.range()] + } + + fn xseq(&self) -> &[u8] { + self.xseq_span + .map_or(&[], |span| &self.block.seq[span.range()]) + } + + fn has_quality(&self) -> bool { + self.block.header.has_qualities() + } + + fn squal(&self) -> &[u8] { + if self.has_quality() { + &self.block.qual[self.sseq_span.range()] + } else { + &self.qbuf[..self.slen() as usize] + } + } + + fn xqual(&self) -> &[u8] { + if self.has_quality() + && let Some(span) = self.xseq_span + { + &self.block.qual[span.range()] + } else { + &self.qbuf[..self.xlen() as usize] + } + } +} diff --git a/src/cbq/core/block_header.rs b/src/cbq/core/block_header.rs new file mode 100644 index 0000000..a63e9e7 --- /dev/null +++ b/src/cbq/core/block_header.rs @@ -0,0 +1,92 @@ +use std::io; + +use bytemuck::{Pod, Zeroable}; + +use crate::{IntoBinseqError, Result, error::CbqError}; + +use super::{BLOCK_MAGIC, ColumnarBlock}; + +/// A block header for a [`ColumnarBlock`](crate::cbq::ColumnarBlock) +/// +/// This is stored identically in memory and on disk. +#[derive(Copy, Clone, Pod, Zeroable, Debug, PartialEq, Eq, Hash)] +#[repr(C)] +pub struct BlockHeader { + magic: [u8; 3], + version: u8, + padding: [u8; 4], + + // length of compressed columns + pub(crate) len_z_seq_len: u64, + pub(crate) len_z_header_len: u64, + pub(crate) len_z_npos: u64, + pub(crate) len_z_seq: u64, + pub(crate) len_z_flags: u64, + pub(crate) len_z_headers: u64, + pub(crate) len_z_qual: u64, + + // full decoded length of the sequence block + pub(crate) nuclen: u64, + + // length of uncompressed N-positions (Elias-Fano encoded) + pub(crate) len_nef: u64, + + /// number of records in the block + pub num_records: u64, + + /// Number of sequences in the block + pub num_sequences: u64, +} +impl BlockHeader { + #[must_use] + pub fn from_block(block: &ColumnarBlock) -> Self { + Self { + magic: *BLOCK_MAGIC, + version: 1, + padding: [42; 4], + len_z_seq_len: block.z_seq_len.len() as u64, + len_z_header_len: block.z_header_len.len() as u64, + len_z_npos: block.z_npos.len() as u64, + len_z_seq: block.z_seq.len() as u64, + len_z_flags: block.z_flags.len() as u64, + len_z_headers: block.z_headers.len() as u64, + len_z_qual: block.z_qual.len() as u64, + nuclen: block.nuclen as u64, + len_nef: block.len_nef as u64, + num_records: block.num_records as u64, + num_sequences: block.num_sequences as u64, + } + } + + /// Calculate the length of the block in bytes. + #[allow(dead_code)] + #[must_use] + pub fn block_len(&self) -> usize { + (self.len_z_seq_len + + self.len_z_header_len + + self.len_z_npos + + self.len_z_seq + + self.len_z_flags + + self.len_z_headers + + self.len_z_qual) as usize + } + + #[must_use] + pub fn as_bytes(&self) -> &[u8] { + bytemuck::bytes_of(self) + } + + pub fn from_bytes(bytes: &[u8]) -> Result { + let header: Self = *bytemuck::from_bytes(bytes); + if header.magic != *BLOCK_MAGIC { + return Err(CbqError::InvalidBlockHeaderMagic.into()); + } + Ok(header) + } + + pub fn write(&self, writer: &mut W) -> Result<()> { + writer + .write_all(self.as_bytes()) + .map_err(IntoBinseqError::into_binseq_error) + } +} diff --git a/src/cbq/core/header.rs b/src/cbq/core/header.rs new file mode 100644 index 0000000..287d197 --- /dev/null +++ b/src/cbq/core/header.rs @@ -0,0 +1,224 @@ +use std::fmt::Display; + +use bytemuck::{Pod, Zeroable}; + +use crate::{Result, error::CbqError}; + +use super::{DEFAULT_BLOCK_SIZE, DEFAULT_COMPRESSION_LEVEL, FILE_MAGIC, FILE_VERSION}; + +/// Records are paired +pub const PRESENCE_PAIRED: u64 = 1 << 0; + +/// Records have quality scores +pub const PRESENCE_QUALITIES: u64 = 1 << 1; + +/// Records have headers +pub const PRESENCE_HEADERS: u64 = 1 << 2; + +/// Records have flags +pub const PRESENCE_FLAGS: u64 = 1 << 3; + +/// The file header for a CBQ file. +/// +/// This is stored identically in memory and on disk. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Zeroable, Pod)] +#[repr(C)] +pub struct FileHeader { + // File Type Metadata (8 bytes) + /// File magic number + magic: [u8; 7], + /// File version number + pub version: u8, + + // Data presence flags (8 bytes) + /// A bitfield indicating which data fields are present in the file + pub presence_flags: u64, + + // Configuration (16 bytes) + /// compression level + pub compression_level: u64, + /// block size in bytes + pub block_size: u64, + + /// Reserved for future use + reserved: [u8; 32], +} +impl Default for FileHeader { + fn default() -> Self { + let mut header = Self { + magic: *FILE_MAGIC, + version: FILE_VERSION, + presence_flags: 0, + compression_level: DEFAULT_COMPRESSION_LEVEL, + block_size: DEFAULT_BLOCK_SIZE, + reserved: [0; 32], + }; + header.set_headers(); + header.set_qualities(); + header + } +} + +/// Flag getters and setters +impl FileHeader { + pub fn set_paired(&mut self) { + self.presence_flags |= PRESENCE_PAIRED; + } + pub fn set_qualities(&mut self) { + self.presence_flags |= PRESENCE_QUALITIES; + } + pub fn set_headers(&mut self) { + self.presence_flags |= PRESENCE_HEADERS; + } + pub fn set_flags(&mut self) { + self.presence_flags |= PRESENCE_FLAGS; + } + + #[inline] + #[must_use] + pub fn is_paired(&self) -> bool { + self.presence_flags & PRESENCE_PAIRED != 0 + } + #[inline] + #[must_use] + pub fn has_qualities(&self) -> bool { + self.presence_flags & PRESENCE_QUALITIES != 0 + } + #[inline] + #[must_use] + pub fn has_headers(&self) -> bool { + self.presence_flags & PRESENCE_HEADERS != 0 + } + #[inline] + #[must_use] + pub fn has_flags(&self) -> bool { + self.presence_flags & PRESENCE_FLAGS != 0 + } +} + +impl Display for FileHeader { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "CBQ {{ version: {}, paired: {}, qualities: {}, headers: {}, flags: {}, block_size: {}, compression: {} }}", + self.version, + self.is_paired(), + self.has_qualities(), + self.has_headers(), + self.has_flags(), + self.block_size, + self.compression_level, + ) + } +} + +impl FileHeader { + #[must_use] + pub fn as_bytes(&self) -> &[u8] { + bytemuck::bytes_of(self) + } + + pub fn from_bytes(bytes: &[u8]) -> Result { + let header: Self = *bytemuck::from_bytes(bytes); + if header.magic != *FILE_MAGIC { + return Err(CbqError::InvalidFileHeaderMagic.into()); + } + Ok(header) + } +} + +/// A convenience struct for building a [`FileHeader`](crate::cbq::FileHeader) using a builder pattern. +#[derive(Default)] +pub struct FileHeaderBuilder { + compression_level: Option, + block_size: Option, + is_paired: Option, + with_headers: Option, + with_flags: Option, + with_qualities: Option, +} + +impl FileHeaderBuilder { + pub fn with_compression_level(&mut self, compression_level: usize) -> &mut Self { + self.compression_level = Some(compression_level); + self + } + + pub fn with_optional_compression_level( + &mut self, + compression_level: Option, + ) -> &mut Self { + self.compression_level = compression_level; + self + } + + pub fn with_block_size(&mut self, block_size: usize) -> &mut Self { + self.block_size = Some(block_size); + self + } + + pub fn with_optional_block_size(&mut self, block_size: Option) -> &mut Self { + self.block_size = block_size; + self + } + + pub fn is_paired(&mut self, is_paired: bool) -> &mut Self { + self.is_paired = Some(is_paired); + self + } + + pub fn with_flags(&mut self, with_flags: bool) -> &mut Self { + self.with_flags = Some(with_flags); + self + } + + pub fn with_headers(&mut self, with_headers: bool) -> &mut Self { + self.with_headers = Some(with_headers); + self + } + + pub fn with_qualities(&mut self, with_qualities: bool) -> &mut Self { + self.with_qualities = Some(with_qualities); + self + } + + #[must_use] + pub fn build(&self) -> FileHeader { + let mut header = FileHeader { + magic: *FILE_MAGIC, + version: FILE_VERSION, + compression_level: self + .compression_level + .map_or(DEFAULT_COMPRESSION_LEVEL, |level| level as u64), + block_size: self + .block_size + .map_or(DEFAULT_BLOCK_SIZE, |size| size as u64), + presence_flags: 0, + reserved: [0; 32], + }; + + // default to unpaired + if let Some(true) = self.is_paired { + header.set_paired(); + } + + // default to using headers + match self.with_headers { + Some(false) => {} + _ => header.set_headers(), + } + + // default to not using flags + if let Some(true) = self.with_flags { + header.set_flags(); + } + + // default to using qualities + match self.with_qualities { + Some(false) => {} + _ => header.set_qualities(), + } + + header + } +} diff --git a/src/cbq/core/index.rs b/src/cbq/core/index.rs new file mode 100644 index 0000000..b70dad7 --- /dev/null +++ b/src/cbq/core/index.rs @@ -0,0 +1,216 @@ +use bytemuck::{Pod, Zeroable}; +use zstd::stream::copy_encode; + +use crate::{Result, error::CbqError}; + +use super::{BlockHeader, FileHeader, INDEX_MAGIC}; + +/// The header for a compressed index. +/// +/// This is stored identically in memory and on disk. +#[derive(Debug, Clone, Copy, Zeroable, Pod)] +#[repr(C)] +pub struct IndexHeader { + /// Magic number identifying the index format + magic: [u8; 8], + + /// Number of bytes in the uncompressed index + pub(crate) u_bytes: u64, + + /// Number of bytes in the compressed index + pub(crate) z_bytes: u64, +} +impl IndexHeader { + /// Creates a new index header + #[must_use] + pub fn new(u_bytes: u64, z_bytes: u64) -> Self { + Self { + magic: *INDEX_MAGIC, + u_bytes, + z_bytes, + } + } + + #[must_use] + pub fn as_bytes(&self) -> &[u8] { + bytemuck::bytes_of(self) + } + + pub fn from_bytes(bytes: &[u8]) -> Result { + let header: Self = *bytemuck::from_bytes(bytes); + if header.magic != *INDEX_MAGIC { + return Err(CbqError::InvalidIndexHeaderMagic.into()); + } + Ok(header) + } +} + +/// The footer for a compressed index. +/// +/// This is stored identically in memory and on disk. +#[derive(Debug, Clone, Copy, Zeroable, Pod)] +#[repr(C)] +pub struct IndexFooter { + /// Number of bytes in the compressed index + pub(crate) bytes: u64, + + /// Magic number identifying the index format + magic: [u8; 8], +} + +impl IndexFooter { + /// Creates a new index footer + #[must_use] + pub fn new(bytes: u64) -> Self { + Self { + bytes, + magic: *INDEX_MAGIC, + } + } + #[must_use] + pub fn as_bytes(&self) -> &[u8] { + bytemuck::bytes_of(self) + } + pub fn from_bytes(bytes: &[u8]) -> Result { + let footer: Self = *bytemuck::from_bytes(bytes); + if footer.magic != *INDEX_MAGIC { + return Err(CbqError::InvalidIndexFooterMagic.into()); + } + Ok(footer) + } +} + +/// An index of block ranges for quick lookups +#[derive(Clone)] +pub struct Index { + ranges: Vec, +} +impl Index { + /// Builds the index from a list of block headers + #[must_use] + pub fn from_block_headers(block_headers: &[BlockHeader]) -> Self { + let mut offset = size_of::() as u64; + let mut cumulative_records = 0; + let mut ranges = Vec::default(); + for block_header in block_headers { + let range = BlockRange::new(offset, cumulative_records + block_header.num_records); + offset += (size_of::() + block_header.block_len()) as u64; + cumulative_records += block_header.num_records; + ranges.push(range); + } + Self { ranges } + } + + /// Returns the byte representation of the index + #[must_use] + pub fn as_bytes(&self) -> &[u8] { + bytemuck::cast_slice(&self.ranges) + } + + /// Builds the index from a byte slice + pub fn from_bytes(bytes: &[u8]) -> Result { + let ranges = match bytemuck::try_cast_slice(bytes) { + Ok(ranges) => ranges.to_vec(), + Err(_) => return Err(CbqError::IndexCastingError.into()), + }; + Ok(Self { ranges }) + } + + /// Returns the size of the index in bytes + #[must_use] + pub fn size(&self) -> u64 { + self.as_bytes().len() as u64 + } + + /// Encodes the index into a ZSTD-compressed byte array + pub fn encoded(&self) -> Result> { + let mut encoded = Vec::default(); + copy_encode(self.as_bytes(), &mut encoded, 0)?; + Ok(encoded) + } + + /// Returns the number of records in the index + #[must_use] + pub fn num_records(&self) -> usize { + self.ranges + .last() + .map_or(0, |range| range.cumulative_records as usize) + } + + /// Returns the number of blocks in the index + #[must_use] + pub fn num_blocks(&self) -> usize { + self.ranges.len() + } + + #[must_use] + pub fn iter_blocks(&self) -> BlockIter<'_> { + BlockIter { + index: self, + pos: 0, + } + } + + #[must_use] + pub fn average_block_size(&self) -> f64 { + let mut block_iter = self.iter_blocks(); + let Some(mut last_block) = block_iter.next() else { + return 0.0; + }; + let mut total_size = 0.0; + let mut count = 0; + for block in block_iter { + let last_block_size = block.offset - last_block.offset; + total_size += last_block_size as f64; + count += 1; + last_block = block; + } + total_size / f64::from(count) + } + + pub fn pprint(&self) { + for block in self.iter_blocks() { + println!("{block:?}"); + } + } +} + +pub struct BlockIter<'a> { + index: &'a Index, + pos: usize, +} +impl Iterator for BlockIter<'_> { + type Item = BlockRange; + + fn next(&mut self) -> Option { + if self.pos >= self.index.num_blocks() { + None + } else { + let block = self.index.ranges[self.pos]; + self.pos += 1; + Some(block) + } + } +} + +/// A struct representing a block range in a CBQ file and stored in the [`Index`](crate::cbq::Index) +/// +/// This is stored identically in memory and on disk. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Zeroable, Pod, Default)] +#[repr(C)] +pub struct BlockRange { + /// Byte offset of this block + pub(crate) offset: u64, + + /// Number of records up to and including this block + pub(crate) cumulative_records: u64, +} +impl BlockRange { + #[must_use] + pub fn new(offset: u64, cumulative_records: u64) -> Self { + Self { + offset, + cumulative_records, + } + } +} diff --git a/src/cbq/core/mod.rs b/src/cbq/core/mod.rs new file mode 100644 index 0000000..0a66ab8 --- /dev/null +++ b/src/cbq/core/mod.rs @@ -0,0 +1,15 @@ +mod block; +mod block_header; +mod header; +mod index; +pub(crate) mod utils; + +pub use block::{ColumnarBlock, RefRecord, RefRecordIter}; +pub use block_header::BlockHeader; +pub use header::{FileHeader, FileHeaderBuilder}; +pub use index::{BlockRange, Index, IndexFooter, IndexHeader}; + +use super::{ + BLOCK_MAGIC, DEFAULT_BLOCK_SIZE, DEFAULT_COMPRESSION_LEVEL, FILE_MAGIC, FILE_VERSION, + INDEX_MAGIC, +}; diff --git a/src/cbq/core/utils.rs b/src/cbq/core/utils.rs new file mode 100644 index 0000000..eb7503b --- /dev/null +++ b/src/cbq/core/utils.rs @@ -0,0 +1,104 @@ +use std::io; + +use zstd::zstd_safe; + +use crate::Result; + +pub(crate) fn sized_compress( + dst: &mut Vec, + src: &[u8], + cctx: &mut zstd_safe::CCtx, +) -> Result<()> { + // determine the maximum compressed size + let max_z_size = zstd_safe::compress_bound(src.len()); + + // resize the destination vector to the maximum compressed size + // + // Note: this uses uninitialized memory, but is safe because we immediately + // follow it with a call to `compress` which overwrites the buffer. + resize_uninit(dst, max_z_size); + + // Compress the data using the provided compression context + let true_size = cctx + .compress2(dst, src) + .map_err(|e| io::Error::other(zstd_safe::get_error_name(e)))?; + + // resize to the true size - clipping all remaining uninitialized memory + dst.truncate(true_size); + + Ok(()) +} + +pub(crate) fn extension_read( + reader: &mut R, + dst: &mut Vec, + size: usize, +) -> Result<()> { + dst.resize(size, 0); + reader.read_exact(dst)?; + Ok(()) +} + +pub(crate) fn slice_and_increment<'a>(offset: &mut usize, len: u64, bytes: &'a [u8]) -> &'a [u8] { + let slice = &bytes[*offset..*offset + len as usize]; + *offset += len as usize; + slice +} + +/// Resize a vector to the target length without initializing new elements. +/// +/// # Safety +/// The caller must ensure that all elements in the range [`old_len..new_len`] +/// are initialized before reading them. This is safe when immediately followed +/// by operations that write to the entire buffer (e.g., decompression). +#[inline] +#[allow(clippy::uninit_vec)] +pub(crate) fn resize_uninit(vec: &mut Vec, new_len: usize) { + match new_len.cmp(&vec.len()) { + std::cmp::Ordering::Greater => { + // Growing: reserve and set length (unsafe but fast) + vec.reserve(new_len - vec.len()); + unsafe { + vec.set_len(new_len); + } + } + std::cmp::Ordering::Less => { + // Shrinking: truncate (safe and fast) + vec.truncate(new_len); + } + std::cmp::Ordering::Equal => { + // Same size: do nothing + } + } +} + +pub(crate) fn calculate_offsets(values: &[u64], offsets: &mut Vec) { + offsets.clear(); + offsets.push(0); + for i in 1..values.len() { + offsets.push(offsets[i - 1] + values[i - 1]); + } +} + +#[derive(Clone, Copy, Debug)] +pub struct Span { + offset: usize, + length: usize, +} +impl Span { + pub fn new(offset: usize, length: usize) -> Self { + Span { offset, length } + } + + pub fn new_u64(offset: u64, length: u64) -> Self { + Span::new(offset as usize, length as usize) + } + + pub fn range(&self) -> std::ops::Range { + self.offset..self.offset + self.length + } + + pub fn len(&self) -> usize { + self.length + } +} diff --git a/src/cbq/mod.rs b/src/cbq/mod.rs new file mode 100644 index 0000000..f59b90a --- /dev/null +++ b/src/cbq/mod.rs @@ -0,0 +1,102 @@ +//! # CBQ Format +//! +//! CBQ is a high-performance binary format built around blocked columnar storage. +//! It optimizes for storage efficiency and parallel processing of records. +//! +//! ## Overview +//! +//! CBQ was built to solve the rough edges of VBQ. +//! It keeps the blocked structure of VBQ, but instead of interleaving the internal data of all records in the block, it stores each attribute in a separate column. +//! Each of these columns are then ZSTD compressed and optionally decoded when reading. +//! +//! It was built to be performant, efficient, and lossless by default. +//! +//! This has a few benefits and advantages over VBQ: +//! +//! 1. Better compression ratios for each individual attribute. +//! 2. Significantly faster throughput for reading (easier decompression + pay-per-use decompression). +//! 3. Simple record parsing and manipulation. +//! +//! Notably this format *only* performs two-bit encoding of sequences. +//! However, it tracks the positions of all ambiguous nucleotides (`N`) within the sequence. +//! When it is decoded and the two-bit encoded sequence is decoded back to nucleotides, the `N` positions are backfilled with `N`. +//! +//! To make use of the sparse-but-clustered nature of the `N`-positions, we make use of an Elias-Fano encoding of the `N`-positions. +//! This encoding is then used to efficiently store and retrieve the positions of `N`s within the sequence. +//! +//! ## File Structure +//! +//! A CBQ file consists of a [`FileHeader`](cbq::FileHeader), followed by record blocks and an embedded [`Index`](cbq::Index). +//! Each record block is composed of a [`BlockHeader`](cbq::BlockHeader) which provides metadata about the block, and a [`ColumnarBlock`](cbq::ColumnarBlock) containing the actual data. +//! +//! The [`IndexHeader`](cbq::IndexHeader) and [`IndexFooter`](cbq::IndexFooter) are used to locate and access the data within the file when reading as memory mapped. +//! +//! ```text +//! ┌───────────────────┐ +//! │ File Header │ 64 bytes +//! ├───────────────────┤ +//! │ Block Header │ 96 bytes +//! ├───────────────────┤ +//! │ │ +//! │ Block Records │ Variable size +//! │ │ +//! ├───────────────────┤ +//! │ ... │ More blocks +//! ├───────────────────┤ +//! │ Index Header │ 24 bytes +//! ├───────────────────┤ +//! │ Compressed Index │ Variable size +//! ├───────────────────┤ +//! │ Index Footer │ 16 bytes +//! └───────────────────┘ +//! ``` +//! +//! ## Block Format +//! +//! The blocks on-disk are stored as ZSTD compressed data. +//! Each column is ZSTD compressed and stored contiguously next to each other. +//! +//! The [BlockHeader](cbq::BlockHeader) contains the compressed sizes of each of the columns as well as the relevant information for their uncompressed sizes. +//! +//! ```text +//! [BlockHeader][col1][col2][col3]...[BlockHeader][col1][col2][col3]... +//! ``` +//! +//! The order of columns in the block is as follows: +//! +//! 1. `z_seq_len` - sequence lengths +//! 2. `z_header_len` - header lengths (optional) +//! 3. `z_npos` - Elias-Fano encoded positions of N's (optional) +//! 4. `z_seq` - sequence data (2-bit encoded) +//! 5. `z_flags` - flags (optional) +//! 6. `z_headers` - sequence headers (optional) +//! 7. `z_qual` - sequence quality scores (optional) + +mod core; +mod read; +mod write; + +pub use core::{ + BlockHeader, BlockRange, ColumnarBlock, FileHeader, FileHeaderBuilder, Index, IndexFooter, + IndexHeader, RefRecord, RefRecordIter, +}; +pub use read::{MmapReader, Reader}; +pub use write::ColumnarBlockWriter; + +/// The magic number for CBQ files. +pub const FILE_MAGIC: &[u8; 7] = b"CBQFILE"; + +/// The magic number for CBQ blocks. +pub const BLOCK_MAGIC: &[u8; 3] = b"BLK"; + +/// The magic number for CBQ index files. +pub const INDEX_MAGIC: &[u8; 8] = b"CBQINDEX"; + +/// The current file version. +pub const FILE_VERSION: u8 = 1; + +/// The default block size. +pub const DEFAULT_BLOCK_SIZE: u64 = 1024 * 1024; + +/// The default compression level. +pub const DEFAULT_COMPRESSION_LEVEL: u64 = 0; diff --git a/src/cbq/read.rs b/src/cbq/read.rs new file mode 100644 index 0000000..6bdaf9f --- /dev/null +++ b/src/cbq/read.rs @@ -0,0 +1,636 @@ +use std::{fs, io, path::Path, sync::Arc, thread}; + +use memmap2::Mmap; +use zstd::{stream::copy_decode, zstd_safe}; + +use crate::{ + BinseqRecord, ParallelProcessor, ParallelReader, Result, + cbq::core::{ + BlockHeader, BlockRange, ColumnarBlock, FileHeader, Index, IndexFooter, IndexHeader, + }, +}; + +/// A reader for CBQ files operating on generic readers (streaming). +pub struct Reader { + inner: R, + pub block: ColumnarBlock, + iheader: Option, +} +impl Reader { + pub fn new(mut inner: R) -> Result { + let mut header_buf = [0u8; size_of::()]; + inner.read_exact(&mut header_buf)?; + let header = FileHeader::from_bytes(&header_buf)?; + + Ok(Self { + inner, + block: ColumnarBlock::new(header), + iheader: None, + }) + } + + /// Update the default quality score for this reader + pub fn set_default_quality_score(&mut self, score: u8) { + self.block.set_default_quality_score(score); + } + + pub fn read_block(&mut self) -> Result> { + let mut iheader_buf = [0u8; size_of::()]; + let mut diff_buf = [0u8; size_of::() - size_of::()]; + let mut header_buf = [0u8; size_of::()]; + + // Attempt to read the index header + match self.inner.read_exact(&mut iheader_buf) { + Ok(()) => {} + Err(e) => { + if e.kind() == io::ErrorKind::UnexpectedEof { + // no more bytes, the stream is exhausted + return Ok(None); + } + return Err(e.into()); + } + } + + // The stream is exhausted, no more blocks to read + if let Ok(iheader) = IndexHeader::from_bytes(&iheader_buf) { + self.iheader = Some(iheader); + return Ok(None); + } + // attempt to read the rest of the block header + match self.inner.read_exact(&mut diff_buf) { + Ok(()) => {} + Err(e) => { + return Err(e.into()); + } + } + header_buf[..iheader_buf.len()].copy_from_slice(&iheader_buf); + header_buf[iheader_buf.len()..].copy_from_slice(&diff_buf); + + let header = BlockHeader::from_bytes(&header_buf)?; + self.block.read_from(&mut self.inner, header)?; + + Ok(Some(header)) + } + + pub fn read_index(&mut self) -> Result> { + let Some(header) = self.iheader else { + return Ok(None); + }; + let mut z_index_buf = Vec::new(); + let mut index_buf = Vec::new(); + let mut footer_buf = [0u8; size_of::()]; + + // Read the index data from the reader + z_index_buf.resize(header.z_bytes as usize, 0); + + // Reads the compressed index data + self.inner.read_exact(&mut z_index_buf)?; + copy_decode(z_index_buf.as_slice(), &mut index_buf)?; + let index = Index::from_bytes(&index_buf)?; + + // Read the footer data from the reader + self.inner.read_exact(&mut footer_buf)?; + let _footer = IndexFooter::from_bytes(&footer_buf)?; + + Ok(Some(index)) + } +} + +/// A memory-mapped reader for CBQ files. +pub struct MmapReader { + inner: Arc, + index: Arc, + + /// Reusable record block + block: ColumnarBlock, + + /// Reusable decompression context + dctx: zstd_safe::DCtx<'static>, +} +impl Clone for MmapReader { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + index: self.index.clone(), + block: self.block.clone(), + dctx: zstd_safe::DCtx::create(), + } + } +} +impl MmapReader { + pub fn new>(path: P) -> Result { + let file = fs::File::open(path)?; + + // Load the mmap + let inner = unsafe { Mmap::map(&file) }?; + + // Build the header + let header = FileHeader::from_bytes(&inner[..size_of::()])?; + + // build the index + let index = { + // Load the index footer + let footer_start = inner.len() - size_of::(); + let mut footer_buf = [0u8; size_of::()]; + footer_buf.copy_from_slice(&inner[footer_start..]); + let index_footer = IndexFooter::from_bytes(&footer_buf)?; + + // Find the coordinates of the compressed index + let z_index_start = footer_start - index_footer.bytes as usize; + let z_index_slice = &inner[z_index_start..footer_start]; + + // Decompress the index + let mut index_buf = Vec::default(); + copy_decode(z_index_slice, &mut index_buf)?; + + // Load the index + Index::from_bytes(&index_buf) + }?; + + Ok(Self { + inner: Arc::new(inner), + index: Arc::new(index), + block: ColumnarBlock::new(header), + dctx: zstd_safe::DCtx::create(), + }) + } + + /// Update the default quality score for this reader + pub fn set_default_quality_score(&mut self, score: u8) { + self.block.set_default_quality_score(score); + } + + #[must_use] + pub fn header(&self) -> FileHeader { + self.block.header + } + + #[must_use] + pub fn is_paired(&self) -> bool { + self.block.header.is_paired() + } + + #[must_use] + pub fn num_records(&self) -> usize { + self.index.num_records() + } + + #[must_use] + pub fn num_blocks(&self) -> usize { + self.index.num_blocks() + } + + #[must_use] + pub fn index(&self) -> &Index { + &self.index + } + + fn load_block(&mut self, range: BlockRange) -> Result<()> { + let header_start = range.offset as usize; + let header_end = size_of::() + header_start; + let block_header = { + let mut block_header_buf = [0u8; size_of::()]; + block_header_buf.copy_from_slice(&self.inner[header_start..header_end]); + BlockHeader::from_bytes(&block_header_buf) + }?; + + let data_end = header_end + block_header.block_len(); + let block_data_slice = &self.inner[header_end..data_end]; + self.block + .decompress_from_bytes(block_data_slice, block_header, &mut self.dctx)?; + Ok(()) + } + + /// Iterate over block headers in the CBQ file. + /// + /// Note: This requires reading slices from the file so it will be IO-bound. + pub fn iter_block_headers(&self) -> impl Iterator> { + self.index.iter_blocks().map(|range| { + let mut block_header_buf = [0u8; size_of::()]; + block_header_buf.copy_from_slice( + &self.inner + [range.offset as usize..range.offset as usize + size_of::()], + ); + BlockHeader::from_bytes(&block_header_buf) + }) + } +} +impl ParallelReader for MmapReader { + fn process_parallel( + self, + processor: P, + num_threads: usize, + ) -> crate::Result<()> { + let num_records = self.num_records(); + self.process_parallel_range(processor, num_threads, 0..num_records) + } + + fn process_parallel_range( + self, + processor: P, + num_threads: usize, + range: std::ops::Range, + ) -> crate::Result<()> { + let num_threads = if num_threads == 0 { + num_cpus::get() + } else { + num_threads.min(num_cpus::get()) + }; + + // validate range + let total_records = self.num_records(); + self.validate_range(total_records, &range)?; + + let mut iv_start = 0; + let relevant_blocks = self + .index + .iter_blocks() + .filter(|block| { + let iv_end = block.cumulative_records as usize; + let relevant = iv_start <= range.end && iv_end > range.start; + iv_start = iv_end; + relevant + }) + .collect::>(); + let num_blocks = relevant_blocks.len(); + + if relevant_blocks.is_empty() { + return Ok(()); // nothing to do + } + + // Distribute blocks evenly across threads, giving extra blocks to first threads + let base_blocks_per_thread = num_blocks / num_threads; + let extra_blocks = num_blocks % num_threads; + + let mut handles = Vec::new(); + for thread_id in 0..num_threads { + // Threads 0..extra_blocks get one extra block + let blocks_for_this_thread = if thread_id < extra_blocks { + base_blocks_per_thread + 1 + } else { + base_blocks_per_thread + }; + + // Calculate cumulative start position + let start_block_idx = if thread_id < extra_blocks { + thread_id * (base_blocks_per_thread + 1) + } else { + extra_blocks * (base_blocks_per_thread + 1) + + (thread_id - extra_blocks) * base_blocks_per_thread + }; + let end_block_idx = start_block_idx + blocks_for_this_thread; + + // Skip threads with no work (happens when num_threads > num_blocks) + if blocks_for_this_thread == 0 { + continue; + } + + let mut t_reader = self.clone(); + let mut t_proc = processor.clone(); + + // pull all block ranges for this thread + let t_block_ranges = relevant_blocks + .iter() + .skip(start_block_idx) + .take(end_block_idx - start_block_idx) + .copied() + .collect::>(); + + // eprintln!( + // "Thread {} block range: {}-{}. First block Cumulative Records: {}. Last block Cumulative Records: {}", + // thread_id, + // start_block_idx, + // end_block_idx, + // t_block_ranges[0].cumulative_records, + // t_block_ranges.last().unwrap().cumulative_records + // ); + + let thread_handle = thread::spawn(move || -> crate::Result<()> { + for b_range in t_block_ranges { + t_reader.load_block(b_range)?; + for record in t_reader.block.iter_records(b_range) { + let global_record_idx = record.index() as usize; + + // Only process records within our specified range + if global_record_idx >= range.start && global_record_idx < range.end { + t_proc.process_record(record)?; + } + } + t_proc.on_batch_complete()?; + } + Ok(()) + }); + handles.push(thread_handle); + } + + for handle in handles { + handle.join().unwrap()?; + } + Ok(()) + } +} +#[cfg(test)] +mod tests { + use super::*; + use crate::BinseqRecord; + + const TEST_CBQ_FILE: &str = "./data/subset.cbq"; + + // ==================== MmapReader Basic Tests ==================== + + #[test] + fn test_mmap_reader_new() { + let reader = MmapReader::new(TEST_CBQ_FILE); + assert!(reader.is_ok(), "Failed to create CBQ reader"); + } + + #[test] + fn test_mmap_reader_num_records() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let num_records = reader.num_records(); + assert!(num_records > 0, "Expected non-zero records"); + } + + #[test] + fn test_mmap_reader_is_paired() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let is_paired = reader.is_paired(); + // Test that the method returns a boolean + assert!(is_paired || !is_paired); + } + + #[test] + fn test_mmap_reader_header_access() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let header = reader.header(); + assert!(header.block_size > 0, "Expected non-zero block size"); + } + + #[test] + fn test_mmap_reader_index_access() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let index = reader.index(); + assert!(index.num_records() > 0, "Index should have records"); + } + + #[test] + fn test_mmap_reader_num_blocks() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let num_blocks = reader.num_blocks(); + assert!(num_blocks > 0, "Should have at least one block"); + } + + // ==================== Default Quality Score Tests ==================== + + #[test] + fn test_set_default_quality_score() { + let mut reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let custom_score = 42u8; + + reader.set_default_quality_score(custom_score); + // Just verify it doesn't panic + } + + // ==================== Parallel Processing Tests ==================== + + #[derive(Clone)] + struct CbqCountingProcessor { + count: Arc>, + } + + impl ParallelProcessor for CbqCountingProcessor { + fn process_record(&mut self, _record: R) -> Result<()> { + let mut count = self.count.lock().unwrap(); + *count += 1; + Ok(()) + } + } + + #[test] + fn test_parallel_processing() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let num_records = reader.num_records(); + + let count = Arc::new(std::sync::Mutex::new(0)); + let processor = CbqCountingProcessor { + count: count.clone(), + }; + + reader.process_parallel(processor, 2).unwrap(); + + let final_count = *count.lock().unwrap(); + assert_eq!(final_count, num_records, "All records should be processed"); + } + + #[test] + fn test_parallel_processing_range() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let num_records = reader.num_records(); + + if num_records >= 100 { + let start = 10; + let end = 50; + let expected_count = end - start; + + let count = Arc::new(std::sync::Mutex::new(0)); + let processor = CbqCountingProcessor { + count: count.clone(), + }; + + reader + .process_parallel_range(processor, 2, start..end) + .unwrap(); + + let final_count = *count.lock().unwrap(); + assert_eq!( + final_count, expected_count, + "Should process exactly {} records", + expected_count + ); + } + } + + #[test] + fn test_parallel_processing_with_record_data() { + #[derive(Clone)] + struct RecordValidator { + valid_count: Arc>, + } + + impl ParallelProcessor for RecordValidator { + fn process_record(&mut self, record: R) -> Result<()> { + // Validate record has non-zero length + assert!(record.slen() > 0, "Record should have non-zero length"); + + let mut count = self.valid_count.lock().unwrap(); + *count += 1; + Ok(()) + } + } + + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let num_records = reader.num_records(); + + let count = Arc::new(std::sync::Mutex::new(0)); + let processor = RecordValidator { + valid_count: count.clone(), + }; + + reader.process_parallel(processor, 2).unwrap(); + + let final_count = *count.lock().unwrap(); + assert_eq!(final_count, num_records); + } + + // ==================== Index Tests ==================== + + #[test] + fn test_index_num_records() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + + let index_records = reader.index().num_records(); + let reader_records = reader.num_records(); + + assert_eq!( + index_records, reader_records, + "Index and reader should report same number of records" + ); + } + + #[test] + fn test_index_num_blocks() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + + let num_blocks = reader.index().num_blocks(); + assert!(num_blocks > 0, "Should have at least one block"); + } + + #[test] + fn test_index_iter_blocks() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + + let blocks: Vec<_> = reader.index().iter_blocks().collect(); + assert!(!blocks.is_empty(), "Should have at least one block"); + + let num_blocks = reader.num_blocks(); + assert_eq!(blocks.len(), num_blocks, "Block count should match"); + } + + // ==================== Error Handling Tests ==================== + + #[test] + fn test_nonexistent_file() { + let result = MmapReader::new("./data/nonexistent.cbq"); + assert!(result.is_err(), "Should fail on nonexistent file"); + } + + #[test] + fn test_invalid_file_format() { + // Try to open a non-CBQ file as CBQ + let result = MmapReader::new("./Cargo.toml"); + // This should fail during header validation + assert!(result.is_err(), "Should fail on invalid file format"); + } + + // ==================== Block Header Iterator Tests ==================== + + #[test] + fn test_iter_block_headers() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + + let headers: Vec<_> = reader + .iter_block_headers() + .take(5) + .collect::>>() + .unwrap(); + + assert!(!headers.is_empty(), "Should have at least one block header"); + + for header in headers { + assert!(header.num_records > 0, "Block should have records"); + } + } + + #[test] + fn test_iter_block_headers_count() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + + let header_count = reader + .iter_block_headers() + .collect::>>() + .unwrap() + .len(); + + let num_blocks = reader.num_blocks(); + assert_eq!(header_count, num_blocks, "Should iterate all block headers"); + } + + // ==================== Empty Range Tests ==================== + + #[test] + fn test_parallel_processing_empty_range() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + + let count = Arc::new(std::sync::Mutex::new(0)); + let processor = CbqCountingProcessor { + count: count.clone(), + }; + + // Process empty range + reader.process_parallel_range(processor, 2, 0..0).unwrap(); + + let final_count = *count.lock().unwrap(); + assert_eq!(final_count, 0, "Empty range should process no records"); + } + + #[test] + fn test_parallel_processing_invalid_range() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let num_records = reader.num_records(); + + let count = Arc::new(std::sync::Mutex::new(0)); + let processor = CbqCountingProcessor { + count: count.clone(), + }; + + // Process out of bounds range (should error) + let result = + reader.process_parallel_range(processor, 2, num_records + 100..num_records + 200); + + assert!(result.is_err(), "Should handle out of bounds as error"); + } + + // ==================== Thread Count Tests ==================== + + #[test] + fn test_parallel_processing_single_thread() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let num_records = reader.num_records(); + + let count = Arc::new(std::sync::Mutex::new(0)); + let processor = CbqCountingProcessor { + count: count.clone(), + }; + + reader.process_parallel(processor, 1).unwrap(); + + let final_count = *count.lock().unwrap(); + assert_eq!(final_count, num_records); + } + + #[test] + fn test_parallel_processing_many_threads() { + let reader = MmapReader::new(TEST_CBQ_FILE).unwrap(); + let num_records = reader.num_records(); + + let count = Arc::new(std::sync::Mutex::new(0)); + let processor = CbqCountingProcessor { + count: count.clone(), + }; + + reader.process_parallel(processor, 8).unwrap(); + + let final_count = *count.lock().unwrap(); + assert_eq!(final_count, num_records); + } +} diff --git a/src/cbq/write.rs b/src/cbq/write.rs new file mode 100644 index 0000000..5415e6c --- /dev/null +++ b/src/cbq/write.rs @@ -0,0 +1,181 @@ +use std::io; + +use zstd::zstd_safe; + +use crate::{ + Result, SequencingRecord, + cbq::core::{BlockHeader, ColumnarBlock, FileHeader, Index, IndexFooter, IndexHeader}, +}; + +/// Writer for CBQ files operating on generic writers (streaming). +pub struct ColumnarBlockWriter { + /// Internal writer for the block + inner: W, + + /// A reusable block for this writer + block: ColumnarBlock, + + /// All block headers written by this writer + headers: Vec, + + /// Compression context for the thread + cctx: zstd_safe::CCtx<'static>, +} +impl Clone for ColumnarBlockWriter { + fn clone(&self) -> Self { + let mut writer = Self { + inner: self.inner.clone(), + block: self.block.clone(), + headers: self.headers.clone(), + cctx: zstd_safe::CCtx::create(), + }; + writer + .init_compressor() + .expect("Failed to set compression level in writer clone"); + writer + } +} +impl ColumnarBlockWriter { + /// Creates a new writer with the header written to the inner writer + pub fn new(inner: W, header: FileHeader) -> Result { + // Build the writer + let mut writer = Self::new_headless(inner, header)?; + + // Ensure the header is written to the file + writer.inner.write_all(header.as_bytes())?; + + Ok(writer) + } + + /// Creates a new writer without writing the header to the inner writer + pub fn new_headless(inner: W, header: FileHeader) -> Result { + let mut writer = Self { + inner, + block: ColumnarBlock::new(header), + headers: Vec::default(), + cctx: zstd_safe::CCtx::create(), + }; + + // Set the compression level for this writer + writer.init_compressor()?; + + Ok(writer) + } + + /// Sets the compression level for Writer + /// + /// Note: only used on init, shouldn't be set by the user + fn init_compressor(&mut self) -> Result<()> { + // Initialize the compressor with the compression level + self.cctx + .set_parameter(zstd_safe::CParameter::CompressionLevel( + self.block.header.compression_level as i32, + )) + .map_err(|e| io::Error::other(zstd_safe::get_error_name(e)))?; + + // Set long distance matching + self.cctx + .set_parameter(zstd_safe::CParameter::EnableLongDistanceMatching(true)) + .map_err(|e| io::Error::other(zstd_safe::get_error_name(e)))?; + Ok(()) + } + + pub fn header(&self) -> FileHeader { + self.block.header + } + + /// Calculate the usage of the block as a percentage + pub fn usage(&self) -> f64 { + self.block.usage() + } + + /// Push a record to the writer + /// + /// Returns `Ok(true)` if the record was written successfully. + /// CBQ handles N's explicitly in its encoding, so records are never skipped. + pub fn push(&mut self, record: SequencingRecord) -> Result { + if !self.block.can_fit(&record) { + self.flush()?; + } + self.block.push(record)?; + Ok(true) + } + + pub fn flush(&mut self) -> Result<()> { + if let Some(header) = self.block.flush_to(&mut self.inner, &mut self.cctx)? { + self.headers.push(header); + } + Ok(()) + } + + pub fn finish(&mut self) -> Result<()> { + self.flush()?; + self.write_index()?; + Ok(()) + } + + fn write_index(&mut self) -> Result<()> { + let index = Index::from_block_headers(&self.headers); + let z_index = index.encoded()?; + let header = IndexHeader::new(index.size(), z_index.len() as u64); + let footer = IndexFooter::new(z_index.len() as u64); + + // Write the index to the inner writer + { + self.inner.write_all(header.as_bytes())?; + self.inner.write_all(&z_index)?; + self.inner.write_all(footer.as_bytes())?; + } + Ok(()) + } + + pub fn ingest(&mut self, other: &mut ColumnarBlockWriter>) -> Result<()> { + // Write all completed blocks from the other + self.inner.write_all(other.inner_data())?; + // eprintln!( + // "Wrote {} bytes from completed blocks", + // other.inner_data().len() + // ); + + // Take all headers from the other + self.headers.extend_from_slice(&other.headers); + + // Attempt to ingest the incomplete block from the other + if self.block.can_ingest(&other.block) { + // eprintln!("Can ingest incomplete block"); + self.block.take_incomplete(&other.block)?; + + // Make space by flushing the current block + // Then ingest the incomplete block from the other + } else { + // eprintln!("Cannot ingest incomplete block"); + self.flush()?; + self.block.take_incomplete(&other.block)?; + } + + // Clear the other's inner data and offsets + other.clear_inner_data(); + + Ok(()) + } +} + +/// Specialized implementation when using a local `Vec` as the inner data structure +impl ColumnarBlockWriter> { + #[must_use] + pub fn inner_data(&self) -> &[u8] { + &self.inner + } + + pub fn clear_inner_data(&mut self) { + self.inner.clear(); + self.headers.clear(); + self.block.clear(); + } + + /// Returns the number of bytes written to the inner data structure + #[must_use] + pub fn bytes_written(&self) -> usize { + self.inner.len() + } +} diff --git a/src/context/mod.rs b/src/context/mod.rs deleted file mode 100644 index f79444a..0000000 --- a/src/context/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -/// Instances of common contexts -mod structs; - -/// Traits for different context behaviors -mod traits; - -pub use structs::{Ctx, SeqCtx, SeqHeaderCtx, SeqQualCtx}; -pub use traits::{Context, HeaderContext, QualityContext, SequenceContext}; diff --git a/src/context/structs.rs b/src/context/structs.rs deleted file mode 100644 index 7c2c637..0000000 --- a/src/context/structs.rs +++ /dev/null @@ -1,221 +0,0 @@ -use super::traits::{Context, HeaderContext, QualityContext, SequenceContext}; -use crate::{BinseqRecord, Result}; - -/// Context for sequence data -/// -/// Has all the necessary fields for storing sequence data. -#[derive(Clone, Default)] -pub struct Ctx { - sbuf: Vec, - xbuf: Vec, - sheader: Vec, - xheader: Vec, - squal: Vec, - xqual: Vec, -} -impl SequenceContext for Ctx { - #[inline] - fn sbuf(&self) -> &[u8] { - &self.sbuf - } - #[inline] - fn xbuf(&self) -> &[u8] { - &self.xbuf - } - #[inline] - fn sbuf_mut(&mut self) -> &mut Vec { - &mut self.sbuf - } - #[inline] - fn xbuf_mut(&mut self) -> &mut Vec { - &mut self.xbuf - } -} -impl QualityContext for Ctx { - #[inline] - fn squal(&self) -> &[u8] { - &self.squal - } - #[inline] - fn xqual(&self) -> &[u8] { - &self.xqual - } - #[inline] - fn squal_mut(&mut self) -> &mut Vec { - &mut self.squal - } - #[inline] - fn xqual_mut(&mut self) -> &mut Vec { - &mut self.xqual - } -} -impl HeaderContext for Ctx { - #[inline] - fn sheader(&self) -> &[u8] { - &self.sheader - } - #[inline] - fn xheader(&self) -> &[u8] { - &self.xheader - } - #[inline] - fn sheader_mut(&mut self) -> &mut Vec { - &mut self.sheader - } - #[inline] - fn xheader_mut(&mut self) -> &mut Vec { - &mut self.xheader - } -} -impl Context for Ctx { - #[inline] - fn fill(&mut self, record: &R) -> Result<()> { - self.fill_sequences(record)?; - self.fill_qualities(record)?; - self.fill_headers(record); - Ok(()) - } -} - -/// Context for just sequence data -/// -/// Only stores nucleotide sequence data. -#[derive(Clone, Default)] -pub struct SeqCtx { - sbuf: Vec, - xbuf: Vec, -} -impl SequenceContext for SeqCtx { - #[inline] - fn sbuf(&self) -> &[u8] { - &self.sbuf - } - #[inline] - fn xbuf(&self) -> &[u8] { - &self.xbuf - } - #[inline] - fn sbuf_mut(&mut self) -> &mut Vec { - &mut self.sbuf - } - #[inline] - fn xbuf_mut(&mut self) -> &mut Vec { - &mut self.xbuf - } -} -impl Context for SeqCtx { - #[inline] - fn fill(&mut self, record: &R) -> Result<()> { - self.fill_sequences(record) - } -} - -/// Context for sequence data and headers -/// -/// Does not store quality data. -#[derive(Clone, Default)] -pub struct SeqHeaderCtx { - sbuf: Vec, - xbuf: Vec, - sheader: Vec, - xheader: Vec, -} -impl SequenceContext for SeqHeaderCtx { - #[inline] - fn sbuf(&self) -> &[u8] { - &self.sbuf - } - #[inline] - fn xbuf(&self) -> &[u8] { - &self.xbuf - } - #[inline] - fn sbuf_mut(&mut self) -> &mut Vec { - &mut self.sbuf - } - #[inline] - fn xbuf_mut(&mut self) -> &mut Vec { - &mut self.xbuf - } -} -impl HeaderContext for SeqHeaderCtx { - #[inline] - fn sheader(&self) -> &[u8] { - &self.sheader - } - #[inline] - fn xheader(&self) -> &[u8] { - &self.xheader - } - #[inline] - fn sheader_mut(&mut self) -> &mut Vec { - &mut self.sheader - } - #[inline] - fn xheader_mut(&mut self) -> &mut Vec { - &mut self.xheader - } -} -impl Context for SeqHeaderCtx { - #[inline] - fn fill(&mut self, record: &R) -> Result<()> { - self.fill_sequences(record)?; - self.fill_headers(record); - Ok(()) - } -} - -/// Context for sequence data and quality data -/// -/// Does not store header data. -#[derive(Clone, Default)] -pub struct SeqQualCtx { - sbuf: Vec, - xbuf: Vec, - squal: Vec, - xqual: Vec, -} -impl SequenceContext for SeqQualCtx { - #[inline] - fn sbuf(&self) -> &[u8] { - &self.sbuf - } - #[inline] - fn xbuf(&self) -> &[u8] { - &self.xbuf - } - #[inline] - fn sbuf_mut(&mut self) -> &mut Vec { - &mut self.sbuf - } - #[inline] - fn xbuf_mut(&mut self) -> &mut Vec { - &mut self.xbuf - } -} -impl QualityContext for SeqQualCtx { - #[inline] - fn squal(&self) -> &[u8] { - &self.squal - } - #[inline] - fn xqual(&self) -> &[u8] { - &self.xqual - } - #[inline] - fn squal_mut(&mut self) -> &mut Vec { - &mut self.squal - } - #[inline] - fn xqual_mut(&mut self) -> &mut Vec { - &mut self.xqual - } -} -impl Context for SeqQualCtx { - #[inline] - fn fill(&mut self, record: &R) -> Result<()> { - self.fill_sequences(record)?; - self.fill_qualities(record)?; - Ok(()) - } -} diff --git a/src/context/traits.rs b/src/context/traits.rs deleted file mode 100644 index 9712d9d..0000000 --- a/src/context/traits.rs +++ /dev/null @@ -1,109 +0,0 @@ -use crate::{BinseqRecord, Result}; - -pub const DEFAULT_QUALITY: u8 = b'?'; - -/// Trait for handling reusable buffers in decoding BINSEQ records. -pub trait Context: Clone + Default { - /// Replaces the contents of the context with the contents of the given record. - /// - /// This will clear all existing data and fill the context with the contents of the record. - fn fill(&mut self, record: &R) -> Result<()>; -} - -/// Trait for handling reusable buffers in decoding BINSEQ records focused on nucleotide sequences. -pub trait SequenceContext { - fn sbuf(&self) -> &[u8]; - fn xbuf(&self) -> &[u8]; - fn sbuf_mut(&mut self) -> &mut Vec; - fn xbuf_mut(&mut self) -> &mut Vec; - #[inline] - fn clear_sequences(&mut self) { - self.sbuf_mut().clear(); - self.xbuf_mut().clear(); - } - #[inline] - #[allow(deprecated)] - fn fill_sequences(&mut self, record: &R) -> Result<()> { - self.clear_sequences(); - record.decode_s(self.sbuf_mut())?; - if record.is_paired() { - record.decode_x(self.xbuf_mut())?; - } - Ok(()) - } -} - -/// Trait for handling reusable buffers in decoding BINSEQ records focused on quality data. -pub trait QualityContext { - fn squal(&self) -> &[u8]; - fn xqual(&self) -> &[u8]; - fn squal_mut(&mut self) -> &mut Vec; - fn xqual_mut(&mut self) -> &mut Vec; - #[inline] - fn clear_qualities(&mut self) { - self.squal_mut().clear(); - self.xqual_mut().clear(); - } - #[inline] - fn fill_qualities(&mut self, record: &R) -> Result<()> { - if record.has_quality() { - let slen = record.slen() as usize; - let squal = self.squal_mut(); - if squal.len() != slen { - squal.resize(slen, DEFAULT_QUALITY); - } - squal.copy_from_slice(record.squal()); - - if record.is_paired() { - let xlen = record.xlen() as usize; - let xqual = self.xqual_mut(); - if xqual.len() != xlen { - xqual.resize(xlen, DEFAULT_QUALITY); - } - xqual.copy_from_slice(record.xqual()); - } - } else { - self.ensure_quality_capacity(record); - } - Ok(()) - } - #[inline] - fn ensure_quality_capacity(&mut self, record: &R) { - let slen = record.slen() as usize; - let xlen = record.xlen() as usize; - - // only resize if its not the right size - let squal = self.squal_mut(); - if squal.len() != slen { - squal.resize(slen, DEFAULT_QUALITY); - } - - // Only resize if there's an extended sequence and it's not already the right size - let xqual = self.xqual_mut(); - if xqual.len() != xlen { - xqual.resize(xlen, DEFAULT_QUALITY); - } - } -} - -/// Trait for handling reusable buffers in decoding BINSEQ records focused on header data. -pub trait HeaderContext { - fn sheader(&self) -> &[u8]; - fn sheader_mut(&mut self) -> &mut Vec; - fn xheader(&self) -> &[u8]; - fn xheader_mut(&mut self) -> &mut Vec; - #[inline] - fn clear_headers(&mut self) { - self.sheader_mut().clear(); - self.xheader_mut().clear(); - } - - #[inline] - fn fill_headers(&mut self, record: &R) { - self.clear_headers(); - self.sheader_mut().extend_from_slice(record.sheader()); - if record.is_paired() { - self.xheader_mut().extend_from_slice(record.xheader()); - } - } -} diff --git a/src/error.rs b/src/error.rs index 0c06762..9354801 100644 --- a/src/error.rs +++ b/src/error.rs @@ -12,6 +12,10 @@ pub enum Error { #[error("Error processing header: {0}")] HeaderError(#[from] HeaderError), + /// Errors related to the CBQ format + #[error("Error processing CBQ: {0}")] + CbqError(#[from] CbqError), + /// Errors that occur during write operations #[error("Error writing file: {0}")] WriteError(#[from] WriteError), @@ -44,30 +48,17 @@ pub enum Error { BitnucError(#[from] bitnuc::Error), /// Conversion errors from anyhow errors + #[cfg(feature = "anyhow")] #[error("Generic error: {0}")] AnyhowError(#[from] anyhow::Error), /// Generic errors for other unexpected situations #[error("Generic error: {0}")] GenericError(#[from] Box), -} -impl Error { - /// Checks if the error is an index mismatch error - /// - /// This is useful for determining if a file's index is out of sync with its content, - /// which might require rebuilding the index. - /// - /// # Returns - /// - /// * `true` if the error is an `IndexError::ByteSizeMismatch` - /// * `false` for all other error types - #[must_use] - pub fn is_index_mismatch(&self) -> bool { - match self { - Self::IndexError(err) => err.is_mismatch(), - _ => false, - } - } + + #[cfg(feature = "paraseq")] + #[error("Fastx encoding error: {0}")] + FastxEncodingError(#[from] FastxEncodingError), } /// Errors specific to processing and validating binary sequence headers @@ -125,8 +116,14 @@ pub enum ReadError { /// # Arguments /// * First `usize` - The requested record index /// * Second `usize` - The maximum available record index - #[error("Requested record index ({0}) is out of record range ({1})")] - OutOfRange(usize, usize), + #[error("Requested record index ({requested_index}) is out of record range ({max_index})")] + OutOfRange { + requested_index: usize, + max_index: usize, + }, + + #[error("Invalid range specified: start ({start}) is greater than end ({end})")] + InvalidRange { start: usize, end: usize }, /// End of stream was reached while reading #[error("End of stream reached")] @@ -151,7 +148,7 @@ pub enum ReadError { #[error("Unable to find an expected full block at position {0}")] UnexpectedEndOfFile(usize), - /// When the file metadata doesn't match the expected VBINSEQ format + /// When the file metadata doesn't match the expected VBQ format #[error("Unexpected file metadata")] InvalidFileType, @@ -169,6 +166,29 @@ pub enum BuilderError { /// Errors that can occur while writing binary sequence data #[derive(thiserror::Error, Debug)] pub enum WriteError { + /// Error between configuration of writer and incoming sequencing record + #[error( + "Cannot push record ({attribute}: {actual}) with writer configuration ({attribute}: {expected})" + )] + ConfigurationMismatch { + attribute: &'static str, + expected: bool, + actual: bool, + }, + + #[error("Cannot ingest writer with incompatible formats")] + FormatMismatch, + + #[error( + "Missing required sequence length, expected (primary: {exp_primary}, extended: {exp_extended}), got (primary: {obs_primary}, extended: {obs_extended})" + )] + MissingSequenceLength { + exp_primary: bool, + exp_extended: bool, + obs_primary: bool, + obs_extended: bool, + }, + /// The length of the sequence being written does not match what was specified in the header /// /// # Fields @@ -211,9 +231,10 @@ pub enum WriteError { /// When a record is too large to fit in a block of the configured size /// /// The first parameter is the record size, the second is the maximum block size - #[error("Encountered a record with embedded size {0} but the maximum block size is {1}. Rerun with increased block size.")] + #[error( + "Encountered a record with embedded size {0} but the maximum block size is {1}. Rerun with increased block size." + )] RecordSizeExceedsMaximumBlockSize(usize, usize), - /// When trying to ingest blocks with different sizes than expected /// /// The first parameter is the expected size, the second is the found size @@ -225,13 +246,17 @@ pub enum WriteError { /// When trying to ingest data with an incompatible header /// /// The first parameter is the expected header, the second is the found header - #[error("Incompatible headers found in VBinseqWriter::ingest. Found ({1:?}) Expected ({0:?})")] - IncompatibleHeaders(crate::vbq::VBinseqHeader, crate::vbq::VBinseqHeader), + #[error("Incompatible headers found in vbq::Writer::ingest. Found ({1:?}) Expected ({0:?})")] + IncompatibleHeaders(crate::vbq::FileHeader, crate::vbq::FileHeader), + + /// When building a `SequencingRecord` without a primary sequence + #[error("SequencingRecordBuilder requires a primary sequence (s_seq)")] + MissingSequence, } -/// Errors related to VBINSEQ file indexing +/// Errors related to VBQ file indexing /// -/// These errors occur when there are issues with the index of a VBINSEQ file, +/// These errors occur when there are issues with the index of a VBQ file, /// such as corruption or mismatches with the underlying file. #[derive(thiserror::Error, Debug)] pub enum IndexError { @@ -241,35 +266,64 @@ pub enum IndexError { #[error("Invalid magic number: {0}")] InvalidMagicNumber(u64), - /// When the index references a file that doesn't exist - /// - /// The parameter is the missing file path - #[error("Index missing upstream file path: {0}")] - MissingUpstreamFile(String), - - /// When the size of the file doesn't match what the index expects - /// - /// The first parameter is the actual file size, the second is the expected size - #[error("Mismatch in size between upstream size: {0} and expected index size {1}")] - ByteSizeMismatch(u64, u64), - /// Invalid reserved bytes in the index header #[error("Invalid reserved bytes in index header")] InvalidReservedBytes, } -impl IndexError { - /// Checks if this error indicates a mismatch between the index and file - /// - /// This is useful to determine if the index needs to be rebuilt. - /// - /// # Returns - /// - /// * `true` for `ByteSizeMismatch` errors - /// * `true` for any other error type (this behavior is likely a bug and should be fixed) - #[must_use] - pub fn is_mismatch(&self) -> bool { - matches!(self, Self::ByteSizeMismatch(_, _) | _) // Note: this appears to always return true regardless of error type - } + +#[derive(thiserror::Error, Debug)] +pub enum CbqError { + #[error( + "Record size ({record_size}) exceeds maximum block size ({max_block_size}) - Try increasing block size." + )] + ExceedsMaximumBlockSize { + max_block_size: usize, + record_size: usize, + }, + + #[error("Cannot ingest block of size {other_block_size} into block of size {self_block_size}")] + CannotIngestBlock { + self_block_size: usize, + other_block_size: usize, + }, + + /// Attempting to write a record into a full block + #[error( + "Block(size: {block_size}) will be exceeded by record size {record_size}. Current size: {current_size}" + )] + BlockFull { + current_size: usize, + record_size: usize, + block_size: usize, + }, + + #[error("Invalid block header MAGIC found")] + InvalidBlockHeaderMagic, + + #[error("Invalid file header MAGIC found")] + InvalidFileHeaderMagic, + + #[error("Invalid index header MAGIC found")] + InvalidIndexHeaderMagic, + + #[error("Invalid index footer MAGIC found")] + InvalidIndexFooterMagic, + + #[error("Unable to cast bytes to Index - likely an alignment error")] + IndexCastingError, + + #[error("SequenceRecordBuilder failed on build due to missing primary sequence (`s_seq`)")] + MissingSequenceOnSequencingRecord, +} + +#[cfg(feature = "paraseq")] +#[derive(thiserror::Error, Debug)] +pub enum FastxEncodingError { + #[error("Empty FASTX file")] + EmptyFastxFile, + + #[error("Builder not provided with any input")] + MissingInput, } #[derive(thiserror::Error, Debug)] @@ -312,4 +366,249 @@ mod testing { let binseq_error = my_error.into_binseq_error(); assert!(matches!(binseq_error, Error::GenericError(_))); } + + // ==================== HeaderError Tests ==================== + + #[test] + fn test_header_error_invalid_magic_number() { + let error = HeaderError::InvalidMagicNumber(0xDEADBEEF); + let error_str = format!("{}", error); + assert!(error_str.contains("0xdeadbeef") || error_str.contains("3735928559")); + } + + #[test] + fn test_header_error_invalid_format_version() { + let error = HeaderError::InvalidFormatVersion(99); + let error_str = format!("{}", error); + assert!(error_str.contains("99")); + } + + #[test] + fn test_header_error_invalid_bit_size() { + let error = HeaderError::InvalidBitSize(8); + let error_str = format!("{}", error); + assert!(error_str.contains("8")); + assert!(error_str.contains("[2,4]")); + } + + #[test] + fn test_header_error_invalid_size() { + let error = HeaderError::InvalidSize(100, 200); + let error_str = format!("{}", error); + assert!(error_str.contains("100")); + assert!(error_str.contains("200")); + } + + // ==================== ReadError Tests ==================== + + #[test] + fn test_read_error_out_of_range() { + let error = ReadError::OutOfRange { + requested_index: 150, + max_index: 100, + }; + let error_str = format!("{}", error); + assert!(error_str.contains("150")); + assert!(error_str.contains("100")); + } + + #[test] + fn test_read_error_file_truncation() { + let error = ReadError::FileTruncation(12345); + let error_str = format!("{}", error); + assert!(error_str.contains("12345")); + } + + #[test] + fn test_read_error_partial_record() { + let error = ReadError::PartialRecord(42); + let error_str = format!("{}", error); + assert!(error_str.contains("42")); + } + + #[test] + fn test_read_error_invalid_block_magic_number() { + let error = ReadError::InvalidBlockMagicNumber(0xBADC0DE, 1000); + let error_str = format!("{}", error); + assert!(error_str.contains("1000")); + } + + // ==================== WriteError Tests ==================== + + #[test] + fn test_write_error_configuration_mismatch() { + let error = WriteError::ConfigurationMismatch { + attribute: "paired", + expected: true, + actual: false, + }; + let error_str = format!("{}", error); + assert!(error_str.contains("paired")); + assert!(error_str.contains("true")); + assert!(error_str.contains("false")); + } + + #[test] + fn test_write_error_unexpected_sequence_length() { + let error = WriteError::UnexpectedSequenceLength { + expected: 100, + got: 150, + }; + let error_str = format!("{}", error); + assert!(error_str.contains("100")); + assert!(error_str.contains("150")); + } + + #[test] + fn test_write_error_invalid_nucleotide_sequence() { + let error = WriteError::InvalidNucleotideSequence("ACGTNX".to_string()); + let error_str = format!("{}", error); + assert!(error_str.contains("ACGTNX")); + } + + #[test] + fn test_write_error_record_size_exceeds_max() { + let error = WriteError::RecordSizeExceedsMaximumBlockSize(2000, 1024); + let error_str = format!("{}", error); + assert!(error_str.contains("2000")); + assert!(error_str.contains("1024")); + } + + #[test] + fn test_write_error_missing_sequence_length() { + let error = WriteError::MissingSequenceLength { + exp_primary: true, + exp_extended: false, + obs_primary: false, + obs_extended: false, + }; + let error_str = format!("{}", error); + assert!(error_str.contains("Missing required sequence length")); + } + + // ==================== CbqError Tests ==================== + + #[test] + fn test_cbq_error_exceeds_maximum_block_size() { + let error = CbqError::ExceedsMaximumBlockSize { + max_block_size: 1024, + record_size: 2048, + }; + let error_str = format!("{}", error); + assert!(error_str.contains("1024")); + assert!(error_str.contains("2048")); + } + + #[test] + fn test_cbq_error_block_full() { + let error = CbqError::BlockFull { + current_size: 900, + record_size: 200, + block_size: 1024, + }; + let error_str = format!("{}", error); + assert!(error_str.contains("900")); + assert!(error_str.contains("200")); + assert!(error_str.contains("1024")); + } + + #[test] + fn test_cbq_error_cannot_ingest_block() { + let error = CbqError::CannotIngestBlock { + self_block_size: 1024, + other_block_size: 2048, + }; + let error_str = format!("{}", error); + assert!(error_str.contains("1024")); + assert!(error_str.contains("2048")); + } + + // ==================== BuilderError Tests ==================== + + #[test] + fn test_builder_error_missing_slen() { + let error = BuilderError::MissingSlen; + let error_str = format!("{}", error); + assert!(error_str.contains("Missing sequence length")); + } + + // ==================== ExtensionError Tests ==================== + + #[test] + fn test_extension_error_unsupported() { + let error = ExtensionError::UnsupportedExtension("test.xyz".to_string()); + let error_str = format!("{}", error); + assert!(error_str.contains("test.xyz")); + } + + // ==================== Error Conversion Tests ==================== + + #[test] + fn test_error_from_header_error() { + let header_error = HeaderError::InvalidMagicNumber(0x1234); + let error: Error = header_error.into(); + assert!(matches!(error, Error::HeaderError(_))); + } + + #[test] + fn test_error_from_write_error() { + let write_error = WriteError::MissingHeader; + let error: Error = write_error.into(); + assert!(matches!(error, Error::WriteError(_))); + } + + #[test] + fn test_error_from_read_error() { + let read_error = ReadError::EndOfStream; + let error: Error = read_error.into(); + assert!(matches!(error, Error::ReadError(_))); + } + + #[test] + fn test_error_from_index_error() { + let index_error = IndexError::InvalidMagicNumber(0x5678); + let error: Error = index_error.into(); + assert!(matches!(error, Error::IndexError(_))); + } + + #[test] + fn test_error_from_cbq_error() { + let cbq_error = CbqError::InvalidBlockHeaderMagic; + let error: Error = cbq_error.into(); + assert!(matches!(error, Error::CbqError(_))); + } + + #[test] + fn test_error_from_builder_error() { + let builder_error = BuilderError::MissingSlen; + let error: Error = builder_error.into(); + assert!(matches!(error, Error::BuilderError(_))); + } + + #[test] + fn test_error_debug_output() { + let error = Error::WriteError(WriteError::MissingHeader); + let debug_str = format!("{:?}", error); + assert!(debug_str.contains("WriteError")); + } + + // ==================== Fastx Error Tests (conditional) ==================== + + #[cfg(feature = "paraseq")] + #[test] + fn test_fastx_error_empty_file() { + use super::FastxEncodingError; + let error = FastxEncodingError::EmptyFastxFile; + let error_str = format!("{}", error); + assert!(error_str.contains("Empty FASTX file")); + } + + #[cfg(feature = "paraseq")] + #[test] + fn test_fastx_error_missing_input() { + use super::FastxEncodingError; + let error = FastxEncodingError::MissingInput; + let error_str = format!("{}", error); + assert!(error_str.contains("not provided with any input")); + } } diff --git a/src/lib.rs b/src/lib.rs index 4877b5d..00a71e9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,38 +2,41 @@ //! //! # BINSEQ //! -//! The `binseq` library provides efficient APIs for working with the [BINSEQ](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1) file format family. +//! The `binseq` library provides efficient APIs for working with the [BINSEQ](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v2) file format family. //! //! It offers methods to read and write BINSEQ files, providing: //! //! - Compact multi-bit encoding and decoding of nucleotide sequences through [`bitnuc`](https://docs.rs/bitnuc/latest/bitnuc/) -//! - Memory-mapped file access for efficient reading ([`bq::MmapReader`] and [`vbq::MmapReader`]) -//! - Parallel processing capabilities for arbitrary tasks through the [`ParallelProcessor`] trait. -//! - Configurable [`Policy`] for handling invalid nucleotides //! - Support for both single and paired-end sequences -//! - Optional sequence headers/identifiers (VBQ format) -//! - Abstract [`BinseqRecord`] trait for representing records from both `.bq` and `.vbq` files. -//! - Abstract [`BinseqReader`] enum for processing records from both `.bq` and `.vbq` files. +//! - Abstract [`BinseqRecord`] trait for representing records from all variants +//! - Abstract [`BinseqReader`] enum for processing records from all variants +//! - Abstract [`BinseqWriter`] enum for writing records to all variants +//! - Parallel processing capabilities for arbitrary tasks through the [`ParallelProcessor`] trait. +//! - Configurable [`Policy`] for handling invalid nucleotides (BQ/VBQ, CBQ natively supports `N` nucleotides) +//! +//! ## Recent additions (v0.9.0): +//! +//! ### New variant: CBQ +//! **[`cbq`]** is a new variant of BINSEQ that solves many of the pain points around VBQ. +//! The CBQ format is a columnar-block-based format that offers improved compression and faster processing speeds compared to VBQ. +//! It natively supports `N` nucleotides and avoids the need for additional 4-bit encoding. +//! +//! ### Improved interface for writing records +//! **[`BinseqWriter`]** provides a unified interface for writing records generically to BINSEQ files. +//! This makes use of the new [`SequencingRecord`] which provides a cleaner builder API for writing records to BINSEQ files. //! //! ## Recent VBQ Format Changes (v0.7.0+) //! //! The VBQ format has undergone significant improvements: //! //! - **Embedded Index**: VBQ files now contain their index data embedded at the end of the file, -//! eliminating separate `.vqi` index files and improving portability. +//! improving portability. //! - **Headers Support**: Optional sequence identifiers/headers can be stored with each record. //! - **Extended Capacity**: u64 indexing supports files with more than 4 billion records. //! - **Multi-bit Encoding**: Support for both 2-bit and 4-bit nucleotide encodings. //! //! Legacy VBQ files are automatically migrated to the new format when accessed. //! -//! ## Crate Organization -//! -//! This library is split into 3 major parts. -//! -//! There are the [`bq`] and [`vbq`] modules, which provide tools for reading and writing `BQ` and `VBQ` files respectively. -//! Then there are traits and utilities that are ubiquitous across the library which are available at the top-level of the crate. -//! //! # Example: Memory-mapped Access //! //! ``` @@ -87,22 +90,32 @@ mod parallel; /// Invalid nucleotide policy mod policy; -/// Record trait shared between BINSEQ variants +/// Record types and traits shared between BINSEQ variants mod record; /// VBQ - Variable length records, optional quality scores, compressed blocks pub mod vbq; +/// CBQ - Columnar variable length records, optional quality scores and headers +pub mod cbq; + /// Prelude - Commonly used types and traits pub mod prelude; -/// Context - Reusable state for parallel processing -pub mod context; +/// Write operations generic over the BINSEQ variant +pub mod write; + +/// Utilities for working with BINSEQ files +pub mod utils; pub use error::{Error, IntoBinseqError, Result}; pub use parallel::{BinseqReader, ParallelProcessor, ParallelReader}; pub use policy::{Policy, RNG_SEED}; -pub use record::BinseqRecord; +pub use record::{BinseqRecord, SequencingRecord, SequencingRecordBuilder}; +pub use write::{BinseqWriter, BinseqWriterBuilder}; /// Re-export `bitnuc::BitSize` pub use bitnuc::BitSize; + +/// Default quality score for BINSEQ readers without quality scores +pub(crate) const DEFAULT_QUALITY_SCORE: u8 = b'?'; diff --git a/src/parallel.rs b/src/parallel.rs index b46e49f..3414f31 100644 --- a/src/parallel.rs +++ b/src/parallel.rs @@ -1,7 +1,11 @@ use std::ops::Range; use std::path::Path; -use crate::{bq, error::ExtensionError, vbq, BinseqRecord, Result}; +use crate::{ + BinseqRecord, Result, bq, cbq, + error::{ExtensionError, ReadError}, + vbq, +}; /// An enum abstraction for BINSEQ readers that can process records in parallel /// @@ -12,6 +16,7 @@ use crate::{bq, error::ExtensionError, vbq, BinseqRecord, Result}; pub enum BinseqReader { Bq(bq::MmapReader), Vbq(vbq::MmapReader), + Cbq(cbq::MmapReader), } impl BinseqReader { pub fn new(path: &str) -> Result { @@ -20,6 +25,7 @@ impl BinseqReader { Some(ext) => match ext.to_str() { Some("bq") => Ok(Self::Bq(bq::MmapReader::new(path)?)), Some("vbq") => Ok(Self::Vbq(vbq::MmapReader::new(path)?)), + Some("cbq") => Ok(Self::Cbq(cbq::MmapReader::new(path)?)), _ => Err(ExtensionError::UnsupportedExtension(path.to_string()).into()), }, None => Err(ExtensionError::UnsupportedExtension(path.to_string()).into()), @@ -31,18 +37,27 @@ impl BinseqReader { /// Note: This setting applies to VBQ readers only. pub fn set_decode_block(&mut self, decode_block: bool) { match self { - Self::Bq(_) => { + Self::Bq(_) | Self::Cbq(_) => { // no-op } Self::Vbq(reader) => reader.set_decode_block(decode_block), } } + pub fn set_default_quality_score(&mut self, score: u8) { + match self { + Self::Bq(reader) => reader.set_default_quality_score(score), + Self::Vbq(reader) => reader.set_default_quality_score(score), + Self::Cbq(reader) => reader.set_default_quality_score(score), + } + } + #[must_use] pub fn is_paired(&self) -> bool { match self { Self::Bq(reader) => reader.is_paired(), Self::Vbq(reader) => reader.is_paired(), + Self::Cbq(reader) => reader.is_paired(), } } @@ -50,6 +65,7 @@ impl BinseqReader { match self { Self::Bq(reader) => Ok(reader.num_records()), Self::Vbq(reader) => reader.num_records(), + Self::Cbq(reader) => Ok(reader.num_records()), } } @@ -79,6 +95,7 @@ impl BinseqReader { match self { Self::Bq(reader) => reader.process_parallel_range(processor, num_threads, range), Self::Vbq(reader) => reader.process_parallel_range(processor, num_threads, range), + Self::Cbq(reader) => reader.process_parallel_range(processor, num_threads, range), } } } @@ -101,6 +118,7 @@ impl ParallelReader for BinseqReader { match self { Self::Bq(reader) => reader.process_parallel_range(processor, num_threads, range), Self::Vbq(reader) => reader.process_parallel_range(processor, num_threads, range), + Self::Cbq(reader) => reader.process_parallel_range(processor, num_threads, range), } } } @@ -138,6 +156,45 @@ pub trait ParallelReader { num_threads: usize, range: Range, ) -> Result<()>; + + /// Validate the specified range for the file. + /// + /// This method checks if the provided range is valid for the file, ensuring that + /// the start index is less than the end index and both indices are within the + /// bounds of the file. + /// + /// # Arguments + /// + /// * `total_records` - The total number of records in the file + /// * `range` - The range of record indices to validate + /// + /// # Returns + /// + /// * `Ok(())` - If the range is valid + /// * `Err(Error)` - If the range is invalid + fn validate_range(&self, total_records: usize, range: &Range) -> Result<()> { + if range.start >= total_records { + Err(ReadError::OutOfRange { + requested_index: range.start, + max_index: total_records, + } + .into()) + } else if range.end > total_records { + Err(ReadError::OutOfRange { + requested_index: range.end, + max_index: total_records, + } + .into()) + } else if range.start > range.end { + Err(ReadError::InvalidRange { + start: range.start, + end: range.end, + } + .into()) + } else { + Ok(()) + } + } } /// Trait for types that can process records in parallel. @@ -168,3 +225,117 @@ pub trait ParallelProcessor: Send + Clone { None } } + +#[cfg(test)] +mod testing { + use std::sync::Arc; + + use parking_lot::Mutex; + + use super::*; + + #[derive(Clone, Default)] + struct TestProcessor { + pub n_records: Arc>, + } + impl ParallelProcessor for TestProcessor { + fn process_record(&mut self, _record: R) -> Result<()> { + *self.n_records.lock() += 1; + Ok(()) + } + } + + #[test] + fn test_parallel_processor() { + for ext in ["bq", "vbq", "cbq"] { + eprintln!("Testing {}", ext); + let reader = BinseqReader::new(&format!("./data/subset.{}", ext)).unwrap(); + let num_records = reader.num_records().unwrap(); + let processor = TestProcessor::default(); + assert!(reader.process_parallel(processor.clone(), 0).is_ok()); + assert_eq!(*processor.n_records.lock(), num_records); + } + } + + #[test] + fn test_parallel_processor_range() { + for ext in ["bq", "vbq", "cbq"] { + eprintln!("Testing {}", ext); + let reader = BinseqReader::new(&format!("./data/subset.{}", ext)).unwrap(); + let processor = TestProcessor::default(); + assert!( + reader + .process_parallel_range(processor.clone(), 0, 0..10) + .is_ok() + ); + assert_eq!(*processor.n_records.lock(), 10); + } + } + + #[test] + fn test_parallel_processor_out_of_range_start() { + for ext in ["bq", "vbq", "cbq"] { + eprintln!("Testing {}", ext); + let reader = BinseqReader::new(&format!("./data/subset.{}", ext)).unwrap(); + let processor = TestProcessor::default(); + assert!( + reader + .process_parallel_range(processor, 0, 1_000_000..1_000_001) + .is_err() + ); + } + } + + #[test] + fn test_parallel_processor_out_of_range_end() { + for ext in ["bq", "vbq", "cbq"] { + eprintln!("Testing {}", ext); + let reader = BinseqReader::new(&format!("./data/subset.{}", ext)).unwrap(); + let processor = TestProcessor::default(); + assert!( + reader + .process_parallel_range(processor, 0, 0..1_000_000) + .is_err() + ); + } + } + + #[test] + fn test_parallel_processor_backwards_range() { + for ext in ["bq", "vbq", "cbq"] { + eprintln!("Testing {}", ext); + let reader = BinseqReader::new(&format!("./data/subset.{}", ext)).unwrap(); + let processor = TestProcessor::default(); + assert!(reader.process_parallel_range(processor, 0, 100..0).is_err()); + } + } + + #[test] + fn test_set_decode_block() { + for ext in ["bq", "vbq", "cbq"] { + for opt in [true, false] { + eprintln!("Testing {} - decode {}", ext, opt); + let mut reader = BinseqReader::new(&format!("./data/subset.{}", ext)).unwrap(); + reader.set_decode_block(opt); + let num_records = reader.num_records().unwrap(); + let processor = TestProcessor::default(); + assert!(reader.process_parallel(processor.clone(), 0).is_ok()); + assert_eq!(*processor.n_records.lock(), num_records); + } + } + } + + #[test] + fn test_set_default_quality_score() { + for ext in ["bq", "vbq", "cbq"] { + let default_score = b'#'; + eprintln!("Testing {} - default score: {}", ext, default_score); + let mut reader = BinseqReader::new(&format!("./data/subset.{}", ext)).unwrap(); + reader.set_default_quality_score(default_score); + let num_records = reader.num_records().unwrap(); + let processor = TestProcessor::default(); + assert!(reader.process_parallel(processor.clone(), 0).is_ok()); + assert_eq!(*processor.n_records.lock(), num_records); + } + } +} diff --git a/src/policy.rs b/src/policy.rs index ac172bc..6f8f39e 100644 --- a/src/policy.rs +++ b/src/policy.rs @@ -169,3 +169,282 @@ impl Policy { } } } + +#[cfg(test)] +mod tests { + use super::*; + use rand::SeedableRng; + use rand::rngs::StdRng; + + // ==================== Basic Policy Tests ==================== + + #[test] + fn test_default_policy() { + let policy = Policy::default(); + assert!(matches!(policy, Policy::IgnoreSequence)); + } + + #[test] + fn test_ignore_sequence_policy() { + let policy = Policy::IgnoreSequence; + let sequence = b"ACGTNX"; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + let should_process = policy.handle(sequence, &mut output, &mut rng).unwrap(); + + assert!(!should_process); // Should return false to skip this sequence + assert!(output.is_empty()); // Output buffer should be empty + } + + #[test] + fn test_break_on_invalid_policy() { + let policy = Policy::BreakOnInvalid; + let sequence = b"ACGTNX"; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + let result = policy.handle(sequence, &mut output, &mut rng); + + assert!(result.is_err()); + assert!(matches!( + result.unwrap_err(), + crate::error::Error::WriteError(WriteError::InvalidNucleotideSequence(_)) + )); + } + + #[test] + fn test_break_on_invalid_with_valid_sequence() { + let policy = Policy::BreakOnInvalid; + let sequence = b"ACGT"; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + let result = policy.handle(sequence, &mut output, &mut rng); + + // Valid sequences should error because handle() doesn't validate for BreakOnInvalid + // It only returns an error immediately + assert!(result.is_err()); + } + + // ==================== Set-to-Specific-Nucleotide Tests ==================== + + #[test] + fn test_set_to_a_policy() { + let policy = Policy::SetToA; + let sequence = b"ACGTNX"; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + let should_process = policy.handle(sequence, &mut output, &mut rng).unwrap(); + + assert!(should_process); // Should return true to process this sequence + assert_eq!(output, b"ACGTAA"); // N and X should be replaced with A + } + + #[test] + fn test_set_to_c_policy() { + let policy = Policy::SetToC; + let sequence = b"ACGTNX"; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + let should_process = policy.handle(sequence, &mut output, &mut rng).unwrap(); + + assert!(should_process); + assert_eq!(output, b"ACGTCC"); // N and X should be replaced with C + } + + #[test] + fn test_set_to_g_policy() { + let policy = Policy::SetToG; + let sequence = b"ACGTNX"; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + let should_process = policy.handle(sequence, &mut output, &mut rng).unwrap(); + + assert!(should_process); + assert_eq!(output, b"ACGTGG"); // N and X should be replaced with G + } + + #[test] + fn test_set_to_t_policy() { + let policy = Policy::SetToT; + let sequence = b"ACGTNX"; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + let should_process = policy.handle(sequence, &mut output, &mut rng).unwrap(); + + assert!(should_process); + assert_eq!(output, b"ACGTTT"); // N and X should be replaced with T + } + + #[test] + fn test_all_valid_nucleotides_unchanged() { + let policy = Policy::SetToA; + let sequence = b"ACGTACGT"; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + let should_process = policy.handle(sequence, &mut output, &mut rng).unwrap(); + + assert!(should_process); + assert_eq!(output, b"ACGTACGT"); // All valid, should remain unchanged + } + + // ==================== Random Draw Tests ==================== + + #[test] + fn test_random_draw_policy() { + let policy = Policy::RandomDraw; + let sequence = b"ACGTNX"; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + let should_process = policy.handle(sequence, &mut output, &mut rng).unwrap(); + + assert!(should_process); + assert_eq!(output.len(), 6); // Same length as input + // First 4 nucleotides should be unchanged + assert_eq!(&output[0..4], b"ACGT"); + // Last 2 should be valid nucleotides (A, C, G, or T) + assert!(matches!(output[4], b'A' | b'C' | b'G' | b'T')); + assert!(matches!(output[5], b'A' | b'C' | b'G' | b'T')); + } + + #[test] + fn test_random_draw_deterministic_with_seed() { + let policy = Policy::RandomDraw; + let sequence = b"NNNN"; + let mut output1 = Vec::new(); + let mut output2 = Vec::new(); + let mut rng1 = StdRng::seed_from_u64(RNG_SEED); + let mut rng2 = StdRng::seed_from_u64(RNG_SEED); + + policy.handle(sequence, &mut output1, &mut rng1).unwrap(); + policy.handle(sequence, &mut output2, &mut rng2).unwrap(); + + // Same seed should produce same output + assert_eq!(output1, output2); + } + + // ==================== Buffer Clearing Tests ==================== + + #[test] + fn test_buffer_cleared_before_processing() { + let policy = Policy::SetToA; + let sequence = b"ACGT"; + let mut output = vec![b'X', b'Y', b'Z']; // Pre-fill buffer + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + policy.handle(sequence, &mut output, &mut rng).unwrap(); + + // Buffer should be cleared and only contain new data + assert_eq!(output, b"ACGT"); + } + + #[test] + fn test_multiple_calls_clear_buffer() { + let policy = Policy::SetToA; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + policy.handle(b"ACGT", &mut output, &mut rng).unwrap(); + assert_eq!(output, b"ACGT"); + + policy.handle(b"TT", &mut output, &mut rng).unwrap(); + assert_eq!(output, b"TT"); // Should only contain second sequence + } + + // ==================== Edge Case Tests ==================== + + #[test] + fn test_empty_sequence() { + let policy = Policy::SetToA; + let sequence = b""; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + let should_process = policy.handle(sequence, &mut output, &mut rng).unwrap(); + + assert!(should_process); + assert!(output.is_empty()); + } + + #[test] + fn test_all_invalid_nucleotides() { + let policy = Policy::SetToG; + let sequence = b"NNNXXX"; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + let should_process = policy.handle(sequence, &mut output, &mut rng).unwrap(); + + assert!(should_process); + assert_eq!(output, b"GGGGGG"); // All should be replaced with G + } + + #[test] + fn test_policy_clone() { + let policy1 = Policy::SetToA; + let policy2 = policy1; + + // Should be able to use both (tests Copy trait) + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + policy1.handle(b"NT", &mut output, &mut rng).unwrap(); + assert_eq!(output, b"AT"); + + policy2.handle(b"NT", &mut output, &mut rng).unwrap(); + assert_eq!(output, b"AT"); + } + + #[test] + fn test_policy_debug() { + let policy = Policy::SetToA; + let debug_str = format!("{:?}", policy); + assert!(debug_str.contains("SetToA")); + } + + // ==================== Various Invalid Character Tests ==================== + + #[test] + fn test_lowercase_nucleotides_treated_as_invalid() { + let policy = Policy::SetToA; + let sequence = b"acgt"; // lowercase + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + policy.handle(sequence, &mut output, &mut rng).unwrap(); + + // Lowercase nucleotides should be treated as invalid + assert_eq!(output, b"AAAA"); + } + + #[test] + fn test_mixed_case_nucleotides() { + let policy = Policy::SetToC; + let sequence = b"AcGt"; + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + policy.handle(sequence, &mut output, &mut rng).unwrap(); + + assert_eq!(output, b"ACGC"); // Only uppercase are valid + } + + #[test] + fn test_ambiguous_nucleotide_codes() { + let policy = Policy::SetToT; + let sequence = b"RYWSMK"; // R, Y, W, S, M, K are ambiguous codes + let mut output = Vec::new(); + let mut rng = StdRng::seed_from_u64(RNG_SEED); + + policy.handle(sequence, &mut output, &mut rng).unwrap(); + + assert_eq!(output, b"TTTTTT"); // All ambiguous codes replaced with T + } +} diff --git a/src/prelude.rs b/src/prelude.rs index 01d30b9..5693f9f 100644 --- a/src/prelude.rs +++ b/src/prelude.rs @@ -1,5 +1,4 @@ -pub use super::{BinseqReader, BinseqRecord, ParallelProcessor, ParallelReader}; - -pub use crate::context::{ - Context, Ctx, HeaderContext, QualityContext, SeqCtx, SeqHeaderCtx, SeqQualCtx, SequenceContext, +pub use super::{ + BinseqReader, BinseqRecord, ParallelProcessor, ParallelReader, SequencingRecord, + SequencingRecordBuilder, }; diff --git a/src/record.rs b/src/record/binseq_record.rs similarity index 99% rename from src/record.rs rename to src/record/binseq_record.rs index 93c1570..1fb0370 100644 --- a/src/record.rs +++ b/src/record/binseq_record.rs @@ -1,7 +1,7 @@ use auto_impl::auto_impl; use bitnuc::BitSize; -use super::Result; +use crate::Result; /// Record trait shared between BINSEQ variants. /// diff --git a/src/record/mod.rs b/src/record/mod.rs new file mode 100644 index 0000000..dad17f9 --- /dev/null +++ b/src/record/mod.rs @@ -0,0 +1,5 @@ +mod binseq_record; +mod sequencing_record; + +pub use binseq_record::BinseqRecord; +pub use sequencing_record::{SequencingRecord, SequencingRecordBuilder}; diff --git a/src/record/sequencing_record.rs b/src/record/sequencing_record.rs new file mode 100644 index 0000000..95b6fd0 --- /dev/null +++ b/src/record/sequencing_record.rs @@ -0,0 +1,403 @@ +use crate::{BitSize, Result, error::WriteError}; + +/// A zero-copy record used to write sequences to binary sequence files. +/// +/// This struct provides a unified API for writing records to all binseq formats +/// (BQ, VBQ, and CBQ). It uses borrowed references for zero-copy efficiency. +/// +/// # Example +/// +/// ``` +/// use binseq::SequencingRecordBuilder; +/// +/// let record = SequencingRecordBuilder::default() +/// .s_seq(b"ACGTACGT") +/// .s_qual(b"IIIIFFFF") +/// .s_header(b"seq_001") +/// .flag(42) +/// .build() +/// .unwrap(); +/// ``` +#[derive(Clone, Copy, Default)] +pub struct SequencingRecord<'a> { + pub(crate) s_seq: &'a [u8], + pub(crate) s_qual: Option<&'a [u8]>, + pub(crate) s_header: Option<&'a [u8]>, + pub(crate) x_seq: Option<&'a [u8]>, + pub(crate) x_qual: Option<&'a [u8]>, + pub(crate) x_header: Option<&'a [u8]>, + pub(crate) flag: Option, +} + +impl<'a> SequencingRecord<'a> { + #[inline] + #[must_use] + pub fn new( + s_seq: &'a [u8], + s_qual: Option<&'a [u8]>, + s_header: Option<&'a [u8]>, + x_seq: Option<&'a [u8]>, + x_qual: Option<&'a [u8]>, + x_header: Option<&'a [u8]>, + flag: Option, + ) -> Self { + Self { + s_seq, + s_qual, + s_header, + x_seq, + x_qual, + x_header, + flag, + } + } + + /// Returns the primary sequence + #[inline] + #[must_use] + pub fn s_seq(&self) -> &'a [u8] { + self.s_seq + } + + /// Returns the primary quality scores if present + #[inline] + #[must_use] + pub fn s_qual(&self) -> Option<&'a [u8]> { + self.s_qual + } + + /// Returns the primary header if present + #[inline] + #[must_use] + pub fn s_header(&self) -> Option<&'a [u8]> { + self.s_header + } + + /// Returns the extended/paired sequence if present + #[inline] + #[must_use] + pub fn x_seq(&self) -> Option<&'a [u8]> { + self.x_seq + } + + /// Returns the extended quality scores if present + #[inline] + #[must_use] + pub fn x_qual(&self) -> Option<&'a [u8]> { + self.x_qual + } + + /// Returns the extended header if present + #[inline] + #[must_use] + pub fn x_header(&self) -> Option<&'a [u8]> { + self.x_header + } + + /// Returns the flag if present + #[inline] + #[must_use] + pub fn flag(&self) -> Option { + self.flag + } + + /// Returns the configured size of this record for CBQ format. + /// + /// CBQ uses columnar storage so there are no per-record length prefixes. + /// This calculates the size based on writer configuration, ignoring any + /// extra data in the record that the writer won't use. + #[inline] + #[must_use] + pub fn configured_size_cbq( + &self, + is_paired: bool, + has_flags: bool, + has_headers: bool, + has_qualities: bool, + ) -> usize { + // CBQ uses 2-bit encoding: 4 nucleotides per byte, 32 per u64 word + const NUCS_PER_WORD: usize = 32; + + let mut size = 0; + + // Sequence size (encoded into u64 words) + let s_chunks = self.s_seq.len().div_ceil(NUCS_PER_WORD); + size += s_chunks * 8; + + // Extended sequence (only if writer is configured for paired) + if is_paired { + let x_chunks = self.x_seq.map_or(0, |x| x.len().div_ceil(NUCS_PER_WORD)); + size += x_chunks * 8; + } + + // Flag size (only if writer is configured for flags) + if has_flags { + size += 8; // u64 + } + + // Header size (only if writer is configured for headers) + if has_headers { + size += self.s_header.map_or(0, <[u8]>::len); + if is_paired { + size += self.x_header.map_or(0, <[u8]>::len); + } + } + + // Quality size (only if writer is configured for qualities) + if has_qualities { + size += self.s_qual.map_or(0, <[u8]>::len); + if is_paired { + size += self.x_qual.map_or(0, <[u8]>::len); + } + } + + size + } + + /// Returns the configured size of this record for VBQ format. + /// + /// VBQ uses a row-based format with length prefixes for each field. + /// This calculates the size based on writer configuration, ignoring any + /// extra data in the record that the writer won't use. + /// + /// The VBQ record layout is: + /// - Flag (8 bytes, if `has_flags`) + /// - `s_len` (8 bytes) + /// - `x_len` (8 bytes) + /// - `s_seq` (encoded, rounded up to 8-byte words) + /// - `s_qual` (raw bytes, if `has_qualities`) + /// - `s_header_len` + `s_header` (8 + len bytes, if `has_headers` and `s_header` present) + /// - `x_seq` (encoded, rounded up to 8-byte words, if paired) + /// - `x_qual` (raw bytes, if `has_qualities` and paired) + /// - `x_header_len` + `x_header` (8 + len bytes, if `has_headers` and `x_header` present) + #[inline] + #[must_use] + pub fn configured_size_vbq( + &self, + is_paired: bool, + has_flags: bool, + has_headers: bool, + has_qualities: bool, + bitsize: BitSize, + ) -> usize { + // Calculate how many nucleotides fit per byte for the given bitsize + let nucs_per_byte = if matches!(bitsize, BitSize::Two) { + 4 + } else { + 2 + }; + // VBQ packs sequences into u64 words + let nucs_per_word = nucs_per_byte * 8; + + let mut size = 0; + + // Length prefixes: s_len and x_len (always present) + size += 16; // 2 * u64 + + // Flag (8 bytes, if has_flags) + if has_flags { + size += 8; + } + + // Primary sequence (encoded into u64 words) + let s_chunks = self.s_seq.len().div_ceil(nucs_per_word); + size += s_chunks * 8; + + // Extended sequence (only if writer is configured for paired) + if is_paired { + let x_chunks = self.x_seq.map_or(0, |x| x.len().div_ceil(nucs_per_word)); + size += x_chunks * 8; + } + + // Quality scores (raw bytes, only if writer configured for qualities) + if has_qualities { + size += self.s_qual.map_or(0, <[u8]>::len); + if is_paired { + size += self.x_qual.map_or(0, <[u8]>::len); + } + } + + // Headers (length prefix + raw bytes, only if writer configured for headers) + if has_headers { + if let Some(h) = self.s_header { + size += 8 + h.len(); // length prefix + header bytes + } + if is_paired && let Some(h) = self.x_header { + size += 8 + h.len(); // length prefix + header bytes + } + } + + size + } + + #[inline] + #[must_use] + pub fn is_paired(&self) -> bool { + self.x_seq.is_some() + } + + #[inline] + #[must_use] + pub fn has_flags(&self) -> bool { + self.flag.is_some() + } + + #[inline] + #[must_use] + pub fn has_headers(&self) -> bool { + self.s_header.is_some() || self.x_header.is_some() + } + + #[inline] + #[must_use] + pub fn has_qualities(&self) -> bool { + self.s_qual.is_some() || self.x_qual.is_some() + } +} + +/// A convenience builder struct for creating a [`SequencingRecord`] +/// +/// # Example +/// +/// ``` +/// use binseq::SequencingRecordBuilder; +/// +/// // Build a simple unpaired record +/// let record = SequencingRecordBuilder::default() +/// .s_seq(b"ACGTACGT") +/// .build() +/// .unwrap(); +/// +/// // Build a paired record with quality scores +/// let paired = SequencingRecordBuilder::default() +/// .s_seq(b"ACGTACGT") +/// .s_qual(b"IIIIFFFF") +/// .x_seq(b"TGCATGCA") +/// .x_qual(b"FFFFHHHH") +/// .flag(1) +/// .build() +/// .unwrap(); +/// ``` +#[derive(Default)] +pub struct SequencingRecordBuilder<'a> { + s_seq: Option<&'a [u8]>, + s_qual: Option<&'a [u8]>, + s_header: Option<&'a [u8]>, + x_seq: Option<&'a [u8]>, + x_qual: Option<&'a [u8]>, + x_header: Option<&'a [u8]>, + flag: Option, +} + +impl<'a> SequencingRecordBuilder<'a> { + /// Sets the primary sequence (required) + #[must_use] + pub fn s_seq(mut self, s_seq: &'a [u8]) -> Self { + self.s_seq = Some(s_seq); + self + } + + /// Sets the primary quality scores + #[must_use] + pub fn s_qual(mut self, s_qual: &'a [u8]) -> Self { + self.s_qual = Some(s_qual); + self + } + + /// Sets the primary quality scores from an Option + #[must_use] + pub fn opt_s_qual(mut self, s_qual: Option<&'a [u8]>) -> Self { + self.s_qual = s_qual; + self + } + + /// Sets the primary header + #[must_use] + pub fn s_header(mut self, s_header: &'a [u8]) -> Self { + self.s_header = Some(s_header); + self + } + + /// Sets the primary header from an Option + #[must_use] + pub fn opt_s_header(mut self, s_header: Option<&'a [u8]>) -> Self { + self.s_header = s_header; + self + } + + /// Sets the extended/paired sequence + #[must_use] + pub fn x_seq(mut self, x_seq: &'a [u8]) -> Self { + self.x_seq = Some(x_seq); + self + } + + /// Sets the extended/paired sequence from an Option + #[must_use] + pub fn opt_x_seq(mut self, x_seq: Option<&'a [u8]>) -> Self { + self.x_seq = x_seq; + self + } + + /// Sets the extended quality scores + #[must_use] + pub fn x_qual(mut self, x_qual: &'a [u8]) -> Self { + self.x_qual = Some(x_qual); + self + } + + /// Sets the extended quality scores from an Option + #[must_use] + pub fn opt_x_qual(mut self, x_qual: Option<&'a [u8]>) -> Self { + self.x_qual = x_qual; + self + } + + /// Sets the extended header + #[must_use] + pub fn x_header(mut self, x_header: &'a [u8]) -> Self { + self.x_header = Some(x_header); + self + } + + /// Sets the extended header from an Option + #[must_use] + pub fn opt_x_header(mut self, x_header: Option<&'a [u8]>) -> Self { + self.x_header = x_header; + self + } + + /// Sets the flag value + #[must_use] + pub fn flag(mut self, flag: u64) -> Self { + self.flag = Some(flag); + self + } + + /// Sets the flag value from an Option + #[must_use] + pub fn opt_flag(mut self, flag: Option) -> Self { + self.flag = flag; + self + } + + /// Builds the `SequencingRecord` + /// + /// # Errors + /// + /// Returns an error if the primary sequence (`s_seq`) is not set. + pub fn build(self) -> Result> { + let Some(s_seq) = self.s_seq else { + return Err(WriteError::MissingSequence.into()); + }; + Ok(SequencingRecord { + s_seq, + s_qual: self.s_qual, + s_header: self.s_header, + x_seq: self.x_seq, + x_qual: self.x_qual, + x_header: self.x_header, + flag: self.flag, + }) + } +} diff --git a/src/utils/fastx.rs b/src/utils/fastx.rs new file mode 100644 index 0000000..46ab7e4 --- /dev/null +++ b/src/utils/fastx.rs @@ -0,0 +1,462 @@ +//! FASTX encoding utilities for converting FASTX files to BINSEQ formats +//! +//! This module provides utilities for encoding FASTX (FASTA/FASTQ) files into +//! BINSEQ formats using parallel processing via the `paraseq` crate. + +use std::{ + io::{Read, Write}, + path::{Path, PathBuf}, + sync::Arc, +}; + +use paraseq::{ + Record, fastx, + prelude::{IntoProcessError, PairedParallelProcessor, ParallelProcessor, ParallelReader}, +}; +use parking_lot::Mutex; + +use crate::{ + BinseqWriter, BinseqWriterBuilder, IntoBinseqError, Result, SequencingRecordBuilder, + error::FastxEncodingError, +}; + +type BoxedRead = Box; +type BoxedWrite = Box; + +/// Input source for FASTX encoding +#[derive(Debug, Clone)] +enum FastxInput { + /// Read from stdin + Stdin, + /// Read from a single file + Single(PathBuf), + /// Read from paired files (R1, R2) + Paired(PathBuf, PathBuf), +} + +/// Builder for encoding FASTX files to BINSEQ format +/// +/// This builder is created by calling [`BinseqWriterBuilder::encode_fastx`] and +/// provides a fluent interface for configuring the input source and threading options. +/// +/// # Example +/// +/// ```rust,no_run +/// use binseq::write::{BinseqWriterBuilder, Format}; +/// use std::fs::File; +/// +/// // Encode from stdin to VBQ +/// let writer = BinseqWriterBuilder::new(Format::Vbq) +/// .quality(true) +/// .headers(true) +/// .encode_fastx(Box::new(File::create("output.vbq")?)) +/// .input_stdin() +/// .threads(8) +/// .run()?; +/// # Ok::<(), binseq::Error>(()) +/// ``` +pub struct FastxEncoderBuilder { + builder: BinseqWriterBuilder, + output: BoxedWrite, + input: Option, + threads: usize, +} + +impl FastxEncoderBuilder { + /// Create a new encoder builder + pub(crate) fn new(builder: BinseqWriterBuilder, output: BoxedWrite) -> Self { + Self { + builder, + output, + input: None, + threads: 0, // 0 means use all available cores + } + } + + /// Read from a single FASTX file + /// + /// # Example + /// + /// ```rust,no_run + /// # use binseq::write::{BinseqWriterBuilder, Format}; + /// # use std::fs::File; + /// BinseqWriterBuilder::new(Format::Vbq) + /// .encode_fastx(Box::new(File::create("output.vbq")?)) + /// .input("input.fastq") + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + pub fn input>(mut self, path: P) -> Self { + self.input = Some(FastxInput::Single(path.as_ref().to_path_buf())); + self + } + + /// Read from stdin + /// + /// # Example + /// + /// ```rust,no_run + /// # use binseq::write::{BinseqWriterBuilder, Format}; + /// # use std::fs::File; + /// BinseqWriterBuilder::new(Format::Vbq) + /// .encode_fastx(Box::new(File::create("output.vbq")?)) + /// .input_stdin() + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + #[must_use] + pub fn input_stdin(mut self) -> Self { + self.input = Some(FastxInput::Stdin); + self + } + + /// Read from paired FASTX files (R1, R2) + /// + /// This automatically sets the writer to paired mode. + /// + /// # Example + /// + /// ```rust,no_run + /// # use binseq::write::{BinseqWriterBuilder, Format}; + /// # use std::fs::File; + /// BinseqWriterBuilder::new(Format::Vbq) + /// .encode_fastx(Box::new(File::create("output.vbq")?)) + /// .input_paired("R1.fastq", "R2.fastq") + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + pub fn input_paired>(mut self, r1: P, r2: P) -> Self { + self.input = Some(FastxInput::Paired( + r1.as_ref().to_path_buf(), + r2.as_ref().to_path_buf(), + )); + // Automatically set paired mode + self.builder = self.builder.paired(true); + self + } + + /// Set the number of threads for parallel processing + /// + /// If not set or set to 0, uses all available CPU cores. + /// + /// # Example + /// + /// ```rust,no_run + /// # use binseq::write::{BinseqWriterBuilder, Format}; + /// # use std::fs::File; + /// BinseqWriterBuilder::new(Format::Vbq) + /// .encode_fastx(Box::new(File::create("output.vbq")?)) + /// .input("input.fastq") + /// .threads(8) + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + #[must_use] + pub fn threads(mut self, n: usize) -> Self { + self.threads = n; + self + } + + /// Execute the FASTX encoding + /// + /// This consumes the builder and returns a `BinseqWriter` that has been + /// populated with all records from the input FASTX file(s). + /// + /// # Errors + /// + /// Returns an error if: + /// - The input files cannot be read + /// - The FASTX format is invalid + /// - The writer configuration is incompatible with the input + /// - For BQ format with stdin input (cannot detect sequence length) + /// + /// # Example + /// + /// ```rust,no_run + /// # use binseq::write::{BinseqWriterBuilder, Format}; + /// # use std::fs::File; + /// let writer = BinseqWriterBuilder::new(Format::Vbq) + /// .encode_fastx(Box::new(File::create("output.vbq")?)) + /// .input("input.fastq") + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + pub fn run(mut self) -> Result<()> { + let (r1, r2) = match self.input { + Some(FastxInput::Single(path)) => { + // build interleaved reader + let mut reader = + fastx::Reader::from_path(path).map_err(IntoBinseqError::into_binseq_error)?; + let (slen, xlen) = detect_seq_len(&mut reader, true)?; + self.builder = self.builder.slen(slen as u32).xlen(xlen as u32); + (reader, None) + } + Some(FastxInput::Stdin) => { + let mut reader = + fastx::Reader::from_stdin().map_err(IntoBinseqError::into_binseq_error)?; + let (slen, xlen) = detect_seq_len(&mut reader, true)?; + self.builder = self.builder.slen(slen as u32).xlen(xlen as u32); + (reader, None) + } + Some(FastxInput::Paired(path1, path2)) => { + // build interleaved reader + let mut reader1 = + fastx::Reader::from_path(path1).map_err(IntoBinseqError::into_binseq_error)?; + let mut reader2 = + fastx::Reader::from_path(path2).map_err(IntoBinseqError::into_binseq_error)?; + let (slen, _) = detect_seq_len(&mut reader1, false)?; + let (xlen, _) = detect_seq_len(&mut reader2, false)?; + self.builder = self.builder.slen(slen as u32).xlen(xlen as u32); + (reader1, Some(reader2)) + } + None => return Err(FastxEncodingError::MissingInput.into()), + }; + + let writer = self.builder.build(self.output)?; + if writer.is_paired() { + if let Some(r2) = r2 { + encode_paired(writer, r1, r2, self.threads)?; + } else { + encode_interleaved(writer, r1, self.threads)?; + } + } else { + encode_single_file(writer, r1, self.threads)?; + } + + Ok(()) + } +} + +/// Encode single-end reads from a file +fn encode_single_file( + writer: BinseqWriter, + reader: fastx::Reader, + threads: usize, +) -> Result<()> { + let mut encoder = Encoder::new(writer)?; + reader + .process_parallel(&mut encoder, threads) + .map_err(IntoBinseqError::into_binseq_error)?; + encoder.finish()?; + Ok(()) +} + +/// Encode paired-end reads from interleaved file +fn encode_interleaved( + writer: BinseqWriter, + reader: fastx::Reader, + threads: usize, +) -> Result<()> { + let mut encoder = Encoder::new(writer)?; + reader + .process_parallel_interleaved(&mut encoder, threads) + .map_err(IntoBinseqError::into_binseq_error)?; + encoder.finish()?; + Ok(()) +} + +/// Encode paired-end reads from files +fn encode_paired( + writer: BinseqWriter, + r1: fastx::Reader, + r2: fastx::Reader, + threads: usize, +) -> Result<()> { + let mut encoder = Encoder::new(writer)?; + r1.process_parallel_paired(r2, &mut encoder, threads) + .map_err(IntoBinseqError::into_binseq_error)?; + encoder.finish()?; + Ok(()) +} + +fn detect_seq_len( + reader: &mut fastx::Reader, + interleaved: bool, +) -> Result<(usize, usize)> { + // Initialze the record set + let mut rset = reader.new_record_set(); + rset.fill(reader) + .map_err(IntoBinseqError::into_binseq_error)?; + + let (slen, xlen) = if interleaved { + let mut rset_iter = rset.iter(); + let Some(Ok(slen)) = rset_iter.next().map(|r| -> Result { + let rec = r.map_err(IntoBinseqError::into_binseq_error)?; + Ok(rec.seq().len()) + }) else { + return Err(FastxEncodingError::EmptyFastxFile.into()); + }; + let Some(Ok(xlen)) = rset_iter.next().map(|r| -> Result { + let rec = r.map_err(IntoBinseqError::into_binseq_error)?; + Ok(rec.seq().len()) + }) else { + return Err(FastxEncodingError::EmptyFastxFile.into()); + }; + (slen, xlen) + } else { + let mut rset_iter = rset.iter(); + let Some(Ok(slen)) = rset_iter.next().map(|r| -> Result { + let rec = r.map_err(IntoBinseqError::into_binseq_error)?; + Ok(rec.seq().len()) + }) else { + return Err(FastxEncodingError::EmptyFastxFile.into()); + }; + (slen, 0) + }; + reader + .reload(&mut rset) + .map_err(IntoBinseqError::into_binseq_error)?; + Ok((slen, xlen)) +} + +/// Parallel encoder for FASTX records to BINSEQ format +/// +/// This struct implements the `ParallelProcessor` and `PairedParallelProcessor` +/// traits from `paraseq` to enable efficient parallel encoding of FASTX files. +#[derive(Clone)] +struct Encoder { + /// Global writer (shared across threads) + writer: Arc>>>, + /// Thread-local writer buffer + thread_writer: BinseqWriter>, +} + +impl Encoder { + /// Create a new encoder with a global writer + pub fn new(writer: BinseqWriter>) -> Result { + let thread_writer = writer.new_headless_buffer()?; + Ok(Self { + writer: Arc::new(Mutex::new(writer)), + thread_writer, + }) + } + /// Finish the stream on the global writer + pub fn finish(&mut self) -> Result<()> { + self.writer.lock().finish()?; + Ok(()) + } +} + +impl ParallelProcessor for Encoder { + fn process_record(&mut self, record: Rf) -> paraseq::Result<()> { + let seq = record.seq(); + let seq_record = SequencingRecordBuilder::default() + .s_header(record.id()) + .s_seq(&seq) + .opt_s_qual(record.qual()) + .build() + .map_err(IntoProcessError::into_process_error)?; + self.thread_writer + .push(seq_record) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } + + fn on_batch_complete(&mut self) -> paraseq::Result<()> { + self.writer + .lock() + .ingest(&mut self.thread_writer) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } +} + +impl PairedParallelProcessor for Encoder { + fn process_record_pair(&mut self, record1: Rf, record2: Rf) -> paraseq::Result<()> { + let sseq = record1.seq(); + let xseq = record2.seq(); + let seq_record = SequencingRecordBuilder::default() + .s_header(record1.id()) + .s_seq(&sseq) + .opt_s_qual(record1.qual()) + .x_header(record2.id()) + .x_seq(&xseq) + .opt_x_qual(record2.qual()) + .build() + .map_err(IntoProcessError::into_process_error)?; + + self.thread_writer + .push(seq_record) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } + + fn on_batch_complete(&mut self) -> paraseq::Result<()> { + self.writer + .lock() + .ingest(&mut self.thread_writer) + .map_err(IntoProcessError::into_process_error)?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::write::Format; + use std::io::Cursor; + + const FASTQ_R1_PATH: &str = "./data/subset_R1.fastq.gz"; + const FASTQ_R2_PATH: &str = "./data/subset_R2.fastq.gz"; + + #[test] + fn test_encoder_builder_construction() { + let builder = BinseqWriterBuilder::new(Format::Vbq); + let handle = Box::new(Cursor::new(Vec::new())); + let encoder_builder = FastxEncoderBuilder::new(builder, handle); + + assert!(encoder_builder.input.is_none()); + assert_eq!(encoder_builder.threads, 0); + } + + #[test] + fn test_encoder_builder_input_methods() { + let builder = BinseqWriterBuilder::new(Format::Vbq); + let handle = Box::new(Cursor::new(Vec::new())); + let encoder_builder = FastxEncoderBuilder::new(builder, handle) + .input("test.fastq") + .threads(4); + + assert!(matches!(encoder_builder.input, Some(FastxInput::Single(_)))); + assert_eq!(encoder_builder.threads, 4); + } + + #[test] + fn test_encoder_builder_stdin() { + let builder = BinseqWriterBuilder::new(Format::Vbq); + let handle = Box::new(Cursor::new(Vec::new())); + let encoder_builder = FastxEncoderBuilder::new(builder, handle).input_stdin(); + + assert!(matches!(encoder_builder.input, Some(FastxInput::Stdin))); + } + + #[test] + fn test_encoder_builder_single() { + let builder = BinseqWriterBuilder::new(Format::Vbq); + let handle = Box::new(Cursor::new(Vec::new())); + let encoder_builder = FastxEncoderBuilder::new(builder, handle).input(FASTQ_R1_PATH); + + assert!(matches!(encoder_builder.input, Some(FastxInput::Single(_)))); + + // Run the encoder builder and assert that it is successful + assert!(encoder_builder.run().is_ok()); + } + + #[test] + fn test_encoder_builder_paired() { + let builder = BinseqWriterBuilder::new(Format::Vbq); + let handle = Box::new(Cursor::new(Vec::new())); + let encoder_builder = + FastxEncoderBuilder::new(builder, handle).input_paired(FASTQ_R1_PATH, FASTQ_R2_PATH); + + assert!(matches!( + encoder_builder.input, + Some(FastxInput::Paired(_, _)) + )); + // Should automatically set paired mode + assert!(encoder_builder.builder.paired); + + // Run the encoder builder and assert that it is successful + assert!(encoder_builder.run().is_ok()); + } +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs new file mode 100644 index 0000000..031c492 --- /dev/null +++ b/src/utils/mod.rs @@ -0,0 +1,7 @@ +//! Utility modules for working with BINSEQ files + +#[cfg(feature = "paraseq")] +pub mod fastx; + +#[cfg(feature = "paraseq")] +pub use fastx::FastxEncoderBuilder; diff --git a/src/vbq/header.rs b/src/vbq/header.rs index fdca1c1..16476bd 100644 --- a/src/vbq/header.rs +++ b/src/vbq/header.rs @@ -1,10 +1,10 @@ //! # File and Block Header Definitions //! -//! This module defines the header structures used in the VBINSEQ file format. +//! This module defines the header structures used in the VBQ file format. //! -//! The VBINSEQ format consists of two primary header types: +//! The VBQ format consists of two primary header types: //! -//! 1. `VBinseqHeader` - The file header that appears at the beginning of a VBINSEQ file, +//! 1. `FileHeader` - The file header that appears at the beginning of a VBQ file, //! containing information about the overall file format and configuration. //! //! 2. `BlockHeader` - Headers that appear before each block of records, containing @@ -21,7 +21,7 @@ use crate::error::{HeaderError, ReadError, Result}; /// Magic number for file identification: "VSEQ" in ASCII (0x51455356) /// -/// This constant is used in the file header to identify VBINSEQ formatted files. +/// This constant is used in the file header to identify VBQ formatted files. #[allow(clippy::unreadable_literal)] const MAGIC: u32 = 0x51455356; @@ -63,7 +63,7 @@ pub const RESERVED_BYTES: [u8; 13] = [42; 13]; pub const RESERVED_BYTES_BLOCK: [u8; 12] = [42; 12]; #[derive(Default, Debug, Clone, Copy)] -pub struct VBinseqHeaderBuilder { +pub struct FileHeaderBuilder { qual: Option, block: Option, compressed: Option, @@ -72,7 +72,7 @@ pub struct VBinseqHeaderBuilder { headers: Option, flags: Option, } -impl VBinseqHeaderBuilder { +impl FileHeaderBuilder { #[must_use] pub fn new() -> Self { Self::default() @@ -113,8 +113,8 @@ impl VBinseqHeaderBuilder { self } #[must_use] - pub fn build(self) -> VBinseqHeader { - VBinseqHeader::with_capacity( + pub fn build(self) -> FileHeader { + FileHeader::with_capacity( self.block.unwrap_or(BLOCK_SIZE), self.qual.unwrap_or(false), self.compressed.unwrap_or(false), @@ -126,10 +126,10 @@ impl VBinseqHeaderBuilder { } } -/// File header for VBINSEQ files +/// File header for VBQ files /// /// This structure represents the 32-byte header that appears at the beginning of every -/// VBINSEQ file. It contains configuration information about the file format, including +/// VBQ file. It contains configuration information about the file format, including /// whether quality scores are included, whether blocks are compressed, and whether /// records contain paired sequences. /// @@ -143,7 +143,7 @@ impl VBinseqHeaderBuilder { /// * `paired` - Whether records contain paired sequences (1 byte boolean) /// * `reserved` - Reserved bytes for future extensions (16 bytes) #[derive(Clone, Copy, Debug, PartialEq)] -pub struct VBinseqHeader { +pub struct FileHeader { /// Magic number to identify the file format ("VSEQ") /// /// Always set to 0x51455356 (4 bytes) @@ -198,7 +198,7 @@ pub struct VBinseqHeader { /// Currently filled with placeholder values (13 bytes) pub reserved: [u8; 13], } -impl Default for VBinseqHeader { +impl Default for FileHeader { /// Creates a default header with default block size and all features disabled /// /// The default header: @@ -220,8 +220,8 @@ impl Default for VBinseqHeader { ) } } -impl VBinseqHeader { - /// Creates a new VBINSEQ header with the default block size +impl FileHeader { + /// Creates a new VBQ header with the default block size /// /// # Parameters /// @@ -234,10 +234,10 @@ impl VBinseqHeader { /// # Example /// /// ```rust - /// use binseq::vbq::VBinseqHeaderBuilder; + /// use binseq::vbq::FileHeaderBuilder; /// /// // Create header with quality scores and compression, without paired sequences - /// let header = VBinseqHeaderBuilder::new() + /// let header = FileHeaderBuilder::new() /// .qual(true) /// .compressed(true) /// .build(); @@ -256,7 +256,7 @@ impl VBinseqHeader { ) } - /// Creates a new VBINSEQ header with a custom block size + /// Creates a new VBQ header with a custom block size /// /// # Parameters /// @@ -268,10 +268,10 @@ impl VBinseqHeader { /// # Example /// /// ```rust - /// use binseq::vbq::VBinseqHeaderBuilder; + /// use binseq::vbq::FileHeaderBuilder; /// /// // Create header with a 256KB block size, with quality scores and compression - /// let header = VBinseqHeaderBuilder::new() + /// let header = FileHeaderBuilder::new() /// .block(256 * 1024) /// .qual(true) /// .compressed(true) @@ -308,7 +308,7 @@ impl VBinseqHeader { /// Creates a header from a 32-byte buffer /// - /// This function parses a raw byte buffer into a `VBinseqHeader` structure, + /// This function parses a raw byte buffer into a `FileHeader` structure, /// validating the magic number and format version. /// /// # Parameters @@ -399,7 +399,7 @@ impl VBinseqHeader { /// Reads a header from a reader /// /// This function reads 32 bytes from the provided reader and parses them into - /// a `VBinseqHeader` structure. + /// a `FileHeader` structure. /// /// # Parameters /// @@ -425,9 +425,9 @@ impl VBinseqHeader { } } -/// Block header for VBINSEQ block data +/// Block header for VBQ block data /// -/// Each block in a VBINSEQ file is preceded by a 32-byte block header that contains +/// Each block in a VBQ file is preceded by a 32-byte block header that contains /// information about the block including its size and the number of records it contains. /// /// # Fields diff --git a/src/vbq/index.rs b/src/vbq/index.rs index 757c620..3f9e880 100644 --- a/src/vbq/index.rs +++ b/src/vbq/index.rs @@ -4,9 +4,8 @@ //! //! ## Format Changes (v0.7.0+) //! -//! **BREAKING CHANGE**: The VBQ index is now embedded at the end of VBQ files instead of -//! being stored in separate `.vqi` files. This improves portability and eliminates the -//! need to manage auxiliary files. +//! **BREAKING CHANGE**: The VBQ index is now embedded at the end of VBQ files, +//! improving portability and eliminating the need to manage auxiliary files. //! //! ## Embedded Index Structure //! @@ -29,13 +28,13 @@ //! //! ## Key Changes from v0.6.x //! -//! - Index moved from separate `.vqi` files into VBQ files +//! - Index is now embedded in VBQ files //! - Cumulative record counts changed from `u32` to `u64` //! - Support for files with more than 4 billion records use std::{ fs::File, - io::{BufReader, BufWriter, Cursor, Read, Write}, + io::{Cursor, Read, Write}, path::Path, }; @@ -43,8 +42,8 @@ use byteorder::{ByteOrder, LittleEndian}; use zstd::{Decoder, Encoder}; use super::{ + BlockHeader, FileHeader, header::{SIZE_BLOCK_HEADER, SIZE_HEADER}, - BlockHeader, VBinseqHeader, }; use crate::error::{IndexError, Result}; @@ -61,13 +60,13 @@ pub const INDEX_END_MAGIC: u64 = 0x444E455845444E49; /// Index Block Reservation pub const INDEX_RESERVATION: [u8; 4] = [42; 4]; -/// Descriptor of the dimensions of a block in a VBINSEQ file +/// Descriptor of the dimensions of a block in a VBQ file /// -/// A `BlockRange` contains metadata about a single block within a VBINSEQ file, +/// A `BlockRange` contains metadata about a single block within a VBQ file, /// including its position, size, and record count. This information enables /// efficient random access to blocks without scanning the entire file. /// -/// Block ranges are stored in a `BlockIndex` to form a complete index of a VBINSEQ file. +/// Block ranges are stored in a `BlockIndex` to form a complete index of a VBQ file. /// Each range is serialized to a fixed-size 32-byte structure when stored in the embedded index. /// /// ## Format Changes (v0.7.0+) @@ -249,22 +248,22 @@ impl BlockRange { } } -/// Header for a VBINSEQ index file +/// Header for a VBQ index file /// /// The `IndexHeader` contains metadata about an index file, including a magic number /// for validation and the size of the indexed file. This allows verifying that an index -/// file matches its corresponding VBINSEQ file. +/// file matches its corresponding VBQ file. /// /// The header has a fixed size of 32 bytes to ensure compatibility across versions. #[derive(Debug, Clone, Copy)] pub struct IndexHeader { /// Magic number to designate the index file ("VBQINDEX" in ASCII) /// - /// This is used to verify that a file is indeed a VBINSEQ index file. + /// This is used to verify that a file is indeed a VBQ index file. /// (8 bytes in serialized form) magic: u64, - /// Total size of the indexed VBINSEQ file in bytes + /// Total size of the indexed VBQ file in bytes /// /// This is used to verify that the index matches the file it references. /// (8 bytes in serialized form) @@ -276,11 +275,11 @@ pub struct IndexHeader { reserved: [u8; INDEX_HEADER_SIZE - 16], } impl IndexHeader { - /// Creates a new index header for a VBINSEQ file of the specified size + /// Creates a new index header for a VBQ file of the specified size /// /// # Parameters /// - /// * `bytes` - The total size of the VBINSEQ file being indexed, in bytes + /// * `bytes` - The total size of the VBQ file being indexed, in bytes /// /// # Returns /// @@ -296,7 +295,7 @@ impl IndexHeader { /// /// This method reads 32 bytes from the provided reader and deserializes them /// into an `IndexHeader`. It validates the magic number to ensure that the file - /// is indeed a VBINSEQ index file. + /// is indeed a VBQ index file. /// /// # Parameters /// @@ -367,16 +366,17 @@ impl IndexHeader { } } -/// Complete index for a VBINSEQ file +/// Complete index for a VBQ file /// -/// A `BlockIndex` contains metadata about a VBINSEQ file and all of its blocks, +/// A `BlockIndex` contains metadata about a VBQ file and all of its blocks, /// enabling efficient random access and parallel processing. It consists of an /// `IndexHeader` and a collection of `BlockRange` entries, one for each block in /// the file. /// -/// The index can be created by scanning a VBINSEQ file or loaded from a previously -/// created index file. Once loaded, it provides information about block locations, -/// sizes, and record counts. +/// The index is embedded at the end of VBQ files and can be loaded using +/// `MmapReader::load_index()` or created by scanning a VBQ file using +/// `BlockIndex::from_vbq()`. Once loaded, it provides information about block +/// locations, sizes, and record counts. /// /// # Examples /// @@ -384,14 +384,10 @@ impl IndexHeader { /// use binseq::vbq::{BlockIndex, MmapReader}; /// use std::path::Path; /// -/// // Create an index from a VBINSEQ file +/// // Create an index from a VBQ file /// let vbq_path = Path::new("example.vbq"); /// let index = BlockIndex::from_vbq(vbq_path).unwrap(); /// -/// // Save the index for future use -/// let index_path = Path::new("example.vbq.vqi"); -/// index.save_to_path(index_path).unwrap(); -/// /// // Use the index with a reader for parallel processing /// let reader = MmapReader::new(vbq_path).unwrap(); /// println!("File contains {} blocks", index.n_blocks()); @@ -425,15 +421,16 @@ impl BlockIndex { /// /// # Returns /// - /// The number of blocks in the VBINSEQ file described by this index + /// The number of blocks in the VBQ file described by this index /// /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::BlockIndex; + /// use binseq::vbq::{BlockIndex, MmapReader}; /// use std::path::Path; /// - /// let index = BlockIndex::from_path(Path::new("example.vbq.vqi")).unwrap(); + /// let reader = MmapReader::new(Path::new("example.vbq")).unwrap(); + /// let index = reader.load_index().unwrap(); /// println!("The file contains {} blocks", index.n_blocks()); /// ``` #[must_use] @@ -441,43 +438,6 @@ impl BlockIndex { self.ranges.len() } - /// Writes the collection of `BlockRange` to a file - /// Saves the index to a file - /// - /// This writes the index header and all block ranges to a file, which can be loaded - /// later to avoid rescanning the VBINSEQ file. The index is compressed to reduce - /// storage space. - /// - /// # Parameters - /// - /// * `path` - The path where the index file should be saved - /// - /// # Returns - /// - /// * `Ok(())` - If the index was successfully saved - /// * `Err(_)` - If an error occurred during saving - /// - /// # Examples - /// - /// ```rust,no_run - /// use binseq::vbq::BlockIndex; - /// use std::path::Path; - /// - /// // Create an index from a VBINSEQ file - /// let index = BlockIndex::from_vbq(Path::new("example.vbq")).unwrap(); - /// - /// // Save it for future use - /// index.save_to_path(Path::new("example.vbq.vqi")).unwrap(); - /// ``` - pub fn save_to_path>(&self, path: P) -> Result<()> { - let mut writer = File::create(path).map(BufWriter::new)?; - self.header.write_bytes(&mut writer)?; - let mut writer = Encoder::new(writer, 3)?.auto_finish(); - self.write_range(&mut writer)?; - writer.flush()?; - Ok(()) - } - /// Write the index to an output buffer pub fn write_bytes(&self, writer: &mut W) -> Result<()> { self.header.write_bytes(writer)?; @@ -490,9 +450,8 @@ impl BlockIndex { /// Write the collection of `BlockRange` to an output handle /// Writes all block ranges to the provided writer /// - /// This method is used internally by `save_to_path` to write the block ranges - /// to an index file. It can also be used to serialize an index to any destination - /// that implements `Write`. + /// This method is used internally to write the block ranges to the embedded index. + /// It can also be used to serialize an index to any destination that implements `Write`. /// /// # Parameters /// @@ -521,15 +480,15 @@ impl BlockIndex { self.ranges.push(range); } - /// Creates a new index by scanning a VBINSEQ file + /// Creates a new index by scanning a VBQ file /// - /// This method memory-maps the specified VBINSEQ file and scans it block by block - /// to create an index. The index can then be saved to a file for future use, enabling - /// efficient random access without rescanning the file. + /// This method memory-maps the specified VBQ file and scans it block by block + /// to create an index. This is primarily used internally when embedding the index + /// into VBQ files during the write process. /// /// # Parameters /// - /// * `path` - Path to the VBINSEQ file to index + /// * `path` - Path to the VBQ file to index /// /// # Returns /// @@ -542,12 +501,9 @@ impl BlockIndex { /// use binseq::vbq::BlockIndex; /// use std::path::Path; /// - /// // Create an index from a VBINSEQ file + /// // Create an index from a VBQ file /// let index = BlockIndex::from_vbq(Path::new("example.vbq")).unwrap(); /// - /// // Save the index for future use - /// index.save_to_path(Path::new("example.vbq.vqi")).unwrap(); - /// /// // Get statistics about the file /// println!("File contains {} blocks", index.n_blocks()); /// @@ -572,7 +528,7 @@ impl BlockIndex { let _header = { let mut header_bytes = [0u8; SIZE_HEADER]; header_bytes.copy_from_slice(&mmap[..SIZE_HEADER]); - VBinseqHeader::from_bytes(&header_bytes)? + FileHeader::from_bytes(&header_bytes)? }; // Initialize position after the header @@ -603,45 +559,6 @@ impl BlockIndex { Ok(index) } - /// Reads an index from a path - /// - /// # Panics - /// Panics if the path is not a valid UTF-8 string. - pub fn from_path>(path: P) -> Result { - let Some(upstream_file) = path.as_ref().to_str().unwrap().strip_suffix(".vqi") else { - return Err(IndexError::MissingUpstreamFile( - path.as_ref().to_string_lossy().to_string(), - ) - .into()); - }; - let upstream_handle = File::open(upstream_file)?; - let mmap = unsafe { memmap2::Mmap::map(&upstream_handle)? }; - let file_size = mmap.len() as u64; - - let mut file_handle = File::open(path).map(BufReader::new)?; - let index_header = IndexHeader::from_reader(&mut file_handle)?; - if index_header.bytes != file_size { - return Err(IndexError::ByteSizeMismatch(file_size, index_header.bytes).into()); - } - let buffer = { - let mut buffer = Vec::new(); - let mut decoder = Decoder::new(file_handle)?; - decoder.read_to_end(&mut buffer)?; - buffer - }; - - let mut ranges = Self::new(index_header); - let mut pos = 0; - while pos < buffer.len() { - let bound = pos + SIZE_BLOCK_RANGE; - let range = BlockRange::from_bytes(&buffer[pos..bound]); - ranges.add_range(range); - pos += SIZE_BLOCK_RANGE; - } - - Ok(ranges) - } - pub fn from_bytes(bytes: &[u8]) -> Result { let index_header = IndexHeader::from_bytes(bytes)?; let buffer = { @@ -676,10 +593,11 @@ impl BlockIndex { /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::BlockIndex; + /// use binseq::vbq::MmapReader; /// use std::path::Path; /// - /// let index = BlockIndex::from_path(Path::new("example.vbq.vqi")).unwrap(); + /// let reader = MmapReader::new(Path::new("example.vbq")).unwrap(); + /// let index = reader.load_index().unwrap(); /// /// // Examine the ranges to determine which blocks to process /// for (i, range) in index.ranges().iter().enumerate() { diff --git a/src/vbq/mod.rs b/src/vbq/mod.rs index f46fb7a..230cca0 100644 --- a/src/vbq/mod.rs +++ b/src/vbq/mod.rs @@ -1,15 +1,15 @@ -//! # VBINSEQ Format +//! # VBQ Format //! -//! VBINSEQ is a high-performance binary format for variable-length nucleotide sequences +//! VBQ is a high-performance binary format for variable-length nucleotide sequences //! that optimizes both storage efficiency and parallel processing capabilities. //! //! For more information on the format, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1). //! //! ## Overview //! -//! VBINSEQ extends the core principles of BINSEQ to accommodate: +//! VBQ extends the core principles of BINSEQ to accommodate: //! -//! * **Variable-length sequences**: Unlike BINSEQ which requires fixed-length reads, VBINSEQ can store +//! * **Variable-length sequences**: Unlike BINSEQ which requires fixed-length reads, VBQ can store //! sequences of any length, making it suitable for technologies like PacBio and Oxford Nanopore. //! //! * **Quality scores**: Optional storage of quality scores alongside nucleotide data when needed. @@ -30,7 +30,7 @@ //! //! ## File Structure //! -//! A VBINSEQ file consists of a 32-byte header followed by record blocks and an embedded index: +//! A VBQ file consists of a 32-byte header followed by record blocks and an embedded index: //! //! ```text //! ┌───────────────────┐ @@ -71,14 +71,14 @@ //! ## Recent Format Changes (v0.7.0+) //! //! * **Embedded Index**: Index data is now stored within the VBQ file itself, eliminating -//! separate `.vqi` files and improving portability. +//! improving portability. //! * **Headers Support**: Optional sequence identifiers can be stored with each record. //! * **Extended Capacity**: u64 indexing supports files with more than 4 billion records. //! * **Multi-bit Encoding**: Support for both 2-bit and 4-bit nucleotide encodings. //! //! ## Performance Characteristics //! -//! VBINSEQ is designed for high-throughput parallel processing: +//! VBQ is designed for high-throughput parallel processing: //! //! * Independent blocks enable true parallel processing without synchronization //! * Memory-mapped access provides efficient I/O @@ -91,15 +91,15 @@ //! ``` //! use std::fs::File; //! use std::io::BufWriter; -//! use binseq::vbq::{VBinseqHeaderBuilder, VBinseqWriterBuilder, MmapReader}; -//! use binseq::BinseqRecord; +//! use binseq::vbq::{FileHeaderBuilder, WriterBuilder, MmapReader}; +//! use binseq::{BinseqRecord, SequencingRecordBuilder}; //! //! /* //! WRITING //! */ //! //! // Create a header for sequences with quality scores and headers -//! let header = VBinseqHeaderBuilder::new() +//! let header = FileHeaderBuilder::new() //! .qual(true) //! .compressed(true) //! .headers(true) @@ -107,16 +107,19 @@ //! //! // Create a writer //! let file = File::create("example.vbq").unwrap(); -//! let mut writer = VBinseqWriterBuilder::default() +//! let mut writer = WriterBuilder::default() //! .header(header) //! .build(BufWriter::new(file)) //! .unwrap(); //! //! // Write a sequence with quality scores and header -//! let sequence = b"ACGTACGT"; -//! let quality = b"IIIIFFFF"; -//! let header_str = b"sequence_001"; -//! writer.write_record(None, Some(header_str), sequence, Some(quality)).unwrap(); +//! let record = SequencingRecordBuilder::default() +//! .s_seq(b"ACGTACGT") +//! .s_qual(b"IIIIFFFF") +//! .s_header(b"sequence_001") +//! .build() +//! .unwrap(); +//! writer.push(record).unwrap(); //! writer.finish().unwrap(); //! //! /* @@ -147,7 +150,7 @@ mod index; mod reader; mod writer; -pub use header::{BlockHeader, VBinseqHeader, VBinseqHeaderBuilder}; +pub use header::{BlockHeader, FileHeader, FileHeaderBuilder}; pub use index::{BlockIndex, BlockRange}; pub use reader::{MmapReader, RecordBlock, RecordBlockIter, RefRecord}; -pub use writer::{VBinseqWriter, VBinseqWriterBuilder}; +pub use writer::{Writer, WriterBuilder}; diff --git a/src/vbq/reader.rs b/src/vbq/reader.rs index 2973457..f9ae0cd 100644 --- a/src/vbq/reader.rs +++ b/src/vbq/reader.rs @@ -1,11 +1,11 @@ -//! Reader implementation for VBINSEQ files +//! Reader implementation for VBQ files //! -//! This module provides functionality for reading sequence data from VBINSEQ files, +//! This module provides functionality for reading sequence data from VBQ files, //! including support for compressed blocks, quality scores, paired-end reads, and sequence headers. //! //! ## Format Changes (v0.7.0+) //! -//! - **Embedded Index**: Readers now load the index from within VBQ files instead of separate `.vqi` files +//! - **Embedded Index**: Readers now load the index from within VBQ files //! - **Headers Support**: Optional sequence headers/identifiers can be read from each record //! - **Multi-bit Encoding**: Support for reading 2-bit and 4-bit nucleotide encodings //! - **Extended Capacity**: u64 indexing supports files with more than 4 billion records @@ -51,7 +51,7 @@ use std::fs::File; use std::ops::Range; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::sync::Arc; use bitnuc::BitSize; @@ -60,13 +60,14 @@ use memmap2::Mmap; use zstd::zstd_safe; use super::{ + BlockHeader, BlockIndex, BlockRange, FileHeader, header::{SIZE_BLOCK_HEADER, SIZE_HEADER}, - BlockHeader, BlockIndex, BlockRange, VBinseqHeader, }; -use crate::vbq::index::{IndexHeader, INDEX_END_MAGIC, INDEX_HEADER_SIZE}; +use crate::DEFAULT_QUALITY_SCORE; +use crate::vbq::index::{INDEX_END_MAGIC, INDEX_HEADER_SIZE, IndexHeader}; use crate::{ - error::{ReadError, Result}, BinseqRecord, ParallelProcessor, ParallelReader, + error::{ReadError, Result}, }; /// Calculates the number of 64-bit words needed to store a nucleotide sequence of the given length @@ -126,11 +127,14 @@ struct RecordMetadata { x_seq_span: Span, // Encoded sequence words (u64s) (into `.sequences` buffer) x_qual_span: Span, // Quality bytes x_header_span: Span, // Header bytes + + /// Indicates whether the record has quality scores + has_quality: bool, } -/// A container for a block of VBINSEQ records +/// A container for a block of VBQ records /// -/// The `RecordBlock` struct represents a single block of records read from a VBINSEQ file. +/// The `RecordBlock` struct represents a single block of records read from a VBQ file. /// It stores the raw data for multiple records in vectors, allowing efficient iteration /// over the records without copying memory for each record. /// @@ -179,11 +183,17 @@ pub struct RecordBlock { /// Reusable decoding buffer for the block dbuf: Vec, + + /// Reusable buffer for quality scores for the block + qbuf: Vec, + + /// Default quality score for the block + default_quality_score: u8, } impl RecordBlock { /// Creates a new empty `RecordBlock` with the specified block size /// - /// The block size should match the one specified in the VBINSEQ file header + /// The block size should match the one specified in the VBQ file header /// for proper operation. This is typically handled automatically when using /// `MmapReader::new_block()`. /// @@ -206,9 +216,21 @@ impl RecordBlock { rbuf: Vec::default(), dbuf: Vec::default(), dctx: zstd_safe::DCtx::create(), + qbuf: Vec::default(), + default_quality_score: DEFAULT_QUALITY_SCORE, } } + /// Sets the default quality score for the block + /// + /// # Parameters + /// + /// * `score` - Default quality score for the block + pub fn set_default_quality_score(&mut self, score: u8) { + self.default_quality_score = score; + self.qbuf.clear(); + } + /// Returns the number of records in this block /// /// # Returns @@ -272,6 +294,7 @@ impl RecordBlock { self.sequences.clear(); self.dbuf.clear(); // Note: We keep rbuf allocated for reuse + // Note: We keep qbuf allocated for reuse } /// Ingest the bytes from a block into the record block @@ -425,6 +448,14 @@ impl RecordBlock { Span::new(0, 0) }; + // Update qbuf size + if !has_quality { + let max_size = slen.max(xlen) as usize; + if self.qbuf.len() < max_size { + self.qbuf.resize(max_size, self.default_quality_score); + } + } + // Store the record metadata - all spans! self.records.push(RecordMetadata { flag, @@ -436,6 +467,7 @@ impl RecordBlock { x_seq_span, x_qual_span, x_header_span, + has_quality, }); } } @@ -511,6 +543,7 @@ pub struct RecordBlockIter<'a> { block: &'a RecordBlock, pos: usize, header_buffer: itoa::Buffer, + qbuf: &'a [u8], } impl<'a> RecordBlockIter<'a> { #[must_use] @@ -519,6 +552,7 @@ impl<'a> RecordBlockIter<'a> { block, pos: 0, header_buffer: itoa::Buffer::new(), + qbuf: &block.qbuf, } } } @@ -542,6 +576,20 @@ impl<'a> Iterator for RecordBlockIter<'a> { header_buf[..header_len].copy_from_slice(header_str.as_bytes()); } + let (squal, xqual) = if meta.has_quality { + // Record has quality scores, slice into rbuf using span + ( + meta.s_qual_span.slice(&self.block.rbuf), + meta.x_qual_span.slice(&self.block.rbuf), + ) + } else { + // Record does not have quality scores, use preallocated buffer for default scores + ( + &self.qbuf[..meta.slen as usize], + &self.qbuf[..meta.xlen as usize], + ) + }; + // increment position { self.pos += 1; @@ -558,9 +606,10 @@ impl<'a> Iterator for RecordBlockIter<'a> { // Slice into sequences Vec using span sbuf: meta.s_seq_span.slice_u64(&self.block.sequences), xbuf: meta.x_seq_span.slice_u64(&self.block.sequences), + // Pass quality score buffers + squal, + xqual, // Slice into rbuf using span - squal: meta.s_qual_span.slice(&self.block.rbuf), - xqual: meta.x_qual_span.slice(&self.block.rbuf), sheader: meta.s_header_span.slice(&self.block.rbuf), xheader: meta.x_header_span.slice(&self.block.rbuf), header_buf, @@ -678,9 +727,9 @@ impl BinseqRecord for RefRecord<'_> { } } -/// Memory-mapped reader for VBINSEQ files +/// Memory-mapped reader for VBQ files /// -/// [`MmapReader`] provides efficient, memory-mapped access to VBINSEQ files. It allows +/// [`MmapReader`] provides efficient, memory-mapped access to VBQ files. It allows /// sequential reading of record blocks and supports parallel processing of records. /// /// ## Format Support (v0.7.0+) @@ -694,7 +743,7 @@ impl BinseqRecord for RefRecord<'_> { /// which can be more efficient than standard file I/O, especially for large files. /// /// The [`MmapReader`] is designed to be used in a multi-threaded environment, and it -/// is built around [`RecordBlock`]s which are the units of data in a VBINSEQ file. +/// is built around [`RecordBlock`]s which are the units of data in a VBQ file. /// Each one would be held by a separate thread and would load data from the shared /// [`MmapReader`] through the [`MmapReader::read_block_into`] method. However, they can /// also be used in a single-threaded environment for sequential processing. @@ -743,14 +792,11 @@ impl BinseqRecord for RefRecord<'_> { /// } /// ``` pub struct MmapReader { - /// Path to the VBINSEQ file - path: PathBuf, - /// Memory-mapped file contents for efficient access mmap: Arc, /// Parsed header information from the file - header: VBinseqHeader, + header: FileHeader, /// Current cursor position in the file (in bytes) pos: usize, @@ -760,23 +806,24 @@ pub struct MmapReader { /// Whether to decode sequences at once in each block decode_block: bool, + + /// Default quality score for this reader + default_quality_score: u8, } impl MmapReader { - /// Creates a new `MmapReader` for a VBINSEQ file + /// Creates a new `MmapReader` for a VBQ file /// /// This method opens the specified file, memory-maps its contents, reads the - /// VBINSEQ header information, and loads the embedded index. The reader is positioned + /// VBQ header information, and loads the embedded index. The reader is positioned /// at the beginning of the first record block after the header. /// /// ## Index Loading (v0.7.0+) /// - /// The embedded index is automatically loaded from the end of the file. For legacy - /// files with separate `.vqi` index files, the index is automatically migrated to - /// the embedded format. + /// The embedded index is automatically loaded from the end of the file. /// /// # Parameters /// - /// * `path` - Path to the VBINSEQ file to open + /// * `path` - Path to the VBQ file to open /// /// # Returns /// @@ -786,7 +833,7 @@ impl MmapReader { /// /// * `ReadError::InvalidFileType` if the path doesn't point to a regular file /// * I/O errors if the file can't be opened or memory-mapped - /// * Header validation errors if the file doesn't contain a valid VBINSEQ header + /// * Header validation errors if the file doesn't contain a valid VBQ header /// /// # Examples /// @@ -809,19 +856,23 @@ impl MmapReader { let header = { let mut header_bytes = [0u8; SIZE_HEADER]; header_bytes.copy_from_slice(&mmap[..SIZE_HEADER]); - VBinseqHeader::from_bytes(&header_bytes)? + FileHeader::from_bytes(&header_bytes)? }; Ok(Self { - path: PathBuf::from(path.as_ref()), mmap: Arc::new(mmap), header, pos: SIZE_HEADER, total: 0, decode_block: true, + default_quality_score: DEFAULT_QUALITY_SCORE, }) } + pub fn set_default_quality_score(&mut self, score: u8) { + self.default_quality_score = score; + } + /// Creates a new empty record block with the appropriate size for this file /// /// This creates a `RecordBlock` with a block size matching the one specified in the @@ -841,7 +892,9 @@ impl MmapReader { /// ``` #[must_use] pub fn new_block(&self) -> RecordBlock { - RecordBlock::new(self.header.bits, self.header.block as usize) + let mut block = RecordBlock::new(self.header.bits, self.header.block as usize); + block.set_default_quality_score(self.default_quality_score); + block } /// Sets whether to decode sequences at once in each block @@ -862,36 +915,6 @@ impl MmapReader { self.decode_block = decode_block; } - /// Returns the path where the index file would be located - /// - /// The index file is used for random access to blocks and has the same path as - /// the VBINSEQ file with the ".vqi" extension appended. - /// - /// # Returns - /// - /// The path where the index file would be located - /// - /// # Examples - /// - /// ``` - /// use binseq::vbq::MmapReader; - /// use binseq::Result; - /// - /// fn main() -> Result<()> { - /// let path = "./data/subset.vbq"; - /// let reader = MmapReader::new(path)?; - /// let index_path = reader.index_path(); - /// assert_eq!(index_path.to_str(), Some("./data/subset.vbq.vqi")); - /// Ok(()) - /// } - /// ``` - #[must_use] - pub fn index_path(&self) -> PathBuf { - let mut p = self.path.as_os_str().to_owned(); - p.push(".vqi"); - p.into() - } - /// Returns a copy of the file's header information /// /// The header contains information about the file format, including whether @@ -900,9 +923,9 @@ impl MmapReader { /// /// # Returns /// - /// A copy of the file's `VBinseqHeader` + /// A copy of the file's `FileHeader` #[must_use] - pub fn header(&self) -> VBinseqHeader { + pub fn header(&self) -> FileHeader { self.header } @@ -917,7 +940,7 @@ impl MmapReader { /// This method reads the next block of records from the current position in the file /// and populates the provided `RecordBlock` with the data. The block is cleared and reused /// to avoid unnecessary memory allocations. This is the primary method for sequential - /// reading of VBINSEQ files. + /// reading of VBQ files. /// /// The method automatically handles decompression if the file was written with /// compression enabled and updates the total record count as it progresses through the file. @@ -1021,23 +1044,20 @@ impl MmapReader { Ok(true) } - /// Loads or creates the block index for this VBINSEQ file + /// Loads the embedded block index from this VBQ file /// /// The block index provides metadata about each block in the file, enabling - /// random access to blocks and parallel processing. This method first attempts to - /// load an existing index file. If the index doesn't exist or doesn't match the - /// current file, it automatically generates a new index from the VBINSEQ file - /// and saves it for future use. + /// random access to blocks and parallel processing. This method reads the + /// embedded index from the end of the VBQ file. /// /// # Returns /// - /// The loaded or newly created `BlockIndex` if successful + /// The loaded `BlockIndex` if successful /// /// # Errors /// - /// * File I/O errors when reading or creating the index - /// * Parsing errors if the VBINSEQ file has invalid format - /// * Other index-related errors that cannot be resolved by creating a new index + /// * File I/O errors when reading the index + /// * Parsing errors if the VBQ file has invalid format or missing index /// /// # Examples /// @@ -1046,18 +1066,12 @@ impl MmapReader { /// /// let reader = MmapReader::new("example.vbq").unwrap(); /// - /// // Load the index file (or create if it doesn't exist) + /// // Load the embedded index /// let index = reader.load_index().unwrap(); /// /// // Use the index to get information about the file /// println!("Number of blocks: {}", index.n_blocks()); /// ``` - /// - /// # Notes - /// - /// The index file is stored with the same path as the VBINSEQ file but with a ".vqi" - /// extension appended. This allows for reusing the index across multiple runs, - /// which can significantly improve startup performance for large files. pub fn load_index(&self) -> Result { let start_pos_magic = self.mmap.len() - 8; let start_pos_index_size = start_pos_magic - 8; @@ -1090,7 +1104,7 @@ impl MmapReader { impl ParallelReader for MmapReader { /// Processes all records in the file in parallel using multiple threads /// - /// This method provides efficient parallel processing of VBINSEQ files by distributing + /// This method provides efficient parallel processing of VBQ files by distributing /// blocks across multiple worker threads. The file's block structure is leveraged to divide /// the work evenly without requiring thread synchronization during processing, which leads /// to near-linear scaling with the number of threads. @@ -1167,7 +1181,7 @@ impl ParallelReader for MmapReader { /// } /// } /// - /// // Use the processor with a VBINSEQ file + /// // Use the processor with a VBQ file /// let reader = MmapReader::new("example.vbq").unwrap(); /// let counter = RecordCounter::new(); /// @@ -1233,9 +1247,7 @@ impl ParallelReader for MmapReader { // Validate range let total_records = index.num_records(); - if range.start >= total_records || range.end > total_records || range.start >= range.end { - return Ok(()); // Nothing to process or invalid range - } + self.validate_range(total_records, &range)?; // Find blocks that contain records in the specified range let relevant_blocks = index @@ -1347,3 +1359,535 @@ impl ParallelReader for MmapReader { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::BinseqRecord; + + const TEST_VBQ_FILE: &str = "./data/subset.vbq"; + + // ==================== MmapReader Basic Tests ==================== + + #[test] + fn test_mmap_reader_new() { + let reader = MmapReader::new(TEST_VBQ_FILE); + assert!(reader.is_ok(), "Failed to create VBQ reader"); + } + + #[test] + fn test_mmap_reader_num_records() { + let reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let num_records = reader.num_records(); + assert!(num_records.is_ok(), "Failed to get num_records"); + assert!(num_records.unwrap() > 0, "Expected non-zero records"); + } + + #[test] + fn test_mmap_reader_is_paired() { + let reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let is_paired = reader.is_paired(); + // Test that the method returns a boolean + assert!(is_paired || !is_paired); + } + + #[test] + fn test_mmap_reader_header_access() { + let reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let header = &reader.header; + assert!(header.block > 0, "Expected non-zero block size"); + assert_eq!(header.magic, 0x51455356, "Expected VSEQ magic number"); + } + + // ==================== RecordBlock Tests ==================== + + #[test] + fn test_new_block() { + let reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let block = reader.new_block(); + + assert_eq!(block.bitsize, reader.header.bits); + assert!(block.n_records() == 0, "New block should be empty"); + } + + #[test] + fn test_record_block_creation() { + let block = RecordBlock::new(BitSize::Two, 1024); + + assert_eq!(block.bitsize, BitSize::Two); + assert_eq!(block.n_records(), 0); + } + + #[test] + fn test_record_block_clear() { + let mut block = RecordBlock::new(BitSize::Two, 1024); + + // Block starts empty + assert_eq!(block.n_records(), 0); + + // Clear should not panic on empty block + block.clear(); + assert_eq!(block.n_records(), 0); + } + + #[test] + fn test_record_block_set_default_quality() { + let mut block = RecordBlock::new(BitSize::Two, 1024); + let custom_score = 42u8; + + block.set_default_quality_score(custom_score); + assert_eq!(block.default_quality_score, custom_score); + } + + // ==================== Block Reading Tests ==================== + + #[test] + fn test_read_block_into() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let mut block = reader.new_block(); + + let result = reader.read_block_into(&mut block); + assert!(result.is_ok(), "Failed to read block"); + + if result.unwrap() { + assert!(block.n_records() > 0, "Block should contain records"); + } + } + + #[test] + fn test_read_multiple_blocks() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let mut block = reader.new_block(); + + let mut blocks_read = 0; + let max_blocks = 5; + + while reader.read_block_into(&mut block).unwrap() && blocks_read < max_blocks { + assert!(block.n_records() > 0, "Each block should have records"); + blocks_read += 1; + } + + assert!(blocks_read > 0, "Should read at least one block"); + } + + #[test] + fn test_block_iteration() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let mut block = reader.new_block(); + + if reader.read_block_into(&mut block).unwrap() { + let num_records = block.n_records(); + let mut count = 0; + + for record in block.iter() { + assert!(record.slen() > 0, "Record should have non-zero length"); + count += 1; + } + + assert_eq!(count, num_records, "Iterator should yield all records"); + } + } + + // ==================== Record Access Tests ==================== + + #[test] + fn test_record_sequence_data() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let mut block = reader.new_block(); + + if reader.read_block_into(&mut block).unwrap() { + // Decode all sequences in the block + block.decode_all().unwrap(); + + if let Some(record) = block.iter().next() { + let sseq = record.sseq(); + assert!(!sseq.is_empty(), "Sequence should not be empty"); + + let slen = record.slen(); + assert_eq!(sseq.len(), slen as usize, "Sequence length mismatch"); + } + } + } + + #[test] + fn test_record_header_data() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let mut block = reader.new_block(); + + if reader.read_block_into(&mut block).unwrap() { + for record in block.iter() { + let sheader = record.sheader(); + // Header may be empty if not included in file + if !sheader.is_empty() { + // Should be valid UTF-8 if present + let _ = std::str::from_utf8(sheader); + } + } + } + } + + #[test] + fn test_record_quality_data() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let mut block = reader.new_block(); + + if reader.read_block_into(&mut block).unwrap() { + for record in block.iter() { + let squal = record.squal(); + let slen = record.slen() as usize; + + if !squal.is_empty() { + assert_eq!( + squal.len(), + slen, + "Quality length should match sequence length" + ); + } + } + } + } + + #[test] + fn test_record_bitsize() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let mut block = reader.new_block(); + + if reader.read_block_into(&mut block).unwrap() { + for record in block.iter() { + let bitsize = record.bitsize(); + assert!( + matches!(bitsize, BitSize::Two | BitSize::Four), + "Bitsize should be Two or Four" + ); + } + } + } + + // ==================== Default Quality Score Tests ==================== + + #[test] + fn test_set_default_quality_score() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let custom_score = 42u8; + + reader.set_default_quality_score(custom_score); + assert_eq!(reader.default_quality_score, custom_score); + + let block = reader.new_block(); + assert_eq!(block.default_quality_score, custom_score); + } + + // ==================== Decode Block Feature Tests ==================== + + #[test] + fn test_set_decode_block() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + + reader.set_decode_block(true); + // Just verify it doesn't panic - actual behavior depends on reading + + reader.set_decode_block(false); + // Verify we can toggle it + } + + #[test] + fn test_decode_block_affects_reading() { + let mut reader1 = MmapReader::new(TEST_VBQ_FILE).unwrap(); + reader1.set_decode_block(true); + let mut block1 = reader1.new_block(); + + let mut reader2 = MmapReader::new(TEST_VBQ_FILE).unwrap(); + reader2.set_decode_block(false); + let mut block2 = reader2.new_block(); + + // Both should read successfully + let result1 = reader1.read_block_into(&mut block1); + let result2 = reader2.read_block_into(&mut block2); + + assert!(result1.is_ok() && result2.is_ok()); + } + + // ==================== Parallel Processing Tests ==================== + + #[derive(Clone, Default)] + struct VbqCountingProcessor { + count: Arc>, + } + + impl ParallelProcessor for VbqCountingProcessor { + fn process_record(&mut self, _record: R) -> Result<()> { + *self.count.lock().unwrap() += 1; + Ok(()) + } + } + + #[test] + fn test_parallel_processing() { + let reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let num_records_result = reader.num_records(); + + // Skip test if we can't determine record count + if num_records_result.is_err() { + return; + } + + let num_records = num_records_result.unwrap(); + + let processor = VbqCountingProcessor::default(); + + let result = reader.process_parallel(processor.clone(), 2); + + // Parallel processing might not be supported for all VBQ files + if result.is_ok() { + let final_count = *processor.count.lock().unwrap(); + assert_eq!(final_count, num_records,); + } + } + + #[test] + fn test_parallel_processing_range() { + let reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let num_records_result = reader.num_records(); + + // Skip test if we can't determine record count + if num_records_result.is_err() { + return; + } + + let num_records = num_records_result.unwrap(); + + if num_records >= 100 { + let start = 10; + let end = 50; + let expected_count = end - start; + + let processor = VbqCountingProcessor::default(); + + let result = reader.process_parallel_range(processor.clone(), 2, start..end); + + // Parallel processing might not be supported for all VBQ files + if result.is_ok() { + let final_count = *processor.count.lock().unwrap(); + // The count should be reasonable + assert_eq!( + final_count, expected_count, + "Processed count should match expected range" + ); + } + } + } + + // ==================== Span Tests ==================== + + #[test] + fn test_span_creation() { + let span = Span::new(10, 20); + assert_eq!(span.offset, 10); + assert_eq!(span.len, 20); + } + + #[test] + fn test_span_default() { + let span = Span::default(); + assert_eq!(span.offset, 0); + assert_eq!(span.len, 0); + } + + // ==================== Error Handling Tests ==================== + + #[test] + fn test_nonexistent_file() { + let result = MmapReader::new("./data/nonexistent.vbq"); + assert!(result.is_err(), "Should fail on nonexistent file"); + } + + #[test] + fn test_invalid_file_format() { + // Try to open a non-VBQ file as VBQ + let result = MmapReader::new("./Cargo.toml"); + // This should fail during header validation + assert!(result.is_err(), "Should fail on invalid file format"); + } + + // ==================== Index Loading Tests ==================== + + #[test] + fn test_load_index() { + let reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let index_result = reader.load_index(); + + assert!(index_result.is_ok(), "Should be able to load index"); + + let index = index_result.unwrap(); + assert!(index.num_records() > 0, "Index should have records"); + } + + #[test] + fn test_index_consistency() { + let reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let num_records_from_reader = reader.num_records().unwrap(); + + let index = reader.load_index().unwrap(); + let num_records_from_index = index.num_records(); + + assert_eq!( + num_records_from_reader, num_records_from_index, + "Reader and index should report same number of records" + ); + } + + // ==================== RecordBlock Decoded Access Tests ==================== + + #[test] + fn test_get_decoded_s() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + reader.set_decode_block(true); + let mut block = reader.new_block(); + + if reader.read_block_into(&mut block).unwrap() && block.n_records() > 0 { + let decoded = block.get_decoded_s(0); + if let Some(seq) = decoded { + assert!(!seq.is_empty(), "Decoded sequence should not be empty"); + } + } + } + + #[test] + fn test_get_decoded_x() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + reader.set_decode_block(true); + let mut block = reader.new_block(); + + if reader.read_block_into(&mut block).unwrap() && block.n_records() > 0 { + // Extended sequence may be empty for non-paired reads + let decoded = block.get_decoded_x(0); + // Just verify it doesn't panic + let _ = decoded; + } + } + + #[test] + fn test_get_decoded_out_of_bounds() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let mut block = reader.new_block(); + + if reader.read_block_into(&mut block).unwrap() { + let num_records = block.n_records(); + + // Try to access beyond bounds + let decoded = block.get_decoded_s(num_records + 100); + assert!(decoded.is_none(), "Should return None for out of bounds"); + } + } + + // ==================== Helper Function Tests ==================== + + #[test] + fn test_encoded_sequence_len_two_bit() { + // 2-bit encoding: 32 nucleotides per u64 + assert_eq!(encoded_sequence_len(32, BitSize::Two), 1); + assert_eq!(encoded_sequence_len(64, BitSize::Two), 2); + assert_eq!(encoded_sequence_len(33, BitSize::Two), 2); // Rounds up + assert_eq!(encoded_sequence_len(1, BitSize::Two), 1); + } + + #[test] + fn test_encoded_sequence_len_four_bit() { + // 4-bit encoding: 16 nucleotides per u64 + assert_eq!(encoded_sequence_len(16, BitSize::Four), 1); + assert_eq!(encoded_sequence_len(32, BitSize::Four), 2); + assert_eq!(encoded_sequence_len(17, BitSize::Four), 2); // Rounds up + assert_eq!(encoded_sequence_len(1, BitSize::Four), 1); + } + + // ==================== Record Iterator Tests ==================== + + #[test] + fn test_record_block_iter_creation() { + let block = RecordBlock::new(BitSize::Two, 1024); + let iter = RecordBlockIter::new(&block); + + // Iterator on empty block should yield nothing + assert_eq!(iter.count(), 0); + } + + #[test] + fn test_record_iteration_multiple_times() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let mut block = reader.new_block(); + + if reader.read_block_into(&mut block).unwrap() && block.n_records() > 0 { + let num_records = block.n_records(); + + // First iteration + let count1 = block.iter().count(); + assert_eq!(count1, num_records); + + // Second iteration should yield same count + let count2 = block.iter().count(); + assert_eq!(count2, num_records); + } + } + + // ==================== Paired Read Tests ==================== + + #[test] + fn test_paired_record_data() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + + if reader.is_paired() { + let mut block = reader.new_block(); + + if reader.read_block_into(&mut block).unwrap() { + // Decode all sequences in the block + block.decode_all().unwrap(); + + for record in block.iter() { + let xlen = record.xlen(); + + if xlen > 0 { + let xseq = record.xseq(); + assert_eq!( + xseq.len(), + xlen as usize, + "Extended sequence length should match xlen" + ); + } + } + } + } + } + + // ==================== Edge Cases ==================== + + #[test] + fn test_empty_block_iteration() { + let block = RecordBlock::new(BitSize::Two, 1024); + + let mut count = 0; + for _ in block.iter() { + count += 1; + } + + assert_eq!(count, 0, "Empty block should yield no records"); + } + + #[test] + fn test_reader_reset_by_new_block() { + let mut reader = MmapReader::new(TEST_VBQ_FILE).unwrap(); + let mut block = reader.new_block(); + + // Read first block + if reader.read_block_into(&mut block).unwrap() { + let first_count = block.n_records(); + + // Read second block (overwrites first) + if reader.read_block_into(&mut block).unwrap() { + let second_count = block.n_records(); + + // Counts may differ, but both should be > 0 + assert!(first_count > 0 && second_count > 0); + } + } + } +} diff --git a/src/vbq/writer.rs b/src/vbq/writer.rs index b7039f0..8aead27 100644 --- a/src/vbq/writer.rs +++ b/src/vbq/writer.rs @@ -1,9 +1,9 @@ -//! Writer implementation for VBINSEQ files +//! Writer implementation for VBQ files //! -//! This module provides functionality for writing sequence data to VBINSEQ files, +//! This module provides functionality for writing sequence data to VBQ files, //! including support for compression, quality scores, paired-end reads, and sequence headers. //! -//! The VBINSEQ writer implements a block-based approach where records are packed +//! The VBQ writer implements a block-based approach where records are packed //! into fixed-size blocks. Each block has a header containing metadata about the //! records it contains. Blocks may be optionally compressed using zstd compression. //! @@ -29,12 +29,13 @@ //! # Example //! //! ```rust,no_run -//! use binseq::vbq::{VBinseqWriterBuilder, VBinseqHeaderBuilder}; +//! use binseq::vbq::{WriterBuilder, FileHeaderBuilder}; +//! use binseq::SequencingRecordBuilder; //! use std::fs::File; //! -//! // Create a VBINSEQ file writer with headers and compression +//! // Create a VBQ file writer with headers and compression //! let file = File::create("example.vbq").unwrap(); -//! let header = VBinseqHeaderBuilder::new() +//! let header = FileHeaderBuilder::new() //! .block(128 * 1024) //! .qual(true) //! .compressed(true) @@ -42,16 +43,20 @@ //! .flags(true) //! .build(); //! -//! let mut writer = VBinseqWriterBuilder::default() +//! let mut writer = WriterBuilder::default() //! .header(header) //! .build(file) //! .unwrap(); //! //! // Write a nucleotide sequence with quality scores and header -//! let sequence = b"ACGTACGTACGT"; -//! let quality = b"IIIIIIIIIIII"; -//! let header_str = b"sequence_001"; -//! writer.write_record(Some(0), Some(header_str), sequence, Some(quality)).unwrap(); +//! let record = SequencingRecordBuilder::default() +//! .s_seq(b"ACGTACGTACGT") +//! .s_qual(b"IIIIIIIIIIII") +//! .s_header(b"sequence_001") +//! .flag(0) +//! .build() +//! .unwrap(); +//! writer.push(record).unwrap(); //! //! // Must call finish() to write the embedded index //! writer.finish().unwrap(); @@ -61,72 +66,35 @@ use std::io::Write; use bitnuc::BitSize; use byteorder::{LittleEndian, WriteBytesExt}; -use rand::rngs::SmallRng; use rand::SeedableRng; +use rand::rngs::SmallRng; use zstd::stream::copy_encode; -use super::header::{BlockHeader, VBinseqHeader}; +use super::header::{BlockHeader, FileHeader}; +use crate::SequencingRecord; use crate::error::{Result, WriteError}; use crate::policy::{Policy, RNG_SEED}; use crate::vbq::header::{SIZE_BLOCK_HEADER, SIZE_HEADER}; -use crate::vbq::index::{IndexHeader, INDEX_END_MAGIC}; +use crate::vbq::index::{INDEX_END_MAGIC, IndexHeader}; use crate::vbq::{BlockIndex, BlockRange}; -/// Calculates the storage size in bytes required for a record without quality scores -/// -/// This function calculates the total size needed to store a record in the VBINSEQ format, -/// including the flag, sequence lengths, and the encoded sequence data. The formula -/// used is: `S = w(Cs + Cx + 3)` where: -/// -/// - `w`: Word size (8 bytes) -/// - `Cs`: Chunk size of the primary sequence in 64-bit words -/// - `Cx`: Chunk size of the extended sequence in 64-bit words (for paired-end reads) -/// - `3`: Additional words for flag, primary length, and extended length -/// -/// # Parameters -/// -/// * `schunk` - Number of 64-bit words needed for the primary sequence -/// * `xchunk` - Number of 64-bit words needed for the extended sequence (0 for single-end) -/// -/// # Returns -/// -/// The total size in bytes needed to store the record -pub fn record_byte_size(schunk: usize, xchunk: usize, has_flags: bool) -> usize { - 8 * (schunk + xchunk + if has_flags { 3 } else { 2 }) -} - -fn record_byte_size_quality_header( - schunk: usize, - xchunk: usize, - squal: usize, - xqual: usize, - sheader: usize, - xheader: usize, - has_flags: bool, -) -> usize { - // counting the header length bytes (u64) - let bytes_sheader = if sheader > 0 { sheader + 8 } else { 0 }; - let bytes_xheader = if xheader > 0 { xheader + 8 } else { 0 }; - record_byte_size(schunk, xchunk, has_flags) + squal + xqual + bytes_sheader + bytes_xheader -} - -/// A builder for creating configured `VBinseqWriter` instances +/// A builder for creating configured `Writer` instances /// /// This builder provides a fluent interface for configuring and creating a -/// `VBinseqWriter` with customized settings. It allows specifying the file header, +/// `Writer` with customized settings. It allows specifying the file header, /// encoding policy, and whether to operate in headless mode. /// /// # Examples /// /// ```rust,no_run -/// use binseq::vbq::{VBinseqWriterBuilder, VBinseqHeaderBuilder}; +/// use binseq::vbq::{WriterBuilder, FileHeaderBuilder}; /// use binseq::Policy; /// use std::fs::File; /// /// // Create a writer with custom settings /// let file = File::create("example.vbq").unwrap(); -/// let mut writer = VBinseqWriterBuilder::default() -/// .header(VBinseqHeaderBuilder::new() +/// let mut writer = WriterBuilder::default() +/// .header(FileHeaderBuilder::new() /// .block(65536) /// .qual(true) /// .compressed(true) @@ -138,23 +106,23 @@ fn record_byte_size_quality_header( /// // Use the writer... /// ``` #[derive(Default)] -pub struct VBinseqWriterBuilder { +pub struct WriterBuilder { /// Header of the file - header: Option, + header: Option, /// Optional policy for encoding policy: Option, /// Optional headless mode (used in parallel writing) headless: Option, } -impl VBinseqWriterBuilder { - /// Sets the header for the VBINSEQ file +impl WriterBuilder { + /// Sets the header for the VBQ file /// /// The header defines the file format parameters such as block size, whether /// the file contains quality scores, paired-end reads, and compression settings. /// /// # Parameters /// - /// * `header` - The `VBinseqHeader` to use for the file + /// * `header` - The `FileHeader` to use for the file /// /// # Returns /// @@ -163,20 +131,20 @@ impl VBinseqWriterBuilder { /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::{VBinseqWriterBuilder, VBinseqHeaderBuilder}; + /// use binseq::vbq::{WriterBuilder, FileHeaderBuilder}; /// /// // Create a header with 64KB blocks and quality scores - /// let header = VBinseqHeaderBuilder::new() + /// let header = FileHeaderBuilder::new() /// .block(65536) /// .qual(true) /// .paired(true) /// .compressed(true) /// .build(); /// - /// let builder = VBinseqWriterBuilder::default().header(header); + /// let builder = WriterBuilder::default().header(header); /// ``` #[must_use] - pub fn header(mut self, header: VBinseqHeader) -> Self { + pub fn header(mut self, header: FileHeader) -> Self { self.header = Some(header); self } @@ -198,10 +166,10 @@ impl VBinseqWriterBuilder { /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::{VBinseqWriterBuilder}; + /// use binseq::vbq::{WriterBuilder}; /// use binseq::Policy; /// - /// let builder = VBinseqWriterBuilder::default().policy(Policy::IgnoreSequence); + /// let builder = WriterBuilder::default().policy(Policy::IgnoreSequence); /// ``` #[must_use] pub fn policy(mut self, policy: Policy) -> Self { @@ -226,10 +194,10 @@ impl VBinseqWriterBuilder { /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::VBinseqWriterBuilder; + /// use binseq::vbq::WriterBuilder; /// /// // Create a headless writer for parallel writing - /// let builder = VBinseqWriterBuilder::default().headless(true); + /// let builder = WriterBuilder::default().headless(true); /// ``` #[must_use] pub fn headless(mut self, headless: bool) -> Self { @@ -237,9 +205,9 @@ impl VBinseqWriterBuilder { self } - /// Builds a `VBinseqWriter` with the configured settings + /// Builds a `Writer` with the configured settings /// - /// This finalizes the builder and creates a new `VBinseqWriter` instance using + /// This finalizes the builder and creates a new `Writer` instance using /// the provided writer and the configured settings. If any settings were not /// explicitly set, default values will be used. /// @@ -249,22 +217,22 @@ impl VBinseqWriterBuilder { /// /// # Returns /// - /// * `Ok(VBinseqWriter)` - A configured `VBinseqWriter` ready for use + /// * `Ok(Writer)` - A configured `Writer` ready for use /// * `Err(_)` - If an error occurred while initializing the writer /// /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::VBinseqWriterBuilder; + /// use binseq::vbq::WriterBuilder; /// use std::fs::File; /// /// let file = File::create("example.vbq").unwrap(); - /// let mut writer = VBinseqWriterBuilder::default() + /// let mut writer = WriterBuilder::default() /// .build(file) /// .unwrap(); /// ``` - pub fn build(self, inner: W) -> Result> { - VBinseqWriter::new( + pub fn build(self, inner: W) -> Result> { + Writer::new( inner, self.header.unwrap_or_default(), self.policy.unwrap_or_default(), @@ -273,15 +241,15 @@ impl VBinseqWriterBuilder { } } -/// Writer for VBINSEQ format files +/// Writer for VBQ format files /// -/// The `VBinseqWriter` handles writing nucleotide sequence data to VBINSEQ files in a +/// The `Writer` handles writing nucleotide sequence data to VBQ files in a /// block-based format. It manages the file structure, compression settings, and ensures /// data is properly encoded and organized. /// /// ## File Structure /// -/// A VBINSEQ file consists of: +/// A VBQ file consists of: /// 1. A file header that defines parameters like block size and compression settings /// 2. A series of blocks, each with: /// - A block header with metadata (e.g., record count) @@ -297,33 +265,37 @@ impl VBinseqWriterBuilder { /// - Single-end sequences with or without quality scores /// - Paired-end sequences with or without quality scores /// -/// It's recommended to use the `VBinseqWriterBuilder` to create and configure a writer +/// It's recommended to use the `WriterBuilder` to create and configure a writer /// instance with the appropriate settings. /// /// ```rust,no_run -/// use binseq::vbq::{VBinseqWriterBuilder, VBinseqHeader}; +/// use binseq::vbq::{WriterBuilder, FileHeader}; +/// use binseq::SequencingRecordBuilder; /// use std::fs::File; /// /// // Create a writer for single-end reads /// let file = File::create("example.vbq").unwrap(); -/// let mut writer = VBinseqWriterBuilder::default() -/// .header(VBinseqHeader::default()) +/// let mut writer = WriterBuilder::default() +/// .header(FileHeader::default()) /// .build(file) /// .unwrap(); /// /// // Write a sequence -/// let sequence = b"ACGTACGTACGT"; -/// writer.write_record(None, None, sequence, None).unwrap(); +/// let record = SequencingRecordBuilder::default() +/// .s_seq(b"ACGTACGTACGT") +/// .build() +/// .unwrap(); +/// writer.push(record).unwrap(); /// /// // Writer automatically flushes when dropped /// ``` #[derive(Clone)] -pub struct VBinseqWriter { +pub struct Writer { /// Inner Writer inner: W, /// Header of the file - header: VBinseqHeader, + header: FileHeader, /// Encoder for nucleotide sequences encoder: Encoder, @@ -343,13 +315,19 @@ pub struct VBinseqWriter { /// Determines if index is already written index_written: bool, } -impl VBinseqWriter { - pub fn new(inner: W, header: VBinseqHeader, policy: Policy, headless: bool) -> Result { +impl Writer { + pub fn new(inner: W, header: FileHeader, policy: Policy, headless: bool) -> Result { let mut wtr = Self { inner, header, encoder: Encoder::with_policy(header.bits, policy), - cblock: BlockWriter::new(header.block as usize, header.compressed, header.flags), + cblock: BlockWriter::new( + header.block as usize, + header.compressed, + header.flags, + header.qual, + header.headers, + ), ranges: Vec::new(), bytes_written: 0, records_written: 0, @@ -364,7 +342,7 @@ impl VBinseqWriter { /// Initializes the writer by writing the file header /// /// This method is called automatically during creation unless headless mode is enabled. - /// It writes the `VBinseqHeader` to the underlying writer. + /// It writes the `FileHeader` to the underlying writer. /// /// # Returns /// @@ -389,15 +367,15 @@ impl VBinseqWriter { /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::{VBinseqWriterBuilder, VBinseqHeader}; + /// use binseq::vbq::{WriterBuilder, FileHeader}; /// use std::fs::File; /// /// // Create a header for paired-end reads - /// let mut header = VBinseqHeader::default(); + /// let mut header = FileHeader::default(); /// header.paired = true; /// /// let file = File::create("paired_reads.vbq").unwrap(); - /// let writer = VBinseqWriterBuilder::default() + /// let writer = WriterBuilder::default() /// .header(header) /// .build(file) /// .unwrap(); @@ -408,6 +386,16 @@ impl VBinseqWriter { self.header.paired } + /// Returns the header of the writer + pub fn header(&self) -> FileHeader { + self.header + } + + /// Returns the N-policy of the writer + pub fn policy(&self) -> Policy { + self.encoder.policy + } + /// Checks if the writer is configured for quality scores /// /// This method returns whether the writer expects quality scores based on the @@ -422,15 +410,15 @@ impl VBinseqWriter { /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::{VBinseqWriterBuilder, VBinseqHeader}; + /// use binseq::vbq::{WriterBuilder, FileHeader}; /// use std::fs::File; /// /// // Create a header for sequences with quality scores - /// let mut header = VBinseqHeader::default(); + /// let mut header = FileHeader::default(); /// header.qual = true; /// /// let file = File::create("reads_with_quality.vbq").unwrap(); - /// let writer = VBinseqWriterBuilder::default() + /// let writer = WriterBuilder::default() /// .header(header) /// .build(file) /// .unwrap(); @@ -445,6 +433,7 @@ impl VBinseqWriter { self.header.headers } + #[deprecated(note = "use `push` method with SequencingRecord instead")] pub fn write_record( &mut self, flag: Option, @@ -452,70 +441,11 @@ impl VBinseqWriter { sequence: &[u8], quality: Option<&[u8]>, ) -> Result { - if self.is_paired() { - return Err(WriteError::PairedFlagSet.into()); - } - - // ignore the header if not set - let header = if header.is_none() && self.header.headers { - return Err(WriteError::HeaderFlagSet.into()); - } else if header.is_some() && !self.header.headers { - None - } else { - header - }; - - // ignore the quality if not set - let quality = if quality.is_none() && self.header.qual { - return Err(WriteError::QualityFlagSet.into()); - } else if quality.is_some() && !self.header.qual { - None - } else { - quality - }; - - // encode the sequence - if let Some(sbuffer) = self.encoder.encode_single(sequence)? { - let record_size = record_byte_size_quality_header( - sbuffer.len(), - 0, - quality.map_or(0, <[u8]>::len), - 0, - header.map_or(0, <[u8]>::len), - 0, - self.header.flags, - ); - if self.cblock.exceeds_block_size(record_size)? { - impl_flush_block( - &mut self.inner, - &mut self.cblock, - &mut self.ranges, - &mut self.bytes_written, - &mut self.records_written, - )?; - } - - // Write the flag, length, and sequence to the block - self.cblock.write_record( - flag, - sequence.len() as u64, - 0, - sbuffer, - quality, - header, - None, - None, - None, - )?; - - // Return true if the sequence was successfully written - Ok(true) - } else { - // Silently ignore sequences that fail encoding - Ok(false) - } + let record = SequencingRecord::new(sequence, quality, header, None, None, None, flag); + self.push(record) } + #[deprecated(note = "use `push` method with SequencingRecord instead")] pub fn write_paired_record( &mut self, flag: Option, @@ -526,81 +456,140 @@ impl VBinseqWriter { x_sequence: &[u8], x_qual: Option<&[u8]>, ) -> Result { - if !self.is_paired() { - return Err(WriteError::PairedFlagNotSet.into()); - } - - let s_header = if s_header.is_none() && self.header.headers { - return Err(WriteError::HeaderFlagSet.into()); - } else if s_header.is_some() && !self.header.headers { - None - } else { - s_header - }; - let x_header = if x_header.is_none() && self.header.headers { - return Err(WriteError::HeaderFlagSet.into()); - } else if x_header.is_some() && !self.header.headers { - None - } else { - x_header - }; - - let s_qual = if s_qual.is_none() && self.header.qual { - return Err(WriteError::QualityFlagSet.into()); - } else if s_qual.is_some() && !self.header.qual { - None - } else { - s_qual - }; + let record = SequencingRecord::new( + s_sequence, + s_qual, + s_header, + Some(x_sequence), + x_qual, + x_header, + flag, + ); + self.push(record) + } - let x_qual = if x_qual.is_none() && self.header.qual { - return Err(WriteError::QualityFlagSet.into()); - } else if x_qual.is_some() && !self.header.qual { - None - } else { - x_qual - }; + /// Writes a record using the unified [`SequencingRecord`] API + /// + /// This method provides a consistent interface with BQ and CBQ writers. + /// It automatically routes to either `write_record` or `write_paired_record` + /// based on whether the record contains paired data. + /// + /// # Arguments + /// + /// * `record` - A [`SequencingRecord`] containing the sequence data to write + /// + /// # Returns + /// + /// * `Ok(true)` if the record was written successfully + /// * `Ok(false)` if the record was skipped due to invalid nucleotides + /// * `Err(_)` if writing failed + /// + /// # Examples + /// + /// ```rust,no_run + /// use binseq::vbq::{WriterBuilder, FileHeaderBuilder}; + /// use binseq::SequencingRecordBuilder; + /// use std::fs::File; + /// + /// let header = FileHeaderBuilder::new() + /// .qual(true) + /// .headers(true) + /// .build(); + /// + /// let mut writer = WriterBuilder::default() + /// .header(header) + /// .build(File::create("example.vbq").unwrap()) + /// .unwrap(); + /// + /// let record = SequencingRecordBuilder::default() + /// .s_seq(b"ACGTACGT") + /// .s_qual(b"IIIIFFFF") + /// .s_header(b"seq_001") + /// .flag(42) + /// .build() + /// .unwrap(); + /// + /// writer.push(record).unwrap(); + /// writer.finish().unwrap(); + /// ``` + pub fn push(&mut self, record: SequencingRecord) -> Result { + // Check paired status - writer can require paired (record must have R2), + // but if writer is single-end, we simply ignore any R2 data in the record. + if self.header.paired && !record.is_paired() { + return Err(WriteError::ConfigurationMismatch { + attribute: "paired", + expected: self.header.paired, + actual: record.is_paired(), + } + .into()); + } - // encode the sequences - if let Some((sbuffer, xbuffer)) = self.encoder.encode_paired(s_sequence, x_sequence)? { - // Check if the current block can handle the next record - let record_size = record_byte_size_quality_header( - sbuffer.len(), - xbuffer.len(), - s_qual.map_or(0, <[u8]>::len), - x_qual.map_or(0, <[u8]>::len), - s_header.map_or(0, <[u8]>::len), - x_header.map_or(0, <[u8]>::len), - self.header.flags, - ); - if self.cblock.exceeds_block_size(record_size)? { - impl_flush_block( - &mut self.inner, - &mut self.cblock, - &mut self.ranges, - &mut self.bytes_written, - &mut self.records_written, - )?; + // For qualities and headers: the writer can require them (record must have them), + // but if the writer doesn't need them, we simply ignore any extra data in the record. + if self.header.qual && !record.has_qualities() { + return Err(WriteError::ConfigurationMismatch { + attribute: "qual", + expected: self.header.qual, + actual: record.has_qualities(), } + .into()); + } + if self.header.headers && !record.has_headers() { + return Err(WriteError::ConfigurationMismatch { + attribute: "headers", + expected: self.header.headers, + actual: record.has_headers(), + } + .into()); + } - // Write the flag, length, sequence, and quality scores to the block - self.cblock.write_record( - flag, - s_sequence.len() as u64, - x_sequence.len() as u64, - sbuffer, - s_qual, - s_header, - Some(xbuffer), - x_qual, - x_header, - )?; - - // Return true if the record was successfully written - Ok(true) + let record_size = record.configured_size_vbq( + self.header.paired, + self.header.flags, + self.header.headers, + self.header.qual, + self.header.bits, + ); + + if self.header.is_paired() { + // encode the sequences + if let Some((sbuffer, xbuffer)) = self + .encoder + .encode_paired(record.s_seq, record.x_seq.unwrap_or_default())? + { + if self.cblock.exceeds_block_size(record_size)? { + impl_flush_block( + &mut self.inner, + &mut self.cblock, + &mut self.ranges, + &mut self.bytes_written, + &mut self.records_written, + )?; + } + + self.cblock.write_record(&record, sbuffer, Some(xbuffer))?; + Ok(true) + } else { + Ok(false) + } } else { - // Return false if the record was not successfully written - Ok(false) + // encode the sequence + if let Some(sbuffer) = self.encoder.encode_single(record.s_seq)? { + if self.cblock.exceeds_block_size(record_size)? { + impl_flush_block( + &mut self.inner, + &mut self.cblock, + &mut self.ranges, + &mut self.bytes_written, + &mut self.records_written, + )?; + } + + self.cblock.write_record(&record, sbuffer, None)?; + Ok(true) + } else { + Ok(false) + } } } @@ -619,17 +608,21 @@ impl VBinseqWriter { /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::{VBinseqWriterBuilder, VBinseqHeader}; + /// use binseq::vbq::{WriterBuilder, FileHeader}; + /// use binseq::SequencingRecordBuilder; /// use std::fs::File; /// /// let file = File::create("example.vbq").unwrap(); - /// let mut writer = VBinseqWriterBuilder::default() + /// let mut writer = WriterBuilder::default() /// .build(file) /// .unwrap(); /// /// // Write some sequences... - /// let sequence = b"ACGTACGTACGT"; - /// writer.write_record(None, None, sequence, None).unwrap(); + /// let record = SequencingRecordBuilder::default() + /// .s_seq(b"ACGTACGTACGT") + /// .build() + /// .unwrap(); + /// writer.push(record).unwrap(); /// /// // Manually finish and check for errors /// if let Err(e) = writer.finish() { @@ -637,6 +630,7 @@ impl VBinseqWriter { /// } /// ``` pub fn finish(&mut self) -> Result<()> { + // Flush any remaining data in the current block impl_flush_block( &mut self.inner, &mut self.cblock, @@ -646,6 +640,8 @@ impl VBinseqWriter { )?; self.inner.flush()?; + // Always write the index - this is critical for VBQ file validity + // The index_written flag prevents double-writing on subsequent finish() calls if !self.index_written { self.write_index()?; self.index_written = true; @@ -663,7 +659,7 @@ impl VBinseqWriter { &mut self.cblock } - /// Ingests data from another `VBinseqWriter` that uses a `Vec` as its inner writer + /// Ingests data from another `Writer` that uses a `Vec` as its inner writer /// /// This method is particularly useful for parallel processing, where multiple writers /// might be writing to memory buffers and need to be combined into a single file. It @@ -673,7 +669,7 @@ impl VBinseqWriter { /// /// # Parameters /// - /// * `other` - Another `VBinseqWriter` whose inner writer is a `Vec` + /// * `other` - Another `Writer` whose inner writer is a `Vec` /// /// # Returns /// @@ -689,27 +685,32 @@ impl VBinseqWriter { /// # Examples /// /// ```rust,no_run - /// use binseq::vbq::{VBinseqWriterBuilder, VBinseqHeader}; + /// use binseq::vbq::{WriterBuilder, FileHeader}; + /// use binseq::SequencingRecordBuilder; /// use std::fs::File; /// /// // Create a file writer /// let file = File::create("combined.vbq").unwrap(); - /// let mut file_writer = VBinseqWriterBuilder::default() + /// let mut file_writer = WriterBuilder::default() /// .build(file) /// .unwrap(); /// /// // Create a memory writer - /// let mut mem_writer = VBinseqWriterBuilder::default() + /// let mut mem_writer = WriterBuilder::default() /// .build(Vec::new()) /// .unwrap(); /// /// // Write some data to the memory writer - /// mem_writer.write_record(None, None, b"ACGTACGT", None).unwrap(); + /// let record = SequencingRecordBuilder::default() + /// .s_seq(b"ACGTACGT") + /// .build() + /// .unwrap(); + /// mem_writer.push(record).unwrap(); /// /// // Ingest data from memory writer into file writer /// file_writer.ingest(&mut mem_writer).unwrap(); /// ``` - pub fn ingest(&mut self, other: &mut VBinseqWriter>) -> Result<()> { + pub fn ingest(&mut self, other: &mut Writer>) -> Result<()> { if self.header != other.header { return Err(WriteError::IncompatibleHeaders(self.header, other.header).into()); } @@ -810,10 +811,9 @@ fn impl_flush_block( Ok(()) } -impl Drop for VBinseqWriter { +impl Drop for Writer { fn drop(&mut self) { - self.finish() - .expect("VBinseqWriter: Failed to finish writing"); + self.finish().expect("Writer: Failed to finish writing"); } } @@ -838,9 +838,19 @@ struct BlockWriter { compress: bool, /// Has flags has_flags: bool, + /// Has quality scores + has_qualities: bool, + /// Has headers + has_headers: bool, } impl BlockWriter { - fn new(block_size: usize, compress: bool, has_flags: bool) -> Self { + fn new( + block_size: usize, + compress: bool, + has_flags: bool, + has_qualities: bool, + has_headers: bool, + ) -> Self { Self { pos: 0, starts: Vec::default(), @@ -851,6 +861,8 @@ impl BlockWriter { padding: vec![0; block_size], compress, has_flags, + has_qualities, + has_headers, } } @@ -865,49 +877,58 @@ impl BlockWriter { Ok(self.pos + record_size > self.block_size) } - #[allow(clippy::too_many_arguments)] fn write_record( &mut self, - flag: Option, - slen: u64, - xlen: u64, + record: &SequencingRecord, sbuf: &[u64], - squal: Option<&[u8]>, - sheader: Option<&[u8]>, xbuf: Option<&[u64]>, - xqual: Option<&[u8]>, - xheader: Option<&[u8]>, ) -> Result<()> { // Tracks the record start position self.starts.push(self.pos); - // Write the flag + // Write the flag (only if configured) if self.has_flags { - self.write_flag(flag.unwrap_or(0))?; + self.write_flag(record.flag.unwrap_or(0))?; } // Write the lengths - self.write_length(slen)?; - self.write_length(xlen)?; + self.write_length(record.s_seq.len() as u64)?; + self.write_length(record.x_seq.map_or(0, <[u8]>::len) as u64)?; - // Write the primary sequence and optional quality + // Write the primary sequence self.write_buffer(sbuf)?; - if let Some(qual) = squal { + + // Write primary quality (only if configured) + if self.has_qualities + && let Some(qual) = record.s_qual + { self.write_u8buf(qual)?; } - if let Some(sheader) = sheader { + + // Write primary header (only if configured) + if self.has_headers + && let Some(sheader) = record.s_header + { self.write_length(sheader.len() as u64)?; self.write_u8buf(sheader)?; } - // Write the optional extended sequence and optional quality + // Write the optional extended sequence if let Some(xbuf) = xbuf { self.write_buffer(xbuf)?; } - if let Some(qual) = xqual { + + // Write extended quality (only if configured) + if self.has_qualities + && let Some(qual) = record.x_qual + { self.write_u8buf(qual)?; } - if let Some(xheader) = xheader { + + // Write extended header (only if configured) + if self.has_headers + && let Some(xheader) = record.x_header + { self.write_length(xheader.len() as u64)?; self.write_u8buf(xheader)?; } @@ -1185,18 +1206,15 @@ impl Encoder { #[cfg(test)] mod tests { use super::*; - use crate::vbq::{header::SIZE_HEADER, VBinseqHeaderBuilder}; + use crate::SequencingRecordBuilder; + use crate::vbq::{FileHeaderBuilder, header::SIZE_HEADER}; #[test] fn test_headless_writer() -> super::Result<()> { - let writer = VBinseqWriterBuilder::default() - .headless(true) - .build(Vec::new())?; + let writer = WriterBuilder::default().headless(true).build(Vec::new())?; assert_eq!(writer.inner.len(), 0); - let writer = VBinseqWriterBuilder::default() - .headless(false) - .build(Vec::new())?; + let writer = WriterBuilder::default().headless(false).build(Vec::new())?; assert_eq!(writer.inner.len(), SIZE_HEADER); Ok(()) @@ -1205,16 +1223,16 @@ mod tests { #[test] fn test_ingest_empty_writer() -> super::Result<()> { // Test ingesting from an empty writer - let header = VBinseqHeaderBuilder::new().build(); + let header = FileHeaderBuilder::new().build(); // Create a source writer that's empty - let mut source = VBinseqWriterBuilder::default() + let mut source = WriterBuilder::default() .header(header) .headless(true) .build(Vec::new())?; // Create a destination writer - let mut dest = VBinseqWriterBuilder::default() + let mut dest = WriterBuilder::default() .header(header) .headless(true) .build(Vec::new())?; @@ -1235,28 +1253,26 @@ mod tests { #[test] fn test_ingest_single_record() -> super::Result<()> { // Test ingesting a single record - let header = VBinseqHeaderBuilder::new().build(); + let header = FileHeaderBuilder::new().build(); // Create a source writer with a single record - let mut source = VBinseqWriterBuilder::default() + let mut source = WriterBuilder::default() .header(header) .headless(true) .build(Vec::new())?; // Write a single sequence - let seq = b"ACGTACGTACGT"; - source.write_record( - Some(1), // flag - None, // header - seq, // sequence - None, // quality - )?; + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .flag(1) + .build()?; + source.push(record)?; // We have not crossed a boundary assert!(source.by_ref().is_empty()); // Create a destination writer - let mut dest = VBinseqWriterBuilder::default() + let mut dest = WriterBuilder::default() .header(header) .headless(true) .build(Vec::new())?; @@ -1286,24 +1302,27 @@ mod tests { #[test] fn test_ingest_multi_record() -> super::Result<()> { // Test ingesting a single record - let header = VBinseqHeaderBuilder::new().build(); + let header = FileHeaderBuilder::new().build(); // Create a source writer with a single record - let mut source = VBinseqWriterBuilder::default() + let mut source = WriterBuilder::default() .header(header) .headless(true) .build(Vec::new())?; // Write multiple sequences for _ in 0..30 { - let seq = b"ACGTACGTACGT"; - source.write_record(Some(1), None, seq, None)?; + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .flag(1) + .build()?; + source.push(record)?; } // We have not crossed a boundary assert!(source.by_ref().is_empty()); // Create a destination writer - let mut dest = VBinseqWriterBuilder::default() + let mut dest = WriterBuilder::default() .header(header) .headless(true) .build(Vec::new())?; @@ -1333,25 +1352,28 @@ mod tests { #[test] fn test_ingest_block_boundary() -> super::Result<()> { // Test ingesting a single record - let header = VBinseqHeaderBuilder::new().build(); + let header = FileHeaderBuilder::new().build(); // Create a source writer with a single record - let mut source = VBinseqWriterBuilder::default() + let mut source = WriterBuilder::default() .header(header) .headless(true) .build(Vec::new())?; // Write multiple sequences (will cross boundary) for _ in 0..30000 { - let seq = b"ACGTACGTACGT"; - source.write_record(Some(1), None, seq, None)?; + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .flag(1) + .build()?; + source.push(record)?; } // We have crossed a boundary assert!(!source.by_ref().is_empty()); // Create a destination writer - let mut dest = VBinseqWriterBuilder::default() + let mut dest = WriterBuilder::default() .header(header) .headless(true) .build(Vec::new())?; @@ -1381,25 +1403,29 @@ mod tests { #[test] fn test_ingest_with_quality_scores() -> super::Result<()> { // Test ingesting records with quality scores - let source_header = VBinseqHeaderBuilder::new().qual(true).build(); - let dest_header = VBinseqHeaderBuilder::new().qual(true).build(); + let source_header = FileHeaderBuilder::new().qual(true).build(); + let dest_header = FileHeaderBuilder::new().qual(true).build(); // Create a source writer with quality scores - let mut source = VBinseqWriterBuilder::default() + let mut source = WriterBuilder::default() .header(source_header) .headless(true) .build(Vec::new())?; // Write sequences with quality scores + let seq = b"ACGTACGTACGT"; + let qual = vec![40u8; seq.len()]; for i in 0..5 { - let seq = b"ACGTACGTACGT"; - // Simple quality scores (all the same for this test) - let qual = vec![40; seq.len()]; - source.write_record(Some(i), None, seq, Some(&qual))?; + let record = SequencingRecordBuilder::default() + .s_seq(seq) + .s_qual(&qual) + .flag(i) + .build()?; + source.push(record)?; } // Create a destination writer - let mut dest = VBinseqWriterBuilder::default() + let mut dest = WriterBuilder::default() .header(dest_header) .headless(true) .build(Vec::new())?; @@ -1421,22 +1447,25 @@ mod tests { #[test] fn test_ingest_with_compression() -> super::Result<()> { // Test ingesting a single record - let header = VBinseqHeaderBuilder::new().compressed(true).build(); + let header = FileHeaderBuilder::new().compressed(true).build(); // Create a source writer with a single record - let mut source = VBinseqWriterBuilder::default() + let mut source = WriterBuilder::default() .header(header) .headless(true) .build(Vec::new())?; // Write multiple sequences (will cross boundary) for _ in 0..30000 { - let seq = b"ACGTACGTACGT"; - source.write_record(Some(1), None, seq, None)?; + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .flag(1) + .build()?; + source.push(record)?; } // Create a destination writer - let mut dest = VBinseqWriterBuilder::default() + let mut dest = WriterBuilder::default() .header(header) .headless(true) .build(Vec::new())?; @@ -1465,17 +1494,17 @@ mod tests { #[test] fn test_ingest_incompatible_headers() -> super::Result<()> { - let source_header = VBinseqHeaderBuilder::new().build(); - let dest_header = VBinseqHeaderBuilder::new().qual(true).build(); + let source_header = FileHeaderBuilder::new().build(); + let dest_header = FileHeaderBuilder::new().qual(true).build(); // Create a source writer with quality scores - let mut source = VBinseqWriterBuilder::default() + let mut source = WriterBuilder::default() .header(source_header) .headless(true) .build(Vec::new())?; // Create a destination writer - let mut dest = VBinseqWriterBuilder::default() + let mut dest = WriterBuilder::default() .header(dest_header) .headless(true) .build(Vec::new())?; @@ -1487,12 +1516,91 @@ mod tests { } #[test] - #[allow(clippy::identity_op)] - fn test_record_byte_size() { - let size = record_byte_size(2, 0, true); - assert_eq!(size, 8 * (2 + 0 + 3)); // 40 bytes + fn test_index_always_written_on_finish() -> super::Result<()> { + use crate::vbq::index::INDEX_END_MAGIC; + use byteorder::{ByteOrder, LittleEndian}; + + // Create a writer with some records + let header = FileHeaderBuilder::new().build(); + let mut writer = WriterBuilder::default().header(header).build(Vec::new())?; + + // Write some records + for i in 0..10 { + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .flag(i) + .build()?; + writer.push(record)?; + } - let size = record_byte_size(4, 8, true); - assert_eq!(size, 8 * (4 + 8 + 3)); // 128 bytes + // Finish the writer + writer.finish()?; + + // Get the written bytes + let bytes = &writer.inner; + + // Verify the file ends with the index magic number + assert!(bytes.len() >= 8, "File is too short to contain index"); + let magic_offset = bytes.len() - 8; + let magic = LittleEndian::read_u64(&bytes[magic_offset..]); + assert_eq!( + magic, INDEX_END_MAGIC, + "Index magic number not found at end of file" + ); + + // Verify we can read the index size + assert!(bytes.len() >= 16, "File is too short to contain index size"); + let size_offset = bytes.len() - 16; + let index_size = LittleEndian::read_u64(&bytes[size_offset..size_offset + 8]); + assert!(index_size > 0, "Index size should be greater than 0"); + + // Verify the index size makes sense (should be less than total file size) + assert!( + index_size < bytes.len() as u64, + "Index size is larger than file" + ); + + Ok(()) + } + + #[test] + fn test_finish_idempotent() -> super::Result<()> { + use crate::vbq::index::INDEX_END_MAGIC; + use byteorder::{ByteOrder, LittleEndian}; + + // Create a writer + let header = FileHeaderBuilder::new().build(); + let mut writer = WriterBuilder::default().header(header).build(Vec::new())?; + + // Write some records + for i in 0..10 { + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .flag(i) + .build()?; + writer.push(record)?; + } + + // Call finish() multiple times + writer.finish()?; + let size_after_first_finish = writer.inner.len(); + + writer.finish()?; + let size_after_second_finish = writer.inner.len(); + + writer.finish()?; + let size_after_third_finish = writer.inner.len(); + + // All sizes should be the same - index should only be written once + assert_eq!(size_after_first_finish, size_after_second_finish); + assert_eq!(size_after_second_finish, size_after_third_finish); + + // Verify only one index magic number at the end + let bytes = &writer.inner; + let magic_offset = bytes.len() - 8; + let magic = LittleEndian::read_u64(&bytes[magic_offset..]); + assert_eq!(magic, INDEX_END_MAGIC); + + Ok(()) } } diff --git a/src/write.rs b/src/write.rs new file mode 100644 index 0000000..d500b2e --- /dev/null +++ b/src/write.rs @@ -0,0 +1,1570 @@ +//! Unified writer interface for BINSEQ formats +//! +//! This module provides a unified `BinseqWriter` enum that abstracts over the three +//! BINSEQ format writers (BQ, VBQ, CBQ), allowing format-agnostic writing of sequence data. +//! +//! # Example +//! +//! ```rust +//! use binseq::{write::{BinseqWriter, BinseqWriterBuilder, Format}, SequencingRecordBuilder}; +//! use std::io::Cursor; +//! +//! // Create a VBQ writer with quality scores and headers +//! let mut writer = BinseqWriterBuilder::new(Format::Vbq) +//! .paired(false) +//! .quality(true) +//! .headers(true) +//! .build(Cursor::new(Vec::new())) +//! .unwrap(); +//! +//! // Write a record +//! let record = SequencingRecordBuilder::default() +//! .s_seq(b"ACGTACGT") +//! .s_qual(b"IIIIIIII") +//! .s_header(b"seq1") +//! .build() +//! .unwrap(); +//! +//! writer.push(record).unwrap(); +//! writer.finish().unwrap(); +//! ``` +//! +//! # Parallel Writing +//! +//! For parallel writing scenarios, use `headless(true)` for thread-local writers +//! and `ingest()` to merge them into a global writer: +//! +//! ```rust,no_run +//! use binseq::{write::{BinseqWriter, BinseqWriterBuilder, Format}, SequencingRecordBuilder}; +//! use std::fs::File; +//! +//! // Global writer (writes header) +//! let mut global = BinseqWriterBuilder::new(Format::Vbq) +//! .paired(false) +//! .build(File::create("output.vbq").unwrap()) +//! .unwrap(); +//! +//! // Thread-local writer (headless, Vec buffer) +//! let mut local = global.new_headless_buffer().unwrap(); +//! +//! // Write to local buffer +//! let record = SequencingRecordBuilder::default() +//! .s_seq(b"ACGTACGT") +//! .build() +//! .unwrap(); +//! local.push(record).unwrap(); +//! +//! // Merge into global writer +//! global.ingest(&mut local).unwrap(); +//! global.finish().unwrap(); +//! ``` + +use std::{io::Write, str::FromStr}; + +use crate::{BitSize, Policy, Result, SequencingRecord, bq, cbq, error::WriteError, vbq}; + +/// Output format for BINSEQ files +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum Format { + /// BQ format - fixed length records, no quality scores + Bq, + /// VBQ format - variable length records, optional quality scores + Vbq, + /// CBQ format - columnar variable length records, optional quality scores + #[default] + Cbq, +} +impl FromStr for Format { + type Err = String; + fn from_str(s: &str) -> std::result::Result { + match s { + "bq" | "BQ" | "b" => Ok(Self::Bq), + "vbq" | "VBQ" | "v" => Ok(Self::Vbq), + "cbq" | "CBQ" | "c" => Ok(Self::Cbq), + _ => Err(format!("Unknown format: {s}")), + } + } +} + +impl Format { + /// Returns the file extension for this format (including the dot) + #[must_use] + pub fn extension(&self) -> &'static str { + match self { + Self::Bq => ".bq", + Self::Vbq => ".vbq", + Self::Cbq => ".cbq", + } + } +} + +/// Builder for creating [`BinseqWriter`] instances +/// +/// This builder provides a unified interface for configuring writers across all +/// BINSEQ formats. Settings that don't apply to a particular format are silently +/// ignored. +/// +/// # Format-specific behavior +/// +/// | Setting | BQ | VBQ | CBQ | +/// |---------|:--:|:---:|:---:| +/// | `quality(true)` | ignored | applied | applied | +/// | `headers(true)` | ignored | applied | applied | +/// | `compression(true)` | ignored | applied | applied | +/// | `compression_level(n)` | ignored | ignored | applied | +/// | `block_size(n)` | ignored | applied | applied | +/// | `bitsize(b)` | applied | applied | ignored | +/// | `slen(n)` | **required** | ignored | ignored | +/// | `xlen(n)` | required if paired | ignored | ignored | +/// | `policy(p)` | applied | applied | ignored | +/// | `headless(true)` | applied | applied | applied | +#[derive(Debug, Clone)] +pub struct BinseqWriterBuilder { + pub(crate) format: Format, + pub(crate) paired: bool, + quality: bool, + headers: bool, + flags: bool, + compression: bool, + compression_level: Option, + block_size: Option, + policy: Option, + headless: bool, + bitsize: Option, + pub(crate) slen: Option, + pub(crate) xlen: Option, +} + +impl BinseqWriterBuilder { + /// Create a new builder for the specified format + #[must_use] + pub fn new(format: Format) -> Self { + Self { + format, + paired: false, + quality: false, + headers: false, + flags: false, + compression: true, + compression_level: None, + block_size: None, + policy: None, + headless: false, + bitsize: None, + slen: None, + xlen: None, + } + } + + /// Set whether records are paired-end + #[must_use] + pub fn paired(mut self, paired: bool) -> Self { + self.paired = paired; + self + } + + /// Set whether to store quality scores (ignored for BQ) + #[must_use] + pub fn quality(mut self, quality: bool) -> Self { + self.quality = quality; + self + } + + /// Set whether to store sequence headers (ignored for BQ) + #[must_use] + pub fn headers(mut self, headers: bool) -> Self { + self.headers = headers; + self + } + + /// Set whether to store flags + #[must_use] + pub fn flags(mut self, flags: bool) -> Self { + self.flags = flags; + self + } + + /// Set whether to compress data (ignored for BQ) + #[must_use] + pub fn compression(mut self, compression: bool) -> Self { + self.compression = compression; + self + } + + /// Set the compression level (only applies to CBQ) + #[must_use] + pub fn compression_level(mut self, level: i32) -> Self { + self.compression_level = Some(level); + self + } + + /// Set the block size in bytes (ignored for BQ) + #[must_use] + pub fn block_size(mut self, size: usize) -> Self { + self.block_size = Some(size); + self + } + + /// Set the policy for handling invalid nucleotides (ignored for CBQ) + #[must_use] + pub fn policy(mut self, policy: Policy) -> Self { + self.policy = Some(policy); + self + } + + /// Set whether to operate in headless mode (for parallel writing) + #[must_use] + pub fn headless(mut self, headless: bool) -> Self { + self.headless = headless; + self + } + + /// Set the bit size for nucleotide encoding (ignored for CBQ) + #[must_use] + pub fn bitsize(mut self, bitsize: BitSize) -> Self { + self.bitsize = Some(bitsize); + self + } + + /// Set the primary sequence length (required for BQ, ignored for VBQ/CBQ) + #[must_use] + pub fn slen(mut self, len: u32) -> Self { + self.slen = Some(len); + self + } + + /// Set the extended sequence length (required for paired BQ, ignored for VBQ/CBQ) + #[must_use] + pub fn xlen(mut self, len: u32) -> Self { + self.xlen = Some(len); + self + } + + /// Sets the corresponding values for this builder given an existing BQ header + #[must_use] + pub fn from_bq_header(header: bq::FileHeader) -> Self { + Self { + format: Format::Bq, + slen: Some(header.slen), + xlen: (header.xlen > 0).then_some(header.xlen), + bitsize: Some(header.bits), + paired: header.is_paired(), + flags: header.flags, + compression: false, + headers: false, + quality: false, + compression_level: None, + block_size: None, + headless: false, + policy: None, + } + } + + /// Sets the corresponding values for this builder given an existing VBQ header + #[must_use] + pub fn from_vbq_header(header: vbq::FileHeader) -> Self { + Self { + format: Format::Vbq, + slen: None, + xlen: None, + flags: header.flags, + quality: header.qual, + paired: header.paired, + bitsize: Some(header.bits), + headers: header.headers, + compression: header.compressed, + block_size: Some(header.block as usize), + policy: None, + compression_level: None, + headless: false, + } + } + + /// Sets the corresponding values for this builder given an existing CBQ header + #[must_use] + pub fn from_cbq_header(header: cbq::FileHeader) -> Self { + Self { + format: Format::Cbq, + flags: header.has_flags(), + quality: header.has_qualities(), + headers: header.has_headers(), + paired: header.is_paired(), + block_size: Some(header.block_size as usize), + compression_level: Some(header.compression_level as i32), + compression: false, + slen: None, + xlen: None, + bitsize: None, + policy: None, + headless: false, + } + } + + /// Encode FASTX file(s) to BINSEQ format + /// + /// This method returns a [`FastxEncoderBuilder`] that allows you to configure + /// the input source and threading options before executing the encoding. + /// + /// This is an alternative to [`build`](Self::build) that directly processes + /// FASTX files using parallel processing. + /// + /// # Availability + /// + /// This method is only available when the `paraseq` feature is enabled. + /// + /// # Example + /// + /// ```rust,no_run + /// use binseq::write::{BinseqWriterBuilder, Format}; + /// use std::fs::File; + /// + /// // Encode from stdin to VBQ + /// let writer = BinseqWriterBuilder::new(Format::Vbq) + /// .quality(true) + /// .headers(true) + /// .encode_fastx(File::create("output.vbq")?) + /// .input_stdin() + /// .threads(8) + /// .run()?; + /// + /// // Encode paired-end reads + /// let writer = BinseqWriterBuilder::new(Format::Vbq) + /// .quality(true) + /// .encode_fastx(File::create("output.vbq")?) + /// .input_paired("R1.fastq", "R2.fastq") + /// .run()?; + /// # Ok::<(), binseq::Error>(()) + /// ``` + #[cfg(feature = "paraseq")] + #[must_use] + pub fn encode_fastx( + self, + output: W, + ) -> crate::utils::FastxEncoderBuilder { + crate::utils::FastxEncoderBuilder::new(self, Box::new(output)) + } + + /// Build the writer + /// + /// # Errors + /// + /// Returns an error if: + /// - Format is BQ and `slen` is not set + /// - Format is BQ, `paired` is true, but `xlen` is not set + pub fn build(self, writer: W) -> Result> { + match self.format { + Format::Bq => self.build_bq(writer), + Format::Vbq => self.build_vbq(writer), + Format::Cbq => self.build_cbq(writer), + } + } + + fn build_bq(self, writer: W) -> Result> { + let slen = self.slen.ok_or(WriteError::MissingSequenceLength { + exp_primary: true, + exp_extended: self.paired, + obs_primary: self.slen.is_some(), + obs_extended: self.xlen.is_some(), + })?; + let xlen = if self.paired || self.xlen.is_some_and(|x| x > 0) { + self.xlen.ok_or(WriteError::MissingSequenceLength { + exp_primary: true, + exp_extended: true, + obs_primary: self.slen.is_some(), + obs_extended: self.xlen.is_some(), + })? + } else { + 0 + }; + + let mut header_builder = bq::FileHeaderBuilder::new().slen(slen).xlen(xlen); + + if let Some(bitsize) = self.bitsize { + header_builder = header_builder.bitsize(bitsize); + } + + header_builder = header_builder.flags(self.flags); + + let header = header_builder.build()?; + + let inner = bq::WriterBuilder::default() + .header(header) + .policy(self.policy.unwrap_or_default()) + .headless(self.headless) + .build(writer)?; + + Ok(BinseqWriter::Bq(inner)) + } + + fn build_vbq(self, writer: W) -> Result> { + let mut header_builder = vbq::FileHeaderBuilder::new() + .paired(self.paired) + .qual(self.quality) + .headers(self.headers) + .flags(self.flags) + .compressed(self.compression); + + if let Some(block_size) = self.block_size { + header_builder = header_builder.block(block_size as u64); + } + + if let Some(bitsize) = self.bitsize { + header_builder = header_builder.bitsize(bitsize); + } + + let header = header_builder.build(); + + let inner = vbq::WriterBuilder::default() + .header(header) + .policy(self.policy.unwrap_or_default()) + .headless(self.headless) + .build(writer)?; + + Ok(BinseqWriter::Vbq(inner)) + } + + fn build_cbq(self, writer: W) -> Result> { + let header = cbq::FileHeaderBuilder::default() + .is_paired(self.paired) + .with_qualities(self.quality) + .with_headers(self.headers) + .with_flags(self.flags) + .with_optional_block_size(self.block_size) + .with_optional_compression_level(self.compression_level.map(|level| level as usize)) + .build(); + + let inner = if self.headless { + cbq::ColumnarBlockWriter::new_headless(writer, header)? + } else { + cbq::ColumnarBlockWriter::new(writer, header)? + }; + + Ok(BinseqWriter::Cbq(inner)) + } +} + +/// Unified writer for BINSEQ formats +/// +/// This enum wraps the three format-specific writers (BQ, VBQ, CBQ) and provides +/// a unified interface for writing sequence data. +pub enum BinseqWriter { + /// BQ format writer + Bq(bq::Writer), + /// VBQ format writer + Vbq(vbq::Writer), + /// CBQ format writer + Cbq(cbq::ColumnarBlockWriter), +} + +impl BinseqWriter { + /// Push a record to the writer + /// + /// Returns `Ok(true)` if the record was written successfully, or `Ok(false)` + /// if the record was skipped due to invalid nucleotides (based on the configured + /// policy). CBQ always returns `Ok(true)` as it handles N's explicitly. + /// + /// # Errors + /// + /// Returns an error if there's an I/O error or if the record doesn't match + /// the writer's configuration (e.g., paired record to unpaired writer). + pub fn push(&mut self, record: SequencingRecord) -> Result { + match self { + Self::Bq(w) => w.push(record), + Self::Vbq(w) => w.push(record), + Self::Cbq(w) => w.push(record), + } + } + + /// Finish writing and flush any remaining data + /// + /// For VBQ and CBQ formats, this writes the embedded index. For BQ, this + /// is equivalent to `flush()`. + /// + /// # Errors + /// + /// Returns an error if there's an I/O error writing the final data. + pub fn finish(&mut self) -> Result<()> { + match self { + Self::Bq(w) => w.flush(), + Self::Vbq(w) => w.finish(), + Self::Cbq(w) => w.finish(), + } + } + + /// Returns the format of this writer + #[must_use] + pub fn format(&self) -> Format { + match self { + Self::Bq(_) => Format::Bq, + Self::Vbq(_) => Format::Vbq, + Self::Cbq(_) => Format::Cbq, + } + } + + /// Returns whether this writer is configured for paired-end records + #[must_use] + pub fn is_paired(&self) -> bool { + match self { + Self::Bq(w) => w.is_paired(), + Self::Vbq(w) => w.is_paired(), + Self::Cbq(w) => w.header().is_paired(), + } + } + + /// Returns whether this writer stores quality scores + /// + /// Always returns `false` for BQ format. + #[must_use] + pub fn has_quality(&self) -> bool { + match self { + Self::Bq(_) => false, + Self::Vbq(w) => w.has_quality(), + Self::Cbq(w) => w.header().has_qualities(), + } + } + + /// Returns whether this writer stores sequence headers + /// + /// Always returns `false` for BQ format. + #[must_use] + pub fn has_headers(&self) -> bool { + match self { + Self::Bq(_) => false, + Self::Vbq(w) => w.has_headers(), + Self::Cbq(w) => w.header().has_headers(), + } + } +} + +impl Clone for BinseqWriter { + fn clone(&self) -> Self { + match self { + Self::Bq(w) => Self::Bq(w.clone()), + Self::Vbq(w) => Self::Vbq(w.clone()), + Self::Cbq(w) => Self::Cbq(w.clone()), + } + } +} + +impl BinseqWriter { + /// Ingest records from a headless `Vec` writer into this writer + /// + /// This is used in parallel writing scenarios where thread-local writers + /// buffer to `Vec` and then get merged into a global writer. + /// + /// # Errors + /// + /// Returns an error if: + /// - The source and destination writers have different formats + /// - The source and destination writers have incompatible headers + /// - There's an I/O error during ingestion + pub fn ingest(&mut self, other: &mut BinseqWriter>) -> Result<()> { + match (self, other) { + (Self::Bq(dst), BinseqWriter::Bq(src)) => dst.ingest(src), + (Self::Vbq(dst), BinseqWriter::Vbq(src)) => dst.ingest(src), + (Self::Cbq(dst), BinseqWriter::Cbq(src)) => dst.ingest(src), + _ => Err(WriteError::FormatMismatch.into()), + } + } +} + +impl BinseqWriter { + /// Create a new headless writer with the same configuration, using a `Vec` buffer + /// + /// This is useful for parallel writing scenarios where each thread has its own + /// buffer that gets merged into a global writer via `ingest()`. + /// + /// # Errors + /// + /// Returns an error if the writer cannot be created. + pub fn new_headless_buffer(&self) -> Result>> { + match self { + Self::Bq(w) => { + let inner = bq::WriterBuilder::default() + .header(w.header()) + .policy(w.policy()) + .headless(true) + .build(Vec::new())?; + Ok(BinseqWriter::Bq(inner)) + } + Self::Vbq(w) => { + let inner = vbq::WriterBuilder::default() + .header(w.header()) + .policy(w.policy()) + .headless(true) + .build(Vec::new())?; + Ok(BinseqWriter::Vbq(inner)) + } + Self::Cbq(w) => { + let inner = cbq::ColumnarBlockWriter::new_headless(Vec::new(), w.header())?; + Ok(BinseqWriter::Cbq(inner)) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::SequencingRecordBuilder; + use std::io::Cursor; + + #[test] + fn test_format_extension() { + assert_eq!(Format::Bq.extension(), ".bq"); + assert_eq!(Format::Vbq.extension(), ".vbq"); + assert_eq!(Format::Cbq.extension(), ".cbq"); + } + + #[test] + fn test_build_bq_writer() -> Result<()> { + let writer = BinseqWriterBuilder::new(Format::Bq) + .slen(100) + .paired(false) + .build(Cursor::new(Vec::new()))?; + + assert_eq!(writer.format(), Format::Bq); + assert!(!writer.is_paired()); + assert!(!writer.has_quality()); + assert!(!writer.has_headers()); + Ok(()) + } + + #[test] + fn test_build_bq_writer_paired() -> Result<()> { + let writer = BinseqWriterBuilder::new(Format::Bq) + .slen(100) + .xlen(150) + .paired(true) + .build(Cursor::new(Vec::new()))?; + + assert_eq!(writer.format(), Format::Bq); + assert!(writer.is_paired()); + Ok(()) + } + + #[test] + fn test_build_bq_missing_slen() { + let result = BinseqWriterBuilder::new(Format::Bq) + .paired(false) + .build(Cursor::new(Vec::new())); + + assert!(result.is_err()); + } + + #[test] + fn test_build_bq_paired_missing_xlen() { + let result = BinseqWriterBuilder::new(Format::Bq) + .slen(100) + .paired(true) + .build(Cursor::new(Vec::new())); + + assert!(result.is_err()); + } + + #[test] + fn test_build_vbq_writer() -> Result<()> { + let writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(true) + .quality(true) + .headers(true) + .build(Cursor::new(Vec::new()))?; + + assert_eq!(writer.format(), Format::Vbq); + assert!(writer.is_paired()); + assert!(writer.has_quality()); + assert!(writer.has_headers()); + Ok(()) + } + + #[test] + fn test_build_cbq_writer() -> Result<()> { + let writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(true) + .headers(true) + .compression_level(3) + .build(Cursor::new(Vec::new()))?; + + assert_eq!(writer.format(), Format::Cbq); + assert!(!writer.is_paired()); + assert!(writer.has_quality()); + assert!(writer.has_headers()); + Ok(()) + } + + #[test] + fn test_push_and_finish_vbq() -> Result<()> { + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(false) + .headers(false) + .build(Cursor::new(Vec::new()))?; + + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .build()?; + + let written = writer.push(record)?; + assert!(written); + + writer.finish()?; + Ok(()) + } + + #[test] + fn test_push_and_finish_cbq() -> Result<()> { + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(false) + .headers(false) + .build(Cursor::new(Vec::new()))?; + + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .build()?; + + let written = writer.push(record)?; + assert!(written); + + writer.finish()?; + Ok(()) + } + + #[test] + fn test_push_and_finish_bq() -> Result<()> { + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(12) + .paired(false) + .build(Cursor::new(Vec::new()))?; + + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .build()?; + + let written = writer.push(record)?; + assert!(written); + + writer.finish()?; + Ok(()) + } + + #[test] + fn test_new_headless_buffer_vbq() -> Result<()> { + let global = BinseqWriterBuilder::new(Format::Vbq) + .paired(true) + .quality(true) + .headers(true) + .build(Cursor::new(Vec::new()))?; + + let local = global.new_headless_buffer()?; + + assert_eq!(local.format(), Format::Vbq); + assert!(local.is_paired()); + assert!(local.has_quality()); + assert!(local.has_headers()); + Ok(()) + } + + #[test] + fn test_new_headless_buffer_cbq() -> Result<()> { + let global = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(true) + .build(Cursor::new(Vec::new()))?; + + let local = global.new_headless_buffer()?; + + assert_eq!(local.format(), Format::Cbq); + assert!(!local.is_paired()); + assert!(local.has_quality()); + Ok(()) + } + + #[test] + fn test_new_headless_buffer_bq() -> Result<()> { + let global = BinseqWriterBuilder::new(Format::Bq) + .slen(100) + .xlen(150) + .paired(true) + .build(Cursor::new(Vec::new()))?; + + let local = global.new_headless_buffer()?; + + assert_eq!(local.format(), Format::Bq); + assert!(local.is_paired()); + Ok(()) + } + + #[test] + fn test_ingest_vbq() -> Result<()> { + let mut global = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(false) + .headers(false) + .build(Cursor::new(Vec::new()))?; + + let mut local = global.new_headless_buffer()?; + + // Write to local + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .build()?; + local.push(record)?; + + // Ingest into global + global.ingest(&mut local)?; + global.finish()?; + + Ok(()) + } + + #[test] + fn test_ingest_cbq() -> Result<()> { + let mut global = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(false) + .headers(false) + .build(Cursor::new(Vec::new()))?; + + let mut local = global.new_headless_buffer()?; + + // Write to local + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .build()?; + local.push(record)?; + + // Ingest into global + global.ingest(&mut local)?; + global.finish()?; + + Ok(()) + } + + #[test] + fn test_ingest_bq() -> Result<()> { + let mut global = BinseqWriterBuilder::new(Format::Bq) + .slen(12) + .paired(false) + .build(Cursor::new(Vec::new()))?; + + let mut local = global.new_headless_buffer()?; + + // Write to local + let record = SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGT") + .build()?; + local.push(record)?; + + // Ingest into global + global.ingest(&mut local)?; + global.finish()?; + + Ok(()) + } + + #[test] + fn test_ingest_format_mismatch() -> Result<()> { + let mut global = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .build(Cursor::new(Vec::new()))?; + + let mut local = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .headless(true) + .build(Vec::new())?; + + let result = global.ingest(&mut local); + assert!(result.is_err()); + + Ok(()) + } + + // ==================== Record Specification Tests ==================== + // + // These tests verify that writers correctly handle records with different + // levels of specification relative to the writer's configuration: + // - Under-specified: record is missing data the writer needs (should error) + // - Over-specified: record has extra data the writer ignores (should succeed) + // - Correctly-specified: record matches writer config exactly (should succeed) + + /// Helper to create a minimal single-end record (sequence only) + fn minimal_single_record() -> SequencingRecord<'static> { + SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGTACGTACGTACGTACGTACGT") + .build() + .unwrap() + } + + /// Helper to create a minimal paired record (sequences only) + fn minimal_paired_record() -> SequencingRecord<'static> { + SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGTACGTACGTACGTACGTACGT") + .x_seq(b"TGCATGCATGCATGCATGCATGCATGCATGCA") + .build() + .unwrap() + } + + /// Helper to create a fully-specified single-end record + fn full_single_record() -> SequencingRecord<'static> { + SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGTACGTACGTACGTACGTACGT") + .s_qual(b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") + .s_header(b"read1") + .flag(42u64) + .build() + .unwrap() + } + + /// Helper to create a fully-specified paired record + fn full_paired_record() -> SequencingRecord<'static> { + SequencingRecordBuilder::default() + .s_seq(b"ACGTACGTACGTACGTACGTACGTACGTACGT") + .s_qual(b"IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") + .s_header(b"read1") + .x_seq(b"TGCATGCATGCATGCATGCATGCATGCATGCA") + .x_qual(b"JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ") + .x_header(b"read2") + .flag(42u64) + .build() + .unwrap() + } + + // ==================== VBQ Tests ==================== + + #[test] + fn test_vbq_single_minimal_writer_minimal_record() -> Result<()> { + // Writer: single-end, no quality, no headers, no flags + // Record: single-end, no quality, no headers, no flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_vbq_single_minimal_writer_full_record() -> Result<()> { + // Writer: single-end, no quality, no headers, no flags + // Record: single-end, with quality, headers, flags + // Expected: success (over-specified - extra data ignored) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_vbq_single_full_writer_minimal_record() -> Result<()> { + // Writer: single-end, with quality, headers, flags + // Record: single-end, no quality, no headers, no flags + // Expected: error (under-specified) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + let result = writer.push(record); + assert!(result.is_err()); + Ok(()) + } + + #[test] + fn test_vbq_single_full_writer_full_record() -> Result<()> { + // Writer: single-end, with quality, headers, flags + // Record: single-end, with quality, headers, flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_vbq_paired_writer_single_record() -> Result<()> { + // Writer: paired + // Record: single-end + // Expected: error (under-specified - missing R2) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(true) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + let result = writer.push(record); + assert!(result.is_err()); + Ok(()) + } + + #[test] + fn test_vbq_single_writer_paired_record() -> Result<()> { + // Writer: single-end + // Record: paired + // Expected: success (over-specified - R2 ignored) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_vbq_paired_minimal_writer_paired_full_record() -> Result<()> { + // Writer: paired, no quality, no headers, no flags + // Record: paired, with quality, headers, flags + // Expected: success (over-specified) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(true) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_vbq_paired_full_writer_paired_full_record() -> Result<()> { + // Writer: paired, with quality, headers, flags + // Record: paired, with quality, headers, flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(true) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + // ==================== CBQ Tests ==================== + + #[test] + fn test_cbq_single_minimal_writer_minimal_record() -> Result<()> { + // Writer: single-end, no quality, no headers, no flags + // Record: single-end, no quality, no headers, no flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_single_minimal_writer_full_record() -> Result<()> { + // Writer: single-end, no quality, no headers, no flags + // Record: single-end, with quality, headers, flags + // Expected: success (over-specified - extra data ignored) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_single_full_writer_minimal_record() -> Result<()> { + // Writer: single-end, with quality, headers, flags + // Record: single-end, no quality, no headers, no flags + // Expected: error (under-specified) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + let result = writer.push(record); + assert!(result.is_err()); + Ok(()) + } + + #[test] + fn test_cbq_single_full_writer_full_record() -> Result<()> { + // Writer: single-end, with quality, headers, flags + // Record: single-end, with quality, headers, flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_paired_writer_single_record() -> Result<()> { + // Writer: paired + // Record: single-end + // Expected: error (under-specified - missing R2) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(true) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + let result = writer.push(record); + assert!(result.is_err()); + Ok(()) + } + + #[test] + fn test_cbq_single_writer_paired_record() -> Result<()> { + // Writer: single-end + // Record: paired + // Expected: success (over-specified - R2 ignored) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_paired_minimal_writer_paired_full_record() -> Result<()> { + // Writer: paired, no quality, no headers, no flags + // Record: paired, with quality, headers, flags + // Expected: success (over-specified) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(true) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_paired_full_writer_paired_full_record() -> Result<()> { + // Writer: paired, with quality, headers, flags + // Record: paired, with quality, headers, flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(true) + .quality(true) + .headers(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + // ==================== BQ Tests ==================== + // Note: BQ format has fixed-length sequences and doesn't support headers + + #[test] + fn test_bq_single_minimal_writer_minimal_record() -> Result<()> { + // Writer: single-end, no quality, no flags + // Record: single-end, no quality, no flags + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_single_minimal_writer_full_record() -> Result<()> { + // Writer: single-end, no quality, no flags + // Record: single-end, with quality, headers, flags + // Expected: success (over-specified - extra data ignored) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_single_with_quality_writer_minimal_record() -> Result<()> { + // Writer: single-end, with quality (note: BQ ignores quality setting) + // Record: single-end, no quality + // Expected: success (BQ format doesn't support quality scores, setting is ignored) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(true) // This is ignored for BQ format + .build(Cursor::new(Vec::new()))?; + + // BQ always reports has_quality as false + assert!(!writer.has_quality()); + + let record = minimal_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_single_with_quality_writer_full_record() -> Result<()> { + // Writer: single-end, with quality + // Record: single-end, with quality + // Expected: success (correctly specified) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_single_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_paired_writer_single_record() -> Result<()> { + // Writer: paired + // Record: single-end + // Expected: error (under-specified - missing R2) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .xlen(32) + .paired(true) + .quality(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_single_record(); + let result = writer.push(record); + assert!(result.is_err()); + Ok(()) + } + + #[test] + fn test_bq_single_writer_paired_record() -> Result<()> { + // Writer: single-end + // Record: paired + // Expected: success (over-specified - R2 ignored) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(false) + .build(Cursor::new(Vec::new()))?; + + let record = minimal_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_paired_minimal_writer_paired_full_record() -> Result<()> { + // Writer: paired, no quality, no flags + // Record: paired, with quality, headers, flags + // Expected: success (over-specified) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .xlen(32) + .paired(true) + .quality(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_paired_full_writer_paired_full_record() -> Result<()> { + // Writer: paired, with quality, flags + // Record: paired, with quality, headers, flags + // Expected: success (correctly specified, headers ignored for BQ) + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .xlen(32) + .paired(true) + .quality(true) + .flags(true) + .build(Cursor::new(Vec::new()))?; + + let record = full_paired_record(); + assert!(writer.push(record)?); + writer.finish()?; + Ok(()) + } + + // ==================== Configured Size Calculation Tests ==================== + + #[test] + fn test_configured_size_cbq_single_minimal() { + let record = minimal_single_record(); + // 32 nucleotides = 1 u64 word = 8 bytes + let size = record.configured_size_cbq(false, false, false, false); + assert_eq!(size, 8); + } + + #[test] + fn test_configured_size_cbq_single_with_flags() { + let record = full_single_record(); + // 32 nucleotides = 8 bytes + 8 bytes flag + let size = record.configured_size_cbq(false, true, false, false); + assert_eq!(size, 16); + } + + #[test] + fn test_configured_size_cbq_single_with_all() { + let record = full_single_record(); + // 32 nucleotides = 8 bytes + // + 8 bytes flag + // + 5 bytes header ("read1") + // + 32 bytes quality + let size = record.configured_size_cbq(false, true, true, true); + assert_eq!(size, 8 + 8 + 5 + 32); + } + + #[test] + fn test_configured_size_cbq_paired_minimal() { + let record = full_paired_record(); + // s_seq: 32 nucleotides = 8 bytes + // x_seq: 32 nucleotides = 8 bytes + let size = record.configured_size_cbq(true, false, false, false); + assert_eq!(size, 16); + } + + #[test] + fn test_configured_size_cbq_paired_with_all() { + let record = full_paired_record(); + // s_seq: 32 nucleotides = 8 bytes + // x_seq: 32 nucleotides = 8 bytes + // flag: 8 bytes + // s_header: 5 bytes ("read1") + // x_header: 5 bytes ("read2") + // s_qual: 32 bytes + // x_qual: 32 bytes + let size = record.configured_size_cbq(true, true, true, true); + assert_eq!(size, 8 + 8 + 8 + 5 + 5 + 32 + 32); + } + + #[test] + fn test_configured_size_cbq_paired_record_single_writer() { + // A paired record being written to a single-end writer + // should only count R1 data + let record = full_paired_record(); + let size = record.configured_size_cbq(false, true, true, true); + // Only s_seq (8) + flag (8) + s_header (5) + s_qual (32) + assert_eq!(size, 8 + 8 + 5 + 32); + } + + #[test] + fn test_configured_size_vbq_single_minimal() { + use bitnuc::BitSize; + let record = minimal_single_record(); + // s_len (8) + x_len (8) + s_seq (32 nucs = 1 word = 8 bytes) + let size = record.configured_size_vbq(false, false, false, false, BitSize::Two); + assert_eq!(size, 16 + 8); + } + + #[test] + fn test_configured_size_vbq_single_with_flags() { + use bitnuc::BitSize; + let record = full_single_record(); + // s_len (8) + x_len (8) + flag (8) + s_seq (8) + let size = record.configured_size_vbq(false, true, false, false, BitSize::Two); + assert_eq!(size, 16 + 8 + 8); + } + + #[test] + fn test_configured_size_vbq_single_with_all() { + use bitnuc::BitSize; + let record = full_single_record(); + // s_len (8) + x_len (8) + flag (8) + s_seq (8) + s_qual (32) + s_header_len (8) + s_header (5) + let size = record.configured_size_vbq(false, true, true, true, BitSize::Two); + assert_eq!(size, 16 + 8 + 8 + 32 + 8 + 5); + } + + #[test] + fn test_configured_size_vbq_paired_minimal() { + use bitnuc::BitSize; + let record = full_paired_record(); + // s_len (8) + x_len (8) + s_seq (8) + x_seq (8) + let size = record.configured_size_vbq(true, false, false, false, BitSize::Two); + assert_eq!(size, 16 + 8 + 8); + } + + #[test] + fn test_configured_size_vbq_paired_with_all() { + use bitnuc::BitSize; + let record = full_paired_record(); + // s_len (8) + x_len (8) + flag (8) + s_seq (8) + x_seq (8) + // + s_qual (32) + x_qual (32) + // + s_header_len (8) + s_header (5) + x_header_len (8) + x_header (5) + let size = record.configured_size_vbq(true, true, true, true, BitSize::Two); + assert_eq!(size, 16 + 8 + 8 + 8 + 32 + 32 + 8 + 5 + 8 + 5); + } + + #[test] + fn test_configured_size_vbq_paired_record_single_writer() { + use bitnuc::BitSize; + // A paired record being written to a single-end writer + // should only count R1 data + let record = full_paired_record(); + let size = record.configured_size_vbq(false, true, true, true, BitSize::Two); + // s_len (8) + x_len (8) + flag (8) + s_seq (8) + s_qual (32) + s_header_len (8) + s_header (5) + assert_eq!(size, 16 + 8 + 8 + 32 + 8 + 5); + } + + #[test] + fn test_configured_size_vbq_four_bit_encoding() { + use bitnuc::BitSize; + let record = minimal_single_record(); + // With 4-bit encoding: 2 nucleotides per byte, 16 per word + // 32 nucleotides = 2 words = 16 bytes + // s_len (8) + x_len (8) + s_seq (16) + let size = record.configured_size_vbq(false, false, false, false, BitSize::Four); + assert_eq!(size, 16 + 16); + } + + // ==================== Multiple Records Tests ==================== + + #[test] + fn test_vbq_multiple_records_mixed_specification() -> Result<()> { + // Writer configured minimally, records over-specified + let mut writer = BinseqWriterBuilder::new(Format::Vbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + // Push minimal record + assert!(writer.push(minimal_single_record())?); + // Push full record (over-specified, should work) + assert!(writer.push(full_single_record())?); + // Push paired record (over-specified, R2 ignored) + assert!(writer.push(full_paired_record())?); + + writer.finish()?; + Ok(()) + } + + #[test] + fn test_cbq_multiple_records_mixed_specification() -> Result<()> { + // Writer configured minimally, records over-specified + let mut writer = BinseqWriterBuilder::new(Format::Cbq) + .paired(false) + .quality(false) + .headers(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + // Push minimal record + assert!(writer.push(minimal_single_record())?); + // Push full record (over-specified, should work) + assert!(writer.push(full_single_record())?); + // Push paired record (over-specified, R2 ignored) + assert!(writer.push(full_paired_record())?); + + writer.finish()?; + Ok(()) + } + + #[test] + fn test_bq_multiple_records_mixed_specification() -> Result<()> { + // Writer configured minimally, records over-specified + let mut writer = BinseqWriterBuilder::new(Format::Bq) + .slen(32) + .paired(false) + .quality(false) + .flags(false) + .build(Cursor::new(Vec::new()))?; + + // Push minimal record + assert!(writer.push(minimal_single_record())?); + // Push full record (over-specified, should work) + assert!(writer.push(full_single_record())?); + // Push paired record (over-specified, R2 ignored) + assert!(writer.push(full_paired_record())?); + + writer.finish()?; + Ok(()) + } +}