Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
120 commits
Select commit Hold shift + click to select a range
cb32965
initial commit
noamteyssier Dec 16, 2025
7e576e2
working implementation of writer
noamteyssier Dec 16, 2025
36ef124
working implementation with packed sequences
noamteyssier Dec 16, 2025
097f9f8
store less redundant data
noamteyssier Dec 16, 2025
777bcef
working implementation with npos tracking
noamteyssier Dec 17, 2025
e7d7148
wip: reader
noamteyssier Dec 17, 2025
ad03e26
refactor: rename block writer
noamteyssier Dec 17, 2025
03d41b4
refactor: rework block into separate data structure
noamteyssier Dec 17, 2025
10bde64
feat: working decoder
noamteyssier Dec 17, 2025
ee92403
chore: remove ebuf_len since its deterministically sized from the nuclen
noamteyssier Dec 17, 2025
c050874
feat: added final index for cbq
noamteyssier Dec 17, 2025
5f5f39c
feat: added a global file header
noamteyssier Dec 17, 2025
d0c7dd2
fix: ensure ranges account for the size of the block header
noamteyssier Dec 17, 2025
065ef3f
feat: working implementation of parallel mmap reader
noamteyssier Dec 17, 2025
453be85
refactor: decompress direct from mmap
noamteyssier Dec 17, 2025
ee9c9ce
refactor: reuse offset calculations instead of allocating
noamteyssier Dec 17, 2025
9880bba
refactor: reuse a dctx for all decoders
noamteyssier Dec 17, 2025
71edfea
fix: ensure that the footer is aligned before transmuting
noamteyssier Dec 17, 2025
5fc2506
refactor: improve throughput and reduce function call overhead
noamteyssier Dec 17, 2025
4891e15
refactor: return a block header for the stream reader
noamteyssier Dec 17, 2025
2976440
refactor: move into separate submodules for cleaner organization
noamteyssier Dec 18, 2025
f39cc54
feat: added a sequencing record builder
noamteyssier Dec 18, 2025
12e65b1
feat: working ingest feature
noamteyssier Dec 18, 2025
9fecd51
feat: examples of reading and writing cbq
noamteyssier Dec 18, 2025
4841590
feat: more complete examples
noamteyssier Dec 18, 2025
fbdb2fb
fix: remove secondary overwrite
noamteyssier Dec 18, 2025
33016e7
fix: finish paired impl
noamteyssier Dec 18, 2025
ffb5093
refactor: simplify impl
noamteyssier Dec 18, 2025
c967fd2
chore: print number of records to stderr
noamteyssier Dec 18, 2025
0aa51dc
feat: improve encoding throughput with shared cctx
noamteyssier Dec 18, 2025
9cc4aa5
refactor: rename offsets to headers for clarity
noamteyssier Dec 18, 2025
4ae9599
refactor: implement parallel range
noamteyssier Dec 18, 2025
3839209
feat: added a header builder api
noamteyssier Dec 18, 2025
09339ad
refactor: enforce configuration on record push
noamteyssier Dec 18, 2025
f47dfad
style(clippy): fix
noamteyssier Dec 18, 2025
8b8a9d9
style(clippy): fix
noamteyssier Dec 18, 2025
06bf98e
Merge cbq repository history as subtree
noamteyssier Dec 18, 2025
3e66adf
feat: added in cbq to core library
noamteyssier Dec 18, 2025
987efef
feat: integrated cbq into binseq reader
noamteyssier Dec 18, 2025
ea85b6c
feat: working implementation of cbq integration
noamteyssier Dec 18, 2025
22a42a6
feat: reduce record size constraints based to reduced sequenced size
noamteyssier Dec 19, 2025
9684676
refactor: use zstd compress2 and ensure compression level
noamteyssier Dec 19, 2025
c13159f
feat: set options for the compressor
noamteyssier Dec 19, 2025
32a8fef
feat: use EF encoding for n-positions instead of just storing u64 ind…
noamteyssier Dec 19, 2025
d71ba7a
dep(bitnuc): update
noamteyssier Dec 19, 2025
0b64b49
chore: remove old cbq impl
noamteyssier Dec 19, 2025
c65ccf1
refactor: remove all anyhow errors and use crate errors
noamteyssier Dec 19, 2025
c73a8fa
style(fmt): run
noamteyssier Dec 19, 2025
72c45f0
style(clippy): fix
noamteyssier Dec 19, 2025
06b93a4
refactor: clean up implementation of ref record iter with more correc…
noamteyssier Dec 19, 2025
ed684db
refactor: clean up internal api and track number of records and numbe…
noamteyssier Dec 19, 2025
25b1638
feat: added convenience functions for accessing index metadata
noamteyssier Dec 19, 2025
ddc5f00
style(clippy): fix
noamteyssier Dec 19, 2025
1655908
fix: ensure size is div by 4 not by 32
noamteyssier Dec 19, 2025
3219c18
feat: added an iterator over block headers
noamteyssier Dec 19, 2025
714b4a0
docs: added mention of cbq
noamteyssier Dec 19, 2025
ed380d0
docs: added documentation about cbq
noamteyssier Dec 19, 2025
8aa75be
docs: added documentation about cbq
noamteyssier Dec 19, 2025
1147407
docs: added documentation about cbq
noamteyssier Dec 19, 2025
b00db99
style(clippy): fix
noamteyssier Dec 19, 2025
afa596b
chore: remove debug code
noamteyssier Jan 6, 2026
b3cb076
dep: remove redundant dependencies
noamteyssier Jan 15, 2026
002d706
Merge pull request #79 from ArcInstitute/integrate-cbq
noamteyssier Jan 15, 2026
c79ca1f
refactor: remove context from lib
noamteyssier Jan 21, 2026
2617982
feat: handle default quality scores with all readers
noamteyssier Jan 21, 2026
0ef0337
refactor: update write api to use sequencing record struct
noamteyssier Jan 21, 2026
6e035ed
docs+refactor: update all examples in docs to use new write/push api
noamteyssier Jan 21, 2026
c5ba2be
fix: flaky write test with in-memory write
noamteyssier Jan 21, 2026
833d608
feat: introduced a generic writer over binseq files
noamteyssier Jan 21, 2026
a59f6e4
feat: make format from str
noamteyssier Jan 21, 2026
af3c309
dep: added dev dependencies
noamteyssier Jan 21, 2026
81d5f23
feat: added example on best way to write with a parallel process
noamteyssier Jan 21, 2026
8fa247a
refactor: improve write example
noamteyssier Jan 21, 2026
416f1fc
refactor: rename decode example to read
noamteyssier Jan 21, 2026
2995d86
refactor: improve grep example
noamteyssier Jan 21, 2026
9ff0771
refactor: allow deprecated on streaming examples
noamteyssier Jan 21, 2026
fb1a6c5
docs: improve documentation
noamteyssier Jan 22, 2026
92948b8
ci: update workflow
noamteyssier Jan 22, 2026
a134bc1
ci: fix workflow substitution
noamteyssier Jan 22, 2026
c41cd7e
ci: fix grep workflow include pattern
noamteyssier Jan 22, 2026
d74f310
style(clippy): fix
noamteyssier Jan 22, 2026
e4f56e9
chore: remove unused deps
noamteyssier Jan 22, 2026
c2c83a4
Merge pull request #81 from ArcInstitute/80-beter-record-api-bq-and-v…
noamteyssier Jan 22, 2026
9dc0501
feat: added functionality to auto-encode a given fastx file or a pair…
noamteyssier Jan 22, 2026
3bf3391
refactor: don't error when introducing extra information into a recor…
noamteyssier Jan 22, 2026
28405d8
refactor: calculate size based on configured writer instead
noamteyssier Jan 22, 2026
62f2f82
chore: remove unused code
noamteyssier Jan 22, 2026
d6f5621
tests: added testing on sequence record writing
noamteyssier Jan 22, 2026
2a73c42
style(clippy): fix
noamteyssier Jan 22, 2026
8847fec
Merge pull request #83 from ArcInstitute/auto-convert-fastx-with-paraseq
noamteyssier Jan 22, 2026
4c38de3
tests: added testing on reader
noamteyssier Jan 23, 2026
3033683
tests: added testing on error
noamteyssier Jan 23, 2026
4b4c88a
tests: added testing on policy
noamteyssier Jan 23, 2026
1cdb1bc
tests: added testing on vbq reader
noamteyssier Jan 23, 2026
a2096e7
tests: added testing on cbq reader
noamteyssier Jan 23, 2026
1aa19f7
tests: added testing on readers with ranges and handle out of bounds …
noamteyssier Jan 23, 2026
4bae75f
fix: incorrect binary file
noamteyssier Jan 23, 2026
4151738
tests: added testing on additional methods
noamteyssier Jan 23, 2026
d4ab5af
tests: added testing on fastx encoding
noamteyssier Jan 23, 2026
33d48ec
tests: update testing expectation on vbq
noamteyssier Jan 23, 2026
b881730
Merge pull request #84 from ArcInstitute/improve-testing-coverage
noamteyssier Jan 23, 2026
bf6ec85
docs: added note on using cbq over vbq
noamteyssier Jan 23, 2026
ce1c57a
docs: update docs
noamteyssier Jan 23, 2026
eca8b08
docs: update docs
noamteyssier Jan 23, 2026
5c75260
docs: update docs
noamteyssier Jan 23, 2026
ce6d260
chore: update preprint link
noamteyssier Jan 23, 2026
d98688c
fix: small hotfixes on write
noamteyssier Jan 23, 2026
d2a521e
feat: quality of life builders from headers
noamteyssier Jan 23, 2026
5aad45f
refactor: rename BinseqHeader to bq::FileHeader
noamteyssier Jan 23, 2026
36d9655
refactor: rename bq specific writer
noamteyssier Jan 23, 2026
f1a7156
refactor: rename vbq file header
noamteyssier Jan 23, 2026
b1b1fea
refactor: rename vbq specific headers and structs
noamteyssier Jan 23, 2026
7a0d5da
chore: update examples with new names
noamteyssier Jan 23, 2026
969c892
chore: remove unnecessary export
noamteyssier Jan 23, 2026
06d8a77
chore: replace all VBINSEQ mentions with VBQ
noamteyssier Jan 23, 2026
d91762b
chore: replace all BINSEQ variants as BQ
noamteyssier Jan 23, 2026
8564733
Merge pull request #86 from ArcInstitute/improve-namespace-of-codebase
noamteyssier Jan 23, 2026
304f6d5
chore: remove the unnecessary code around the external index
noamteyssier Jan 23, 2026
3c55918
tests: added validity checking on vbq index being written
noamteyssier Jan 23, 2026
cdc5b7a
Merge pull request #87 from ArcInstitute/remove-vbq-external-index-code
noamteyssier Jan 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 26 additions & 23 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,41 +26,44 @@ jobs:
- name: Linting
run: cargo clippy --verbose

example_read_write:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: run example
run: cargo run --release --example read_write

example_parallel:
example_grep:
runs-on: ubuntu-latest
strategy:
matrix:
ext: [bq, vbq, cbq]
steps:
- uses: actions/checkout@v3
- name: run example
run: cargo run --release --example parallel_processing
- name: run example ${{ matrix.ext }}
run: cargo run --release --example grep -- ./data/subset.${{ matrix.ext }} "ACGTACGT"

example_example:
example_range:
runs-on: ubuntu-latest
strategy:
matrix:
ext: [bq, vbq, cbq]
steps:
- uses: actions/checkout@v3
- name: run example
run: cargo run --release --example example
- name: run example ${{ matrix.ext }}
run: cargo run --release --example parallel_range -- ./data/subset.${{ matrix.ext }} 4 30 200

example_grep:
example_write:
runs-on: ubuntu-latest
strategy:
matrix:
ext: [bq, vbq, cbq]
steps:
- uses: actions/checkout@v3
- name: run example bq
run: cargo run --release --example grep ./data/subset.bq
- name: run example vbq
run: cargo run --release --example grep ./data/subset.vbq
- name: run example (single) ${{ matrix.ext }}
run: cargo run --release --example write -- ./data/subset_R1.fastq.gz -o ./output.${{ matrix.ext }}
- name: run example (paired) ${{ matrix.ext }}
run: cargo run --release --example write -- ./data/subset_R1.fastq.gz ./data/subset_R2.fastq.gz -o ./output.${{ matrix.ext }}

example_range:
example_read:
runs-on: ubuntu-latest
strategy:
matrix:
ext: [bq, vbq, cbq]
steps:
- uses: actions/checkout@v3
- name: run example (bq)
run: cargo run --release --example parallel_range -- ./data/subset.bq 4 30 200
- name: run example (vbq)
run: cargo run --release --example parallel_range -- ./data/subset.vbq 4 30 200
- name: run example ${{ matrix.ext }}
run: cargo run --release --example read -- ./data/subset.${{ matrix.ext }}
27 changes: 17 additions & 10 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "binseq"
version = "0.8.3"
edition = "2021"
edition = "2024"
description = "A high efficiency binary format for sequencing data"
license = "MIT"
authors = ["Noam Teyssier <noam.teyssier@arcinstitute.org>"]
Expand All @@ -11,25 +11,32 @@ categories = ["science::bioinformatics", "encoding", "data-structures"]
keywords = ["bioinformatics", "nucleotide", "sequencing", "genomics", "fastq"]

[dependencies]
anyhow = "1.0.100"
anyhow = {version = "1.0.100", optional = true}
auto_impl = "1.3.0"
bitnuc = "0.3.2"
bytemuck = "1.24.0"
bitnuc = "0.4.0"
bytemuck = { version = "1.24.0", features = ["derive", "extern_crate_alloc"] }
byteorder = "1.5.0"
itoa = "1.0.15"
itoa = "1.0.17"
memchr = "2.7.6"
memmap2 = "0.9.9"
num_cpus = "1.17.0"
paraseq = { version = "0.4.8", optional = true }
parking_lot = {version = "0.12.5", optional = true }
rand = { version = "0.9.2", features = ["small_rng"] }
sucds = "0.8.3"
thiserror = "2.0.17"
zstd = { version = "0.13.3", features = ["zstdmt"] }

[dev-dependencies]
nucgen = "0.2.0"
niffler = "3.0.0"
seq_io = "0.3.4"
anyhow = "1.0.100"
parking_lot = "0.12.5"
itoa = "1.0.15"
memchr = "2.7.6"
clap = { version = "4.5.54", features = ["derive"] }
paraseq = "0.4.8"

[features]
default = ["paraseq", "anyhow"]
anyhow = ["dep:anyhow"]
paraseq = ["dep:paraseq", "dep:parking_lot"]

[lints.clippy]
pedantic = { level = "warn", priority = -1 }
Expand Down
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,18 @@
BINSEQ is a binary file format family designed for efficient storage and processing of DNA sequences.
They make use of two-bit encoding for nucleotides and are optimized for high-performance parallel processing.

BINSEQ currently has two flavors:
BINSEQ has three variants:

1. **BQ**: (`*.bq`) files are for _fixed-length_ records **without** quality scores.
2. **VBQ**: (`*.vbq`) files are for _variable-length_ records **with optional** quality scores and headers.
3. **CBQ**: (`*.cbq`) files are for _columnar variable-length_ records **with optional** quality scores and headers.

Both flavors support both single and paired sequences.
All variants support both single and paired sequences.

**Note:** For most use cases, the newest variant _CBQ_ is recommended due to its flexibility, storage efficiency, and decoding speed.
It supersedes _VBQ_ in terms of performance and storage efficiency, at a small cost in encoding speed.
VBQ will still be supported but newer projects should consider using _CBQ_ instead.
For information on the structure of _CBQ_ files, see the [documentation](https://docs.rs/binseq/latest/binseq/cbq/).

## Getting Started

Expand All @@ -24,4 +30,4 @@ This is a **library** for reading and writing BINSEQ files, for a **command-line
To get started please refer to our [documentation](https://docs.rs/binseq/latest/binseq/).
For example programs which make use of the library check out our [examples directory](https://github.com/arcinstitute/binseq/tree/main/examples).

For more information about the BINSEQ file family, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v1).
For more information about the BINSEQ file family, please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2025.04.08.647863v2).
Binary file added data/subset.cbq
Binary file not shown.
Binary file modified data/subset.vbq
Binary file not shown.
122 changes: 122 additions & 0 deletions examples/auto-write.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
use std::{fs::File, io::BufWriter};

use anyhow::Result;
use binseq::{BinseqWriterBuilder, write::Format};
use bitnuc::BitSize;
use clap::Parser;

type BoxedWriter = Box<dyn std::io::Write + Send>;

#[derive(Parser)]
struct Args {
/// Input FASTX to encode into BINSEQ format
#[clap(required = true)]
input: String,

/// Input FASTX to encode into BINSEQ format (R2)
#[clap(required = false)]
input2: Option<String>,

/// Output file path for BINSEQ format
#[clap(short = 'o', long)]
output: Option<String>,

/// Default prefix for writing BINSEQ: `<prefix>.<ext>`
#[clap(short = 'p', long, default_value = "output")]
prefix: String,

/// Format of the output BINSEQ file
///
/// [bq: bq|BQ|b, vbq: vbq|VBQ|v, cbq: cbq|CBQ|c]
#[clap(short = 'f', long)]
format: Option<Format>,

/// Exclude quality information in BINSEQ output
///
/// (bq ignores quality always)
#[clap(short = 'Q', long)]
exclude_quality: bool,

/// Exclude sequence headers in BINSEQ output
///
/// (bq ignores headers always)
#[clap(short = 'H', long)]
exclude_headers: bool,

/// Compression level for BINSEQ output (0: auto)
#[clap(long, default_value_t = 0)]
compression_level: i32,

/// Default BITSIZE for BINSEQ output (2: 2bit, 4: 4bit)
#[clap(long, default_value_t = 2)]
bitsize: u8,

/// Default BLOCKSIZE in KB for BINSEQ output (vbq,cbq)
#[clap(long, default_value_t = 128)]
blocksize: usize,

/// Number of threads to use for parallel processing, 0: all available
#[clap(short = 'T', long, default_value = "0")]
threads: usize,
}
impl Args {
/// Determines the output format based on the file extension or the provided format
fn format(&self) -> Format {
if let Some(format) = self.format {
format
} else {
if let Some(output) = &self.output {
match output.split(".").last() {
Some("bq") => Format::Bq,
Some("vbq") => Format::Vbq,
Some("cbq") => Format::Cbq,
_ => Format::default(),
}
} else {
Format::default()
}
}
}
fn bitsize(&self) -> BitSize {
match self.bitsize {
4 => BitSize::Four,
_ => BitSize::Two,
}
}

/// Creates an output file handle
fn ohandle(&self) -> Result<BoxedWriter> {
let path = if let Some(output) = &self.output {
output.to_string()
} else {
format!("{}{}", &self.prefix, self.format().extension())
};
let ofile = File::create(path).map(BufWriter::new)?;
Ok(Box::new(ofile))
}

fn is_paired(&self) -> bool {
self.input2.is_some()
}
}

fn main() -> Result<()> {
let args = Args::parse();
let handle = args.ohandle()?;
let builder = BinseqWriterBuilder::new(args.format())
.bitsize(args.bitsize())
.block_size(args.blocksize * 1024)
.headers(!args.exclude_headers)
.quality(!args.exclude_quality)
.compression_level(args.compression_level)
.encode_fastx(handle);
if args.is_paired() {
builder.input_paired(&args.input, args.input2.as_ref().unwrap())
} else {
builder.input(&args.input)
}
.threads(args.threads)
.run()?;

Ok(())
}
42 changes: 22 additions & 20 deletions examples/grep.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
use std::sync::Arc;

use anyhow::Result;
use binseq::{context::SeqCtx, prelude::*};
use binseq::prelude::*;
use clap::Parser;
use memchr::memmem::Finder;
use parking_lot::Mutex;

#[derive(Clone)]
pub struct GrepCounter {
// (thread) local variables
ctx: SeqCtx,
local_count: usize,

// search pattern (using memchr::memmem::Finder for fast searching)
Expand All @@ -21,7 +21,6 @@ impl GrepCounter {
#[must_use]
pub fn new(pattern: &[u8]) -> Self {
Self {
ctx: SeqCtx::default(),
pattern: Finder::new(pattern).into_owned(),
local_count: 0,
count: Arc::new(Mutex::new(0)),
Expand All @@ -38,9 +37,7 @@ impl GrepCounter {
}
impl ParallelProcessor for GrepCounter {
fn process_record<R: binseq::BinseqRecord>(&mut self, record: R) -> binseq::Result<()> {
self.ctx.fill(&record)?;

if self.match_sequence(&self.ctx.sbuf()) || self.match_sequence(&self.ctx.xbuf()) {
if self.match_sequence(&record.sseq()) || self.match_sequence(&record.xseq()) {
self.local_count += 1;
}

Expand All @@ -54,21 +51,26 @@ impl ParallelProcessor for GrepCounter {
}
}

fn main() -> Result<()> {
let path = std::env::args()
.nth(1)
.unwrap_or("./data/subset.bq".to_string());
let pattern = std::env::args()
.nth(2)
.unwrap_or("ACGT".to_string())
.as_bytes()
.to_vec();
let n_threads = std::env::args().nth(3).unwrap_or("1".to_string()).parse()?;
#[derive(Parser)]
struct Args {
/// Input BINSEQ path to grep
#[clap(required = true)]
input: String,

let reader = BinseqReader::new(&path)?;
let counter = GrepCounter::new(&pattern);
reader.process_parallel(counter.clone(), n_threads)?;
counter.pprint();
/// Pattern to search for (either sseq or xseq)
#[clap(required = true)]
pattern: String,

/// Threads to use [0: auto]
#[clap(short = 'T', long, default_value_t = 0)]
threads: usize,
}

fn main() -> Result<()> {
let args = Args::parse();
let reader = BinseqReader::new(&args.input)?;
let counter = GrepCounter::new(args.pattern.as_bytes());
reader.process_parallel(counter.clone(), args.threads)?;
counter.pprint();
Ok(())
}
7 changes: 4 additions & 3 deletions examples/network_streaming.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ use std::io::{BufReader, BufWriter};
use std::net::{TcpListener, TcpStream};
use std::thread;

use binseq::bq::{BinseqHeader, BinseqHeaderBuilder, StreamReader, StreamWriterBuilder};
use binseq::bq::{FileHeader, FileHeaderBuilder, StreamReader, StreamWriterBuilder};
use binseq::{BinseqRecord, Policy, Result};

fn server(header: BinseqHeader, sequence: &[u8]) -> Result<()> {
fn server(header: FileHeader, sequence: &[u8]) -> Result<()> {
// Create a listener on localhost:3000
let listener = TcpListener::bind("127.0.0.1:3000").expect("Failed to bind to address");
println!("Server listening on 127.0.0.1:3000");
Expand All @@ -25,6 +25,7 @@ fn server(header: BinseqHeader, sequence: &[u8]) -> Result<()> {

// Write sequences in a loop
for i in 0..10 {
#[allow(deprecated)]
writer.write_record(Some(i), sequence)?;
println!("Server: Sent record {i}");

Expand Down Expand Up @@ -79,7 +80,7 @@ fn client() -> Result<()> {

fn main() -> Result<()> {
// Create a header for sequences of length 100
let header = BinseqHeaderBuilder::new().slen(100).build()?;
let header = FileHeaderBuilder::new().slen(100).build()?;

// Create some example sequence data
let sequence = b"ACGT".repeat(25); // 100 nucleotides
Expand Down
Loading