diff --git a/cli/src/commands/synthetic.rs b/cli/src/commands/synthetic.rs index e52254f..b821bc2 100644 --- a/cli/src/commands/synthetic.rs +++ b/cli/src/commands/synthetic.rs @@ -1,7 +1,7 @@ use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{BufRead, BufReader, BufWriter, Write}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::sync::Arc; use anyhow::{anyhow, bail, Context, Result}; @@ -13,7 +13,7 @@ use rayon::ThreadPoolBuilder; use serde::Deserialize; use crate::download::ensure_reference_db; -use crate::stats::{ReferenceVariant, StatsStore}; +use crate::stats::{IlluminaProbeRecord, ReferenceVariant, StatsStore}; use crate::{SyntheticArgs, SyntheticFormat}; const HEADER_TEXT: &str = r#"# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 @@ -89,6 +89,7 @@ pub fn run_synthetic(args: SyntheticArgs) -> Result<()> { let overlays = load_overlay_specs(&args)? .map(Arc::new) .unwrap_or_else(|| Arc::new(Vec::new())); + let illumina_manifest = load_illumina_probe_manifest(&args, &store)?.map(Arc::new); let biallelic_choices = if args.biallelic { Some(Arc::new(build_biallelic_choices( references.as_ref(), @@ -122,6 +123,7 @@ pub fn run_synthetic(args: SyntheticArgs) -> Result<()> { plan.seed, args.format, biallelic_choices.as_ref().map(|choices| choices.as_ref()), + illumina_manifest.as_ref().map(|manifest| manifest.as_ref()), ) }) .collect::>>() @@ -171,6 +173,7 @@ fn write_single_file( seed: Option, format: SyntheticFormat, biallelic_choices: Option<&HashMap>, + illumina_manifest: Option<&IlluminaProbeManifest>, ) -> Result { if let Some(parent) = path.parent() { if !parent.as_os_str().is_empty() { @@ -201,18 +204,40 @@ fn write_single_file( .context("write header")?; } SyntheticFormat::Illumina => { - let ref_rsids: HashSet = references.iter().map(|r| r.rsid).collect(); - let extra = overlay_assignments - .keys() - .filter(|rsid| !ref_rsids.contains(rsid)) - .count(); - let num_snps = references.len() + extra; + let num_snps = if let Some(manifest) = illumina_manifest { + count_illumina_manifest_outputs(manifest, references, overlays) + } else { + let ref_rsids: HashSet = references.iter().map(|r| r.rsid).collect(); + let extra = overlay_assignments + .keys() + .filter(|rsid| !ref_rsids.contains(rsid)) + .count(); + references.len() + extra + }; writer .write_all(illumina_header(num_snps).as_bytes()) .context("write header")?; } } + if format == SyntheticFormat::Illumina { + if let Some(manifest) = illumina_manifest { + return write_illumina_manifest_file( + &mut writer, + manifest, + references, + overlays, + &mut overlay_assignments, + &default_frequencies, + no_call_frequency, + no_call_token, + biallelic_choices, + &sample_id, + &mut rng, + ); + } + } + let mut written = 0usize; for reference in references { if let Some(assignment) = overlay_assignments.remove(&reference.rsid) { @@ -227,6 +252,7 @@ fn write_single_file( format, &sample_id, &format!("rs{}", assignment.spec.rsid), + &format!("rs{}", assignment.spec.rsid), &assignment.spec.chromosome, assignment.spec.position, &genotype, @@ -251,6 +277,7 @@ fn write_single_file( format, &sample_id, &format!("rs{}", reference.rsid), + &format!("rs{}", reference.rsid), &reference.chromosome, reference.position, &genotype, @@ -273,6 +300,7 @@ fn write_single_file( format, &sample_id, &format!("rs{}", assignment.spec.rsid), + &format!("rs{}", assignment.spec.rsid), &assignment.spec.chromosome, assignment.spec.position, &genotype, @@ -306,6 +334,201 @@ fn illumina_header(num_snps: usize) -> String { ) } +#[allow(clippy::too_many_arguments)] +fn write_illumina_manifest_file( + writer: &mut BufWriter, + manifest: &IlluminaProbeManifest, + references: &[ReferenceVariant], + overlays: &[OverlaySpec], + overlay_assignments: &mut HashMap, + default_frequencies: &GenotypeFrequencies, + no_call_frequency: f64, + no_call_token: &str, + biallelic_choices: Option<&HashMap>, + sample_id: &str, + rng: &mut StdRng, +) -> Result { + let references_by_rsid = references + .iter() + .map(|reference| (reference.rsid, reference)) + .collect::>(); + let references_by_locus = references + .iter() + .map(|reference| { + ( + (reference.chromosome.clone(), reference.position), + reference, + ) + }) + .collect::>(); + let overlays_by_locus = overlays + .iter() + .map(|overlay| ((overlay.chromosome.clone(), overlay.position), overlay.rsid)) + .collect::>(); + let mut genotype_by_rsid: HashMap = HashMap::new(); + let mut represented_rsids: HashSet = HashSet::new(); + let mut explicit_probe_rsids: HashSet = HashSet::new(); + let mut written = 0usize; + + for probe in &manifest.probes { + let rsid = resolve_probe_rsid(probe, &references_by_locus, &overlays_by_locus); + if let Some(explicit_rsid) = probe.rsid { + explicit_probe_rsids.insert(explicit_rsid); + } + + let genotype = if let Some(rsid) = rsid { + represented_rsids.insert(rsid); + if let Some(existing) = genotype_by_rsid.get(&rsid) { + existing.clone() + } else { + let raw = if let Some(assignment) = overlay_assignments.get(&rsid) { + assignment.genotype.clone() + } else if let Some(reference) = references_by_rsid.get(&rsid) { + synthesize_reference_genotype( + reference, + default_frequencies, + biallelic_choices, + rng, + ) + } else { + synthesize_genotype_from_design(&probe.design, default_frequencies, rng) + }; + genotype_by_rsid.insert(rsid, raw.clone()); + raw + } + } else { + synthesize_genotype_from_design(&probe.design, default_frequencies, rng) + }; + let genotype = apply_no_call(&genotype, no_call_frequency, no_call_token, rng); + + write_row( + writer, + SyntheticFormat::Illumina, + sample_id, + &probe.probe_id, + &probe.probe_id, + &probe.chromosome, + probe.position, + &genotype, + &probe.design, + rng, + )?; + if let Some(rsid) = rsid { + overlay_assignments.remove(&rsid); + } + written += 1; + } + + for reference in references { + if explicit_probe_rsids.contains(&reference.rsid) { + continue; + } + if let Some(clean_rsids) = &manifest.clean_rsids { + if !clean_rsids.contains(&reference.rsid) { + continue; + } + } + let genotype = if let Some(assignment) = overlay_assignments.remove(&reference.rsid) { + assignment.genotype + } else { + synthesize_reference_genotype(reference, default_frequencies, biallelic_choices, rng) + }; + let genotype = apply_no_call(&genotype, no_call_frequency, no_call_token, rng); + let (design_reference, design_alts) = + design_parts_for_reference(reference, biallelic_choices); + let design = design_for(Some(&design_reference), &design_alts, &genotype); + let rsid_label = format!("rs{}", reference.rsid); + write_row( + writer, + SyntheticFormat::Illumina, + sample_id, + &rsid_label, + &rsid_label, + &reference.chromosome, + reference.position, + &genotype, + &design, + rng, + )?; + written += 1; + } + + for assignment in overlay_assignments.values() { + if represented_rsids.contains(&assignment.spec.rsid) { + continue; + } + let genotype = apply_no_call(&assignment.genotype, no_call_frequency, no_call_token, rng); + let rsid_label = format!("rs{}", assignment.spec.rsid); + write_row( + writer, + SyntheticFormat::Illumina, + sample_id, + &rsid_label, + &rsid_label, + &assignment.spec.chromosome, + assignment.spec.position, + &genotype, + &design_for_assignment(assignment, biallelic_choices, &genotype), + rng, + )?; + written += 1; + } + + writer.flush()?; + Ok(written) +} + +fn count_illumina_manifest_outputs( + manifest: &IlluminaProbeManifest, + references: &[ReferenceVariant], + overlays: &[OverlaySpec], +) -> usize { + let represented_rsids = manifest + .probes + .iter() + .filter_map(|probe| probe.rsid) + .collect::>(); + let reference_rsids = references + .iter() + .map(|reference| reference.rsid) + .collect::>(); + let clean_references = references + .iter() + .filter(|reference| !represented_rsids.contains(&reference.rsid)) + .filter(|reference| { + manifest + .clean_rsids + .as_ref() + .map(|clean_rsids| clean_rsids.contains(&reference.rsid)) + .unwrap_or(true) + }) + .count(); + let extra_overlays = overlays + .iter() + .filter(|overlay| { + !reference_rsids.contains(&overlay.rsid) && !represented_rsids.contains(&overlay.rsid) + }) + .count(); + manifest.probes.len() + clean_references + extra_overlays +} + +fn resolve_probe_rsid( + probe: &IlluminaProbe, + references_by_locus: &HashMap<(String, i64), &ReferenceVariant>, + overlays_by_locus: &HashMap<(String, i64), i64>, +) -> Option { + probe.rsid.or_else(|| { + references_by_locus + .get(&(probe.chromosome.clone(), probe.position)) + .map(|reference| reference.rsid) + .or_else(|| { + overlays_by_locus + .get(&(probe.chromosome.clone(), probe.position)) + .copied() + }) + }) +} + /// Split a synthetic genotype string into two single-token alleles. /// No-call / unknown -> ("-", "-"); 2-char -> (first, last); 1-char -> dup. fn split_alleles(genotype: &str) -> (String, String) { @@ -451,6 +674,7 @@ fn write_row( format: SyntheticFormat, sample_id: &str, rsid_label: &str, + illumina_snp_name: &str, chromosome: &str, position: i64, genotype: &str, @@ -481,7 +705,7 @@ fn write_row( "{sid}\t\t{snp}\t{design}\t{chr}\t{pos}\t\ {a1}\t{a2}\t{a1}\t{a2}\t{a1}\t{a2}\t{a1}\t{a2}\t{ab1}\t{ab2}\t+", sid = sample_id, - snp = rsid_label, + snp = illumina_snp_name, design = design, chr = chromosome, pos = position, @@ -543,6 +767,79 @@ fn load_overlay_specs(args: &SyntheticArgs) -> Result>> Ok(Some(specs)) } +fn load_illumina_probe_manifest( + args: &SyntheticArgs, + store: &StatsStore, +) -> Result> { + if args.format != SyntheticFormat::Illumina || args.clean_illumina_rsids { + return Ok(None); + } + if let Some(path) = args.illumina_probe_manifest.as_deref() { + return load_illumina_probe_manifest_tsv(path); + } + let probes = store + .all_illumina_probe_manifest()? + .into_iter() + .map(IlluminaProbe::from_record) + .collect::>(); + let clean_rsids = store.all_illumina_clean_rsids()?; + if probes.is_empty() { + Ok(None) + } else { + Ok(Some(IlluminaProbeManifest { + probes, + clean_rsids: Some(clean_rsids), + })) + } +} + +fn load_illumina_probe_manifest_tsv(path: &Path) -> Result> { + let mut reader = csv::ReaderBuilder::new() + .delimiter(b'\t') + .flexible(true) + .from_path(path) + .with_context(|| format!("Open Illumina probe manifest {:?}", path))?; + let mut probes = Vec::new(); + for row in reader.deserialize::() { + let raw = row.with_context(|| format!("Parse Illumina probe manifest {:?}", path))?; + let probe = IlluminaProbe::from_raw(raw)?; + probes.push(probe); + } + if probes.is_empty() { + bail!("Illumina probe manifest {:?} has no probes", path); + } + Ok(Some(IlluminaProbeManifest { + probes, + clean_rsids: None, + })) +} + +fn synthesize_genotype_from_design( + design: &str, + default_frequencies: &GenotypeFrequencies, + rng: &mut StdRng, +) -> String { + let alleles = design_alleles(design); + if alleles.is_empty() { + return "--".to_string(); + } + let reference = alleles[0].clone(); + let alternates = alleles.into_iter().skip(1).collect::>(); + synthesize_genotype_from_parts(&reference, &alternates, default_frequencies, rng) +} + +fn design_alleles(design: &str) -> Vec { + design + .trim() + .trim_start_matches('[') + .trim_end_matches(']') + .split('/') + .map(|allele| allele.trim()) + .filter(|allele| !allele.is_empty() && *allele != "-") + .map(|allele| allele.to_string()) + .collect() +} + fn prepare_overlay_assignments( overlays: &[OverlaySpec], default_frequencies: GenotypeFrequencies, @@ -928,6 +1225,63 @@ struct OverlayVariantRaw { genotype_frequencies: Option, } +#[derive(Debug, Clone)] +struct IlluminaProbeManifest { + probes: Vec, + clean_rsids: Option>, +} + +#[derive(Debug, Clone)] +struct IlluminaProbe { + probe_id: String, + rsid: Option, + design: String, + chromosome: String, + position: i64, +} + +#[derive(Debug, Deserialize)] +struct IlluminaProbeRaw { + probe_id: String, + #[serde(default)] + rsid: String, + design: String, + chrom: String, + pos: i64, +} + +impl IlluminaProbe { + fn from_record(record: IlluminaProbeRecord) -> Self { + Self { + probe_id: record.probe_id, + rsid: record.rsid, + design: record.design, + chromosome: record.chromosome, + position: record.position, + } + } + + fn from_raw(raw: IlluminaProbeRaw) -> Result { + let rsid = parse_optional_rsid(&raw.rsid) + .with_context(|| format!("parse manifest rsid {}", raw.rsid))?; + Ok(Self { + probe_id: raw.probe_id, + rsid, + design: raw.design, + chromosome: raw.chrom, + position: raw.pos, + }) + } +} + +fn parse_optional_rsid(value: &str) -> Result> { + let value = value.trim(); + if value.is_empty() { + return Ok(None); + } + Ok(Some(value.trim_start_matches("rs").parse::()?)) +} + fn synthesize_genotype( reference: &ReferenceVariant, default_frequencies: &GenotypeFrequencies, diff --git a/cli/src/main.rs b/cli/src/main.rs index 430fbb3..b35943c 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -362,6 +362,12 @@ pub struct SyntheticArgs { /// Optional JSON file describing overlay variants to force/include. #[arg(long = "variants-file")] pub variants_file: Option, + /// Optional Illumina probe manifest TSV with probe_id, rsid, design, chrom, pos columns. + #[arg(long = "illumina-probe-manifest")] + pub illumina_probe_manifest: Option, + /// Disable realistic Illumina probe names and emit clean rsid SNP Name values. + #[arg(long = "clean-illumina-rsids", action = ArgAction::SetTrue)] + pub clean_illumina_rsids: bool, /// Inline JSON describing overlay variants (use instead of --variants-file). #[arg(long = "variants-json")] pub variants_json: Option, diff --git a/cli/src/stats.rs b/cli/src/stats.rs index aacfdf9..c058dd6 100644 --- a/cli/src/stats.rs +++ b/cli/src/stats.rs @@ -18,6 +18,15 @@ pub struct ReferenceVariant { pub alternates: String, } +#[derive(Debug, Clone)] +pub struct IlluminaProbeRecord { + pub probe_id: String, + pub rsid: Option, + pub design: String, + pub chromosome: String, + pub position: i64, +} + #[derive(Debug, Clone)] pub struct StatsStore { sqlite_path: PathBuf, @@ -242,6 +251,44 @@ impl StatsStore { Ok(out) } + pub fn all_illumina_probe_manifest(&self) -> Result> { + let conn = self.open_connection()?; + if !table_exists(&conn, "illumina_probe_manifest")? { + return Ok(Vec::new()); + } + let mut stmt = conn.prepare( + "SELECT probe_id, rsid, design, chromosome, position + FROM illumina_probe_manifest + ORDER BY row_order", + )?; + let mut rows = stmt.query([])?; + let mut out = Vec::new(); + while let Some(row) = rows.next()? { + out.push(IlluminaProbeRecord { + probe_id: row.get(0)?, + rsid: row.get(1)?, + design: row.get(2)?, + chromosome: row.get(3)?, + position: row.get(4)?, + }); + } + Ok(out) + } + + pub fn all_illumina_clean_rsids(&self) -> Result> { + let conn = self.open_connection()?; + if !table_exists(&conn, "illumina_clean_rsids")? { + return Ok(HashSet::new()); + } + let mut stmt = conn.prepare("SELECT rsid FROM illumina_clean_rsids")?; + let mut rows = stmt.query([])?; + let mut rsids = HashSet::new(); + while let Some(row) = rows.next()? { + rsids.insert(row.get(0)?); + } + Ok(rsids) + } + pub fn known_rsids(&self) -> Result> { let conn = self.open_connection()?; let mut stmt = conn.prepare( @@ -419,12 +466,27 @@ fn init_schema(conn: &Connection) -> Result<()> { source TEXT NOT NULL DEFAULT 'ensembl_grch38', note TEXT ); + CREATE TABLE IF NOT EXISTS illumina_probe_manifest ( + row_order INTEGER PRIMARY KEY, + probe_id TEXT NOT NULL, + rsid INTEGER, + design TEXT NOT NULL, + chromosome TEXT NOT NULL, + position INTEGER NOT NULL + ); + CREATE TABLE IF NOT EXISTS illumina_clean_rsids ( + rsid INTEGER PRIMARY KEY + ); CREATE INDEX IF NOT EXISTS idx_rsid_reference_pos ON rsid_reference(chromosome, position); CREATE INDEX IF NOT EXISTS idx_grch38_non_rsids_pos ON grch38_non_rsids(chromosome, position); CREATE INDEX IF NOT EXISTS idx_grch38_non_rsids_rsid ON grch38_non_rsids(rsid); + CREATE INDEX IF NOT EXISTS idx_illumina_probe_manifest_rsid + ON illumina_probe_manifest(rsid); + CREATE INDEX IF NOT EXISTS idx_illumina_probe_manifest_pos + ON illumina_probe_manifest(chromosome, position); "#, )?; Ok(()) diff --git a/data/genostats.sqlite b/data/genostats.sqlite index 25886f6..f0bc303 100644 Binary files a/data/genostats.sqlite and b/data/genostats.sqlite differ