From 7d3b7aa6e4d0269e64bcf1e4f842ff4182d992e5 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Fri, 13 Jun 2025 13:28:18 -0600 Subject: [PATCH 01/24] Add scraper for output --- Cargo.toml | 2 +- libs/output_scraper/Cargo.toml | 6 ++++++ libs/output_scraper/src/main.rs | 3 +++ 3 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 libs/output_scraper/Cargo.toml create mode 100644 libs/output_scraper/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 546b30f7..57dcfbe0 100755 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,2 +1,2 @@ [workspace] -members = ["libs/graph_analyze", "libs/super_source_and_sink","libs/FmAssemblyGraph"] +members = ["libs/graph_analyze", "libs/super_source_and_sink","libs/FmAssemblyGraph", "libs/output_scraper"] diff --git a/libs/output_scraper/Cargo.toml b/libs/output_scraper/Cargo.toml new file mode 100644 index 00000000..5e142277 --- /dev/null +++ b/libs/output_scraper/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "output_scraper" +version = "0.1.0" +edition = "2021" + +[dependencies] diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs new file mode 100644 index 00000000..e7a11a96 --- /dev/null +++ b/libs/output_scraper/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +} From 265e8fb8033c6f5f47e8776962117febcc680325 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Mon, 16 Jun 2025 15:29:03 -0600 Subject: [PATCH 02/24] Change needle to waterman command --- findviralstrains_2.smk | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/findviralstrains_2.smk b/findviralstrains_2.smk index 1335f356..cadb0e1f 100755 --- a/findviralstrains_2.smk +++ b/findviralstrains_2.smk @@ -289,7 +289,7 @@ rule Rebuild_3: shell: "python3 {input.script} {input.flow} {input.swg} {params.outtemp}" -# Compares our newly constructed genomes to original covid reference using Needleman-Wunsch # +# Compares our newly constructed genomes to original covid reference using waterman-Wunsch # rule Compare_1: input: rebuilt_genome = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_1_of_1.fasta"), @@ -297,7 +297,7 @@ rule Compare_1: output: compar_file = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_1_of_1_vs_ref.txt") shell: - "needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file}" + "water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file}" # Compares genomes from the two path result to the reference # rule Compare_2: @@ -310,8 +310,8 @@ rule Compare_2: compar_file_2 = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_2_of_2_vs_ref.txt") shell: """ - needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1} - needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2} + water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1} + water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2} """ # Compares genomes from the three path result to the reference # @@ -327,7 +327,7 @@ rule Compare_3: compar_file_3 = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_3_of_3_vs_ref.txt") shell: """ - needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1} - needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2} - needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_3} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_3} - """ \ No newline at end of file + water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1} + water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2} + water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_3} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_3} + """ From 37ad2a26131e49f8c173cb40ad3e7567558cb5eb Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 18 Jun 2025 14:47:25 -0600 Subject: [PATCH 03/24] Add initial version of output scraper (to be changed) --- libs/FmAssemblyGraph | 2 +- libs/output_scraper/Cargo.toml | 1 + libs/output_scraper/src/main.rs | 161 +++++++++++++++++++++++++++++++- 3 files changed, 161 insertions(+), 3 deletions(-) diff --git a/libs/FmAssemblyGraph b/libs/FmAssemblyGraph index affc5bc3..120c78ab 160000 --- a/libs/FmAssemblyGraph +++ b/libs/FmAssemblyGraph @@ -1 +1 @@ -Subproject commit affc5bc31d3fc815b0898e91e32a210e54764e6a +Subproject commit 120c78abcc7d6363d69b46f8ca7de71c9a28668c diff --git a/libs/output_scraper/Cargo.toml b/libs/output_scraper/Cargo.toml index 5e142277..8a7e64fa 100644 --- a/libs/output_scraper/Cargo.toml +++ b/libs/output_scraper/Cargo.toml @@ -4,3 +4,4 @@ version = "0.1.0" edition = "2021" [dependencies] +csv = "1.1" diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index e7a11a96..8f67dbf2 100644 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -1,3 +1,160 @@ -fn main() { - println!("Hello, world!"); +use std::path::{Path, PathBuf}; +use std::fs::{self, File}; +use std::io::{BufRead, BufReader, Write}; +use std::collections::BTreeMap; // Changed from HashMap for ordered output + +#[derive(Debug)] +struct AlignmentStats { + file_name: String, + length: usize, + identity_pct: f64, + gaps_pct: f64, + score: f64, + start_position: usize, + end_position: usize, +} + +fn main() -> std::io::Result<()> { + let args: Vec = std::env::args().collect(); + if args.len() != 3 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + let input_dir = Path::new(&args[1]); + let output_path = Path::new(&args[2]); + let mut results = BTreeMap::new(); // Key: subgraph name, Value: Vec of stats + + // Walk through the directory structure + for entry in fs::read_dir(input_dir)? { + let entry = entry?; + let path = entry.path(); + + if path.is_dir() { + if let Some(dir_name) = path.file_name() { + let dir_name = dir_name.to_string_lossy(); + if dir_name.starts_with("subgraph_") { + process_subgraph_dir(&path, &dir_name, &mut results)?; + } + } + } + } + + // Write formatted results + write_formatted_output(output_path, &results)?; + + println!("Successfully processed {} subgraphs, output written to {}", + results.len(), + output_path.display()); + + Ok(()) +} + +fn process_subgraph_dir(dir: &Path, subgraph: &str, results: &mut BTreeMap>) -> std::io::Result<()> { + let mut subgraph_results = Vec::new(); + + for entry in fs::read_dir(dir)? { + let entry = entry?; + let path = entry.path(); + + if path.is_file() { + if let Some(file_name) = path.file_name() { + let file_name = file_name.to_string_lossy(); + if file_name.ends_with("_vs_ref.txt") && file_name.contains("1_of_1") { + if let Ok(stats) = parse_alignment_file(&path) { + subgraph_results.push(stats); + } + } + } + } + } + + if !subgraph_results.is_empty() { + results.insert(subgraph.to_string(), subgraph_results); + } + Ok(()) +} + +fn parse_alignment_file(file_path: &Path) -> std::io::Result { + let file = fs::File::open(file_path)?; + let reader = BufReader::new(file); + + let file_name = file_path.file_name() + .and_then(|n| n.to_str()) + .unwrap_or("unknown") + .to_string(); + + let mut stats = AlignmentStats { + file_name, + length: 0, + identity_pct: 0.0, + gaps_pct: 0.0, + score: 0.0, + start_position: 0, + end_position: 0, + }; + + for line in reader.lines() { + let line = line?; + + if line.starts_with("# Length: ") { + stats.length = line[10..].trim().parse().unwrap_or(0); + } + else if line.starts_with("# Identity: ") { + let identity_str = line[12..].trim(); + stats.identity_pct = parse_percentage(identity_str); + } + else if line.starts_with("# Gaps: ") { + let gaps_str = line[8..].trim(); + stats.gaps_pct = parse_percentage(gaps_str); + } + else if line.starts_with("# Score: ") { + stats.score = line[9..].trim().parse().unwrap_or(0.0); + } + else if line.starts_with("NC_045512.2") { + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 2 { + if stats.start_position == 0 { + stats.start_position = parts[1].parse().unwrap_or(0); + } + stats.end_position = parts.last().and_then(|s| s.parse().ok()).unwrap_or(0); + } + } + } + + Ok(stats) +} + +fn parse_percentage(s: &str) -> f64 { + s.split('(').nth(1) + .and_then(|s| s.split('%').next()) + .and_then(|s| s.trim().parse().ok()) + .unwrap_or(0.0) +} + +fn write_formatted_output(output_path: &Path, results: &BTreeMap>) -> std::io::Result<()> { + let mut file = File::create(output_path)?; + + for (subgraph, stats_vec) in results { + writeln!(file, "╔══════════════════════════════════════╗")?; + writeln!(file, "║ Subgraph: {:<26} ║", subgraph)?; + writeln!(file, "╠══════════════════════════════════════╣")?; + + for stats in stats_vec { + writeln!(file, "║ File: {:<30} ║", stats.file_name)?; + writeln!(file, "║ Length: {:<26} ║", stats.length)?; + writeln!(file, "║ Identity: {:>5.1}% {:<18} ║", + stats.identity_pct, + format!("({}/{})", (stats.identity_pct/100.0 * stats.length as f64) as usize, stats.length))?; + writeln!(file, "║ Gaps: {:>5.1}% {:<20} ║", + stats.gaps_pct, + format!("({}/{})", (stats.gaps_pct/100.0 * stats.length as f64) as usize, stats.length))?; + writeln!(file, "║ Score: {:<26.1} ║", stats.score)?; + writeln!(file, "║ Positions: {}-{:<18} ║", stats.start_position, stats.end_position)?; + writeln!(file, "╠──────────────────────────────────────╣")?; + } + } + + writeln!(file, "╚══════════════════════════════════════╝")?; + Ok(()) } From 872591958954581cca71f05fcaf092d86f297208 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 18 Jun 2025 15:22:12 -0600 Subject: [PATCH 04/24] Change output to cvs --- libs/output_scraper/src/main.rs | 118 +++++++++++++++++++------------- 1 file changed, 69 insertions(+), 49 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index 8f67dbf2..b64c8dae 100644 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -1,14 +1,17 @@ use std::path::{Path, PathBuf}; use std::fs::{self, File}; -use std::io::{BufRead, BufReader, Write}; -use std::collections::BTreeMap; // Changed from HashMap for ordered output +use std::io::{BufRead, BufReader}; +use std::collections::BTreeMap; +use csv::Writer; #[derive(Debug)] struct AlignmentStats { - file_name: String, + sample_name: String, length: usize, identity_pct: f64, + identity_count: usize, gaps_pct: f64, + gaps_count: usize, score: f64, start_position: usize, end_position: usize, @@ -17,15 +20,15 @@ struct AlignmentStats { fn main() -> std::io::Result<()> { let args: Vec = std::env::args().collect(); if args.len() != 3 { - eprintln!("Usage: {} ", args[0]); + eprintln!("Usage: {} ", args[0]); std::process::exit(1); } let input_dir = Path::new(&args[1]); let output_path = Path::new(&args[2]); - let mut results = BTreeMap::new(); // Key: subgraph name, Value: Vec of stats + let mut results = BTreeMap::new(); - // Walk through the directory structure + // Process each subgraph directory for entry in fs::read_dir(input_dir)? { let entry = entry?; let path = entry.path(); @@ -34,14 +37,16 @@ fn main() -> std::io::Result<()> { if let Some(dir_name) = path.file_name() { let dir_name = dir_name.to_string_lossy(); if dir_name.starts_with("subgraph_") { - process_subgraph_dir(&path, &dir_name, &mut results)?; + if let Some((sample_name, stats)) = process_subgraph_dir(&path)? { + results.insert(dir_name.to_string(), (sample_name, stats)); + } } } } } - // Write formatted results - write_formatted_output(output_path, &results)?; + // Write CSV output + write_csv_output(output_path, &results)?; println!("Successfully processed {} subgraphs, output written to {}", results.len(), @@ -50,9 +55,7 @@ fn main() -> std::io::Result<()> { Ok(()) } -fn process_subgraph_dir(dir: &Path, subgraph: &str, results: &mut BTreeMap>) -> std::io::Result<()> { - let mut subgraph_results = Vec::new(); - +fn process_subgraph_dir(dir: &Path) -> std::io::Result> { for entry in fs::read_dir(dir)? { let entry = entry?; let path = entry.path(); @@ -61,34 +64,30 @@ fn process_subgraph_dir(dir: &Path, subgraph: &str, results: &mut BTreeMap>() + .join("_"); + return Ok(Some((sample_name, parse_alignment_file(&path)?))); } } } } - - if !subgraph_results.is_empty() { - results.insert(subgraph.to_string(), subgraph_results); - } - Ok(()) + Ok(None) } fn parse_alignment_file(file_path: &Path) -> std::io::Result { let file = fs::File::open(file_path)?; let reader = BufReader::new(file); - - let file_name = file_path.file_name() - .and_then(|n| n.to_str()) - .unwrap_or("unknown") - .to_string(); let mut stats = AlignmentStats { - file_name, + sample_name: String::new(), length: 0, identity_pct: 0.0, + identity_count: 0, gaps_pct: 0.0, + gaps_count: 0, score: 0.0, start_position: 0, end_position: 0, @@ -103,10 +102,12 @@ fn parse_alignment_file(file_path: &Path) -> std::io::Result { else if line.starts_with("# Identity: ") { let identity_str = line[12..].trim(); stats.identity_pct = parse_percentage(identity_str); + stats.identity_count = parse_count(identity_str); } else if line.starts_with("# Gaps: ") { let gaps_str = line[8..].trim(); stats.gaps_pct = parse_percentage(gaps_str); + stats.gaps_count = parse_count(gaps_str); } else if line.starts_with("# Score: ") { stats.score = line[9..].trim().parse().unwrap_or(0.0); @@ -132,29 +133,48 @@ fn parse_percentage(s: &str) -> f64 { .unwrap_or(0.0) } -fn write_formatted_output(output_path: &Path, results: &BTreeMap>) -> std::io::Result<()> { - let mut file = File::create(output_path)?; - - for (subgraph, stats_vec) in results { - writeln!(file, "╔══════════════════════════════════════╗")?; - writeln!(file, "║ Subgraph: {:<26} ║", subgraph)?; - writeln!(file, "╠══════════════════════════════════════╣")?; - - for stats in stats_vec { - writeln!(file, "║ File: {:<30} ║", stats.file_name)?; - writeln!(file, "║ Length: {:<26} ║", stats.length)?; - writeln!(file, "║ Identity: {:>5.1}% {:<18} ║", - stats.identity_pct, - format!("({}/{})", (stats.identity_pct/100.0 * stats.length as f64) as usize, stats.length))?; - writeln!(file, "║ Gaps: {:>5.1}% {:<20} ║", - stats.gaps_pct, - format!("({}/{})", (stats.gaps_pct/100.0 * stats.length as f64) as usize, stats.length))?; - writeln!(file, "║ Score: {:<26.1} ║", stats.score)?; - writeln!(file, "║ Positions: {}-{:<18} ║", stats.start_position, stats.end_position)?; - writeln!(file, "╠──────────────────────────────────────╣")?; - } +fn parse_count(s: &str) -> usize { + s.split('/').next() + .and_then(|s| s.trim().parse().ok()) + .unwrap_or(0) +} + +fn write_csv_output(output_path: &Path, results: &BTreeMap) -> std::io::Result<()> { + let mut writer = Writer::from_path(output_path)?; + + // Write header + writer.write_record(&[ + "Subgraph", + "Sample", + "Length", + "Identity %", + "Identity Count", + "Gaps %", + "Gaps Count", + "Score", + "Start Position", + "End Position", + "Alignment Length", + ])?; + + // Write data with one row per subgraph + for (subgraph, (sample_name, stats)) in results { + let alignment_length = stats.end_position - stats.start_position + 1; + writer.write_record(&[ + subgraph, + sample_name, + &stats.length.to_string(), + &format!("{:.1}", stats.identity_pct), + &stats.identity_count.to_string(), + &format!("{:.1}", stats.gaps_pct), + &stats.gaps_count.to_string(), + &format!("{:.1}", stats.score), + &stats.start_position.to_string(), + &stats.end_position.to_string(), + &alignment_length.to_string(), + ])?; } - - writeln!(file, "╚══════════════════════════════════════╝")?; + + writer.flush()?; Ok(()) } From cc1287b4ed51e9fd273de9864fd18cc224a9c38b Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 18 Jun 2025 15:30:15 -0600 Subject: [PATCH 05/24] Change csv to include # of paths: --- libs/output_scraper/src/main.rs | 37 +++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index b64c8dae..97fa7a36 100644 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -7,6 +7,8 @@ use csv::Writer; #[derive(Debug)] struct AlignmentStats { sample_name: String, + part_number: usize, // First number (1 in "1_of_3") // + total_parts: usize, // Second number (3 in "1_of_3") // length: usize, identity_pct: f64, identity_count: usize, @@ -28,7 +30,7 @@ fn main() -> std::io::Result<()> { let output_path = Path::new(&args[2]); let mut results = BTreeMap::new(); - // Process each subgraph directory + // Process each subgraph directory // for entry in fs::read_dir(input_dir)? { let entry = entry?; let path = entry.path(); @@ -63,13 +65,17 @@ fn process_subgraph_dir(dir: &Path) -> std::io::Result>() .join("_"); - return Ok(Some((sample_name, parse_alignment_file(&path)?))); + + // Extract the subgraph numbers // + let part_numbers = extract_part_numbers(&file_name); + + return Ok(Some((sample_name, parse_alignment_file(&path, part_numbers)?))); } } } @@ -77,12 +83,27 @@ fn process_subgraph_dir(dir: &Path) -> std::io::Result std::io::Result { +fn extract_part_numbers(filename: &str) -> (usize, usize) { + let parts: Vec<&str> = filename.split('_').collect(); + if parts.len() >= 5 { + if let (Ok(current), Ok(total)) = ( + parts[3].parse::(), + parts[5].parse::(), + ) { + return (current, total); + } + } + (1, 1) // Default values if parsing fails // +} + +fn parse_alignment_file(file_path: &Path, part_numbers: (usize, usize)) -> std::io::Result { let file = fs::File::open(file_path)?; let reader = BufReader::new(file); let mut stats = AlignmentStats { sample_name: String::new(), + part_number: part_numbers.0, + total_parts: part_numbers.0, length: 0, identity_pct: 0.0, identity_count: 0, @@ -146,6 +167,8 @@ fn write_csv_output(output_path: &Path, results: &BTreeMap Date: Thu, 19 Jun 2025 14:45:26 -0600 Subject: [PATCH 06/24] Change to run on all samples and all subgraphs --- libs/output_scraper/src/main.rs | 117 +++++++++++++++++++++----------- 1 file changed, 77 insertions(+), 40 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index 97fa7a36..7553d8d3 100644 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -7,8 +7,9 @@ use csv::Writer; #[derive(Debug)] struct AlignmentStats { sample_name: String, - part_number: usize, // First number (1 in "1_of_3") // - total_parts: usize, // Second number (3 in "1_of_3") // + subgraph_name: String, // Added to track subgraph directory + part_number: usize, + total_parts: usize, length: usize, identity_pct: f64, identity_count: usize, @@ -28,36 +29,60 @@ fn main() -> std::io::Result<()> { let input_dir = Path::new(&args[1]); let output_path = Path::new(&args[2]); - let mut results = BTreeMap::new(); + let mut results = Vec::new(); - // Process each subgraph directory // - for entry in fs::read_dir(input_dir)? { - let entry = entry?; - let path = entry.path(); + // Process each sample directory + for sample_entry in fs::read_dir(input_dir)? { + let sample_entry = sample_entry?; + let sample_path = sample_entry.path(); - if path.is_dir() { - if let Some(dir_name) = path.file_name() { - let dir_name = dir_name.to_string_lossy(); - if dir_name.starts_with("subgraph_") { - if let Some((sample_name, stats)) = process_subgraph_dir(&path)? { - results.insert(dir_name.to_string(), (sample_name, stats)); + if sample_path.is_dir() { + let sample_name = sample_path.file_name() + .unwrap_or_default() + .to_string_lossy() + .to_string(); + + // Process each subgraph directory in the sample directory + for subgraph_entry in fs::read_dir(&sample_path)? { + let subgraph_entry = subgraph_entry?; + let subgraph_path = subgraph_entry.path(); + + if subgraph_path.is_dir() { + if let Some(dir_name) = subgraph_path.file_name() { + let subgraph_name = dir_name.to_string_lossy().to_string(); + if subgraph_name.starts_with("subgraph_") { + if let Some(stats_vec) = process_subgraph_dir(&subgraph_path, &sample_name, &subgraph_name)? { + results.extend(stats_vec); + } + } } } } + + // Also check for files directly in the sample directory (like subgraph_0 might be missing) + if let Some(stats_vec) = process_files_in_dir(&sample_path, &sample_name, "root")? { + results.extend(stats_vec); + } } } // Write CSV output write_csv_output(output_path, &results)?; - println!("Successfully processed {} subgraphs, output written to {}", + println!("Successfully processed {} alignment files, output written to {}", results.len(), output_path.display()); Ok(()) } -fn process_subgraph_dir(dir: &Path) -> std::io::Result> { +fn process_subgraph_dir(dir: &Path, sample_name: &str, subgraph_name: &str) -> std::io::Result>> { + process_files_in_dir(dir, sample_name, subgraph_name) +} + +fn process_files_in_dir(dir: &Path, sample_name: &str, subgraph_name: &str) -> std::io::Result>> { + let mut stats_vec = Vec::new(); + for entry in fs::read_dir(dir)? { let entry = entry?; let path = entry.path(); @@ -66,44 +91,56 @@ fn process_subgraph_dir(dir: &Path) -> std::io::Result>() - .join("_"); - - // Extract the subgraph numbers // + // Extract the part numbers let part_numbers = extract_part_numbers(&file_name); - return Ok(Some((sample_name, parse_alignment_file(&path, part_numbers)?))); + stats_vec.push(parse_alignment_file( + &path, + sample_name.to_string(), + subgraph_name.to_string(), + part_numbers + )?); } } } } - Ok(None) + + if stats_vec.is_empty() { + Ok(None) + } else { + Ok(Some(stats_vec)) + } } fn extract_part_numbers(filename: &str) -> (usize, usize) { let parts: Vec<&str> = filename.split('_').collect(); - if parts.len() >= 5 { - if let (Ok(current), Ok(total)) = ( - parts[3].parse::(), - parts[5].parse::(), - ) { - return (current, total); + for i in 0..parts.len() { + if parts[i] == "of" && i > 0 && i < parts.len() - 1 { + if let (Ok(current), Ok(total)) = ( + parts[i-1].parse::(), + parts[i+1].parse::(), + ) { + return (current, total); + } } } - (1, 1) // Default values if parsing fails // + (1, 1) // Default values if parsing fails } -fn parse_alignment_file(file_path: &Path, part_numbers: (usize, usize)) -> std::io::Result { +fn parse_alignment_file( + file_path: &Path, + sample_name: String, + subgraph_name: String, + part_numbers: (usize, usize) +) -> std::io::Result { let file = fs::File::open(file_path)?; let reader = BufReader::new(file); let mut stats = AlignmentStats { - sample_name: String::new(), + sample_name, + subgraph_name, part_number: part_numbers.0, - total_parts: part_numbers.0, + total_parts: part_numbers.1, length: 0, identity_pct: 0.0, identity_count: 0, @@ -160,13 +197,13 @@ fn parse_count(s: &str) -> usize { .unwrap_or(0) } -fn write_csv_output(output_path: &Path, results: &BTreeMap) -> std::io::Result<()> { +fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io::Result<()> { let mut writer = Writer::from_path(output_path)?; // Write header writer.write_record(&[ - "Subgraph", "Sample", + "Subgraph", "Part", "Total Parts", "Length", @@ -180,12 +217,12 @@ fn write_csv_output(output_path: &Path, results: &BTreeMap Date: Thu, 19 Jun 2025 14:51:28 -0600 Subject: [PATCH 07/24] Change csv header and sort output --- libs/output_scraper/src/main.rs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index 7553d8d3..4a51983d 100644 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -7,7 +7,7 @@ use csv::Writer; #[derive(Debug)] struct AlignmentStats { sample_name: String, - subgraph_name: String, // Added to track subgraph directory + subgraph_name: String, part_number: usize, total_parts: usize, length: usize, @@ -59,13 +59,21 @@ fn main() -> std::io::Result<()> { } } - // Also check for files directly in the sample directory (like subgraph_0 might be missing) + // Also check for files directly in the sample directory if let Some(stats_vec) = process_files_in_dir(&sample_path, &sample_name, "root")? { results.extend(stats_vec); } } } + // Sort results by sample, then subgraph, then total parts, then part number + results.sort_by(|a, b| { + a.sample_name.cmp(&b.sample_name) + .then(a.subgraph_name.cmp(&b.subgraph_name)) + .then(a.total_parts.cmp(&b.total_parts)) + .then(a.part_number.cmp(&b.part_number)) + }); + // Write CSV output write_csv_output(output_path, &results)?; @@ -91,7 +99,6 @@ fn process_files_in_dir(dir: &Path, sample_name: &str, subgraph_name: &str) -> s if let Some(file_name) = path.file_name() { let file_name = file_name.to_string_lossy(); if file_name.ends_with("_vs_ref.txt") { - // Extract the part numbers let part_numbers = extract_part_numbers(&file_name); stats_vec.push(parse_alignment_file( @@ -200,12 +207,11 @@ fn parse_count(s: &str) -> usize { fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io::Result<()> { let mut writer = Writer::from_path(output_path)?; - // Write header writer.write_record(&[ "Sample", "Subgraph", - "Part", - "Total Parts", + "Path", + "Total Paths", "Length", "Identity %", "Identity Count", @@ -217,7 +223,6 @@ fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io:: "Alignment Length", ])?; - // Write data for stats in results { let alignment_length = stats.end_position - stats.start_position + 1; writer.write_record(&[ From a7da5a5e99903b253b910e8f0974102f5ea0e49b Mon Sep 17 00:00:00 2001 From: Mikhail Date: Thu, 19 Jun 2025 15:41:50 -0600 Subject: [PATCH 08/24] Add runtime and objective values --- libs/output_scraper/src/main.rs | 78 ++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 5 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index 4a51983d..67430c4f 100644 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -18,6 +18,14 @@ struct AlignmentStats { score: f64, start_position: usize, end_position: usize, + runtime: f64, + objective_value: f64, +} + +#[derive(Debug)] +struct DecompStats { + runtime: f64, + objective_value: f64, } fn main() -> std::io::Result<()> { @@ -28,6 +36,7 @@ fn main() -> std::io::Result<()> { } let input_dir = Path::new(&args[1]); + let decomp_dir = input_dir.join("../decomp_results"); let output_path = Path::new(&args[2]); let mut results = Vec::new(); @@ -51,7 +60,8 @@ fn main() -> std::io::Result<()> { if let Some(dir_name) = subgraph_path.file_name() { let subgraph_name = dir_name.to_string_lossy().to_string(); if subgraph_name.starts_with("subgraph_") { - if let Some(stats_vec) = process_subgraph_dir(&subgraph_path, &sample_name, &subgraph_name)? { + if let Some(mut stats_vec) = process_subgraph_dir(&subgraph_path, &sample_name, &subgraph_name)? { + add_decomp_stats(&decomp_dir, &mut stats_vec)?; results.extend(stats_vec); } } @@ -60,13 +70,14 @@ fn main() -> std::io::Result<()> { } // Also check for files directly in the sample directory - if let Some(stats_vec) = process_files_in_dir(&sample_path, &sample_name, "root")? { + if let Some(mut stats_vec) = process_files_in_dir(&sample_path, &sample_name, "root")? { + add_decomp_stats(&decomp_dir, &mut stats_vec)?; results.extend(stats_vec); } } } - // Sort results by sample, then subgraph, then total parts, then part number + // Sort results results.sort_by(|a, b| { a.sample_name.cmp(&b.sample_name) .then(a.subgraph_name.cmp(&b.subgraph_name)) @@ -84,6 +95,57 @@ fn main() -> std::io::Result<()> { Ok(()) } +fn add_decomp_stats(decomp_dir: &Path, stats_vec: &mut Vec) -> std::io::Result<()> { + for stat in stats_vec { + if let Some(decomp_stats) = get_decomp_stats(decomp_dir, &stat.sample_name, &stat.subgraph_name, stat.part_number)? { + stat.runtime = decomp_stats.runtime; + stat.objective_value = decomp_stats.objective_value; + } + } + Ok(()) +} + +fn get_decomp_stats(decomp_dir: &Path, sample_name: &str, subgraph_name: &str, part_number: usize) -> std::io::Result> { + let pattern = format!("{}_{}_{}.paths", sample_name, subgraph_name, part_number); + + for entry in fs::read_dir(decomp_dir)? { + let entry = entry?; + let path = entry.path(); + + if let Some(file_name) = path.file_name() { + let file_name = file_name.to_string_lossy(); + if file_name.ends_with(&pattern) { + return parse_decomp_file(&path); + } + } + } + + Ok(None) +} + +fn parse_decomp_file(file_path: &Path) -> std::io::Result> { + let file = File::open(file_path)?; + let reader = BufReader::new(file); + + let mut runtime = 0.0; + let mut objective_value = 0.0; + + for line in reader.lines() { + let line = line?; + + if line.starts_with("Runtime: ") { + runtime = line.split_whitespace().nth(1).and_then(|s| s.parse().ok()).unwrap_or(0.0); + } else if line.starts_with("Objective Value: ") { + objective_value = line.split_whitespace().nth(2).and_then(|s| s.parse().ok()).unwrap_or(0.0); + } + } + + Ok(Some(DecompStats { + runtime, + objective_value, + })) +} + fn process_subgraph_dir(dir: &Path, sample_name: &str, subgraph_name: &str) -> std::io::Result>> { process_files_in_dir(dir, sample_name, subgraph_name) } @@ -131,7 +193,7 @@ fn extract_part_numbers(filename: &str) -> (usize, usize) { } } } - (1, 1) // Default values if parsing fails + (1, 1) } fn parse_alignment_file( @@ -140,7 +202,7 @@ fn parse_alignment_file( subgraph_name: String, part_numbers: (usize, usize) ) -> std::io::Result { - let file = fs::File::open(file_path)?; + let file = File::open(file_path)?; let reader = BufReader::new(file); let mut stats = AlignmentStats { @@ -156,6 +218,8 @@ fn parse_alignment_file( score: 0.0, start_position: 0, end_position: 0, + runtime: 0.0, + objective_value: 0.0, }; for line in reader.lines() { @@ -221,6 +285,8 @@ fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io:: "Start Position", "End Position", "Alignment Length", + "Runtime (s)", + "Objective Value", ])?; for stats in results { @@ -239,6 +305,8 @@ fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io:: &stats.start_position.to_string(), &stats.end_position.to_string(), &alignment_length.to_string(), + &format!("{:.4}", stats.runtime), + &format!("{:.6}", stats.objective_value), ])?; } From 28acbc4ef630ed8f87308294339b84972bcf4f26 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Fri, 20 Jun 2025 10:29:55 -0600 Subject: [PATCH 09/24] Change where decomp time is pulled from --- libs/output_scraper/src/main.rs | 80 +++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 28 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index 67430c4f..3aeb4990 100644 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -1,7 +1,7 @@ use std::path::{Path, PathBuf}; use std::fs::{self, File}; use std::io::{BufRead, BufReader}; -use std::collections::BTreeMap; +use std::collections::HashMap; use csv::Writer; #[derive(Debug)] @@ -22,7 +22,7 @@ struct AlignmentStats { objective_value: f64, } -#[derive(Debug)] +#[derive(Debug, Clone)] struct DecompStats { runtime: f64, objective_value: f64, @@ -40,7 +40,9 @@ fn main() -> std::io::Result<()> { let output_path = Path::new(&args[2]); let mut results = Vec::new(); - // Process each sample directory + let decomp_stats_map = build_decomp_stats_map(&decomp_dir)?; + + // Process each sample dir // for sample_entry in fs::read_dir(input_dir)? { let sample_entry = sample_entry?; let sample_path = sample_entry.path(); @@ -51,7 +53,7 @@ fn main() -> std::io::Result<()> { .to_string_lossy() .to_string(); - // Process each subgraph directory in the sample directory + // Process each subgraph dir // for subgraph_entry in fs::read_dir(&sample_path)? { let subgraph_entry = subgraph_entry?; let subgraph_path = subgraph_entry.path(); @@ -61,7 +63,7 @@ fn main() -> std::io::Result<()> { let subgraph_name = dir_name.to_string_lossy().to_string(); if subgraph_name.starts_with("subgraph_") { if let Some(mut stats_vec) = process_subgraph_dir(&subgraph_path, &sample_name, &subgraph_name)? { - add_decomp_stats(&decomp_dir, &mut stats_vec)?; + add_decomp_stats(&decomp_stats_map, &mut stats_vec); results.extend(stats_vec); } } @@ -69,15 +71,15 @@ fn main() -> std::io::Result<()> { } } - // Also check for files directly in the sample directory + // Also check for files // if let Some(mut stats_vec) = process_files_in_dir(&sample_path, &sample_name, "root")? { - add_decomp_stats(&decomp_dir, &mut stats_vec)?; + add_decomp_stats(&decomp_stats_map, &mut stats_vec); results.extend(stats_vec); } } } - // Sort results + // Sort results by sample, then subgraph, then total parts, then part number // results.sort_by(|a, b| { a.sample_name.cmp(&b.sample_name) .then(a.subgraph_name.cmp(&b.subgraph_name)) @@ -85,7 +87,7 @@ fn main() -> std::io::Result<()> { .then(a.part_number.cmp(&b.part_number)) }); - // Write CSV output + // Write csv // write_csv_output(output_path, &results)?; println!("Successfully processed {} alignment files, output written to {}", @@ -95,18 +97,8 @@ fn main() -> std::io::Result<()> { Ok(()) } -fn add_decomp_stats(decomp_dir: &Path, stats_vec: &mut Vec) -> std::io::Result<()> { - for stat in stats_vec { - if let Some(decomp_stats) = get_decomp_stats(decomp_dir, &stat.sample_name, &stat.subgraph_name, stat.part_number)? { - stat.runtime = decomp_stats.runtime; - stat.objective_value = decomp_stats.objective_value; - } - } - Ok(()) -} - -fn get_decomp_stats(decomp_dir: &Path, sample_name: &str, subgraph_name: &str, part_number: usize) -> std::io::Result> { - let pattern = format!("{}_{}_{}.paths", sample_name, subgraph_name, part_number); +fn build_decomp_stats_map(decomp_dir: &Path) -> std::io::Result> { + let mut map = HashMap::new(); for entry in fs::read_dir(decomp_dir)? { let entry = entry?; @@ -114,13 +106,41 @@ fn get_decomp_stats(decomp_dir: &Path, sample_name: &str, subgraph_name: &str, p if let Some(file_name) = path.file_name() { let file_name = file_name.to_string_lossy(); - if file_name.ends_with(&pattern) { - return parse_decomp_file(&path); + if file_name.ends_with(".paths") { + if let Some((sample_name, subgraph_name)) = parse_decomp_filename(&file_name) { + if let Some(stats) = parse_decomp_file(&path)? { + map.insert((sample_name, subgraph_name), stats); + } + } } } } - Ok(None) + Ok(map) +} + +fn parse_decomp_filename(filename: &str) -> Option<(String, String)> { + let parts: Vec<&str> = filename.split('_').collect(); + if parts.len() >= 4 { + let sample_end = parts.len() - 3; + let sample_name = parts[..sample_end].join("_"); + let subgraph_name = format!("{}_{}", parts[sample_end], parts[sample_end + 1]); + return Some((sample_name, subgraph_name)); + } + None +} + +fn add_decomp_stats( + decomp_stats_map: &HashMap<(String, String), DecompStats>, + stats_vec: &mut Vec +) { + for stat in stats_vec { + let key = (stat.sample_name.clone(), stat.subgraph_name.clone()); + if let Some(decomp_stats) = decomp_stats_map.get(&key) { + stat.runtime = decomp_stats.runtime; + stat.objective_value = decomp_stats.objective_value; + } + } } fn parse_decomp_file(file_path: &Path) -> std::io::Result> { @@ -140,10 +160,14 @@ fn parse_decomp_file(file_path: &Path) -> std::io::Result> { } } - Ok(Some(DecompStats { - runtime, - objective_value, - })) + if runtime > 0.0 || objective_value > 0.0 { + Ok(Some(DecompStats { + runtime, + objective_value, + })) + } else { + Ok(None) + } } fn process_subgraph_dir(dir: &Path, sample_name: &str, subgraph_name: &str) -> std::io::Result>> { From 226d9eec3995c56cc7a31447b9c9930a01633b15 Mon Sep 17 00:00:00 2001 From: joserod0704 Date: Fri, 20 Jun 2025 14:40:22 -0600 Subject: [PATCH 10/24] Fixed objective value in kleast_errors.py --- libs/decompose/kleast_errors.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/libs/decompose/kleast_errors.py b/libs/decompose/kleast_errors.py index 67741259..2d8a9bf2 100755 --- a/libs/decompose/kleast_errors.py +++ b/libs/decompose/kleast_errors.py @@ -89,7 +89,7 @@ def create_k_least_graph(graph, paths): return k_least_graph -def save_paths_to_file(paths, output_path, num_paths, runtime, mip_gap, objective_value, multigraph_decomposer=None): +def save_paths_to_file(paths, output_path, num_paths, runtime, objective_value, multigraph_decomposer=None): """Save path information to a text file in the specified format.""" # Calculate total flow through all paths total_flow = sum(paths['weights']) @@ -97,8 +97,7 @@ def save_paths_to_file(paths, output_path, num_paths, runtime, mip_gap, objectiv with open(output_path, 'w') as f: f.write(f"Decomposition into {num_paths} paths\n") f.write(f"Runtime: {runtime:.2f} seconds\n") - f.write(f"MIP Gap: {mip_gap:.6f}\n") - f.write(f"Objective Value: {objective_value:.6f}\n") + f.write(f"Objective Value: {objective_value}\n") f.write(f"Number of Paths: {num_paths}\n") f.write("Paths and Weights:\n") @@ -299,8 +298,8 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua # Get solver statistics runtime = time.time() - start_time - mip_gap = k_least.model.MIPGap if hasattr(k_least, 'model') else 1.0 - objective_value = k_least.model.ObjVal if hasattr(k_least, 'model') else 0.0 + #mip_gap = k_least.model.MIPGap #if hasattr(k_least, 'model') else 1.0 + objective_value = k_least.get_objective_value if visualize: @@ -318,7 +317,6 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua output_path, num_paths, runtime, - mip_gap, objective_value, multigraph_decomposer=decomposer ) From bd109ac683c96016a218a2fa9080400fff994d9f Mon Sep 17 00:00:00 2001 From: joserod0704 Date: Fri, 20 Jun 2025 15:53:47 -0600 Subject: [PATCH 11/24] Fixed data type for objective value in kleasterrors.py --- libs/decompose/kleast_errors.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libs/decompose/kleast_errors.py b/libs/decompose/kleast_errors.py index b047d08b..c19d22de 100755 --- a/libs/decompose/kleast_errors.py +++ b/libs/decompose/kleast_errors.py @@ -306,12 +306,15 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua k_least = fp.kLeastAbsErrors(G=graph, k=num_paths, flow_attr='flow', elements_to_ignore=edges_to_ignore) k_least.solve() paths = k_least.get_solution(remove_empty_paths=True) - + + # Get solver statistics runtime = time.time() - start_time #mip_gap = k_least.model.MIPGap #if hasattr(k_least, 'model') else 1.0 - objective_value = k_least.get_objective_value + objective_value = k_least.get_objective_value() + + print(f'objective: {type(objective_value)}') if visualize: From 7ce8be8bd371482f8dcedc79033a1f94e57cee81 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Mon, 23 Jun 2025 11:36:29 -0600 Subject: [PATCH 12/24] Change permissions --- libs/output_scraper/src/main.rs | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 libs/output_scraper/src/main.rs diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs old mode 100644 new mode 100755 From 6002937e98c496467e6d5a453b9d0f4ef3830e22 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Thu, 26 Jun 2025 09:24:18 -0600 Subject: [PATCH 13/24] Edit output scraper to count nodes and edges --- libs/output_scraper/src/main.rs | 192 +++++++++++++++++++++++++------- 1 file changed, 149 insertions(+), 43 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index 3aeb4990..aede0bc3 100755 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -1,7 +1,7 @@ use std::path::{Path, PathBuf}; use std::fs::{self, File}; use std::io::{BufRead, BufReader}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use csv::Writer; #[derive(Debug)] @@ -20,6 +20,10 @@ struct AlignmentStats { end_position: usize, runtime: f64, objective_value: f64, + nodes: usize, + edges: usize, + sources: usize, + sinks: usize, } #[derive(Debug, Clone)] @@ -28,6 +32,14 @@ struct DecompStats { objective_value: f64, } +#[derive(Debug, Default)] +struct GraphData { + nodes: HashSet, + edges: usize, + sources: usize, + sinks: usize, +} + fn main() -> std::io::Result<()> { let args: Vec = std::env::args().collect(); if args.len() != 3 { @@ -37,12 +49,22 @@ fn main() -> std::io::Result<()> { let input_dir = Path::new(&args[1]); let decomp_dir = input_dir.join("../decomp_results"); + let graphs_dir = input_dir.join("../graphs"); let output_path = Path::new(&args[2]); let mut results = Vec::new(); + println!("Starting processing with:"); + println!("- Input directory: {}", input_dir.display()); + println!("- Decomp directory: {}", decomp_dir.display()); + println!("- Graphs directory: {}", graphs_dir.display()); + println!("- Output CSV: {}", output_path.display()); + + println!("\nBuilding decomp stats map..."); let decomp_stats_map = build_decomp_stats_map(&decomp_dir)?; + println!("Found {} decomp results", decomp_stats_map.len()); - // Process each sample dir // + // Process each sample directory // + println!("\nProcessing sample directories..."); for sample_entry in fs::read_dir(input_dir)? { let sample_entry = sample_entry?; let sample_path = sample_entry.path(); @@ -53,7 +75,10 @@ fn main() -> std::io::Result<()> { .to_string_lossy() .to_string(); - // Process each subgraph dir // + println!("\nProcessing sample: {}", sample_name); + + // Process each subgraph directory // + println!("Processing subgraph directories..."); for subgraph_entry in fs::read_dir(&sample_path)? { let subgraph_entry = subgraph_entry?; let subgraph_path = subgraph_entry.path(); @@ -62,7 +87,9 @@ fn main() -> std::io::Result<()> { if let Some(dir_name) = subgraph_path.file_name() { let subgraph_name = dir_name.to_string_lossy().to_string(); if subgraph_name.starts_with("subgraph_") { - if let Some(mut stats_vec) = process_subgraph_dir(&subgraph_path, &sample_name, &subgraph_name)? { + println!(" Processing subgraph: {}", subgraph_name); + if let Some(mut stats_vec) = process_subgraph_dir(&subgraph_path, &sample_name, &subgraph_name, &graphs_dir)? { + println!(" Found {} alignment files", stats_vec.len()); add_decomp_stats(&decomp_stats_map, &mut stats_vec); results.extend(stats_vec); } @@ -71,8 +98,9 @@ fn main() -> std::io::Result<()> { } } - // Also check for files // - if let Some(mut stats_vec) = process_files_in_dir(&sample_path, &sample_name, "root")? { + println!("Checking for root-level alignment files..."); + if let Some(mut stats_vec) = process_files_in_dir(&sample_path, &sample_name, "root", &graphs_dir)? { + println!(" Found {} root-level alignment files", stats_vec.len()); add_decomp_stats(&decomp_stats_map, &mut stats_vec); results.extend(stats_vec); } @@ -87,19 +115,117 @@ fn main() -> std::io::Result<()> { .then(a.part_number.cmp(&b.part_number)) }); - // Write csv // + // Write CSV // + println!("\nWriting output to {}...", output_path.display()); write_csv_output(output_path, &results)?; - println!("Successfully processed {} alignment files, output written to {}", + println!("\nSuccessfully processed {} alignment files, output written to {}", results.len(), output_path.display()); Ok(()) } +fn parse_graph_file(file_path: &Path) -> std::io::Result { + let file = File::open(file_path)?; + let reader = BufReader::new(file); + let mut graph_data = GraphData::default(); + + let mut lines = reader.lines().skip(1); + + while let Some(Ok(line)) = lines.next() { + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 2 { + if let (Ok(from_node), Ok(to_node)) = (parts[0].parse::(), parts[1].parse::()) { + graph_data.nodes.insert(from_node); + graph_data.nodes.insert(to_node); + graph_data.edges += 1; + + // Count sources (edges from node 0) // + if from_node == 0 { + graph_data.sources += 1; + } + // Count sinks (edges to node 1) // + if to_node == 1 { + graph_data.sinks += 1; + } + } + } + } + + Ok(graph_data) +} + +fn process_subgraph_dir(dir: &Path, sample_name: &str, subgraph_name: &str, graphs_dir: &Path) -> std::io::Result>> { + process_files_in_dir(dir, sample_name, subgraph_name, graphs_dir) +} + +fn process_files_in_dir(dir: &Path, sample_name: &str, subgraph_name: &str, graphs_dir: &Path) -> std::io::Result>> { + let mut stats_vec = Vec::new(); + + println!(" Scanning directory: {}", dir.display()); + + for entry in fs::read_dir(dir)? { + let entry = entry?; + let path = entry.path(); + + if path.is_file() { + if let Some(file_name) = path.file_name() { + let file_name = file_name.to_string_lossy(); + if file_name.ends_with("_vs_ref.txt") { + println!(" Found alignment file: {}", file_name); + let part_numbers = extract_part_numbers(&file_name); + + let mut stats = parse_alignment_file( + &path, + sample_name.to_string(), + subgraph_name.to_string(), + part_numbers + )?; + + // Find and parse graph file + let subgraph_num = subgraph_name.trim_start_matches("subgraph_"); + let graph_file_path = graphs_dir.join(sample_name) + .join("out.dbg_subgraphs") + .join(format!("graph_{}_compressed.dbg", subgraph_num)); + + if graph_file_path.exists() { + println!(" Parsing graph file: {}", graph_file_path.display()); + let graph_data = parse_graph_file(&graph_file_path)?; + stats.nodes = graph_data.nodes.len(); + stats.edges = graph_data.edges; + stats.sources = graph_data.sources; + stats.sinks = graph_data.sinks; + + println!(" Nodes: {}, Edges: {}, Sources (from 0): {}, Sinks (to 1): {}", + stats.nodes, stats.edges, stats.sources, stats.sinks); + } else { + println!(" Graph file not found: {}", graph_file_path.display()); + } + + println!(" Part {}/{}: length={}, identity={:.1}%, gaps={:.1}%", + stats.part_number, stats.total_parts, stats.length, + stats.identity_pct, stats.gaps_pct); + + stats_vec.push(stats); + } + } + } + } + + if stats_vec.is_empty() { + println!(" No alignment files found"); + Ok(None) + } else { + Ok(Some(stats_vec)) + } +} + fn build_decomp_stats_map(decomp_dir: &Path) -> std::io::Result> { let mut map = HashMap::new(); + println!("Scanning decomp directory: {}", decomp_dir.display()); + for entry in fs::read_dir(decomp_dir)? { let entry = entry?; let path = entry.path(); @@ -107,8 +233,11 @@ fn build_decomp_stats_map(decomp_dir: &Path) -> std::io::Result std::io::Result> { } } -fn process_subgraph_dir(dir: &Path, sample_name: &str, subgraph_name: &str) -> std::io::Result>> { - process_files_in_dir(dir, sample_name, subgraph_name) -} - -fn process_files_in_dir(dir: &Path, sample_name: &str, subgraph_name: &str) -> std::io::Result>> { - let mut stats_vec = Vec::new(); - - for entry in fs::read_dir(dir)? { - let entry = entry?; - let path = entry.path(); - - if path.is_file() { - if let Some(file_name) = path.file_name() { - let file_name = file_name.to_string_lossy(); - if file_name.ends_with("_vs_ref.txt") { - let part_numbers = extract_part_numbers(&file_name); - - stats_vec.push(parse_alignment_file( - &path, - sample_name.to_string(), - subgraph_name.to_string(), - part_numbers - )?); - } - } - } - } - - if stats_vec.is_empty() { - Ok(None) - } else { - Ok(Some(stats_vec)) - } -} - fn extract_part_numbers(filename: &str) -> (usize, usize) { let parts: Vec<&str> = filename.split('_').collect(); for i in 0..parts.len() { @@ -244,6 +338,10 @@ fn parse_alignment_file( end_position: 0, runtime: 0.0, objective_value: 0.0, + nodes: 0, + edges: 0, + sources: 0, + sinks: 0, }; for line in reader.lines() { @@ -311,6 +409,10 @@ fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io:: "Alignment Length", "Runtime (s)", "Objective Value", + "Nodes", + "Edges", + "Sources (from 0)", + "Sinks (to 1)", ])?; for stats in results { @@ -331,6 +433,10 @@ fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io:: &alignment_length.to_string(), &format!("{:.4}", stats.runtime), &format!("{:.6}", stats.objective_value), + &stats.nodes.to_string(), + &stats.edges.to_string(), + &stats.sources.to_string(), + &stats.sinks.to_string(), ])?; } From caf34694c0e27f3c1ae85afc25b91e9da37c5c1a Mon Sep 17 00:00:00 2001 From: Mikhail Date: Thu, 26 Jun 2025 09:43:58 -0600 Subject: [PATCH 14/24] Change which file the scraper is looking at --- libs/output_scraper/src/main.rs | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index aede0bc3..2c4f38d0 100755 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -59,11 +59,12 @@ fn main() -> std::io::Result<()> { println!("- Graphs directory: {}", graphs_dir.display()); println!("- Output CSV: {}", output_path.display()); + // First collect all decomp stats in a lookup table println!("\nBuilding decomp stats map..."); let decomp_stats_map = build_decomp_stats_map(&decomp_dir)?; println!("Found {} decomp results", decomp_stats_map.len()); - // Process each sample directory // + // Process each sample directory println!("\nProcessing sample directories..."); for sample_entry in fs::read_dir(input_dir)? { let sample_entry = sample_entry?; @@ -77,7 +78,7 @@ fn main() -> std::io::Result<()> { println!("\nProcessing sample: {}", sample_name); - // Process each subgraph directory // + // Process each subgraph directory in the sample directory println!("Processing subgraph directories..."); for subgraph_entry in fs::read_dir(&sample_path)? { let subgraph_entry = subgraph_entry?; @@ -98,6 +99,7 @@ fn main() -> std::io::Result<()> { } } + // Also check for files directly in the sample directory println!("Checking for root-level alignment files..."); if let Some(mut stats_vec) = process_files_in_dir(&sample_path, &sample_name, "root", &graphs_dir)? { println!(" Found {} root-level alignment files", stats_vec.len()); @@ -107,7 +109,7 @@ fn main() -> std::io::Result<()> { } } - // Sort results by sample, then subgraph, then total parts, then part number // + // Sort results by sample, then subgraph, then total parts, then part number results.sort_by(|a, b| { a.sample_name.cmp(&b.sample_name) .then(a.subgraph_name.cmp(&b.subgraph_name)) @@ -115,7 +117,7 @@ fn main() -> std::io::Result<()> { .then(a.part_number.cmp(&b.part_number)) }); - // Write CSV // + // Write CSV output println!("\nWriting output to {}...", output_path.display()); write_csv_output(output_path, &results)?; @@ -131,6 +133,7 @@ fn parse_graph_file(file_path: &Path) -> std::io::Result { let reader = BufReader::new(file); let mut graph_data = GraphData::default(); + // Skip header line let mut lines = reader.lines().skip(1); while let Some(Ok(line)) = lines.next() { @@ -141,11 +144,11 @@ fn parse_graph_file(file_path: &Path) -> std::io::Result { graph_data.nodes.insert(to_node); graph_data.edges += 1; - // Count sources (edges from node 0) // + // Count sources (edges from node 0) if from_node == 0 { graph_data.sources += 1; } - // Count sinks (edges to node 1) // + // Count sinks (edges to node 1) if to_node == 1 { graph_data.sinks += 1; } @@ -183,11 +186,9 @@ fn process_files_in_dir(dir: &Path, sample_name: &str, subgraph_name: &str, grap part_numbers )?; - // Find and parse graph file + // Find and parse graph file in the new format: .super_.dbg let subgraph_num = subgraph_name.trim_start_matches("subgraph_"); - let graph_file_path = graphs_dir.join(sample_name) - .join("out.dbg_subgraphs") - .join(format!("graph_{}_compressed.dbg", subgraph_num)); + let graph_file_path = graphs_dir.join(format!("{}.super_{}.dbg", sample_name, subgraph_num)); if graph_file_path.exists() { println!(" Parsing graph file: {}", graph_file_path.display()); @@ -221,6 +222,11 @@ fn process_files_in_dir(dir: &Path, sample_name: &str, subgraph_name: &str, grap } } +// [Rest of the functions remain exactly the same as in previous implementation...] +// [build_decomp_stats_map, parse_decomp_filename, add_decomp_stats, parse_decomp_file] +// [extract_part_numbers, parse_alignment_file, parse_percentage, parse_count] +// [write_csv_output] + fn build_decomp_stats_map(decomp_dir: &Path) -> std::io::Result> { let mut map = HashMap::new(); From f3be2c94573e9257bac8b68e08af31d88af5f021 Mon Sep 17 00:00:00 2001 From: joserod0704 Date: Fri, 27 Jun 2025 09:49:18 -0600 Subject: [PATCH 15/24] Cleaned up output in pipeline --- findviralstrains.smk | 2 +- libs/FmAssemblyGraph | 2 +- libs/compress/compress.py | 7 ------- libs/decompose/kleast_errors.py | 12 +++--------- libs/rebuild/rebuild.py | 8 -------- libs/super_source_and_sink/src/main.rs | 5 +---- 6 files changed, 6 insertions(+), 30 deletions(-) diff --git a/findviralstrains.smk b/findviralstrains.smk index 4eeac6a1..a8862bda 100755 --- a/findviralstrains.smk +++ b/findviralstrains.smk @@ -84,7 +84,7 @@ fastq_filenames = set(fastq_filenames) # Deletes duplicate file entrys by conver fastq_filenames = list(fastq_filenames) fastq_filenames = [entry for entry in fastq_filenames if entry != ""] # Remake list with only populated values # -print(fastq_filenames) + ###################### ## HELPER FUNCTIONS ## diff --git a/libs/FmAssemblyGraph b/libs/FmAssemblyGraph index 120c78ab..affc5bc3 160000 --- a/libs/FmAssemblyGraph +++ b/libs/FmAssemblyGraph @@ -1 +1 @@ -Subproject commit 120c78abcc7d6363d69b46f8ca7de71c9a28668c +Subproject commit affc5bc31d3fc815b0898e91e32a210e54764e6a diff --git a/libs/compress/compress.py b/libs/compress/compress.py index cd719268..6196344e 100755 --- a/libs/compress/compress.py +++ b/libs/compress/compress.py @@ -128,9 +128,6 @@ def merge_nodes(forward_edges, reverse_edges, edge_seqs, kmer_length): # Find initial merge candidates candidates = find_merge_candidates(forward_edges, reverse_edges) - print(f"Initial candidates: {len(candidates)}") - - for node in candidates: # Get the source and target nodes @@ -233,7 +230,6 @@ def main(): forward_edges, reverse_edges, edge_seqs, kmer_length = read_graph(input_file) # merge the nodes - print("Merging nodes...") merge_nodes(forward_edges, reverse_edges, edge_seqs, kmer_length) @@ -242,10 +238,7 @@ def main(): # - print("Writing merged graph...") write_merged_graph(output_file, forward_edges) - print(f"Merged graph written to {output_file}") - if __name__ == "__main__": main() \ No newline at end of file diff --git a/libs/decompose/kleast_errors.py b/libs/decompose/kleast_errors.py index c19d22de..aef074dd 100755 --- a/libs/decompose/kleast_errors.py +++ b/libs/decompose/kleast_errors.py @@ -119,7 +119,7 @@ def save_paths_to_file(paths, output_path, num_paths, runtime, objective_value, f.write(f"{path_weight:.6f} {path_str}\n") - print(f"INFO: Path details saved to {output_path}") + def draw_labeled_multigraph(G, attr_name, ax=None, decimal_places=2, paths=None): @@ -267,7 +267,7 @@ def visualize_and_save_graph(graph, output_path, num_paths, base_size=10, paths visualization_file = f"{output_path}_visualization.pdf" plt.savefig(visualization_file, dpi=300, bbox_inches='tight') - print(f"INFO: Visualization saved to {visualization_file}") + def get_all_edges_for_node(graph, node): @@ -314,9 +314,6 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua #mip_gap = k_least.model.MIPGap #if hasattr(k_least, 'model') else 1.0 objective_value = k_least.get_objective_value() - print(f'objective: {type(objective_value)}') - - if visualize: # Visualize the graph visualize_and_save_graph(graph, output_path, num_paths, paths = paths) @@ -345,8 +342,5 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua # Read the input graph graph = read_graph_to_networkx(args.input, min_edge_weight=args.mincount) - # Generate output files for all path counts from max_paths down to 1 - generate_output_files(args.output, graph, args.maxpaths, args.minpaths, visualize=args.visualize) - - print("INFO: Processing completed.") \ No newline at end of file + generate_output_files(args.output, graph, args.maxpaths, args.minpaths, visualize=args.visualize) \ No newline at end of file diff --git a/libs/rebuild/rebuild.py b/libs/rebuild/rebuild.py index b775193d..afcb5ebe 100755 --- a/libs/rebuild/rebuild.py +++ b/libs/rebuild/rebuild.py @@ -72,7 +72,6 @@ def main(path_file, edge_file, bd_outfile): print(f"Skipping path - not enough nodes: {nodes}") continue - print(f"\nProcessing path {counter} of {total_paths}:") genome = "" is_first_node = True @@ -83,16 +82,13 @@ def main(path_file, edge_file, bd_outfile): # Check if this is a special source/sink edge if (from_node, to_node) in special_edges: - print(f"Edge {from_node}->{to_node}: special source/sink edge - no sequence added") continue # Try forward direction first if to_node in sequences.get(from_node, {}): sequence = sequences[from_node][to_node] - print(f"Edge {from_node}->{to_node}: found sequence (length {len(sequence)})") if not is_first_node and len(sequence) > 27: sequence = sequence[27:] - print(f" Trimmed to {len(sequence)} bases") genome += sequence else: # Try reverse complement @@ -100,13 +96,10 @@ def main(path_file, edge_file, bd_outfile): rev_to = from_node if rev_to in sequences.get(rev_from, {}): sequence = reverse_complement(sequences[rev_from][rev_to]) - print(f"Edge {from_node}->{to_node}: found reverse complement (length {len(sequence)})") if not is_first_node and len(sequence) > 27: sequence = sequence[27:] - print(f" Trimmed to {len(sequence)} bases") genome += sequence else: - print(f"WARNING: Edge {from_node}->{to_node} not found in either direction") # Add gap of Ns proportional to expected length gap_size = 100 if (from_node == '0' or to_node == '1') else 30 genome += "N" * gap_size @@ -117,7 +110,6 @@ def main(path_file, edge_file, bd_outfile): output_file = f"{bd_outfile.rsplit('.', 1)[0]}_{counter}_of_{total_paths}.fasta" with open(output_file, 'w') as out_f: out_f.write(f">Weight: {weight}\n{genome}\n") - print(f"Generated {output_file} with {len(genome)} bases") counter += 1 diff --git a/libs/super_source_and_sink/src/main.rs b/libs/super_source_and_sink/src/main.rs index 4d3e914f..81adbbd5 100755 --- a/libs/super_source_and_sink/src/main.rs +++ b/libs/super_source_and_sink/src/main.rs @@ -140,8 +140,5 @@ fn main() { ) .expect("unable to create super sources and sinks"); - println!( - "New nodes and edges with weights written to: {}", - output_file_path.display() - ); + } \ No newline at end of file From f8ed7bedd0a63164bfb1231839c7e9ba17340efd Mon Sep 17 00:00:00 2001 From: joserod0704 Date: Fri, 27 Jun 2025 11:58:20 -0600 Subject: [PATCH 16/24] Added time limit arg to kleast_errors.py for solver --- libs/decompose/kleast_errors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/decompose/kleast_errors.py b/libs/decompose/kleast_errors.py index aef074dd..6e372740 100755 --- a/libs/decompose/kleast_errors.py +++ b/libs/decompose/kleast_errors.py @@ -287,7 +287,7 @@ def get_all_edges_for_node(graph, node): return edges -def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visualize=False): +def generate_output_files(base_output_path, graph, time_limit, max_paths, min_paths=1, visualize=False): """Generate output files for all path counts from max_paths down to min_paths.""" # Extract the base filename without extension base_name = os.path.splitext(base_output_path)[0] @@ -303,7 +303,7 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua edges_to_ignore = get_all_edges_for_node(graph, "0") + get_all_edges_for_node(graph, "1") # Perform k-least errors analysis for current number of paths - k_least = fp.kLeastAbsErrors(G=graph, k=num_paths, flow_attr='flow', elements_to_ignore=edges_to_ignore) + k_least = fp.kLeastAbsErrors(G=graph, k=num_paths, flow_attr='flow', elements_to_ignore=edges_to_ignore, time_limit = time_limit) k_least.solve() paths = k_least.get_solution(remove_empty_paths=True) @@ -343,4 +343,4 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua graph = read_graph_to_networkx(args.input, min_edge_weight=args.mincount) # Generate output files for all path counts from max_paths down to 1 - generate_output_files(args.output, graph, args.maxpaths, args.minpaths, visualize=args.visualize) \ No newline at end of file + generate_output_files(args.output, graph, args.timelimit, args.maxpaths, args.minpaths, visualize=args.visualize) \ No newline at end of file From 8b9f572a294924bf83d5d837d65dcf12811fb91b Mon Sep 17 00:00:00 2001 From: joserod0704 Date: Fri, 27 Jun 2025 14:10:48 -0600 Subject: [PATCH 17/24] Added threads to be passed to flowpaths in kleast_errors.py --- libs/decompose/kleast_errors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/decompose/kleast_errors.py b/libs/decompose/kleast_errors.py index 6e372740..3974618a 100755 --- a/libs/decompose/kleast_errors.py +++ b/libs/decompose/kleast_errors.py @@ -287,7 +287,7 @@ def get_all_edges_for_node(graph, node): return edges -def generate_output_files(base_output_path, graph, time_limit, max_paths, min_paths=1, visualize=False): +def generate_output_files(base_output_path, graph, time_limit, threads, max_paths, min_paths=1, visualize=False): """Generate output files for all path counts from max_paths down to min_paths.""" # Extract the base filename without extension base_name = os.path.splitext(base_output_path)[0] @@ -303,7 +303,7 @@ def generate_output_files(base_output_path, graph, time_limit, max_paths, min_pa edges_to_ignore = get_all_edges_for_node(graph, "0") + get_all_edges_for_node(graph, "1") # Perform k-least errors analysis for current number of paths - k_least = fp.kLeastAbsErrors(G=graph, k=num_paths, flow_attr='flow', elements_to_ignore=edges_to_ignore, time_limit = time_limit) + k_least = fp.kLeastAbsErrors(G=graph, k=num_paths, flow_attr='flow', elements_to_ignore=edges_to_ignore, time_limit = time_limit, threads = threads) k_least.solve() paths = k_least.get_solution(remove_empty_paths=True) @@ -343,4 +343,4 @@ def generate_output_files(base_output_path, graph, time_limit, max_paths, min_pa graph = read_graph_to_networkx(args.input, min_edge_weight=args.mincount) # Generate output files for all path counts from max_paths down to 1 - generate_output_files(args.output, graph, args.timelimit, args.maxpaths, args.minpaths, visualize=args.visualize) \ No newline at end of file + generate_output_files(args.output, graph, args.timelimit, args.threads, args.maxpaths, args.minpaths, visualize=args.visualize) \ No newline at end of file From 82f090292a8af846dad6c51e64ab8b5a2cb102e1 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 2 Jul 2025 11:45:56 -0600 Subject: [PATCH 18/24] Change scraper to grab separate decomp files --- libs/output_scraper/src/main.rs | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index 2c4f38d0..28e00488 100755 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -222,12 +222,7 @@ fn process_files_in_dir(dir: &Path, sample_name: &str, subgraph_name: &str, grap } } -// [Rest of the functions remain exactly the same as in previous implementation...] -// [build_decomp_stats_map, parse_decomp_filename, add_decomp_stats, parse_decomp_file] -// [extract_part_numbers, parse_alignment_file, parse_percentage, parse_count] -// [write_csv_output] - -fn build_decomp_stats_map(decomp_dir: &Path) -> std::io::Result> { +fn build_decomp_stats_map(decomp_dir: &Path) -> std::io::Result> { let mut map = HashMap::new(); println!("Scanning decomp directory: {}", decomp_dir.display()); @@ -240,11 +235,11 @@ fn build_decomp_stats_map(decomp_dir: &Path) -> std::io::Result std::io::Result Option<(String, String)> { +fn parse_decomp_filename(filename: &str) -> Option<(String, String, usize)> { let parts: Vec<&str> = filename.split('_').collect(); if parts.len() >= 4 { let sample_end = parts.len() - 3; let sample_name = parts[..sample_end].join("_"); let subgraph_name = format!("{}_{}", parts[sample_end], parts[sample_end + 1]); - return Some((sample_name, subgraph_name)); + + // Extract total parts from filename (assuming format like "XXX_YYY_Z.paths") + let total_parts = parts.last() + .and_then(|s| s.split('.').next()) + .and_then(|s| s.parse().ok()) + .unwrap_or(1); + + return Some((sample_name, subgraph_name, total_parts)); } None } fn add_decomp_stats( - decomp_stats_map: &HashMap<(String, String), DecompStats>, + decomp_stats_map: &HashMap<(String, String, usize), DecompStats>, stats_vec: &mut Vec ) { for stat in stats_vec { - let key = (stat.sample_name.clone(), stat.subgraph_name.clone()); + let key = ( + stat.sample_name.clone(), + stat.subgraph_name.clone(), + stat.total_parts + ); if let Some(decomp_stats) = decomp_stats_map.get(&key) { stat.runtime = decomp_stats.runtime; stat.objective_value = decomp_stats.objective_value; From b7d934714d14f5ca492861a927ddd4159e0d4e9a Mon Sep 17 00:00:00 2001 From: Mikhail Date: Thu, 3 Jul 2025 10:44:42 -0600 Subject: [PATCH 19/24] Create alignment visualizer --- libs/alignment_vis/alignment_vis.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100755 libs/alignment_vis/alignment_vis.py diff --git a/libs/alignment_vis/alignment_vis.py b/libs/alignment_vis/alignment_vis.py new file mode 100755 index 00000000..e69de29b From 96d5b95da680ecf21755ab20f39dfc66064692e5 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Thu, 3 Jul 2025 15:20:29 -0600 Subject: [PATCH 20/24] First draft out alignment vis --- libs/alignment_vis/alignment_vis.py | 149 ++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/libs/alignment_vis/alignment_vis.py b/libs/alignment_vis/alignment_vis.py index e69de29b..b7e17988 100755 --- a/libs/alignment_vis/alignment_vis.py +++ b/libs/alignment_vis/alignment_vis.py @@ -0,0 +1,149 @@ +import re +import os +import matplotlib.pyplot as plt +from matplotlib.patches import Rectangle, Patch +from matplotlib.collections import PatchCollection + +def parse_alignment_file(filename): + try: + with open(filename, 'r') as f: + content = f.read() + + identity_match = re.search(r'Identity:\s+(\d+)/(\d+)\s+\(([\d.]+)%\)', content) + identity = float(identity_match.group(3)) if identity_match else 100.0 + + # Get aligned regions and positions + positions = [] + matches = [] + blocks = re.finditer( + r'NC_045512\.2\s+(\d+)\s+([ACGT]+).*?\n\s+([|.]+)\nWeight\s+\d+\s+([ACGT]+)', + content, + re.DOTALL + ) + + for block in blocks: + start = int(block.group(1)) + ref_seq = block.group(2) + match_str = block.group(3) + end = start + len(ref_seq) - 1 + positions.append((start, end)) + matches.append(match_str) + + return positions, matches, identity + except Exception as e: + print(f"Error parsing {filename}: {str(e)}") + return [], [], 0 + +def find_alignments(root_dir): + """Find all alignment files in subgraph directories""" + alignments = {} + + # Walk through subgraph directories + for subgraph in os.listdir(root_dir): + if not subgraph.startswith('subgraph_'): + continue + + subgraph_dir = os.path.join(root_dir, subgraph) + if not os.path.isdir(subgraph_dir): + continue + + # Find all alignment files in this subgraph + for fname in os.listdir(subgraph_dir): + if not fname.endswith('_vs_ref.txt'): + continue + + # Extract the X_of_Y pattern (e.g., "1_of_1") + parts = fname.split('_') + try: + x_of_y = f"{parts[-4]}_of_{parts[-2]}" + except IndexError: + continue + + full_path = os.path.join(subgraph_dir, fname) + alignments.setdefault(x_of_y, []).append((full_path, subgraph)) + + return alignments + +def plot_alignment_group(group_name, files, genome_length=29903, output_dir="."): + """Plot one group of alignments (e.g., all 1_of_1 files)""" + if not files: + return + + fig, ax = plt.subplots(figsize=(15, 2 + len(files) * 0.5)) + + # Gray background for full genome + ax.add_patch(Rectangle((0, 0), genome_length, len(files) + 1, + color='lightgray', alpha=0.3)) + + # Plot each alignment in the group + for i, (file_path, subgraph) in enumerate(sorted(files), 1): + positions, matches, identity = parse_alignment_file(file_path) + if not positions: + continue + + # Create colored segments + patches = [] + for (start, end), match_str in zip(positions, matches): + for pos, char in zip(range(start, end + 1), match_str): + color = (0, 0.8, 0) if char == '|' else (0.8, 0, 0) # Green or red + patches.append(Rectangle((pos, i - 0.4), 1, 0.8, color=color)) + + ax.add_collection(PatchCollection(patches, match_original=True)) + + # Add labels + fname = os.path.basename(file_path).replace('_vs_ref.txt', '') + ax.text(-1500, i, f"{subgraph}/{fname}", ha='right', va='center', fontsize=8) + ax.text(genome_length + 1500, i, f"{identity:.1f}%", ha='left', va='center', fontsize=8) + + # Add genome scale markers + for x in range(0, genome_length + 1, 5000): + ax.axvline(x, color='gray', linestyle=':', alpha=0.5) + if x > 0: + ax.text(x, 0.2, f"{x//1000}kb", ha='center', fontsize=8) + + ax.set_xlim(-2000, genome_length + 2000) + ax.set_ylim(0, len(files) + 1) + ax.set_yticks([]) + ax.set_xlabel("Genomic Position (bp)") + ax.set_title(f"Alignment Group: {group_name.replace('_', ' ')}") + + plt.tight_layout() + output_path = os.path.join(output_dir, f"{group_name}_alignment.pdf") + plt.savefig(output_path, dpi=300, bbox_inches='tight') + plt.close() + print(f"Saved: {output_path}") + +def main(): + import sys + if len(sys.argv) < 2: + print("Usage: python alignment_vis.py [output_dir]") + print("Example: python alignment_vis.py path/to/E1250_S84_L001/") + return + + input_dir = sys.argv[1].rstrip('/') + output_dir = sys.argv[2] if len(sys.argv) > 2 else "alignment_plots" + + if not os.path.exists(input_dir): + print(f"Error: Directory not found - {input_dir}") + return + + # Create output directory + os.makedirs(output_dir, exist_ok=True) + + # Find and group all alignment files + alignments = find_alignments(input_dir) + if not alignments: + print(f"No valid alignment files found in subgraph directories under {input_dir}") + print("Please verify that:") + print("1. The directory contains subgraph_* folders") + print("2. Those subgraphs contain files matching *_X_of_Y_vs_ref.txt") + return + + # Process each group + for group_name, files in alignments.items(): + plot_alignment_group(group_name, files, output_dir=output_dir) + + print(f"\nAll plots saved to: {os.path.abspath(output_dir)}") + +if __name__ == "__main__": + main() From a595a753e5a13640bfabe634007393125d1c7648 Mon Sep 17 00:00:00 2001 From: joserod0704 Date: Tue, 8 Jul 2025 10:20:47 -0600 Subject: [PATCH 21/24] Prints out total weight of all edges in a graph --- libs/decompose/kleast_errors.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libs/decompose/kleast_errors.py b/libs/decompose/kleast_errors.py index 3974618a..f4e4554a 100755 --- a/libs/decompose/kleast_errors.py +++ b/libs/decompose/kleast_errors.py @@ -96,10 +96,14 @@ def save_paths_to_file(paths, output_path, num_paths, runtime, objective_value, """Save path information to a text file in the specified format.""" # Calculate total flow through all paths total_flow = sum(paths['weights']) + + # sum of all weights on all edges of original graph + total_weight_graph = sum(data['flow'] for u, v, data in graph.edges(data=True)) with open(output_path, 'w') as f: f.write(f"Decomposition into {num_paths} paths\n") f.write(f"Runtime: {runtime:.2f} seconds\n") + f.write(f"Total Flow: {total_weight_graph}\n") f.write(f"Objective Value: {objective_value}\n") f.write(f"Number of Paths: {num_paths}\n") f.write("Paths and Weights:\n") @@ -307,7 +311,9 @@ def generate_output_files(base_output_path, graph, time_limit, threads, max_pat k_least.solve() paths = k_least.get_solution(remove_empty_paths=True) - + + + # Get solver statistics runtime = time.time() - start_time From 2a9c59419e981793798cb06c52f85d33a4b97c97 Mon Sep 17 00:00:00 2001 From: joserod0704 Date: Tue, 8 Jul 2025 14:41:40 -0600 Subject: [PATCH 22/24] added total flow for entire graph to the csv for stats --- libs/output_scraper/src/main.rs | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index 28e00488..285d9e53 100755 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -24,12 +24,14 @@ struct AlignmentStats { edges: usize, sources: usize, sinks: usize, + total_flow: f64, } #[derive(Debug, Clone)] struct DecompStats { runtime: f64, objective_value: f64, + total_flow: f64, } #[derive(Debug, Default)] @@ -117,6 +119,7 @@ fn main() -> std::io::Result<()> { .then(a.part_number.cmp(&b.part_number)) }); + // Write CSV output println!("\nWriting output to {}...", output_path.display()); write_csv_output(output_path, &results)?; @@ -128,6 +131,8 @@ fn main() -> std::io::Result<()> { Ok(()) } + + fn parse_graph_file(file_path: &Path) -> std::io::Result { let file = File::open(file_path)?; let reader = BufReader::new(file); @@ -236,9 +241,12 @@ fn build_decomp_stats_map(decomp_dir: &Path) -> std::io::Result std::io::Result> { let mut runtime = 0.0; let mut objective_value = 0.0; + let mut total_flow = 0.0; for line in reader.lines() { let line = line?; @@ -299,12 +309,16 @@ fn parse_decomp_file(file_path: &Path) -> std::io::Result> { } else if line.starts_with("Objective Value: ") { objective_value = line.split_whitespace().nth(2).and_then(|s| s.parse().ok()).unwrap_or(0.0); } + else if line.starts_with("Total Flow: ") { + total_flow = line.split_whitespace().nth(2).and_then(|s| s.parse().ok()).unwrap_or(0.0); + } } - if runtime > 0.0 || objective_value > 0.0 { + if runtime > 0.0 || objective_value > 0.0 || total_flow > 0.0 { Ok(Some(DecompStats { runtime, objective_value, + total_flow, })) } else { Ok(None) @@ -354,6 +368,8 @@ fn parse_alignment_file( edges: 0, sources: 0, sinks: 0, + total_flow: 0.0, + }; for line in reader.lines() { @@ -402,6 +418,9 @@ fn parse_count(s: &str) -> usize { .unwrap_or(0) } + + + fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io::Result<()> { let mut writer = Writer::from_path(output_path)?; @@ -425,6 +444,7 @@ fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io:: "Edges", "Sources (from 0)", "Sinks (to 1)", + "Total Flow", ])?; for stats in results { @@ -449,6 +469,7 @@ fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io:: &stats.edges.to_string(), &stats.sources.to_string(), &stats.sinks.to_string(), + &format!("{:.6}", stats.total_flow), ])?; } From c43de57e3ccaf4e606b1c1bb9750e67ed7eccefe Mon Sep 17 00:00:00 2001 From: joserod0704 Date: Wed, 9 Jul 2025 10:11:48 -0600 Subject: [PATCH 23/24] Added column to csv file of explained flow after decomposition --- libs/output_scraper/src/main.rs | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index 285d9e53..d6ad6984 100755 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -25,6 +25,7 @@ struct AlignmentStats { sources: usize, sinks: usize, total_flow: f64, + explained_flow: f64, } #[derive(Debug, Clone)] @@ -32,6 +33,7 @@ struct DecompStats { runtime: f64, objective_value: f64, total_flow: f64, + explained_flow: f64, } #[derive(Debug, Default)] @@ -245,8 +247,8 @@ fn build_decomp_stats_map(decomp_dir: &Path) -> std::io::Result std::io::Result> { let file = File::open(file_path)?; let reader = BufReader::new(file); let mut runtime = 0.0; let mut objective_value = 0.0; - let mut total_flow = 0.0; + let mut total_flow = 0.0; for line in reader.lines() { let line = line?; @@ -315,16 +319,22 @@ fn parse_decomp_file(file_path: &Path) -> std::io::Result> { } if runtime > 0.0 || objective_value > 0.0 || total_flow > 0.0 { + let explained_flow = if total_flow > 0.0 { + (total_flow - objective_value) / total_flow + } else { + 0.0 + }; + Ok(Some(DecompStats { runtime, objective_value, total_flow, + explained_flow, })) } else { Ok(None) } } - fn extract_part_numbers(filename: &str) -> (usize, usize) { let parts: Vec<&str> = filename.split('_').collect(); for i in 0..parts.len() { @@ -369,6 +379,7 @@ fn parse_alignment_file( sources: 0, sinks: 0, total_flow: 0.0, + explained_flow: 0.0, }; @@ -445,6 +456,7 @@ fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io:: "Sources (from 0)", "Sinks (to 1)", "Total Flow", + "Explained Flow", ])?; for stats in results { @@ -470,6 +482,7 @@ fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io:: &stats.sources.to_string(), &stats.sinks.to_string(), &format!("{:.6}", stats.total_flow), + &format!("{:.6}", stats.explained_flow), ])?; } From 62e60e9fefa727d866c33aa997b9a48cafab5f75 Mon Sep 17 00:00:00 2001 From: joserod0704 Date: Thu, 17 Jul 2025 11:52:23 -0600 Subject: [PATCH 24/24] Scapes weight of paths for table of output: --- libs/output_scraper/src/main.rs | 53 ++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs index d6ad6984..0968081d 100755 --- a/libs/output_scraper/src/main.rs +++ b/libs/output_scraper/src/main.rs @@ -26,6 +26,7 @@ struct AlignmentStats { sinks: usize, total_flow: f64, explained_flow: f64, + weight: f64 } #[derive(Debug, Clone)] @@ -34,6 +35,7 @@ struct DecompStats { objective_value: f64, total_flow: f64, explained_flow: f64, + weight: f64 } #[derive(Debug, Default)] @@ -121,7 +123,6 @@ fn main() -> std::io::Result<()> { .then(a.part_number.cmp(&b.part_number)) }); - // Write CSV output println!("\nWriting output to {}...", output_path.display()); write_csv_output(output_path, &results)?; @@ -247,15 +248,14 @@ fn build_decomp_stats_map(decomp_dir: &Path) -> std::io::Result std::io::Result> { let mut objective_value = 0.0; let mut total_flow = 0.0; - for line in reader.lines() { - let line = line?; - - if line.starts_with("Runtime: ") { - runtime = line.split_whitespace().nth(1).and_then(|s| s.parse().ok()).unwrap_or(0.0); - } else if line.starts_with("Objective Value: ") { - objective_value = line.split_whitespace().nth(2).and_then(|s| s.parse().ok()).unwrap_or(0.0); - } - else if line.starts_with("Total Flow: ") { - total_flow = line.split_whitespace().nth(2).and_then(|s| s.parse().ok()).unwrap_or(0.0); + + + let mut path_weights = Vec::new(); + let mut parsing_paths = false; + +for line in reader.lines() { + let line = line?; + + if line.starts_with("Runtime: ") { + runtime = line.split_whitespace().nth(1).and_then(|s| s.parse().ok()).unwrap_or(0.0); + } else if line.starts_with("Objective Value: ") { + objective_value = line.split_whitespace().nth(2).and_then(|s| s.parse().ok()).unwrap_or(0.0); + } else if line.starts_with("Total Flow: ") { + total_flow = line.split_whitespace().nth(2).and_then(|s| s.parse().ok()).unwrap_or(0.0); + } else if line.starts_with("Paths and Weights:") { + parsing_paths = true; + } else if parsing_paths { + if line.trim().is_empty() { + parsing_paths = false; + } else { + // weight is the first whitespace-separated field + if let Some(weight_str) = line.split_whitespace().next() { + if let Ok(weight) = weight_str.parse::() { + path_weights.push(weight); + } + } } } +} if runtime > 0.0 || objective_value > 0.0 || total_flow > 0.0 { let explained_flow = if total_flow > 0.0 { @@ -325,11 +343,15 @@ fn parse_decomp_file(file_path: &Path) -> std::io::Result> { 0.0 }; + let total_weight: f64 = path_weights.iter().sum(); + + Ok(Some(DecompStats { runtime, objective_value, total_flow, explained_flow, + weight: total_weight, })) } else { Ok(None) @@ -380,6 +402,7 @@ fn parse_alignment_file( sinks: 0, total_flow: 0.0, explained_flow: 0.0, + weight: 0.0 }; @@ -457,6 +480,7 @@ fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io:: "Sinks (to 1)", "Total Flow", "Explained Flow", + "Weight", ])?; for stats in results { @@ -483,6 +507,7 @@ fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io:: &stats.sinks.to_string(), &format!("{:.6}", stats.total_flow), &format!("{:.6}", stats.explained_flow), + &format!("{:.6}", stats.weight), ])?; }