diff --git a/Cargo.toml b/Cargo.toml index 546b30f7..57dcfbe0 100755 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,2 +1,2 @@ [workspace] -members = ["libs/graph_analyze", "libs/super_source_and_sink","libs/FmAssemblyGraph"] +members = ["libs/graph_analyze", "libs/super_source_and_sink","libs/FmAssemblyGraph", "libs/output_scraper"] diff --git a/findviralstrains.smk b/findviralstrains.smk index 4eeac6a1..a8862bda 100755 --- a/findviralstrains.smk +++ b/findviralstrains.smk @@ -84,7 +84,7 @@ fastq_filenames = set(fastq_filenames) # Deletes duplicate file entrys by conver fastq_filenames = list(fastq_filenames) fastq_filenames = [entry for entry in fastq_filenames if entry != ""] # Remake list with only populated values # -print(fastq_filenames) + ###################### ## HELPER FUNCTIONS ## diff --git a/findviralstrains_2.smk b/findviralstrains_2.smk index 1335f356..cadb0e1f 100755 --- a/findviralstrains_2.smk +++ b/findviralstrains_2.smk @@ -289,7 +289,7 @@ rule Rebuild_3: shell: "python3 {input.script} {input.flow} {input.swg} {params.outtemp}" -# Compares our newly constructed genomes to original covid reference using Needleman-Wunsch # +# Compares our newly constructed genomes to original covid reference using waterman-Wunsch # rule Compare_1: input: rebuilt_genome = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_1_of_1.fasta"), @@ -297,7 +297,7 @@ rule Compare_1: output: compar_file = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_1_of_1_vs_ref.txt") shell: - "needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file}" + "water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file}" # Compares genomes from the two path result to the reference # rule Compare_2: @@ -310,8 +310,8 @@ rule Compare_2: compar_file_2 = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_2_of_2_vs_ref.txt") shell: """ - needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1} - needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2} + water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1} + water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2} """ # Compares genomes from the three path result to the reference # @@ -327,7 +327,7 @@ rule Compare_3: compar_file_3 = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_3_of_3_vs_ref.txt") shell: """ - needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1} - needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2} - needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_3} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_3} - """ \ No newline at end of file + water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1} + water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2} + water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_3} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_3} + """ diff --git a/libs/alignment_vis/alignment_vis.py b/libs/alignment_vis/alignment_vis.py new file mode 100755 index 00000000..b7e17988 --- /dev/null +++ b/libs/alignment_vis/alignment_vis.py @@ -0,0 +1,149 @@ +import re +import os +import matplotlib.pyplot as plt +from matplotlib.patches import Rectangle, Patch +from matplotlib.collections import PatchCollection + +def parse_alignment_file(filename): + try: + with open(filename, 'r') as f: + content = f.read() + + identity_match = re.search(r'Identity:\s+(\d+)/(\d+)\s+\(([\d.]+)%\)', content) + identity = float(identity_match.group(3)) if identity_match else 100.0 + + # Get aligned regions and positions + positions = [] + matches = [] + blocks = re.finditer( + r'NC_045512\.2\s+(\d+)\s+([ACGT]+).*?\n\s+([|.]+)\nWeight\s+\d+\s+([ACGT]+)', + content, + re.DOTALL + ) + + for block in blocks: + start = int(block.group(1)) + ref_seq = block.group(2) + match_str = block.group(3) + end = start + len(ref_seq) - 1 + positions.append((start, end)) + matches.append(match_str) + + return positions, matches, identity + except Exception as e: + print(f"Error parsing {filename}: {str(e)}") + return [], [], 0 + +def find_alignments(root_dir): + """Find all alignment files in subgraph directories""" + alignments = {} + + # Walk through subgraph directories + for subgraph in os.listdir(root_dir): + if not subgraph.startswith('subgraph_'): + continue + + subgraph_dir = os.path.join(root_dir, subgraph) + if not os.path.isdir(subgraph_dir): + continue + + # Find all alignment files in this subgraph + for fname in os.listdir(subgraph_dir): + if not fname.endswith('_vs_ref.txt'): + continue + + # Extract the X_of_Y pattern (e.g., "1_of_1") + parts = fname.split('_') + try: + x_of_y = f"{parts[-4]}_of_{parts[-2]}" + except IndexError: + continue + + full_path = os.path.join(subgraph_dir, fname) + alignments.setdefault(x_of_y, []).append((full_path, subgraph)) + + return alignments + +def plot_alignment_group(group_name, files, genome_length=29903, output_dir="."): + """Plot one group of alignments (e.g., all 1_of_1 files)""" + if not files: + return + + fig, ax = plt.subplots(figsize=(15, 2 + len(files) * 0.5)) + + # Gray background for full genome + ax.add_patch(Rectangle((0, 0), genome_length, len(files) + 1, + color='lightgray', alpha=0.3)) + + # Plot each alignment in the group + for i, (file_path, subgraph) in enumerate(sorted(files), 1): + positions, matches, identity = parse_alignment_file(file_path) + if not positions: + continue + + # Create colored segments + patches = [] + for (start, end), match_str in zip(positions, matches): + for pos, char in zip(range(start, end + 1), match_str): + color = (0, 0.8, 0) if char == '|' else (0.8, 0, 0) # Green or red + patches.append(Rectangle((pos, i - 0.4), 1, 0.8, color=color)) + + ax.add_collection(PatchCollection(patches, match_original=True)) + + # Add labels + fname = os.path.basename(file_path).replace('_vs_ref.txt', '') + ax.text(-1500, i, f"{subgraph}/{fname}", ha='right', va='center', fontsize=8) + ax.text(genome_length + 1500, i, f"{identity:.1f}%", ha='left', va='center', fontsize=8) + + # Add genome scale markers + for x in range(0, genome_length + 1, 5000): + ax.axvline(x, color='gray', linestyle=':', alpha=0.5) + if x > 0: + ax.text(x, 0.2, f"{x//1000}kb", ha='center', fontsize=8) + + ax.set_xlim(-2000, genome_length + 2000) + ax.set_ylim(0, len(files) + 1) + ax.set_yticks([]) + ax.set_xlabel("Genomic Position (bp)") + ax.set_title(f"Alignment Group: {group_name.replace('_', ' ')}") + + plt.tight_layout() + output_path = os.path.join(output_dir, f"{group_name}_alignment.pdf") + plt.savefig(output_path, dpi=300, bbox_inches='tight') + plt.close() + print(f"Saved: {output_path}") + +def main(): + import sys + if len(sys.argv) < 2: + print("Usage: python alignment_vis.py [output_dir]") + print("Example: python alignment_vis.py path/to/E1250_S84_L001/") + return + + input_dir = sys.argv[1].rstrip('/') + output_dir = sys.argv[2] if len(sys.argv) > 2 else "alignment_plots" + + if not os.path.exists(input_dir): + print(f"Error: Directory not found - {input_dir}") + return + + # Create output directory + os.makedirs(output_dir, exist_ok=True) + + # Find and group all alignment files + alignments = find_alignments(input_dir) + if not alignments: + print(f"No valid alignment files found in subgraph directories under {input_dir}") + print("Please verify that:") + print("1. The directory contains subgraph_* folders") + print("2. Those subgraphs contain files matching *_X_of_Y_vs_ref.txt") + return + + # Process each group + for group_name, files in alignments.items(): + plot_alignment_group(group_name, files, output_dir=output_dir) + + print(f"\nAll plots saved to: {os.path.abspath(output_dir)}") + +if __name__ == "__main__": + main() diff --git a/libs/compress/compress.py b/libs/compress/compress.py index cd719268..6196344e 100755 --- a/libs/compress/compress.py +++ b/libs/compress/compress.py @@ -128,9 +128,6 @@ def merge_nodes(forward_edges, reverse_edges, edge_seqs, kmer_length): # Find initial merge candidates candidates = find_merge_candidates(forward_edges, reverse_edges) - print(f"Initial candidates: {len(candidates)}") - - for node in candidates: # Get the source and target nodes @@ -233,7 +230,6 @@ def main(): forward_edges, reverse_edges, edge_seqs, kmer_length = read_graph(input_file) # merge the nodes - print("Merging nodes...") merge_nodes(forward_edges, reverse_edges, edge_seqs, kmer_length) @@ -242,10 +238,7 @@ def main(): # - print("Writing merged graph...") write_merged_graph(output_file, forward_edges) - print(f"Merged graph written to {output_file}") - if __name__ == "__main__": main() \ No newline at end of file diff --git a/libs/decompose/kleast_errors.py b/libs/decompose/kleast_errors.py index a8181aaa..f4e4554a 100755 --- a/libs/decompose/kleast_errors.py +++ b/libs/decompose/kleast_errors.py @@ -92,16 +92,19 @@ def create_k_least_graph(graph, paths): return k_least_graph -def save_paths_to_file(paths, output_path, num_paths, runtime, mip_gap, objective_value, multigraph_decomposer=None): +def save_paths_to_file(paths, output_path, num_paths, runtime, objective_value, multigraph_decomposer=None): """Save path information to a text file in the specified format.""" # Calculate total flow through all paths total_flow = sum(paths['weights']) + + # sum of all weights on all edges of original graph + total_weight_graph = sum(data['flow'] for u, v, data in graph.edges(data=True)) with open(output_path, 'w') as f: f.write(f"Decomposition into {num_paths} paths\n") f.write(f"Runtime: {runtime:.2f} seconds\n") - f.write(f"MIP Gap: {mip_gap:.6f}\n") - f.write(f"Objective Value: {objective_value:.6f}\n") + f.write(f"Total Flow: {total_weight_graph}\n") + f.write(f"Objective Value: {objective_value}\n") f.write(f"Number of Paths: {num_paths}\n") f.write("Paths and Weights:\n") @@ -120,7 +123,7 @@ def save_paths_to_file(paths, output_path, num_paths, runtime, mip_gap, objectiv f.write(f"{path_weight:.6f} {path_str}\n") - print(f"INFO: Path details saved to {output_path}") + def draw_labeled_multigraph(G, attr_name, ax=None, decimal_places=2, paths=None): @@ -268,7 +271,7 @@ def visualize_and_save_graph(graph, output_path, num_paths, base_size=10, paths visualization_file = f"{output_path}_visualization.pdf" plt.savefig(visualization_file, dpi=300, bbox_inches='tight') - print(f"INFO: Visualization saved to {visualization_file}") + def get_all_edges_for_node(graph, node): @@ -288,7 +291,7 @@ def get_all_edges_for_node(graph, node): return edges -def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visualize=False): +def generate_output_files(base_output_path, graph, time_limit, threads, max_paths, min_paths=1, visualize=False): """Generate output files for all path counts from max_paths down to min_paths.""" # Extract the base filename without extension base_name = os.path.splitext(base_output_path)[0] @@ -304,16 +307,18 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua edges_to_ignore = get_all_edges_for_node(graph, "0") + get_all_edges_for_node(graph, "1") # Perform k-least errors analysis for current number of paths - k_least = fp.kLeastAbsErrors(G=graph, k=num_paths, flow_attr='flow', elements_to_ignore=edges_to_ignore) + k_least = fp.kLeastAbsErrors(G=graph, k=num_paths, flow_attr='flow', elements_to_ignore=edges_to_ignore, time_limit = time_limit, threads = threads) k_least.solve() paths = k_least.get_solution(remove_empty_paths=True) + + + # Get solver statistics runtime = time.time() - start_time - mip_gap = k_least.model.MIPGap if hasattr(k_least, 'model') else 1.0 - objective_value = k_least.model.ObjVal if hasattr(k_least, 'model') else 0.0 - + #mip_gap = k_least.model.MIPGap #if hasattr(k_least, 'model') else 1.0 + objective_value = k_least.get_objective_value() if visualize: # Visualize the graph @@ -330,7 +335,6 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua output_path, num_paths, runtime, - mip_gap, objective_value, multigraph_decomposer=decomposer ) @@ -344,8 +348,5 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua # Read the input graph graph = read_graph_to_networkx(args.input, min_edge_weight=args.mincount) - # Generate output files for all path counts from max_paths down to 1 - generate_output_files(args.output, graph, args.maxpaths, args.minpaths, visualize=args.visualize) - - print("INFO: Processing completed.") \ No newline at end of file + generate_output_files(args.output, graph, args.timelimit, args.threads, args.maxpaths, args.minpaths, visualize=args.visualize) \ No newline at end of file diff --git a/libs/output_scraper/Cargo.toml b/libs/output_scraper/Cargo.toml new file mode 100644 index 00000000..8a7e64fa --- /dev/null +++ b/libs/output_scraper/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "output_scraper" +version = "0.1.0" +edition = "2021" + +[dependencies] +csv = "1.1" diff --git a/libs/output_scraper/src/main.rs b/libs/output_scraper/src/main.rs new file mode 100755 index 00000000..0968081d --- /dev/null +++ b/libs/output_scraper/src/main.rs @@ -0,0 +1,516 @@ +use std::path::{Path, PathBuf}; +use std::fs::{self, File}; +use std::io::{BufRead, BufReader}; +use std::collections::{HashMap, HashSet}; +use csv::Writer; + +#[derive(Debug)] +struct AlignmentStats { + sample_name: String, + subgraph_name: String, + part_number: usize, + total_parts: usize, + length: usize, + identity_pct: f64, + identity_count: usize, + gaps_pct: f64, + gaps_count: usize, + score: f64, + start_position: usize, + end_position: usize, + runtime: f64, + objective_value: f64, + nodes: usize, + edges: usize, + sources: usize, + sinks: usize, + total_flow: f64, + explained_flow: f64, + weight: f64 +} + +#[derive(Debug, Clone)] +struct DecompStats { + runtime: f64, + objective_value: f64, + total_flow: f64, + explained_flow: f64, + weight: f64 +} + +#[derive(Debug, Default)] +struct GraphData { + nodes: HashSet, + edges: usize, + sources: usize, + sinks: usize, +} + +fn main() -> std::io::Result<()> { + let args: Vec = std::env::args().collect(); + if args.len() != 3 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + let input_dir = Path::new(&args[1]); + let decomp_dir = input_dir.join("../decomp_results"); + let graphs_dir = input_dir.join("../graphs"); + let output_path = Path::new(&args[2]); + let mut results = Vec::new(); + + println!("Starting processing with:"); + println!("- Input directory: {}", input_dir.display()); + println!("- Decomp directory: {}", decomp_dir.display()); + println!("- Graphs directory: {}", graphs_dir.display()); + println!("- Output CSV: {}", output_path.display()); + + // First collect all decomp stats in a lookup table + println!("\nBuilding decomp stats map..."); + let decomp_stats_map = build_decomp_stats_map(&decomp_dir)?; + println!("Found {} decomp results", decomp_stats_map.len()); + + // Process each sample directory + println!("\nProcessing sample directories..."); + for sample_entry in fs::read_dir(input_dir)? { + let sample_entry = sample_entry?; + let sample_path = sample_entry.path(); + + if sample_path.is_dir() { + let sample_name = sample_path.file_name() + .unwrap_or_default() + .to_string_lossy() + .to_string(); + + println!("\nProcessing sample: {}", sample_name); + + // Process each subgraph directory in the sample directory + println!("Processing subgraph directories..."); + for subgraph_entry in fs::read_dir(&sample_path)? { + let subgraph_entry = subgraph_entry?; + let subgraph_path = subgraph_entry.path(); + + if subgraph_path.is_dir() { + if let Some(dir_name) = subgraph_path.file_name() { + let subgraph_name = dir_name.to_string_lossy().to_string(); + if subgraph_name.starts_with("subgraph_") { + println!(" Processing subgraph: {}", subgraph_name); + if let Some(mut stats_vec) = process_subgraph_dir(&subgraph_path, &sample_name, &subgraph_name, &graphs_dir)? { + println!(" Found {} alignment files", stats_vec.len()); + add_decomp_stats(&decomp_stats_map, &mut stats_vec); + results.extend(stats_vec); + } + } + } + } + } + + // Also check for files directly in the sample directory + println!("Checking for root-level alignment files..."); + if let Some(mut stats_vec) = process_files_in_dir(&sample_path, &sample_name, "root", &graphs_dir)? { + println!(" Found {} root-level alignment files", stats_vec.len()); + add_decomp_stats(&decomp_stats_map, &mut stats_vec); + results.extend(stats_vec); + } + } + } + + // Sort results by sample, then subgraph, then total parts, then part number + results.sort_by(|a, b| { + a.sample_name.cmp(&b.sample_name) + .then(a.subgraph_name.cmp(&b.subgraph_name)) + .then(a.total_parts.cmp(&b.total_parts)) + .then(a.part_number.cmp(&b.part_number)) + }); + + // Write CSV output + println!("\nWriting output to {}...", output_path.display()); + write_csv_output(output_path, &results)?; + + println!("\nSuccessfully processed {} alignment files, output written to {}", + results.len(), + output_path.display()); + + Ok(()) +} + + + +fn parse_graph_file(file_path: &Path) -> std::io::Result { + let file = File::open(file_path)?; + let reader = BufReader::new(file); + let mut graph_data = GraphData::default(); + + // Skip header line + let mut lines = reader.lines().skip(1); + + while let Some(Ok(line)) = lines.next() { + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 2 { + if let (Ok(from_node), Ok(to_node)) = (parts[0].parse::(), parts[1].parse::()) { + graph_data.nodes.insert(from_node); + graph_data.nodes.insert(to_node); + graph_data.edges += 1; + + // Count sources (edges from node 0) + if from_node == 0 { + graph_data.sources += 1; + } + // Count sinks (edges to node 1) + if to_node == 1 { + graph_data.sinks += 1; + } + } + } + } + + Ok(graph_data) +} + +fn process_subgraph_dir(dir: &Path, sample_name: &str, subgraph_name: &str, graphs_dir: &Path) -> std::io::Result>> { + process_files_in_dir(dir, sample_name, subgraph_name, graphs_dir) +} + +fn process_files_in_dir(dir: &Path, sample_name: &str, subgraph_name: &str, graphs_dir: &Path) -> std::io::Result>> { + let mut stats_vec = Vec::new(); + + println!(" Scanning directory: {}", dir.display()); + + for entry in fs::read_dir(dir)? { + let entry = entry?; + let path = entry.path(); + + if path.is_file() { + if let Some(file_name) = path.file_name() { + let file_name = file_name.to_string_lossy(); + if file_name.ends_with("_vs_ref.txt") { + println!(" Found alignment file: {}", file_name); + let part_numbers = extract_part_numbers(&file_name); + + let mut stats = parse_alignment_file( + &path, + sample_name.to_string(), + subgraph_name.to_string(), + part_numbers + )?; + + // Find and parse graph file in the new format: .super_.dbg + let subgraph_num = subgraph_name.trim_start_matches("subgraph_"); + let graph_file_path = graphs_dir.join(format!("{}.super_{}.dbg", sample_name, subgraph_num)); + + if graph_file_path.exists() { + println!(" Parsing graph file: {}", graph_file_path.display()); + let graph_data = parse_graph_file(&graph_file_path)?; + stats.nodes = graph_data.nodes.len(); + stats.edges = graph_data.edges; + stats.sources = graph_data.sources; + stats.sinks = graph_data.sinks; + + println!(" Nodes: {}, Edges: {}, Sources (from 0): {}, Sinks (to 1): {}", + stats.nodes, stats.edges, stats.sources, stats.sinks); + } else { + println!(" Graph file not found: {}", graph_file_path.display()); + } + + println!(" Part {}/{}: length={}, identity={:.1}%, gaps={:.1}%", + stats.part_number, stats.total_parts, stats.length, + stats.identity_pct, stats.gaps_pct); + + stats_vec.push(stats); + } + } + } + } + + if stats_vec.is_empty() { + println!(" No alignment files found"); + Ok(None) + } else { + Ok(Some(stats_vec)) + } +} + +fn build_decomp_stats_map(decomp_dir: &Path) -> std::io::Result> { + let mut map = HashMap::new(); + + println!("Scanning decomp directory: {}", decomp_dir.display()); + + for entry in fs::read_dir(decomp_dir)? { + let entry = entry?; + let path = entry.path(); + + if let Some(file_name) = path.file_name() { + let file_name = file_name.to_string_lossy(); + if file_name.ends_with(".paths") { + println!(" Found decomp file: {}", file_name); + if let Some((sample_name, subgraph_name, total_parts)) = parse_decomp_filename(&file_name) { + println!(" Sample: {}, Subgraph: {}, Total Parts: {}", + sample_name, subgraph_name, total_parts); + + if let Some(stats) = parse_decomp_file(&path)? { + println!(" Runtime: {:.4}s, Objective: {:.6}, Total Flow: {:.6}, Explained Flow: {:.6}, Weight: {:.6}", + stats.runtime, stats.objective_value, stats.total_flow, stats.explained_flow, stats.weight); + map.insert((sample_name, subgraph_name, total_parts), stats); + } + } + } + } + } + Ok(map) +} + +fn parse_decomp_filename(filename: &str) -> Option<(String, String, usize)> { + let parts: Vec<&str> = filename.split('_').collect(); + if parts.len() >= 4 { + let sample_end = parts.len() - 3; + let sample_name = parts[..sample_end].join("_"); + let subgraph_name = format!("{}_{}", parts[sample_end], parts[sample_end + 1]); + + // Extract total parts from filename (assuming format like "XXX_YYY_Z.paths") + let total_parts = parts.last() + .and_then(|s| s.split('.').next()) + .and_then(|s| s.parse().ok()) + .unwrap_or(1); + + return Some((sample_name, subgraph_name, total_parts)); + } + None +} + +fn add_decomp_stats( + decomp_stats_map: &HashMap<(String, String, usize), DecompStats>, + stats_vec: &mut Vec +) { + for stat in stats_vec { + let key = ( + stat.sample_name.clone(), + stat.subgraph_name.clone(), + stat.total_parts + ); + if let Some(decomp_stats) = decomp_stats_map.get(&key) { + stat.runtime = decomp_stats.runtime; + stat.objective_value = decomp_stats.objective_value; + stat.total_flow = decomp_stats.total_flow; + stat.explained_flow = decomp_stats.explained_flow; + stat.weight = decomp_stats.weight; + + + } + } +} +fn parse_decomp_file(file_path: &Path) -> std::io::Result> { + let file = File::open(file_path)?; + let reader = BufReader::new(file); + + let mut runtime = 0.0; + let mut objective_value = 0.0; + let mut total_flow = 0.0; + + + + let mut path_weights = Vec::new(); + let mut parsing_paths = false; + +for line in reader.lines() { + let line = line?; + + if line.starts_with("Runtime: ") { + runtime = line.split_whitespace().nth(1).and_then(|s| s.parse().ok()).unwrap_or(0.0); + } else if line.starts_with("Objective Value: ") { + objective_value = line.split_whitespace().nth(2).and_then(|s| s.parse().ok()).unwrap_or(0.0); + } else if line.starts_with("Total Flow: ") { + total_flow = line.split_whitespace().nth(2).and_then(|s| s.parse().ok()).unwrap_or(0.0); + } else if line.starts_with("Paths and Weights:") { + parsing_paths = true; + } else if parsing_paths { + if line.trim().is_empty() { + parsing_paths = false; + } else { + // weight is the first whitespace-separated field + if let Some(weight_str) = line.split_whitespace().next() { + if let Ok(weight) = weight_str.parse::() { + path_weights.push(weight); + } + } + } + } +} + + if runtime > 0.0 || objective_value > 0.0 || total_flow > 0.0 { + let explained_flow = if total_flow > 0.0 { + (total_flow - objective_value) / total_flow + } else { + 0.0 + }; + + let total_weight: f64 = path_weights.iter().sum(); + + + Ok(Some(DecompStats { + runtime, + objective_value, + total_flow, + explained_flow, + weight: total_weight, + })) + } else { + Ok(None) + } +} +fn extract_part_numbers(filename: &str) -> (usize, usize) { + let parts: Vec<&str> = filename.split('_').collect(); + for i in 0..parts.len() { + if parts[i] == "of" && i > 0 && i < parts.len() - 1 { + if let (Ok(current), Ok(total)) = ( + parts[i-1].parse::(), + parts[i+1].parse::(), + ) { + return (current, total); + } + } + } + (1, 1) +} + +fn parse_alignment_file( + file_path: &Path, + sample_name: String, + subgraph_name: String, + part_numbers: (usize, usize) +) -> std::io::Result { + let file = File::open(file_path)?; + let reader = BufReader::new(file); + + let mut stats = AlignmentStats { + sample_name, + subgraph_name, + part_number: part_numbers.0, + total_parts: part_numbers.1, + length: 0, + identity_pct: 0.0, + identity_count: 0, + gaps_pct: 0.0, + gaps_count: 0, + score: 0.0, + start_position: 0, + end_position: 0, + runtime: 0.0, + objective_value: 0.0, + nodes: 0, + edges: 0, + sources: 0, + sinks: 0, + total_flow: 0.0, + explained_flow: 0.0, + weight: 0.0 + + }; + + for line in reader.lines() { + let line = line?; + + if line.starts_with("# Length: ") { + stats.length = line[10..].trim().parse().unwrap_or(0); + } + else if line.starts_with("# Identity: ") { + let identity_str = line[12..].trim(); + stats.identity_pct = parse_percentage(identity_str); + stats.identity_count = parse_count(identity_str); + } + else if line.starts_with("# Gaps: ") { + let gaps_str = line[8..].trim(); + stats.gaps_pct = parse_percentage(gaps_str); + stats.gaps_count = parse_count(gaps_str); + } + else if line.starts_with("# Score: ") { + stats.score = line[9..].trim().parse().unwrap_or(0.0); + } + else if line.starts_with("NC_045512.2") { + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 2 { + if stats.start_position == 0 { + stats.start_position = parts[1].parse().unwrap_or(0); + } + stats.end_position = parts.last().and_then(|s| s.parse().ok()).unwrap_or(0); + } + } + } + + Ok(stats) +} + +fn parse_percentage(s: &str) -> f64 { + s.split('(').nth(1) + .and_then(|s| s.split('%').next()) + .and_then(|s| s.trim().parse().ok()) + .unwrap_or(0.0) +} + +fn parse_count(s: &str) -> usize { + s.split('/').next() + .and_then(|s| s.trim().parse().ok()) + .unwrap_or(0) +} + + + + +fn write_csv_output(output_path: &Path, results: &[AlignmentStats]) -> std::io::Result<()> { + let mut writer = Writer::from_path(output_path)?; + + writer.write_record(&[ + "Sample", + "Subgraph", + "Path", + "Total Paths", + "Length", + "Identity %", + "Identity Count", + "Gaps %", + "Gaps Count", + "Score", + "Start Position", + "End Position", + "Alignment Length", + "Runtime (s)", + "Objective Value", + "Nodes", + "Edges", + "Sources (from 0)", + "Sinks (to 1)", + "Total Flow", + "Explained Flow", + "Weight", + ])?; + + for stats in results { + let alignment_length = stats.end_position - stats.start_position + 1; + writer.write_record(&[ + &stats.sample_name, + &stats.subgraph_name, + &stats.part_number.to_string(), + &stats.total_parts.to_string(), + &stats.length.to_string(), + &format!("{:.1}", stats.identity_pct), + &stats.identity_count.to_string(), + &format!("{:.1}", stats.gaps_pct), + &stats.gaps_count.to_string(), + &format!("{:.1}", stats.score), + &stats.start_position.to_string(), + &stats.end_position.to_string(), + &alignment_length.to_string(), + &format!("{:.4}", stats.runtime), + &format!("{:.6}", stats.objective_value), + &stats.nodes.to_string(), + &stats.edges.to_string(), + &stats.sources.to_string(), + &stats.sinks.to_string(), + &format!("{:.6}", stats.total_flow), + &format!("{:.6}", stats.explained_flow), + &format!("{:.6}", stats.weight), + ])?; + } + + writer.flush()?; + Ok(()) +} diff --git a/libs/rebuild/rebuild.py b/libs/rebuild/rebuild.py index b775193d..afcb5ebe 100755 --- a/libs/rebuild/rebuild.py +++ b/libs/rebuild/rebuild.py @@ -72,7 +72,6 @@ def main(path_file, edge_file, bd_outfile): print(f"Skipping path - not enough nodes: {nodes}") continue - print(f"\nProcessing path {counter} of {total_paths}:") genome = "" is_first_node = True @@ -83,16 +82,13 @@ def main(path_file, edge_file, bd_outfile): # Check if this is a special source/sink edge if (from_node, to_node) in special_edges: - print(f"Edge {from_node}->{to_node}: special source/sink edge - no sequence added") continue # Try forward direction first if to_node in sequences.get(from_node, {}): sequence = sequences[from_node][to_node] - print(f"Edge {from_node}->{to_node}: found sequence (length {len(sequence)})") if not is_first_node and len(sequence) > 27: sequence = sequence[27:] - print(f" Trimmed to {len(sequence)} bases") genome += sequence else: # Try reverse complement @@ -100,13 +96,10 @@ def main(path_file, edge_file, bd_outfile): rev_to = from_node if rev_to in sequences.get(rev_from, {}): sequence = reverse_complement(sequences[rev_from][rev_to]) - print(f"Edge {from_node}->{to_node}: found reverse complement (length {len(sequence)})") if not is_first_node and len(sequence) > 27: sequence = sequence[27:] - print(f" Trimmed to {len(sequence)} bases") genome += sequence else: - print(f"WARNING: Edge {from_node}->{to_node} not found in either direction") # Add gap of Ns proportional to expected length gap_size = 100 if (from_node == '0' or to_node == '1') else 30 genome += "N" * gap_size @@ -117,7 +110,6 @@ def main(path_file, edge_file, bd_outfile): output_file = f"{bd_outfile.rsplit('.', 1)[0]}_{counter}_of_{total_paths}.fasta" with open(output_file, 'w') as out_f: out_f.write(f">Weight: {weight}\n{genome}\n") - print(f"Generated {output_file} with {len(genome)} bases") counter += 1 diff --git a/libs/super_source_and_sink/src/main.rs b/libs/super_source_and_sink/src/main.rs index 4d3e914f..81adbbd5 100755 --- a/libs/super_source_and_sink/src/main.rs +++ b/libs/super_source_and_sink/src/main.rs @@ -140,8 +140,5 @@ fn main() { ) .expect("unable to create super sources and sinks"); - println!( - "New nodes and edges with weights written to: {}", - output_file_path.display() - ); + } \ No newline at end of file