UM-Applied-Algorithms-Lab · hypercomrade · Aug 2, 2025 · Jun 13, 2025 · Jun 13, 2025 · Jun 16, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,2 +1,2 @@
 [workspace]
-members = ["libs/graph_analyze", "libs/super_source_and_sink","libs/FmAssemblyGraph"]
+members = ["libs/graph_analyze", "libs/super_source_and_sink","libs/FmAssemblyGraph", "libs/output_scraper"]
diff --git a/findviralstrains.smk b/findviralstrains.smk
@@ -84,7 +84,7 @@ fastq_filenames = set(fastq_filenames) # Deletes duplicate file entrys by conver
 fastq_filenames = list(fastq_filenames)
 
 fastq_filenames = [entry for entry in fastq_filenames if entry != ""] # Remake list with only populated values #
-print(fastq_filenames)
+
 
 ######################
 ## HELPER FUNCTIONS ##

diff --git a/findviralstrains_2.smk b/findviralstrains_2.smk
@@ -289,15 +289,15 @@ rule Rebuild_3:
     shell:
         "python3 {input.script} {input.flow} {input.swg} {params.outtemp}"
 
-# Compares our newly constructed genomes to original covid reference using Needleman-Wunsch #
+# Compares our newly constructed genomes to original covid reference using waterman-Wunsch #
 rule Compare_1:
     input:
         rebuilt_genome = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_1_of_1.fasta"),
         origin_covid = "reference_genomes/covid19ref.fasta"
     output:
         compar_file = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_1_of_1_vs_ref.txt")
     shell:
-        "needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file}"
+        "water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file}"
 
 # Compares genomes from the two path result to the reference #
 rule Compare_2:
@@ -310,8 +310,8 @@ rule Compare_2:
         compar_file_2 = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_2_of_2_vs_ref.txt")
     shell:
         """
-        needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1}
-        needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2}
+        water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1}
+        water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2}
         """
 
 # Compares genomes from the three path result to the reference #
@@ -327,7 +327,7 @@ rule Compare_3:
         compar_file_3 = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_3_of_3_vs_ref.txt")
     shell:
         """
-        needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1}
-        needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2}
-        needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_3} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_3}
-        """
+        water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1}
+        water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2}
+        water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_3} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_3}
+        """
diff --git a/libs/alignment_vis/alignment_vis.py b/libs/alignment_vis/alignment_vis.py
@@ -0,0 +1,149 @@
+import re
+import os
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle, Patch
+from matplotlib.collections import PatchCollection
+
+def parse_alignment_file(filename):
+    try:
+        with open(filename, 'r') as f:
+            content = f.read()
+
+        identity_match = re.search(r'Identity:\s+(\d+)/(\d+)\s+\(([\d.]+)%\)', content)
+        identity = float(identity_match.group(3)) if identity_match else 100.0
+
+        # Get aligned regions and positions
+        positions = []
+        matches = []
+        blocks = re.finditer(
+            r'NC_045512\.2\s+(\d+)\s+([ACGT]+).*?\n\s+([|.]+)\nWeight\s+\d+\s+([ACGT]+)',
+            content,
+            re.DOTALL
+        )
+
+        for block in blocks:
+            start = int(block.group(1))
+            ref_seq = block.group(2)
+            match_str = block.group(3)
+            end = start + len(ref_seq) - 1
+            positions.append((start, end))
+            matches.append(match_str)
+
+        return positions, matches, identity
+    except Exception as e:
+        print(f"Error parsing {filename}: {str(e)}")
+        return [], [], 0
+
+def find_alignments(root_dir):
+    """Find all alignment files in subgraph directories"""
+    alignments = {}
+
+    # Walk through subgraph directories
+    for subgraph in os.listdir(root_dir):
+        if not subgraph.startswith('subgraph_'):
+            continue
+
+        subgraph_dir = os.path.join(root_dir, subgraph)
+        if not os.path.isdir(subgraph_dir):
+            continue
+
+        # Find all alignment files in this subgraph
+        for fname in os.listdir(subgraph_dir):
+            if not fname.endswith('_vs_ref.txt'):
+                continue
+
+            # Extract the X_of_Y pattern (e.g., "1_of_1")
+            parts = fname.split('_')
+            try:
+                x_of_y = f"{parts[-4]}_of_{parts[-2]}"
+            except IndexError:
+                continue
+
+            full_path = os.path.join(subgraph_dir, fname)
+            alignments.setdefault(x_of_y, []).append((full_path, subgraph))
+
+    return alignments
+
+def plot_alignment_group(group_name, files, genome_length=29903, output_dir="."):
+    """Plot one group of alignments (e.g., all 1_of_1 files)"""
+    if not files:
+        return
+
+    fig, ax = plt.subplots(figsize=(15, 2 + len(files) * 0.5))
+
+    # Gray background for full genome
+    ax.add_patch(Rectangle((0, 0), genome_length, len(files) + 1,
+                 color='lightgray', alpha=0.3))
+
+    # Plot each alignment in the group
+    for i, (file_path, subgraph) in enumerate(sorted(files), 1):
+        positions, matches, identity = parse_alignment_file(file_path)
+        if not positions:
+            continue
+
+        # Create colored segments
+        patches = []
+        for (start, end), match_str in zip(positions, matches):
+            for pos, char in zip(range(start, end + 1), match_str):
+                color = (0, 0.8, 0) if char == '|' else (0.8, 0, 0)  # Green or red
+                patches.append(Rectangle((pos, i - 0.4), 1, 0.8, color=color))
+
+        ax.add_collection(PatchCollection(patches, match_original=True))
+
+        # Add labels
+        fname = os.path.basename(file_path).replace('_vs_ref.txt', '')
+        ax.text(-1500, i, f"{subgraph}/{fname}", ha='right', va='center', fontsize=8)
+        ax.text(genome_length + 1500, i, f"{identity:.1f}%", ha='left', va='center', fontsize=8)
+
+    # Add genome scale markers
+    for x in range(0, genome_length + 1, 5000):
+        ax.axvline(x, color='gray', linestyle=':', alpha=0.5)
+        if x > 0:
+            ax.text(x, 0.2, f"{x//1000}kb", ha='center', fontsize=8)
+
+    ax.set_xlim(-2000, genome_length + 2000)
+    ax.set_ylim(0, len(files) + 1)
+    ax.set_yticks([])
+    ax.set_xlabel("Genomic Position (bp)")
+    ax.set_title(f"Alignment Group: {group_name.replace('_', ' ')}")
+
+    plt.tight_layout()
+    output_path = os.path.join(output_dir, f"{group_name}_alignment.pdf")
+    plt.savefig(output_path, dpi=300, bbox_inches='tight')
+    plt.close()
+    print(f"Saved: {output_path}")
+
+def main():
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python alignment_vis.py <input_dir> [output_dir]")
+        print("Example: python alignment_vis.py path/to/E1250_S84_L001/")
+        return
+
+    input_dir = sys.argv[1].rstrip('/')
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "alignment_plots"
+
+    if not os.path.exists(input_dir):
+        print(f"Error: Directory not found - {input_dir}")
+        return
+
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Find and group all alignment files
+    alignments = find_alignments(input_dir)
+    if not alignments:
+        print(f"No valid alignment files found in subgraph directories under {input_dir}")
+        print("Please verify that:")
+        print("1. The directory contains subgraph_* folders")
+        print("2. Those subgraphs contain files matching *_X_of_Y_vs_ref.txt")
+        return
+
+    # Process each group
+    for group_name, files in alignments.items():
+        plot_alignment_group(group_name, files, output_dir=output_dir)
+
+    print(f"\nAll plots saved to: {os.path.abspath(output_dir)}")
+
+if __name__ == "__main__":
+    main()
diff --git a/libs/compress/compress.py b/libs/compress/compress.py
@@ -128,9 +128,6 @@ def merge_nodes(forward_edges, reverse_edges, edge_seqs, kmer_length):
     # Find initial merge candidates
     candidates = find_merge_candidates(forward_edges, reverse_edges)
 
-    print(f"Initial candidates: {len(candidates)}")
-
-
     for node in candidates:
 
         # Get the source and target nodes
@@ -233,7 +230,6 @@ def main():
     forward_edges, reverse_edges, edge_seqs, kmer_length = read_graph(input_file)
 
     # merge the nodes
-    print("Merging nodes...")
     merge_nodes(forward_edges, reverse_edges, edge_seqs, kmer_length)
 
 
@@ -242,10 +238,7 @@ def main():
 
 
     #
-    print("Writing merged graph...")
     write_merged_graph(output_file, forward_edges)
 
-    print(f"Merged graph written to {output_file}")
-
 if __name__ == "__main__":
     main()
diff --git a/libs/decompose/kleast_errors.py b/libs/decompose/kleast_errors.py
@@ -92,16 +92,19 @@ def create_k_least_graph(graph, paths):
 
     return k_least_graph
 
-def save_paths_to_file(paths, output_path, num_paths, runtime, mip_gap, objective_value, multigraph_decomposer=None):
+def save_paths_to_file(paths, output_path, num_paths, runtime, objective_value, multigraph_decomposer=None):
     """Save path information to a text file in the specified format."""
     # Calculate total flow through all paths
     total_flow = sum(paths['weights'])
+
+    # sum of all weights on all edges of original graph
+    total_weight_graph = sum(data['flow'] for u, v, data in graph.edges(data=True))
 
     with open(output_path, 'w') as f:
         f.write(f"Decomposition into {num_paths} paths\n")
         f.write(f"Runtime: {runtime:.2f} seconds\n")
-        f.write(f"MIP Gap: {mip_gap:.6f}\n")
-        f.write(f"Objective Value: {objective_value:.6f}\n")
+        f.write(f"Total Flow: {total_weight_graph}\n")
+        f.write(f"Objective Value: {objective_value}\n")
         f.write(f"Number of Paths: {num_paths}\n")
         f.write("Paths and Weights:\n")
 
@@ -120,7 +123,7 @@ def save_paths_to_file(paths, output_path, num_paths, runtime, mip_gap, objectiv
 
             f.write(f"{path_weight:.6f} {path_str}\n")
 
-    print(f"INFO: Path details saved to {output_path}")
+
 
 
 def draw_labeled_multigraph(G, attr_name, ax=None, decimal_places=2, paths=None):
@@ -268,7 +271,7 @@ def visualize_and_save_graph(graph, output_path, num_paths, base_size=10, paths
 
     visualization_file = f"{output_path}_visualization.pdf"
     plt.savefig(visualization_file, dpi=300, bbox_inches='tight')
-    print(f"INFO: Visualization saved to {visualization_file}")
+
 
 
 def get_all_edges_for_node(graph, node):
@@ -288,7 +291,7 @@ def get_all_edges_for_node(graph, node):
     return edges
 
 
-def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visualize=False):
+def generate_output_files(base_output_path, graph, time_limit, threads,  max_paths, min_paths=1, visualize=False):
     """Generate output files for all path counts from max_paths down to min_paths."""
     # Extract the base filename without extension
     base_name = os.path.splitext(base_output_path)[0]
@@ -304,16 +307,18 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua
         edges_to_ignore = get_all_edges_for_node(graph, "0") + get_all_edges_for_node(graph, "1")
 
         # Perform k-least errors analysis for current number of paths
-        k_least = fp.kLeastAbsErrors(G=graph, k=num_paths, flow_attr='flow', elements_to_ignore=edges_to_ignore)
+        k_least = fp.kLeastAbsErrors(G=graph, k=num_paths, flow_attr='flow', elements_to_ignore=edges_to_ignore, time_limit = time_limit, threads = threads)
         k_least.solve()
         paths = k_least.get_solution(remove_empty_paths=True)
+
 
 
+
+
         # Get solver statistics
         runtime = time.time() - start_time
-        mip_gap = k_least.model.MIPGap if hasattr(k_least, 'model') else 1.0
-        objective_value = k_least.model.ObjVal if hasattr(k_least, 'model') else 0.0
-
+        #mip_gap = k_least.model.MIPGap #if hasattr(k_least, 'model') else 1.0
+        objective_value = k_least.get_objective_value()
 
         if visualize:
             # Visualize the graph
@@ -330,7 +335,6 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua
             output_path, 
             num_paths,
             runtime,
-            mip_gap,
             objective_value,
             multigraph_decomposer=decomposer
         )
@@ -344,8 +348,5 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua
     # Read the input graph
     graph = read_graph_to_networkx(args.input, min_edge_weight=args.mincount)
 
-
     # Generate output files for all path counts from max_paths down to 1
-    generate_output_files(args.output, graph, args.maxpaths, args.minpaths, visualize=args.visualize)
-
-    print("INFO: Processing completed.")
+    generate_output_files(args.output, graph, args.timelimit, args.threads, args.maxpaths, args.minpaths, visualize=args.visualize)
diff --git a/libs/output_scraper/Cargo.toml b/libs/output_scraper/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "output_scraper"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+csv = "1.1"