Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7d3b7aa
Add scraper for output
hypercomrade Jun 13, 2025
5d5d365
Merge branch 'Add_output_summary' of https://github.com/UM-Applied-Al…
joserod0704 Jun 13, 2025
265e8fb
Change needle to waterman command
hypercomrade Jun 16, 2025
a166078
Merge pull request #26 from UM-Applied-Algorithms-Lab/Waterman
hypercomrade Jun 18, 2025
37ad2a2
Add initial version of output scraper (to be changed)
hypercomrade Jun 18, 2025
8725919
Change output to cvs
hypercomrade Jun 18, 2025
cc1287b
Change csv to include # of paths:
hypercomrade Jun 18, 2025
1cb4ab6
Change to run on all samples and all subgraphs
hypercomrade Jun 19, 2025
ae9ed33
Change csv header and sort output
hypercomrade Jun 19, 2025
a7da5a5
Add runtime and objective values
hypercomrade Jun 19, 2025
28acbc4
Change where decomp time is pulled from
hypercomrade Jun 20, 2025
226d9ee
Fixed objective value in kleast_errors.py
joserod0704 Jun 20, 2025
166f996
Merge branch 'dev-main' of https://github.com/UM-Applied-Algorithms-L…
joserod0704 Jun 20, 2025
bd109ac
Fixed data type for objective value in kleasterrors.py
joserod0704 Jun 20, 2025
f445b82
Merge pull request #27 from UM-Applied-Algorithms-Lab/Add_output_summary
hypercomrade Jun 20, 2025
7ce8be8
Change permissions
hypercomrade Jun 23, 2025
6002937
Edit output scraper to count nodes and edges
hypercomrade Jun 26, 2025
caf3469
Change which file the scraper is looking at
hypercomrade Jun 26, 2025
ce0a9ab
Merge pull request #28 from UM-Applied-Algorithms-Lab/Add_output_summary
hypercomrade Jun 26, 2025
f3be2c9
Cleaned up output in pipeline
joserod0704 Jun 27, 2025
7754144
Merge pull request #29 from UM-Applied-Algorithms-Lab/clean_output
hypercomrade Jun 27, 2025
f8ed7be
Added time limit arg to kleast_errors.py for solver
joserod0704 Jun 27, 2025
8b9f572
Added threads to be passed to flowpaths in kleast_errors.py
joserod0704 Jun 27, 2025
66004b4
Merge pull request #30 from UM-Applied-Algorithms-Lab/clean_output
hypercomrade Jul 1, 2025
82f0902
Change scraper to grab separate decomp files
hypercomrade Jul 2, 2025
b7d9347
Create alignment visualizer
hypercomrade Jul 3, 2025
96d5b95
First draft out alignment vis
hypercomrade Jul 3, 2025
a595a75
Prints out total weight of all edges in a graph
joserod0704 Jul 8, 2025
2a9c594
added total flow for entire graph to the csv for stats
joserod0704 Jul 8, 2025
460ccdf
Merge pull request #31 from UM-Applied-Algorithms-Lab/Add_Alignment_Vis
hypercomrade Jul 9, 2025
c43de57
Added column to csv file of explained flow after decomposition
joserod0704 Jul 9, 2025
62e60e9
Scapes weight of paths for table of output:
joserod0704 Jul 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[workspace]
members = ["libs/graph_analyze", "libs/super_source_and_sink","libs/FmAssemblyGraph"]
members = ["libs/graph_analyze", "libs/super_source_and_sink","libs/FmAssemblyGraph", "libs/output_scraper"]
2 changes: 1 addition & 1 deletion findviralstrains.smk
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ fastq_filenames = set(fastq_filenames) # Deletes duplicate file entrys by conver
fastq_filenames = list(fastq_filenames)

fastq_filenames = [entry for entry in fastq_filenames if entry != ""] # Remake list with only populated values #
print(fastq_filenames)


######################
## HELPER FUNCTIONS ##
Expand Down
16 changes: 8 additions & 8 deletions findviralstrains_2.smk
Original file line number Diff line number Diff line change
Expand Up @@ -289,15 +289,15 @@ rule Rebuild_3:
shell:
"python3 {input.script} {input.flow} {input.swg} {params.outtemp}"

# Compares our newly constructed genomes to original covid reference using Needleman-Wunsch #
# Compares our newly constructed genomes to original covid reference using waterman-Wunsch #
rule Compare_1:
input:
rebuilt_genome = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_1_of_1.fasta"),
origin_covid = "reference_genomes/covid19ref.fasta"
output:
compar_file = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_1_of_1_vs_ref.txt")
shell:
"needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file}"
"water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file}"

# Compares genomes from the two path result to the reference #
rule Compare_2:
Expand All @@ -310,8 +310,8 @@ rule Compare_2:
compar_file_2 = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_2_of_2_vs_ref.txt")
shell:
"""
needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1}
needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2}
water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1}
water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2}
"""

# Compares genomes from the three path result to the reference #
Expand All @@ -327,7 +327,7 @@ rule Compare_3:
compar_file_3 = bd("output_genomes/{sample}/subgraph_{subgraph}/{sample}_3_of_3_vs_ref.txt")
shell:
"""
needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1}
needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2}
needle -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_3} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_3}
"""
water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_1} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_1}
water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_2} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_2}
water -asequence {input.origin_covid} -bsequence {input.rebuilt_genome_3} -gapopen 10 -gapextend 0.5 -outfile {output.compar_file_3}
"""
149 changes: 149 additions & 0 deletions libs/alignment_vis/alignment_vis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import re
import os
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle, Patch
from matplotlib.collections import PatchCollection

def parse_alignment_file(filename):
try:
with open(filename, 'r') as f:
content = f.read()

identity_match = re.search(r'Identity:\s+(\d+)/(\d+)\s+\(([\d.]+)%\)', content)
identity = float(identity_match.group(3)) if identity_match else 100.0

# Get aligned regions and positions
positions = []
matches = []
blocks = re.finditer(
r'NC_045512\.2\s+(\d+)\s+([ACGT]+).*?\n\s+([|.]+)\nWeight\s+\d+\s+([ACGT]+)',
content,
re.DOTALL
)

for block in blocks:
start = int(block.group(1))
ref_seq = block.group(2)
match_str = block.group(3)
end = start + len(ref_seq) - 1
positions.append((start, end))
matches.append(match_str)

return positions, matches, identity
except Exception as e:
print(f"Error parsing {filename}: {str(e)}")
return [], [], 0

def find_alignments(root_dir):
"""Find all alignment files in subgraph directories"""
alignments = {}

# Walk through subgraph directories
for subgraph in os.listdir(root_dir):
if not subgraph.startswith('subgraph_'):
continue

subgraph_dir = os.path.join(root_dir, subgraph)
if not os.path.isdir(subgraph_dir):
continue

# Find all alignment files in this subgraph
for fname in os.listdir(subgraph_dir):
if not fname.endswith('_vs_ref.txt'):
continue

# Extract the X_of_Y pattern (e.g., "1_of_1")
parts = fname.split('_')
try:
x_of_y = f"{parts[-4]}_of_{parts[-2]}"
except IndexError:
continue

full_path = os.path.join(subgraph_dir, fname)
alignments.setdefault(x_of_y, []).append((full_path, subgraph))

return alignments

def plot_alignment_group(group_name, files, genome_length=29903, output_dir="."):
"""Plot one group of alignments (e.g., all 1_of_1 files)"""
if not files:
return

fig, ax = plt.subplots(figsize=(15, 2 + len(files) * 0.5))

# Gray background for full genome
ax.add_patch(Rectangle((0, 0), genome_length, len(files) + 1,
color='lightgray', alpha=0.3))

# Plot each alignment in the group
for i, (file_path, subgraph) in enumerate(sorted(files), 1):
positions, matches, identity = parse_alignment_file(file_path)
if not positions:
continue

# Create colored segments
patches = []
for (start, end), match_str in zip(positions, matches):
for pos, char in zip(range(start, end + 1), match_str):
color = (0, 0.8, 0) if char == '|' else (0.8, 0, 0) # Green or red
patches.append(Rectangle((pos, i - 0.4), 1, 0.8, color=color))

ax.add_collection(PatchCollection(patches, match_original=True))

# Add labels
fname = os.path.basename(file_path).replace('_vs_ref.txt', '')
ax.text(-1500, i, f"{subgraph}/{fname}", ha='right', va='center', fontsize=8)
ax.text(genome_length + 1500, i, f"{identity:.1f}%", ha='left', va='center', fontsize=8)

# Add genome scale markers
for x in range(0, genome_length + 1, 5000):
ax.axvline(x, color='gray', linestyle=':', alpha=0.5)
if x > 0:
ax.text(x, 0.2, f"{x//1000}kb", ha='center', fontsize=8)

ax.set_xlim(-2000, genome_length + 2000)
ax.set_ylim(0, len(files) + 1)
ax.set_yticks([])
ax.set_xlabel("Genomic Position (bp)")
ax.set_title(f"Alignment Group: {group_name.replace('_', ' ')}")

plt.tight_layout()
output_path = os.path.join(output_dir, f"{group_name}_alignment.pdf")
plt.savefig(output_path, dpi=300, bbox_inches='tight')
plt.close()
print(f"Saved: {output_path}")

def main():
import sys
if len(sys.argv) < 2:
print("Usage: python alignment_vis.py <input_dir> [output_dir]")
print("Example: python alignment_vis.py path/to/E1250_S84_L001/")
return

input_dir = sys.argv[1].rstrip('/')
output_dir = sys.argv[2] if len(sys.argv) > 2 else "alignment_plots"

if not os.path.exists(input_dir):
print(f"Error: Directory not found - {input_dir}")
return

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Find and group all alignment files
alignments = find_alignments(input_dir)
if not alignments:
print(f"No valid alignment files found in subgraph directories under {input_dir}")
print("Please verify that:")
print("1. The directory contains subgraph_* folders")
print("2. Those subgraphs contain files matching *_X_of_Y_vs_ref.txt")
return

# Process each group
for group_name, files in alignments.items():
plot_alignment_group(group_name, files, output_dir=output_dir)

print(f"\nAll plots saved to: {os.path.abspath(output_dir)}")

if __name__ == "__main__":
main()
7 changes: 0 additions & 7 deletions libs/compress/compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,6 @@ def merge_nodes(forward_edges, reverse_edges, edge_seqs, kmer_length):
# Find initial merge candidates
candidates = find_merge_candidates(forward_edges, reverse_edges)

print(f"Initial candidates: {len(candidates)}")


for node in candidates:

# Get the source and target nodes
Expand Down Expand Up @@ -233,7 +230,6 @@ def main():
forward_edges, reverse_edges, edge_seqs, kmer_length = read_graph(input_file)

# merge the nodes
print("Merging nodes...")
merge_nodes(forward_edges, reverse_edges, edge_seqs, kmer_length)


Expand All @@ -242,10 +238,7 @@ def main():


#
print("Writing merged graph...")
write_merged_graph(output_file, forward_edges)

print(f"Merged graph written to {output_file}")

if __name__ == "__main__":
main()
31 changes: 16 additions & 15 deletions libs/decompose/kleast_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,19 @@ def create_k_least_graph(graph, paths):

return k_least_graph

def save_paths_to_file(paths, output_path, num_paths, runtime, mip_gap, objective_value, multigraph_decomposer=None):
def save_paths_to_file(paths, output_path, num_paths, runtime, objective_value, multigraph_decomposer=None):
"""Save path information to a text file in the specified format."""
# Calculate total flow through all paths
total_flow = sum(paths['weights'])

# sum of all weights on all edges of original graph
total_weight_graph = sum(data['flow'] for u, v, data in graph.edges(data=True))

with open(output_path, 'w') as f:
f.write(f"Decomposition into {num_paths} paths\n")
f.write(f"Runtime: {runtime:.2f} seconds\n")
f.write(f"MIP Gap: {mip_gap:.6f}\n")
f.write(f"Objective Value: {objective_value:.6f}\n")
f.write(f"Total Flow: {total_weight_graph}\n")
f.write(f"Objective Value: {objective_value}\n")
f.write(f"Number of Paths: {num_paths}\n")
f.write("Paths and Weights:\n")

Expand All @@ -120,7 +123,7 @@ def save_paths_to_file(paths, output_path, num_paths, runtime, mip_gap, objectiv

f.write(f"{path_weight:.6f} {path_str}\n")

print(f"INFO: Path details saved to {output_path}")



def draw_labeled_multigraph(G, attr_name, ax=None, decimal_places=2, paths=None):
Expand Down Expand Up @@ -268,7 +271,7 @@ def visualize_and_save_graph(graph, output_path, num_paths, base_size=10, paths

visualization_file = f"{output_path}_visualization.pdf"
plt.savefig(visualization_file, dpi=300, bbox_inches='tight')
print(f"INFO: Visualization saved to {visualization_file}")



def get_all_edges_for_node(graph, node):
Expand All @@ -288,7 +291,7 @@ def get_all_edges_for_node(graph, node):
return edges


def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visualize=False):
def generate_output_files(base_output_path, graph, time_limit, threads, max_paths, min_paths=1, visualize=False):
"""Generate output files for all path counts from max_paths down to min_paths."""
# Extract the base filename without extension
base_name = os.path.splitext(base_output_path)[0]
Expand All @@ -304,16 +307,18 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua
edges_to_ignore = get_all_edges_for_node(graph, "0") + get_all_edges_for_node(graph, "1")

# Perform k-least errors analysis for current number of paths
k_least = fp.kLeastAbsErrors(G=graph, k=num_paths, flow_attr='flow', elements_to_ignore=edges_to_ignore)
k_least = fp.kLeastAbsErrors(G=graph, k=num_paths, flow_attr='flow', elements_to_ignore=edges_to_ignore, time_limit = time_limit, threads = threads)
k_least.solve()
paths = k_least.get_solution(remove_empty_paths=True)





# Get solver statistics
runtime = time.time() - start_time
mip_gap = k_least.model.MIPGap if hasattr(k_least, 'model') else 1.0
objective_value = k_least.model.ObjVal if hasattr(k_least, 'model') else 0.0

#mip_gap = k_least.model.MIPGap #if hasattr(k_least, 'model') else 1.0
objective_value = k_least.get_objective_value()

if visualize:
# Visualize the graph
Expand All @@ -330,7 +335,6 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua
output_path,
num_paths,
runtime,
mip_gap,
objective_value,
multigraph_decomposer=decomposer
)
Expand All @@ -344,8 +348,5 @@ def generate_output_files(base_output_path, graph, max_paths, min_paths=1, visua
# Read the input graph
graph = read_graph_to_networkx(args.input, min_edge_weight=args.mincount)


# Generate output files for all path counts from max_paths down to 1
generate_output_files(args.output, graph, args.maxpaths, args.minpaths, visualize=args.visualize)

print("INFO: Processing completed.")
generate_output_files(args.output, graph, args.timelimit, args.threads, args.maxpaths, args.minpaths, visualize=args.visualize)
7 changes: 7 additions & 0 deletions libs/output_scraper/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[package]
name = "output_scraper"
version = "0.1.0"
edition = "2021"

[dependencies]
csv = "1.1"
Loading
Loading