From 29248e8b444e42e55f419aad53eca8c3b8045773 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 9 Nov 2025 17:13:04 +0100 Subject: [PATCH] use tidy tree to generate trees with different origins --- config/configfile.yaml | 12 + scripts/tidy_tree.py | 479 ++++++++++++++++++++++++++++ workflow/snakemake_rules/clades.smk | 11 + workflow/snakemake_rules/core.smk | 39 ++- workflow/snakemake_rules/export.smk | 1 + 5 files changed, 538 insertions(+), 4 deletions(-) create mode 100755 scripts/tidy_tree.py diff --git a/config/configfile.yaml b/config/configfile.yaml index 6d04584..c8bb684 100644 --- a/config/configfile.yaml +++ b/config/configfile.yaml @@ -29,15 +29,27 @@ filter: resolutions: all-time: min_date: 1975-01-01 + root_lineage: + a: "A" + b: "B" 6y: min_date: 6Y background_min_date: 12Y + root_lineage: + a: "A.D" + b: "B.D.4.1" 3y: min_date: 3Y background_min_date: 12Y + root_lineage: + a: "A.D" + b: "B.D.4.1.1" 1y: min_date: 1Y background_min_date: 12Y + root_lineage: + a: "A.D" + b: "B.D.4.1.1" subsample_max_sequences: genome: 3000 diff --git a/scripts/tidy_tree.py b/scripts/tidy_tree.py new file mode 100755 index 0000000..a0ec1a3 --- /dev/null +++ b/scripts/tidy_tree.py @@ -0,0 +1,479 @@ +#!/usr/bin/env python3 +""" +Tidy Tree: Build phylogenetic trees guided by hierarchical lineages. + +This program constructs a phylogenetic tree by: +1. Building subtrees for each lineage (with its sequences + child lineage founders) +2. Stitching subtrees together following the lineage guide tree +""" + +import argparse +import io +import subprocess +import tempfile +from pathlib import Path +from typing import Dict, List, Optional +import pandas as pd +from Bio import SeqIO, Phylo +from Bio.SeqRecord import SeqRecord + + +class LineageNode: + """Represents a node in the lineage guide tree.""" + def __init__(self, name: str): + self.tree = None + self.name = name + self.parent = None + self.children = [] + self.sequences = [] # Sequence IDs belonging to this lineage + self.founder_seq = None # Founder sequence for this lineage + self.subtree = None # Phylogenetic subtree for this lineage + + +def parse_arguments(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description='Build phylogenetic tree guided by lineage hierarchy' + ) + parser.add_argument( + '--alignment', '-s', + required=True, + help='Aligned input sequences in FASTA format' + ) + parser.add_argument( + '--founder-sequences', '-f', + required=False, + help='Aligned lineage founder sequences in FASTA format (if not provided in alignment)' + ) + parser.add_argument( + '--guide-tree', '-g', + required=True, + help='Lineage guide tree in Newick format' + ) + parser.add_argument( + '--lineage-assignments', '-a', + required=True, + help='Table assigning sequences to lineages (TSV format)' + ) + parser.add_argument( + '--seq-id-column', + default='seq_id', + help='Column name for sequence IDs in assignments file (default: seq_id)' + ) + parser.add_argument( + '--lineage-column', + default='lineage', + help='Column name for lineage in assignments file (default: lineage)' + ) + parser.add_argument( + '--output-tree', '-o', + required=True, + help='Output tree in Newick format' + ) + parser.add_argument( + '--iqtree-path', + default='iqtree', + help='Path to IQ-TREE executable (default: iqtree)' + ) + parser.add_argument( + '--model', + default='GTR+G', + help='Substitution model for IQ-TREE (default: GTR+G)' + ) + parser.add_argument( + '--threads', + type=int, + default=1, + help='Number of threads for IQ-TREE (default: 1)' + ) + parser.add_argument( + '--iqtree-args', + default='--ninit 2 -n 2 --epsilon 0.05', + help='Additional arguments to pass to IQ-TREE' + ) + parser.add_argument( + '--keep-founders', + action='store_true', + help='Keep founder sequences as leaves in the final tree' + ) + parser.add_argument('--root-lineage', default=None, help='Root lineage for the tree') + return parser.parse_args() + + +def load_sequences(fasta_path: str) -> Dict[str, SeqRecord]: + """Load sequences from FASTA file into a dictionary.""" + sequences = {} + for record in SeqIO.parse(fasta_path, 'fasta'): + sequences[record.id] = record + return sequences + + +def load_assignments( + assignment_path: str, + seq_id_column: str = 'seq_id', + lineage_column: str = 'lineage' +) -> Dict[str, str]: + """Load sequence-to-lineage assignments from TSV file using pandas.""" + # Read TSV file, handling comments + df = pd.read_csv(assignment_path, sep='\t', comment='#') + + # Check if required columns exist + if seq_id_column not in df.columns: + raise ValueError(f"Column '{seq_id_column}' not found in assignments file. " + f"Available columns: {', '.join(df.columns)}") + if lineage_column not in df.columns: + raise ValueError(f"Column '{lineage_column}' not found in assignments file. " + f"Available columns: {', '.join(df.columns)}") + + # Convert to dictionary + assignments = dict(zip(df[seq_id_column], df[lineage_column])) + + return assignments + + +def parse_guide_tree(newick_path: str, root_lineage: str = None) -> LineageNode: + """Parse the lineage guide tree and build LineageNode structure.""" + tree = Phylo.read(newick_path, 'newick') + if root_lineage: + matches = [x for x in tree.find_clades(name=root_lineage)] + if matches: + tree.root = matches[0] + print(f"Re-rooted lineage tree to '{root_lineage}'") + else: + print(f"Warning: Specified root lineage '{root_lineage}' not found. Using original root.") + + # Convert Bio.Phylo tree to LineageNode structure + def build_lineage_tree(phylo_node, parent=None, save_phylo_node=False): + node = LineageNode(phylo_node.name or f"internal_{id(phylo_node)}") + node.parent = parent + if save_phylo_node: + node.tree = phylo_node + for child in phylo_node.clades: + child_node = build_lineage_tree(child, node) + node.children.append(child_node) + + return node + + return build_lineage_tree(tree.root, save_phylo_node=True) + + +def assign_sequences_to_lineages( + root: LineageNode, + assignments: Dict[str, str], + sequences: Dict[str, SeqRecord] +): + """Assign sequences to their respective lineage nodes.""" + # Create a map of lineage name to node + lineage_map = {} + + def map_lineages(node): + lineage_map[node.name] = node + for child in node.children: + map_lineages(child) + + map_lineages(root) + + # Assign sequences + for seq_id, lineage in assignments.items(): + if lineage in lineage_map and seq_id in sequences: + lineage_map[lineage].sequences.append(seq_id) + + +def assign_founders_to_lineages( + root: LineageNode, + founders: Dict[str, SeqRecord] +): + """Assign founder sequences to their respective lineage nodes.""" + def assign(node): + if node.name in founders: + node.founder_seq = founders[node.name] + for child in node.children: + assign(child) + + assign(root) + + +def run_iqtree( + sequences: List[SeqRecord], + iqtree_path: str, + model: str, + threads: int, + root: str, + extra_args: str = '' +) -> Optional[Phylo.BaseTree.Tree]: + """ + Run IQ-TREE on a set of aligned sequences. + + Returns the resulting phylogenetic tree. + """ + if len(sequences) < 3: + return None + + # Create temporary directory for IQ-TREE files + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + input_file = tmpdir_path / 'input.fasta' + + # Write sequences to file + SeqIO.write(sequences, input_file, 'fasta') + + # Build IQ-TREE command + cmd = [ + iqtree_path, + '-s', str(input_file), + '-m', model, + '-nt', str(threads), + '-quiet', # Reduce output + '-redo', # Overwrite existing results + ] + + # Add extra arguments if provided + if extra_args: + cmd.extend(extra_args.split()) + + # Run IQ-TREE + try: + result = subprocess.run( + cmd, + cwd=tmpdir, + capture_output=True, + text=True, + check=True + ) + except subprocess.CalledProcessError as e: + print(f"IQ-TREE error: {e.stderr}") + return None + + # Read the resulting tree + tree_file = input_file.with_suffix('.fasta.treefile') + if not tree_file.exists(): + print(f"IQ-TREE did not produce expected tree file: {tree_file}") + return None + + tree = Phylo.read(str(tree_file), 'newick') + tree.root_with_outgroup(root) + return tree + + +def build_subtree( + node: LineageNode, + all_sequences: Dict[str, SeqRecord], + iqtree_path: str, + model: str, + threads: int, + extra_args: str +) -> Optional[Phylo.BaseTree.Tree]: + """ + Build a phylogenetic subtree for a lineage using IQ-TREE. + + Includes: + - Sequences assigned to this lineage + - Founder sequence of this lineage + - Founder sequences of child lineages + """ + # Collect sequences for this subtree + subtree_seqs = [] + + # Add sequences belonging to this lineage + for seq_id in node.sequences: + if seq_id in all_sequences: + subtree_seqs.append(all_sequences[seq_id]) + + # Add founder of this lineage + if node.founder_seq: + subtree_seqs.append(node.founder_seq) + + # Add founders of child lineages + for child in node.children: + if child.founder_seq: + subtree_seqs.append(child.founder_seq) + + # Need at least 3 sequences to build a tree + print(f"Lineage {node.name}: total sequences for subtree: {len(subtree_seqs)}") + print([seq.id for seq in subtree_seqs]) + if len(subtree_seqs) < 3: + if node.name.startswith('internal_'): + tree = Phylo.read(io.StringIO(f"({','.join(seq.id for seq in subtree_seqs)});"), 'newick') + elif len(subtree_seqs) == 2: + tree = Phylo.read(io.StringIO(f"({','.join(seq.id for seq in subtree_seqs)}){node.name};"), 'newick') + elif len(subtree_seqs) == 1: + tree = Phylo.read(io.StringIO(f"{subtree_seqs[0].id};"), 'newick') + else: + print(f"Building subtree for lineage {node.name} with {len(subtree_seqs)} sequences") + + # Build tree using IQ-TREE + tree = run_iqtree(subtree_seqs, iqtree_path, model, threads, root=node.name, extra_args=extra_args) + + return tree + + +def find_clade_by_name(tree, name: str): + """Find a clade/node in the tree by its name.""" + def search(clade): + if clade.name == name: + return clade + for child in clade.clades: + result = search(child) + if result: + return result + return None + + return search(tree.root) + + +def graft_subtree( + parent_tree: Phylo.BaseTree.Tree, + child_tree: Phylo.BaseTree.Tree, + connection_point_id: str, + keep_founders: bool = False +): + """ + Graft a child subtree onto a parent tree at the connection point. + + The connection point is the child lineage's founder sequence, + which should appear as a leaf in the parent tree and as the root + or an internal node in the child tree. + """ + # Find the connection point in the parent tree (should be a leaf) + parent_node = find_clade_by_name(parent_tree, connection_point_id) + if not parent_node: + print(f"Warning: Could not find {connection_point_id} in parent tree") + return + + assert parent_node.is_terminal(), "Connection point in parent tree must be a leaf" + + # Find the connection point in the child tree + child_node = find_clade_by_name(child_tree, connection_point_id) + + if not child_node: + print(f"Warning: Could not find {connection_point_id} in child tree") + return + + # Replace the leaf in parent tree with the subtree from child tree + # If child_node is a leaf in child_tree, nothing to graft + if child_tree.root == child_node and len(child_tree.root.clades) == 0: + # The founder is a leaf in both trees, no grafting needed + return + elif child_tree.root == child_node: + if keep_founders: + clades_to_graft = [c for c in child_tree.root.clades] + child_node.clades = [] # Clear clades to avoid duplication + clades_to_graft.append(child_node) # Retain the founder as a leaf + else: + clades_to_graft = child_tree.root.clades + elif child_tree.root != child_node: + clades_to_graft = [c for c in child_tree.root.clades if (c.name != connection_point_id) or keep_founders] + + # Replace parent_node's children with child_node's children + parent_node.clades = list(clades_to_graft) + if parent_node.name: + parent_node.name = parent_node.name + "_internal" + + +def stitch_subtrees( + root: LineageNode, + all_sequences: Dict[str, SeqRecord], + iqtree_path: str, + model: str, + threads: int, + extra_args: str, + keep_founders: bool = False +) -> Optional[Phylo.BaseTree.Tree]: + """ + Stitch subtrees together following the guide tree structure. + + Process (post-order traversal): + 1. Build subtrees for children first (bottom-up) + 2. Build subtree for current node + 3. Graft child subtrees onto current subtree at founder positions + """ + def process_node(node): + # First process all children (post-order) + for child in node.children: + process_node(child) + + # Build subtree for this node + node.subtree = build_subtree( + node, all_sequences, iqtree_path, model, threads, extra_args + ) + + # If this node has a subtree and children with subtrees, + # graft child subtrees onto this subtree + if node.subtree and node.children: + for child in node.children: + if child.subtree and child.founder_seq: + print(f"Grafting {child.name} subtree onto {node.name} at {child.founder_seq.id}") + graft_subtree(node.subtree, child.subtree, child.founder_seq.id, keep_founders) + else: + print(f"Skipping grafting for {child.name} onto {node.name} (missing subtree or founder)") + + process_node(root) + root.subtree.ladderize() + return root.subtree + + +def main(): + """Main execution function.""" + args = parse_arguments() + + # Load input data + print("Loading sequences...") + sequences = load_sequences(args.alignment) + print(f" Loaded {len(sequences)} sequences") + + print("Parsing lineage guide tree...") + lineage_root = parse_guide_tree(args.guide_tree, args.root_lineage) + + if args.founder_sequences: + print("Loading founder sequences...") + founders = load_sequences(args.founder_sequences) + print(f" Loaded {len(founders)} founder sequences") + else: + founders = {} + lineages_in_guide_tree = [clade.name for clade in lineage_root.tree.find_clades() if clade.name] + for seq_id, seq in sequences.items(): + if seq_id in lineages_in_guide_tree: + founders[seq_id] = seq + print(f" gathered {len(founders)} founder sequences from alignment") + + print("Loading sequence-to-lineage assignments...") + assignments = load_assignments( + args.lineage_assignments, + args.seq_id_column, + args.lineage_column + ) + print(f" Loaded {len(assignments)} assignments") + + # Prepare lineage structure + print("\nAssigning sequences to lineages...") + assign_sequences_to_lineages(lineage_root, assignments, sequences) + assign_founders_to_lineages(lineage_root, founders) + + # Build and stitch trees + print("\nBuilding subtrees using IQ-TREE...") + all_sequences = {**sequences, **founders} + + final_tree = stitch_subtrees( + lineage_root, + all_sequences, + args.iqtree_path, + args.model, + args.threads, + args.iqtree_args, + args.keep_founders + ) + + if final_tree is None: + print("Error: Failed to build final tree") + return 1 + + # Write output + print(f"\nWriting final tree to {args.output_tree}...") + Phylo.write(final_tree, args.output_tree, 'newick') + + print("Done!") + return 0 + + +if __name__ == '__main__': + exit(main()) diff --git a/workflow/snakemake_rules/clades.smk b/workflow/snakemake_rules/clades.smk index 51935df..8d8e85d 100644 --- a/workflow/snakemake_rules/clades.smk +++ b/workflow/snakemake_rules/clades.smk @@ -70,3 +70,14 @@ rule download_clades: """ curl {params.url} --output {output.clades} """ + + +rule download_clade_tree: + output: + clades = "results/clades_consortium_{a_or_b}.nwk" + params: + url = lambda w: f"https://raw.githubusercontent.com/rsv-lineages/lineage-designation-{w.a_or_b.upper()}/main/.auto-generated/lineages.nwk" + shell: + """ + curl {params.url} --output {output.clades} + """ diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index c8e8fd6..fdae17e 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -202,6 +202,9 @@ rule get_nextclade_dataset: nextclade3 dataset get -n {params.ds_name} --output-zip {output.dataset} """ +def get_founders(w): + ref = config["nextclade_attributes"][w.a_or_b]["accession"] + return f"nextclade/config/rsv-{w.a_or_b}/{ref}/founder_sequences.fasta" rule genome_align: message: @@ -210,6 +213,7 @@ rule genome_align: """ input: sequences=rules.combine_samples.output.sequences, + founder_sequences= get_founders, dataset=rules.get_nextclade_dataset.output.dataset, output: alignment=build_dir + "/{a_or_b}/{build_name}/{resolution}/sequences.aligned.fasta", @@ -222,7 +226,7 @@ rule genome_align: shell: """ nextclade3 run -j {threads}\ - {input.sequences} \ + {input.sequences} {input.founder_sequences} \ -D {input.dataset} \ --output-fasta {output.alignment} \ --cds-selection {params.genes} \ @@ -299,24 +303,50 @@ def get_alignment(w): ) +# rule tree: +# message: +# "Building tree" +# input: +# alignment=get_alignment, +# output: +# tree=build_dir + "/{a_or_b}/{build_name}/{resolution}/tree_raw.nwk", +# threads: 4 +# shell: +# """ +# augur tree \ +# --alignment {input.alignment} \ +# --output {output.tree} \ +# --tree-builder-args '-ninit 10 -n 4 -czb' \ +# --nthreads {threads} +# """ + rule tree: message: "Building tree" input: alignment=get_alignment, + guide_tree = "results/clades_consortium_{a_or_b}.nwk", + metadata="data/{a_or_b}/metadata.tsv", output: tree=build_dir + "/{a_or_b}/{build_name}/{resolution}/tree_raw.nwk", + params: + root_lineage=lambda w: config["filter"]["resolutions"][w.resolution]["root_lineage"][w.a_or_b], threads: 4 shell: """ - augur tree \ + python3 scripts/tidy_tree.py \ --alignment {input.alignment} \ --output {output.tree} \ - --tree-builder-args '-ninit 10 -n 4 -czb' \ - --nthreads {threads} + --lineage-assignments {input.metadata} \ + --seq-id-column accession \ + --lineage-column clade \ + --root-lineage {params.root_lineage} \ + --guide-tree {input.guide_tree} \ + --threads {threads} """ + rule refine: message: """ @@ -350,6 +380,7 @@ rule refine: --date-inference {params.date_inference} \ --timetree \ --stochastic-resolve \ + --keep-root \ --use-fft \ --clock-filter-iqd {params.clock_filter_iqd} """ diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk index 7dfb3aa..ab1d873 100644 --- a/workflow/snakemake_rules/export.smk +++ b/workflow/snakemake_rules/export.smk @@ -54,6 +54,7 @@ rule export: params: title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny", strain_id=config["strain_id_field"], + color_by_metadata= lambda w: "--color-by-metadata clade " if w.build_name!="genome" else "" shell: """ augur export v2 \