Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
293 changes: 293 additions & 0 deletions modular/human_cox1_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
#!/usr/bin/env python3
"""
Human COX1 Similarity Filter
Removes sequences with high similarity to human mitochondrial COX1
"""

import os
import sys
import csv
import argparse
import numpy as np
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import List, Tuple, Dict
import gc
from datetime import datetime

# Human COX1 reference sequence
HUMAN_COX1 = """ATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTGGAACACTATACCTATTATTCGGCG
CATGAGCTGGAGTCCTAGGCACAGCTCTAAGCCTCCTTATTCGAGCCGAGCTGGGCCAGCCAGGCAACCT
TCTAGGTAACGACCACATCTACAACGTTATCGTCACAGCCCATGCATTTGTAATAATCTTCTTCATAGTA
ATACCCATCATAATCGGAGGCTTTGGCAACTGACTAGTTCCCCTAATAATCGGTGCCCCCGATATGGCGT
TTCCCCGCATAAACAACATAAGCTTCTGACTCTTACCTCCCTCTCTCCTACTCCTGCTCGCATCTGCTAT
AGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTAGCAGGGAACTACTCCCACCCTGGA
GCCTCCGTAGACCTAACCATCTTCTCCTTACACCTAGCAGGTGTCTCCTCTATCTTAGGGGCCATCAATT
TCATCACAACAATTATCAATATAAAACCCCCTGCCATAACCCAATACCAAACGCCCCTCTTCGTCTGATC
CGTCCTAATCACAGCAGTCCTACTTCTCCTATCTCTCCCAGTCCTAGCTGCTGGCATCACTATACTACTA
ACAGACCGCAACCTCAACACCACCTTCTTCGACCCCGCCGGAGGAGGAGACCCCATTCTATACCAACACC
TATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACCAGGCTTCGGAATAATCTCCCATAT
TGTAACTTACTACTCCGGAAAAAAAGAACCATTTGGATACATAGGTATGGTCTGAGCTATGATATCAATT
GGCTTCCTAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGGAATAGACGTAGACACACGAGCAT
ATTTCACCTCCGCTACCATAATCATCGCTATCCCCACCGGCGTCAAAGTATTTAGCTGACTCGCCACACT
CCACGGAAGCAATATGAAATGATCTGCTGCAGTGCTCTGAGCCCTAGGATTCATCTTTCTTTTCACCGTA
GGTGGCCTGACTGGCATTGTATTAGCAAACTCATCACTAGACATCGTACTACACGACACGTACTACGTTG
TAGCCCACTTCCACTATGTCCTATCAATAGGAGCTGTATTTGCCATCATAGGAGGCTTCATTCACTGATT
TCCCCTATTCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATCCATTTCACTATCATATTCATCGGC
GTAAATCTAACTTTCTTCCCACAACACTTTCTCGGCCTATCCGGAATGCCCCGACGTTACTCGGACTACC
CCGATGCATACACCACATGAAACATCCTATCATCTGTAGGCTCATTCATTTCTCTAACAGCAGTAATATT
AATAATTTTCATGATTTGAGAAGCCTTCGCTTCGAAGCGAAAAGTCCTAATAGTAGAAGAACCCTCCATA
AACCTGGAGTGACTATATGGATGCCCCCCACCCTACCACACATTCGAAGAACCCGTATACATAAAATCTA
GA""".replace("\n", "")

HUMAN_COX1_ARRAY = np.array(list(HUMAN_COX1.upper()))

def log_message(message: str, log_file=None, stdout=False):
"""Log message to file and optionally stdout"""
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
formatted_msg = f"[{timestamp}] {message}"

if log_file:
log_file.write(formatted_msg + '\n')
log_file.flush()

if stdout:
print(formatted_msg, flush=True)

def vectorized_sequence_similarity(seq_no_gaps: str, reference_array: np.ndarray, min_overlap: int = 20) -> float:
"""Calculate sequence similarity using vectorized operations"""
if not seq_no_gaps or len(seq_no_gaps) == 0:
return 0.0

seq_array = np.array(list(seq_no_gaps.upper()))
ref_len = min(len(seq_array), len(reference_array))

if ref_len < min_overlap:
return 0.0

seq_part = seq_array[:ref_len]
ref_part = reference_array[:ref_len]

matches = np.sum(seq_part == ref_part)
similarity = matches / ref_len

return similarity

def process_single_file(file_path: str, human_threshold: float, output_dir: str) -> Dict:
"""Process a single FASTA file for human COX1 filtering"""
try:
base_name = os.path.splitext(os.path.basename(file_path))[0]
if '_align' in base_name:
base_name = base_name.replace('_align', '')

# Check if file is empty
if os.path.getsize(file_path) == 0:
return {
'file_path': file_path,
'base_name': base_name,
'status': 'skipped',
'reason': 'empty_file',
'input_count': 0,
'kept_count': 0,
'removed_count': 0,
'removed_sequences': []
}

# Read sequences
try:
records = list(SeqIO.parse(file_path, "fasta"))
except Exception as e:
return {
'file_path': file_path,
'base_name': base_name,
'status': 'error',
'reason': f'parse_error: {str(e)}',
'input_count': 0,
'kept_count': 0,
'removed_count': 0,
'removed_sequences': []
}

if not records:
return {
'file_path': file_path,
'base_name': base_name,
'status': 'skipped',
'reason': 'no_sequences',
'input_count': 0,
'kept_count': 0,
'removed_count': 0,
'removed_sequences': []
}

input_count = len(records)
kept_records = []
removed_sequences = []

# Filter sequences
for record in records:
sequence_no_gaps = str(record.seq).replace('-', '').upper()

if len(sequence_no_gaps) == 0:
# Empty sequence - remove
removed_sequences.append({
'sequence_id': record.id,
'removal_reason': 'empty_sequence',
'human_similarity': 0.0
})
continue

# Calculate human similarity
similarity = vectorized_sequence_similarity(sequence_no_gaps, HUMAN_COX1_ARRAY)

if similarity >= human_threshold:
removed_sequences.append({
'sequence_id': record.id,
'removal_reason': 'human_similar',
'human_similarity': similarity
})
else:
kept_records.append(record)

# Write filtered sequences
if kept_records:
output_file = os.path.join(output_dir, f"{base_name}_human_filtered.fasta")
with open(output_file, 'w') as handle:
SeqIO.write(kept_records, handle, "fasta")

return {
'file_path': file_path,
'base_name': base_name,
'status': 'success',
'reason': 'processed',
'input_count': input_count,
'kept_count': len(kept_records),
'removed_count': len(removed_sequences),
'removed_sequences': removed_sequences,
'output_file': os.path.join(output_dir, f"{base_name}_human_filtered.fasta") if kept_records else None
}

except Exception as e:
return {
'file_path': file_path,
'base_name': base_name,
'status': 'error',
'reason': f'processing_error: {str(e)}',
'input_count': 0,
'kept_count': 0,
'removed_count': 0,
'removed_sequences': []
}

def main():
parser = argparse.ArgumentParser(description='Filter sequences based on human COX1 similarity')
parser.add_argument('--input-log', required=True, help='File containing list of FASTA alignment files')
parser.add_argument('--output-dir', required=True, help='Output directory for filtered files')
parser.add_argument('--filtered-files-list', required=True, help='Output file listing successfully filtered files')
parser.add_argument('--metrics-csv', required=True, help='Output CSV file with filtering metrics')
parser.add_argument('--human-threshold', type=float, default=0.95, help='Human COX1 similarity threshold')
parser.add_argument('--threads', type=int, default=1, help='Number of threads for parallel processing')

args = parser.parse_args()

# Create output directory
os.makedirs(args.output_dir, exist_ok=True)

# Read input files
with open(args.input_log, 'r') as f:
input_files = [line.strip() for line in f if line.strip()]

print(f"Processing {len(input_files)} files with human COX1 similarity threshold {args.human_threshold}")

# Process files in parallel
results = []
with ProcessPoolExecutor(max_workers=args.threads) as executor:
future_to_file = {
executor.submit(process_single_file, file_path, args.human_threshold, args.output_dir): file_path
for file_path in input_files
}

for future in as_completed(future_to_file):
file_path = future_to_file[future]
try:
result = future.result()
results.append(result)

if result['status'] == 'success':
print(f"✓ {result['base_name']}: {result['kept_count']}/{result['input_count']} sequences kept")
elif result['status'] == 'skipped':
print(f"⚠ {result['base_name']}: skipped ({result['reason']})")
else:
print(f"✗ {result['base_name']}: error ({result['reason']})")

except Exception as e:
print(f"✗ {file_path}: processing failed - {str(e)}")
results.append({
'file_path': file_path,
'base_name': os.path.basename(file_path),
'status': 'error',
'reason': f'executor_error: {str(e)}',
'input_count': 0,
'kept_count': 0,
'removed_count': 0,
'removed_sequences': []
})

# Write filtered files list
successful_files = []
with open(args.filtered_files_list, 'w') as f:
for result in results:
if result['status'] == 'success' and result.get('output_file') and os.path.exists(result['output_file']):
f.write(result['output_file'] + '\n')
successful_files.append(result['output_file'])

# Write detailed metrics CSV
with open(args.metrics_csv, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['file_path', 'base_name', 'sequence_id', 'removal_reason', 'human_similarity', 'step_name', 'input_count', 'kept_count', 'removed_count'])

for result in results:
# Write file-level summary
writer.writerow([
result['file_path'],
result['base_name'],
'FILE_SUMMARY',
result['reason'],
'',
'human_cox1_filter',
result['input_count'],
result['kept_count'],
result['removed_count']
])

# Write individual removed sequences
for seq_info in result['removed_sequences']:
writer.writerow([
result['file_path'],
result['base_name'],
seq_info['sequence_id'],
seq_info['removal_reason'],
f"{seq_info['human_similarity']:.4f}",
'human_cox1_filter',
'',
'',
''
])

# Summary statistics
total_input = sum(r['input_count'] for r in results)
total_kept = sum(r['kept_count'] for r in results)
total_removed = sum(r['removed_count'] for r in results)
successful_count = len([r for r in results if r['status'] == 'success'])

print(f"\nHuman COX1 Filtering Summary:")
print(f"Files processed successfully: {successful_count}/{len(input_files)}")
print(f"Total input sequences: {total_input}")
print(f"Sequences kept: {total_kept}")
print(f"Sequences removed (human similar): {total_removed}")
print(f"Filtered files written: {len(successful_files)}")

if __name__ == "__main__":
main()