diff --git a/README.md b/README.md index 6bbc774..e847415 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,8 @@ options: -t, --threads THREADS number of alignment threads. A sensible default is chosen automatically (default: 10) + --output-bam save mapped reads to BAM for downstream analysis + (default: False) --force overwrite existing output files (default: False) --airplane disable automatic index download (offline mode) diff --git a/src/hostile/aligner.py b/src/hostile/aligner.py index 439d89c..b7a3efc 100644 --- a/src/hostile/aligner.py +++ b/src/hostile/aligner.py @@ -139,6 +139,7 @@ def gen_clean_cmd( aligner_threads: int, compression_threads: int, force: bool, + output_bam: bool, ) -> str: fastq, output = Path(fastq), Path(output) output.mkdir(exist_ok=True, parents=True) @@ -146,6 +147,13 @@ def gen_clean_cmd( fastq_out_path = output / f"{fastq_stem}.clean.fastq.gz" count_before_path = output / f"{fastq_stem}.reads_in.txt" count_after_path = output / f"{fastq_stem}.reads_out.txt" + mapped_bam_path = output / f"{fastq_stem}.mapped.bam" + + bam_cmd = ( + f" | tee >(samtools view -F 2304 -b - > '{mapped_bam_path}')" + if output_bam + else "" + ) if not stdout and not force and fastq_out_path.exists(): raise FileExistsError( @@ -196,6 +204,8 @@ def gen_clean_cmd( cmd = ( # Align, stream reads to stdout in SAM format f"{alignment_cmd}" + # optional - output bam + f"{bam_cmd}" # Count reads in stream before filtering (2048 + 256 = 2304) f" | tee >(samtools view -F 2304 -c - > '{count_before_path}')" # Discard mapped reads (or inverse) @@ -228,6 +238,7 @@ def gen_paired_clean_cmd( aligner_threads: int, compression_threads: int, force: bool, + output_bam: bool, ) -> str: fastq1, fastq2, output = Path(fastq1), Path(fastq2), Path(output) output.mkdir(exist_ok=True, parents=True) @@ -237,6 +248,13 @@ def gen_paired_clean_cmd( fastq2_out_path = output / f"{fastq2_stem}.clean_2.fastq.gz" count_before_path = output / f"{fastq1_stem}.reads_in.txt" count_after_path = output / f"{fastq1_stem}.reads_out.txt" + mapped_bam_path = output / f"{fastq1_stem.removesuffix('_R1_paired')}.mapped.bam" + + bam_cmd = ( + f" | tee >(samtools view -F 2304 -b - > '{mapped_bam_path}')" + if output_bam + else "" + ) if ( not stdout @@ -320,6 +338,8 @@ def gen_paired_clean_cmd( ) cmd = ( f"{alignment_cmd}" + # optional - output bam + f"{bam_cmd}" f" | tee >(samtools view -F 2304 -c - > '{count_before_path}')" f"{filter_cmd}" f" | tee >(samtools view -F 2304 -c - > '{count_after_path}')" diff --git a/src/hostile/cli.py b/src/hostile/cli.py index 6cf3079..a50d6cc 100644 --- a/src/hostile/cli.py +++ b/src/hostile/cli.py @@ -34,6 +34,7 @@ def clean( force: bool = False, airplane: bool = False, debug: bool = False, + output_bam: bool = False, ) -> None: """ Remove reads aligning to an index from fastq[.gz] input files or stdin. @@ -53,6 +54,7 @@ def clean( :arg force: overwrite existing output files :arg airplane: disable automatic index download (offline mode) :arg debug: show debug messages + :arg output_bam: save mapped reads to BAM for downstream analysis """ if debug: @@ -81,6 +83,7 @@ def clean( threads=threads, force=force, airplane=airplane, + output_bam=output_bam, ) else: stats = lib.clean_fastqs( @@ -96,6 +99,7 @@ def clean( threads=threads, force=force, airplane=airplane, + output_bam=output_bam, ) print( json.dumps(stats, indent=4), diff --git a/src/hostile/lib.py b/src/hostile/lib.py index 20052b8..8ecd3ce 100644 --- a/src/hostile/lib.py +++ b/src/hostile/lib.py @@ -174,6 +174,7 @@ def clean_fastqs( threads: int = util.CPU_COUNT, force: bool = False, airplane: bool = False, + output_bam: bool = False, ): stdin = str(fastqs[0]) == "-" stdout = str(output) == "-" @@ -212,6 +213,7 @@ def clean_fastqs( aligner_threads=aligner_threads, compression_threads=compression_threads, force=force, + output_bam=output_bam, ) for fastq in fastqs ] @@ -252,6 +254,7 @@ def clean_paired_fastqs( threads: int = util.CPU_COUNT, force: bool = False, airplane: bool = False, + output_bam: bool = False, ): stdin = str(fastqs[0][0]) == "-" stdout = str(output) == "-" @@ -292,6 +295,7 @@ def clean_paired_fastqs( aligner_threads=aligner_threads, compression_threads=compression_threads, force=force, + output_bam=output_bam, ) for fastq_pair in fastqs ]