From f98e3c7abb7b9ece11c8cb0ca664e6f4006b3605 Mon Sep 17 00:00:00 2001 From: Ludvig Date: Thu, 11 Dec 2025 14:51:01 +0100 Subject: [PATCH 1/4] Attempts fixing zero-sum issue after outlier removal --- single_sample_correction_factor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/single_sample_correction_factor.py b/single_sample_correction_factor.py index 0ab473b..274ce17 100644 --- a/single_sample_correction_factor.py +++ b/single_sample_correction_factor.py @@ -49,6 +49,8 @@ def construct_GC_bias(sample_lengths, ref_lengths, X): GC_array = GC_array[lower_cnt: upper_cnt+1] GC_array = remove_outlier(GC_array) + if GC_array.sum() == 0: + return np.ones_like(GC_array)[lag:-lag] GC_array = GC_array / np.mean(GC_array) lowess = sm.nonparametric.lowess(GC_array, X, frac=0.1) From 56d9f216e2a66022be422066d2427dd331f5ec7b Mon Sep 17 00:00:00 2001 From: Ludvig Date: Thu, 11 Dec 2025 16:07:23 +0100 Subject: [PATCH 2/4] Makes tlen absolute so both mates are kept --- single_sample_GC_tagging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/single_sample_GC_tagging.py b/single_sample_GC_tagging.py index d2ca736..bab57c5 100644 --- a/single_sample_GC_tagging.py +++ b/single_sample_GC_tagging.py @@ -15,7 +15,7 @@ reference_genome_path = sys.argv[8] bin_location_file = sys.argv[9] temp_folder = sys.argv[10] -if os.path.isdir(temp_folder)==False: +if not os.path.isdir(temp_folder): os.mkdir(temp_folder) bin_locations = pd.read_csv(bin_location_file).values.tolist() @@ -47,7 +47,7 @@ def tag_bam(sub_bin_locations): for read in bam_in.fetch(contig=contig, start=start, end=end): if (read.flag & 1024 == 0) and (read.flag & 2048 == 0) and (read.flag & 4 == 0) and (read.flag & 8 == 0): if read.mapping_quality>=mapq and (read.reference_name == read.next_reference_name): - length = read.template_length + length = abs(read.template_length) if length>=start_len and length<=end_len: ref_start = read.reference_start + lag ref_end = read.reference_start + length - lag From e689ff925a41e767e700b88ac3ffc814998072e3 Mon Sep 17 00:00:00 2001 From: Ludvig Date: Thu, 11 Dec 2025 16:17:53 +0100 Subject: [PATCH 3/4] Fixes hardcoded min length --- single_sample_GC_tagging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/single_sample_GC_tagging.py b/single_sample_GC_tagging.py index bab57c5..af6b927 100644 --- a/single_sample_GC_tagging.py +++ b/single_sample_GC_tagging.py @@ -1,6 +1,6 @@ import pysam import numpy as np -import os, sys, subprocess, time, shutil +import os, sys, subprocess, shutil from multiprocessing import Pool import pandas as pd @@ -60,7 +60,7 @@ def tag_bam(sub_bin_locations): if valid_percent>=0.9: GC_content = float( GC_cnt/total_cnt ) GC_content = round(round(GC_content, 2) * 100) - correction_factor = correction_factors[length-51, GC_content] + correction_factor = correction_factors[length-start_len, GC_content] read.set_tag("GC", correction_factor) bam_out.write(read) return output_bam From 6a2e4403c81d73c9163a779d6510bdc3c5c6427b Mon Sep 17 00:00:00 2001 From: Ludvig Date: Thu, 11 Dec 2025 17:02:14 +0100 Subject: [PATCH 4/4] Uses start_len as offset --- single_sample_corrected_cov.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/single_sample_corrected_cov.py b/single_sample_corrected_cov.py index 714a516..4d40a66 100644 --- a/single_sample_corrected_cov.py +++ b/single_sample_corrected_cov.py @@ -46,7 +46,7 @@ def get_cov(sub_bin_locations): for read in bam_in.fetch(contig=contig, start=start, end=end): if (read.flag & 1024 == 0) and (read.flag & 2048 == 0) and (read.flag & 4 == 0) and (read.flag & 8 == 0): if read.mapping_quality>=mapq and (read.reference_name == read.next_reference_name): - length = read.template_length + length = read.template_length # NOTE: Only one read is used if length>=start_len and length<=end_len: ref_start = read.reference_start + lag ref_end = read.reference_start + length - lag @@ -59,7 +59,7 @@ def get_cov(sub_bin_locations): if valid_percent>=0.9: GC_content = float( GC_cnt/total_cnt ) GC_content = round(round(GC_content, 2) * 100) - correction_factor = correction_factors[length-51, GC_content] + correction_factor = correction_factors[length-start_len, GC_content] cov_array[bin_index] += correction_factor return cov_array