Rafeed-bot · LudvigOlsen · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/single_sample_GC_tagging.py b/single_sample_GC_tagging.py
@@ -1,6 +1,6 @@
 import pysam
 import numpy as np
-import os, sys, subprocess, time, shutil
+import os, sys, subprocess, shutil
 from multiprocessing import Pool
 import pandas as pd
 
@@ -15,7 +15,7 @@
 reference_genome_path = sys.argv[8]
 bin_location_file = sys.argv[9]
 temp_folder = sys.argv[10]
-if os.path.isdir(temp_folder)==False:
+if not os.path.isdir(temp_folder):
     os.mkdir(temp_folder)
 
 bin_locations = pd.read_csv(bin_location_file).values.tolist()
@@ -47,7 +47,7 @@ def tag_bam(sub_bin_locations):
                 for read in bam_in.fetch(contig=contig, start=start, end=end): 
                     if (read.flag & 1024 == 0) and (read.flag & 2048 == 0) and (read.flag & 4 == 0) and (read.flag & 8 == 0):
                         if read.mapping_quality>=mapq and (read.reference_name == read.next_reference_name):
-                            length = read.template_length
+                            length = abs(read.template_length)
                             if length>=start_len and length<=end_len:
                                 ref_start = read.reference_start + lag
                                 ref_end = read.reference_start + length - lag
@@ -60,7 +60,7 @@ def tag_bam(sub_bin_locations):
                                     if valid_percent>=0.9:
                                         GC_content = float( GC_cnt/total_cnt )
                                         GC_content = round(round(GC_content, 2) * 100)
-                                        correction_factor = correction_factors[length-51, GC_content]
+                                        correction_factor = correction_factors[length-start_len, GC_content]
                                         read.set_tag("GC", correction_factor)
                                         bam_out.write(read)
     return output_bam

diff --git a/single_sample_corrected_cov.py b/single_sample_corrected_cov.py
@@ -46,7 +46,7 @@ def get_cov(sub_bin_locations):
             for read in bam_in.fetch(contig=contig, start=start, end=end): 
                 if (read.flag & 1024 == 0) and (read.flag & 2048 == 0) and (read.flag & 4 == 0) and (read.flag & 8 == 0):
                     if read.mapping_quality>=mapq and (read.reference_name == read.next_reference_name):
-                        length = read.template_length
+                        length = read.template_length # NOTE: Only one read is used
                         if length>=start_len and length<=end_len:
                             ref_start = read.reference_start + lag
                             ref_end = read.reference_start + length - lag
@@ -59,7 +59,7 @@ def get_cov(sub_bin_locations):
                                 if valid_percent>=0.9:
                                     GC_content = float( GC_cnt/total_cnt )
                                     GC_content = round(round(GC_content, 2) * 100)
-                                    correction_factor = correction_factors[length-51, GC_content]
+                                    correction_factor = correction_factors[length-start_len, GC_content]
                                     cov_array[bin_index] += correction_factor
     return cov_array
 

diff --git a/single_sample_correction_factor.py b/single_sample_correction_factor.py
@@ -49,6 +49,8 @@ def construct_GC_bias(sample_lengths, ref_lengths, X):
 
     GC_array = GC_array[lower_cnt: upper_cnt+1]
     GC_array = remove_outlier(GC_array)
+    if GC_array.sum() == 0:
+        return np.ones_like(GC_array)[lag:-lag]
     GC_array = GC_array / np.mean(GC_array)
 
     lowess = sm.nonparametric.lowess(GC_array, X, frac=0.1)