From f98e3c7abb7b9ece11c8cb0ca664e6f4006b3605 Mon Sep 17 00:00:00 2001
From: Ludvig <ludvig@clin.au.dk>
Date: Thu, 11 Dec 2025 14:51:01 +0100
Subject: [PATCH 1/4] Attempts fixing zero-sum issue after outlier removal

---
 single_sample_correction_factor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/single_sample_correction_factor.py b/single_sample_correction_factor.py
index 0ab473b..274ce17 100644
--- a/single_sample_correction_factor.py
+++ b/single_sample_correction_factor.py
@@ -49,6 +49,8 @@ def construct_GC_bias(sample_lengths, ref_lengths, X):
     
     GC_array = GC_array[lower_cnt: upper_cnt+1]
     GC_array = remove_outlier(GC_array)
+    if GC_array.sum() == 0:
+        return np.ones_like(GC_array)[lag:-lag]
     GC_array = GC_array / np.mean(GC_array)
     
     lowess = sm.nonparametric.lowess(GC_array, X, frac=0.1)

From 56d9f216e2a66022be422066d2427dd331f5ec7b Mon Sep 17 00:00:00 2001
From: Ludvig <ludvig@clin.au.dk>
Date: Thu, 11 Dec 2025 16:07:23 +0100
Subject: [PATCH 2/4] Makes tlen absolute so both mates are kept

---
 single_sample_GC_tagging.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/single_sample_GC_tagging.py b/single_sample_GC_tagging.py
index d2ca736..bab57c5 100644
--- a/single_sample_GC_tagging.py
+++ b/single_sample_GC_tagging.py
@@ -15,7 +15,7 @@
 reference_genome_path = sys.argv[8]
 bin_location_file = sys.argv[9]
 temp_folder = sys.argv[10]
-if os.path.isdir(temp_folder)==False:
+if not os.path.isdir(temp_folder):
     os.mkdir(temp_folder)
 
 bin_locations = pd.read_csv(bin_location_file).values.tolist()
@@ -47,7 +47,7 @@ def tag_bam(sub_bin_locations):
                 for read in bam_in.fetch(contig=contig, start=start, end=end): 
                     if (read.flag & 1024 == 0) and (read.flag & 2048 == 0) and (read.flag & 4 == 0) and (read.flag & 8 == 0):
                         if read.mapping_quality>=mapq and (read.reference_name == read.next_reference_name):
-                            length = read.template_length
+                            length = abs(read.template_length)
                             if length>=start_len and length<=end_len:
                                 ref_start = read.reference_start + lag
                                 ref_end = read.reference_start + length - lag

From e689ff925a41e767e700b88ac3ffc814998072e3 Mon Sep 17 00:00:00 2001
From: Ludvig <ludvig@clin.au.dk>
Date: Thu, 11 Dec 2025 16:17:53 +0100
Subject: [PATCH 3/4] Fixes hardcoded min length

---
 single_sample_GC_tagging.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/single_sample_GC_tagging.py b/single_sample_GC_tagging.py
index bab57c5..af6b927 100644
--- a/single_sample_GC_tagging.py
+++ b/single_sample_GC_tagging.py
@@ -1,6 +1,6 @@
 import pysam
 import numpy as np
-import os, sys, subprocess, time, shutil
+import os, sys, subprocess, shutil
 from multiprocessing import Pool
 import pandas as pd
 
@@ -60,7 +60,7 @@ def tag_bam(sub_bin_locations):
                                     if valid_percent>=0.9:
                                         GC_content = float( GC_cnt/total_cnt )
                                         GC_content = round(round(GC_content, 2) * 100)
-                                        correction_factor = correction_factors[length-51, GC_content]
+                                        correction_factor = correction_factors[length-start_len, GC_content]
                                         read.set_tag("GC", correction_factor)
                                         bam_out.write(read)
     return output_bam

From 6a2e4403c81d73c9163a779d6510bdc3c5c6427b Mon Sep 17 00:00:00 2001
From: Ludvig <ludvig@clin.au.dk>
Date: Thu, 11 Dec 2025 17:02:14 +0100
Subject: [PATCH 4/4] Uses start_len as offset

---
 single_sample_corrected_cov.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/single_sample_corrected_cov.py b/single_sample_corrected_cov.py
index 714a516..4d40a66 100644
--- a/single_sample_corrected_cov.py
+++ b/single_sample_corrected_cov.py
@@ -46,7 +46,7 @@ def get_cov(sub_bin_locations):
             for read in bam_in.fetch(contig=contig, start=start, end=end): 
                 if (read.flag & 1024 == 0) and (read.flag & 2048 == 0) and (read.flag & 4 == 0) and (read.flag & 8 == 0):
                     if read.mapping_quality>=mapq and (read.reference_name == read.next_reference_name):
-                        length = read.template_length
+                        length = read.template_length # NOTE: Only one read is used
                         if length>=start_len and length<=end_len:
                             ref_start = read.reference_start + lag
                             ref_end = read.reference_start + length - lag
@@ -59,7 +59,7 @@ def get_cov(sub_bin_locations):
                                 if valid_percent>=0.9:
                                     GC_content = float( GC_cnt/total_cnt )
                                     GC_content = round(round(GC_content, 2) * 100)
-                                    correction_factor = correction_factors[length-51, GC_content]
+                                    correction_factor = correction_factors[length-start_len, GC_content]
                                     cov_array[bin_index] += correction_factor
     return cov_array