diff --git a/single_sample_GC_tagging.py b/single_sample_GC_tagging.py index d2ca736..af6b927 100644 --- a/single_sample_GC_tagging.py +++ b/single_sample_GC_tagging.py @@ -1,6 +1,6 @@ import pysam import numpy as np -import os, sys, subprocess, time, shutil +import os, sys, subprocess, shutil from multiprocessing import Pool import pandas as pd @@ -15,7 +15,7 @@ reference_genome_path = sys.argv[8] bin_location_file = sys.argv[9] temp_folder = sys.argv[10] -if os.path.isdir(temp_folder)==False: +if not os.path.isdir(temp_folder): os.mkdir(temp_folder) bin_locations = pd.read_csv(bin_location_file).values.tolist() @@ -47,7 +47,7 @@ def tag_bam(sub_bin_locations): for read in bam_in.fetch(contig=contig, start=start, end=end): if (read.flag & 1024 == 0) and (read.flag & 2048 == 0) and (read.flag & 4 == 0) and (read.flag & 8 == 0): if read.mapping_quality>=mapq and (read.reference_name == read.next_reference_name): - length = read.template_length + length = abs(read.template_length) if length>=start_len and length<=end_len: ref_start = read.reference_start + lag ref_end = read.reference_start + length - lag @@ -60,7 +60,7 @@ def tag_bam(sub_bin_locations): if valid_percent>=0.9: GC_content = float( GC_cnt/total_cnt ) GC_content = round(round(GC_content, 2) * 100) - correction_factor = correction_factors[length-51, GC_content] + correction_factor = correction_factors[length-start_len, GC_content] read.set_tag("GC", correction_factor) bam_out.write(read) return output_bam diff --git a/single_sample_corrected_cov.py b/single_sample_corrected_cov.py index 714a516..4d40a66 100644 --- a/single_sample_corrected_cov.py +++ b/single_sample_corrected_cov.py @@ -46,7 +46,7 @@ def get_cov(sub_bin_locations): for read in bam_in.fetch(contig=contig, start=start, end=end): if (read.flag & 1024 == 0) and (read.flag & 2048 == 0) and (read.flag & 4 == 0) and (read.flag & 8 == 0): if read.mapping_quality>=mapq and (read.reference_name == read.next_reference_name): - length = read.template_length + length = read.template_length # NOTE: Only one read is used if length>=start_len and length<=end_len: ref_start = read.reference_start + lag ref_end = read.reference_start + length - lag @@ -59,7 +59,7 @@ def get_cov(sub_bin_locations): if valid_percent>=0.9: GC_content = float( GC_cnt/total_cnt ) GC_content = round(round(GC_content, 2) * 100) - correction_factor = correction_factors[length-51, GC_content] + correction_factor = correction_factors[length-start_len, GC_content] cov_array[bin_index] += correction_factor return cov_array diff --git a/single_sample_correction_factor.py b/single_sample_correction_factor.py index 0ab473b..274ce17 100644 --- a/single_sample_correction_factor.py +++ b/single_sample_correction_factor.py @@ -49,6 +49,8 @@ def construct_GC_bias(sample_lengths, ref_lengths, X): GC_array = GC_array[lower_cnt: upper_cnt+1] GC_array = remove_outlier(GC_array) + if GC_array.sum() == 0: + return np.ones_like(GC_array)[lag:-lag] GC_array = GC_array / np.mean(GC_array) lowess = sm.nonparametric.lowess(GC_array, X, frac=0.1)