Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions single_sample_GC_tagging.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pysam
import numpy as np
import os, sys, subprocess, time, shutil
import os, sys, subprocess, shutil
from multiprocessing import Pool
import pandas as pd

Expand All @@ -15,7 +15,7 @@
reference_genome_path = sys.argv[8]
bin_location_file = sys.argv[9]
temp_folder = sys.argv[10]
if os.path.isdir(temp_folder)==False:
if not os.path.isdir(temp_folder):
os.mkdir(temp_folder)

bin_locations = pd.read_csv(bin_location_file).values.tolist()
Expand Down Expand Up @@ -47,7 +47,7 @@ def tag_bam(sub_bin_locations):
for read in bam_in.fetch(contig=contig, start=start, end=end):
if (read.flag & 1024 == 0) and (read.flag & 2048 == 0) and (read.flag & 4 == 0) and (read.flag & 8 == 0):
if read.mapping_quality>=mapq and (read.reference_name == read.next_reference_name):
length = read.template_length
length = abs(read.template_length)
if length>=start_len and length<=end_len:
ref_start = read.reference_start + lag
ref_end = read.reference_start + length - lag
Expand All @@ -60,7 +60,7 @@ def tag_bam(sub_bin_locations):
if valid_percent>=0.9:
GC_content = float( GC_cnt/total_cnt )
GC_content = round(round(GC_content, 2) * 100)
correction_factor = correction_factors[length-51, GC_content]
correction_factor = correction_factors[length-start_len, GC_content]
read.set_tag("GC", correction_factor)
bam_out.write(read)
return output_bam
Expand Down
4 changes: 2 additions & 2 deletions single_sample_corrected_cov.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def get_cov(sub_bin_locations):
for read in bam_in.fetch(contig=contig, start=start, end=end):
if (read.flag & 1024 == 0) and (read.flag & 2048 == 0) and (read.flag & 4 == 0) and (read.flag & 8 == 0):
if read.mapping_quality>=mapq and (read.reference_name == read.next_reference_name):
length = read.template_length
length = read.template_length # NOTE: Only one read is used
if length>=start_len and length<=end_len:
ref_start = read.reference_start + lag
ref_end = read.reference_start + length - lag
Expand All @@ -59,7 +59,7 @@ def get_cov(sub_bin_locations):
if valid_percent>=0.9:
GC_content = float( GC_cnt/total_cnt )
GC_content = round(round(GC_content, 2) * 100)
correction_factor = correction_factors[length-51, GC_content]
correction_factor = correction_factors[length-start_len, GC_content]
cov_array[bin_index] += correction_factor
return cov_array

Expand Down
2 changes: 2 additions & 0 deletions single_sample_correction_factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def construct_GC_bias(sample_lengths, ref_lengths, X):

GC_array = GC_array[lower_cnt: upper_cnt+1]
GC_array = remove_outlier(GC_array)
if GC_array.sum() == 0:
return np.ones_like(GC_array)[lag:-lag]
GC_array = GC_array / np.mean(GC_array)

lowess = sm.nonparametric.lowess(GC_array, X, frac=0.1)
Expand Down