-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgetCommonMotifReadsFromCollapsedFastq.py
More file actions
129 lines (103 loc) · 4.26 KB
/
getCommonMotifReadsFromCollapsedFastq.py
File metadata and controls
129 lines (103 loc) · 4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
## PURPOSE: get reads for certain motifs across certain tumors
## INPUT: manifest data all-tumor-manifest.csv
## collapsed fastq files sample.converted.unpaired.fastq.collapsed
## OUTPUT: table containing reads for specific motif across samples motif.tumor.common.reads.fastq.collapsed.summary.tsv
import os
import os.path
import numpy as np
import pandas as pd
import collections
import subprocess
from pathlib import Path
import time
base_path = '/media/user/2TB (MAC)/Susanna/'
collapsed_ext = '.converted.unpaired.fastq.collapsed'
manifest_file = base_path + 'all-tumor-manifest.csv'
manifest_data =pd.read_csv(manifest_file, header='infer', sep=',')
'''
file = base_path + 'TARGET/TARGET-manifest.csv'
data = pd.read_csv(file, header='infer', sep=',')
data['DISEASE.ABBV'] = 'TARGET'
manifest_data = pd.concat([manifest_data, data])
print(manifest_data.shape)
manifest_data.to_csv(manifest_file, sep=',', index=False)
'''
def getCollapsedFastqDataframe(file):
df = pd.read_table(file, header=None, delim_whitespace=True)
df = df.dropna(axis=1, how='all')
sample = file.split('/')
sample = sample[len(sample)-1]
sample = sample.split('.')[0]
df.columns = ['READS', 'SEQUENCE']
return df
def getManifestID(name, tumor):
id = manifest_data.loc[(manifest_data['DISEASE.ABBV']==tumor) & (manifest_data['NAME']==name)]['ID']
id = id.tolist()[0]
id = str(id)
return str(id)
motifs = ['TGGTTATCTAGCT', 'TTATCAGACTGAT']
mirnas = ['hsa-miR-9-5p-1-2-3', 'hsa-miR-21-5p']
i = 1
motif = motifs[i]
mirna = mirnas[i]
for subdir, dirs, files in os.walk(base_path):
if '/collapsed_fastq' in subdir:
folders = subdir.split('/')
tumor = folders[len(folders)-2]
if tumor in ['THCA', 'STAD', 'SKCM', 'PCPG']:
continue
print(tumor)
summary_file = base_path + 'motif_reads/' + mirna + '/' + motif + '.' + tumor + '.common.reads.fastq.collapsed.summary.tsv'
if Path(summary_file).exists():
summary_data = pd.read_table(summary_file, header='infer', sep='\t')
else:
print('SUMMARY FILE NOT FOUND')
summary_data = pd.DataFrame({'SEQUENCE':[]})
matched_ids = list(summary_data)
common_seqs = list(summary_data['SEQUENCE'])
total_time_start = time.time()
for f in os.listdir(subdir):
time_start = time.time()
if f[0] == '.':
break
patient = f.split('.')[0]
id = getManifestID(patient, tumor)
if id not in matched_ids:
matched_ids.append(id)
if Path(summary_file).exists():
summary_data = pd.read_table(summary_file, header='infer', sep='\t')
else:
print('SUMMARY FILE NOT FOUND')
summary_data = pd.DataFrame({'SEQUENCE':[]})
matched_ids = list(summary_data)
common_seqs = list(summary_data['SEQUENCE'])
#matched_seq = list(summary_data['SEQUENCE'])
summary_data = None
collapsed_file = subdir+'/'+f
collapsed_data = getCollapsedFastqDataframe(collapsed_file)
#print(collapsed_data.shape[0])
if len(common_seqs) > 0:
collapsed_data = collapsed_data[collapsed_data.SEQUENCE.isin(common_seqs)]
num_rows = collapsed_data.shape[0]
#print(collapsed_data.shape[0])
collapsed_data.columns = [str(id), 'SEQUENCE']
match_collapsed_data = collapsed_data #pd.DataFrame(columns = ['READS', 'SEQUENCE'])
match_collapsed_data.columns = [str(id), 'SEQUENCE']
if Path(summary_file).exists():
summary_data = pd.read_table(summary_file, header='infer', sep='\t')
summary_data = pd.merge(summary_data, match_collapsed_data, on='SEQUENCE', sort=False, how='inner')
else:
summary_data = match_collapsed_data
summary_data.to_csv(summary_file, sep='\t', index=False)
summary_data = pd.DataFrame({'SEQUENCE':[]})
time_end = time.time()
#print('TUMOR: ' + tumor + ' SAMPLE: ' + str(patient) + ' TOTAL TIME: ' + str((time_end-time_start)/60) + ' ROWS: ' + str(num_rows))
summary_data = pd.read_table(summary_file, header='infer', sep='\t')
match_summary_data = summary_data.copy()
for index, row in summary_data.iterrows():
sequence = str(row['SEQUENCE'])
if motif not in sequence:
match_summary_data = match_summary_data[match_summary_data.SEQUENCE != sequence]
match_summary_data.to_csv(summary_file, sep='\t', index=False)
total_time_end = time.time()
print('TOTAl TUMOR TIME: ' + str(total_time_end-total_time_start))