forked from SwagarikaGiri/Multi-PredGO
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing_inga.py
More file actions
121 lines (96 loc) · 3.41 KB
/
preprocessing_inga.py
File metadata and controls
121 lines (96 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
import numpy as np
import csv
query = 'bp'
ont = 'P'
root='data/'
outfile='inga-predictions-'+query+'.csv'
# Get different terms from ontology pickle file
go_term_file=root+query+'.pkl'
terms_df = pd.read_pickle(go_term_file)
terms = terms_df['functions'].values.flatten()
# Get list of accessions passed in testing
test_df = pd.read_pickle(root+'multimodaltest-'+query+'.pkl')
accessions = test_df['accessions'].values.flatten()
inga_pred_file=root+'inga_pred_file.csv'
# Read prediction file from INGA
columns = ['accessions', 'annotations', 'go_ont', 'INGA_score', 'dist_to_ont', 'leaf', 'go_definition']
df = pd.read_csv(inga_pred_file, sep='\t', names=columns)
df = df[['accessions', 'annotations', 'go_ont']]
# Filter dataframe according to its ontology
df = df[df['go_ont'] == ont]
df=df[['accessions','annotations']]
inga_pred_dict=dict()
for i, row in enumerate(df.itertuples()):
key=(row.accessions,row.annotations)
if key in inga_pred_dict:
pass
else:
inga_pred_dict[key]=1
# generate the csv file
# with open(outfile,'w') as outputcsv_file:
# spamwriter = csv.writer(outputcsv_file,delimiter=',')
# col1=['accession_no']
# col1=col1+terms.tolist()
# spamwriter.writerow(col1)
# print len(col1)
# raw_input()
# for accession in accessions:
# col1=[]
# col1.append(accession)
# for goterm in terms:
# if (accession,goterm) in inga_pred_dict:
# col1.append('1')
# else:
# col1.append('0')
# print len(col1)
# spamwriter.writerow(col1)
# ans = pd.DataFrame(columns = ['accessions', 'annotations'])
# with open(outfile,'w') as outputcsv_file:
# spamwriter = csv.writer(outputcsv_file,delimiter=',')
# col1=['accession_no']
# col1=col1+terms.tolist()
# spamwriter.writerow(col1)
# print len(col1)
# raw_input()
# for accession in accessions:
# col1=[]
# col1.append(accession)
# for goterm in terms:
# if (accession,goterm) in inga_pred_dict:
# col1.append('1')
# else:
# col1.append('0')
# print len(col1)
# spamwriter.writerow(col1)
terms_dict = {v: i for i, v in enumerate(terms)}
nb_classes = len(terms)
ans = pd.DataFrame(columns = ['accessions', 'annotations'])
# Get Matrix
res = dict()
# for i, row in enumerate(df.itertuples()):
# if row.annotations in terms:
# if row.accessions in res.keys():
# res[row.accessions][terms_dict[row.annotations]] = 1
# else:
# res[row.accessions] = np.zeros((nb_classes), dtype=np.int32)
# res[row.accessions][terms_dict[row.annotations]] = 1
for accession in accessions:
for term in terms:
if accession in res.keys():
if (accession,term) in inga_pred_dict:
res[accession][terms_dict[term]]=1.00
else:
res[accession][terms_dict[term]]=0.00
else:
res[accession] = np.zeros((nb_classes), dtype=np.int32)
if (accession,term) in inga_pred_dict:
res[accession][terms_dict[term]]=float(1.00)
else:
res[accession][terms_dict[term]]=float(0.00)
for key in res.keys():
ans = ans.append({'accessions': key, 'annotations': res[key]}, ignore_index = True)
print(len(ans))
print ans
raw_input()
ans.to_pickle('pred_'+query+'.pkl')