-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsplit_data.py
More file actions
executable file
·109 lines (90 loc) · 2.93 KB
/
split_data.py
File metadata and controls
executable file
·109 lines (90 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
import sys
import os
from collections import deque
import time
from utils import shuffle, get_gene_ontology
DATA_ROOT = 'data/'
RESULT_ROOT = 'data/swiss/'
FILES = (
'uniprot-swiss-mol-func.txt',)
def load_all_proteins_paac():
prots = dict()
for i in range(len(FILES)):
file_name = FILES[i]
with open(DATA_ROOT + file_name, 'r') as f:
for line in f:
line = line.strip().split()
prot_id = line[0]
paac = line[1:]
prots[prot_id] = paac
return prots
INVALID_ACIDS = set(['U', 'O', 'B', 'Z', 'J', 'X'])
MIN_LEN = 24
def is_ok(seq):
if len(seq) < MIN_LEN:
return False
for c in seq:
if c in INVALID_ACIDS:
return False
return True
def load_all_proteins():
prots = list()
for i in range(len(FILES)):
file_name = FILES[i]
with open(DATA_ROOT + file_name, 'r') as f:
for line in f:
line = line.strip().split('\t')
prot_id = line[0]
seq = line[1]
gos = line[2]
if is_ok(seq):
prots.append((prot_id, seq, gos))
return prots
def load_train_proteins():
prot_set = set()
for root, dirs, files in os.walk(DATA_ROOT + 'hierarchical/'):
for filename in files:
with open(root + filename, 'r') as f:
for line in f:
line = line.strip().split()
prot_set.add(line[1])
return prot_set
def load_unseen_proteins():
prot_set = set()
with open(DATA_ROOT + 'unseen.txt', 'r') as f:
for line in f:
line = line.strip().split()
prot_id = line[0]
prot_set.add(prot_id)
return prot_set
def main():
start_time = time.time()
print 'Loading all proteins'
all_prots = load_all_proteins()
shuffle(all_prots)
split = 0.8
train_len = int(len(all_prots) * split)
# print 'Loading train proteins'
# train_set = load_train_proteins()
# all_set = set(all_prots.keys())
# print len(all_set), len(train_set)
# unseen = all_set - train_set
with open(RESULT_ROOT + 'train.txt', 'w') as f:
for prot_id, seq, gos in all_prots[:train_len]:
f.write(prot_id + '\t' + seq + '\t' + gos + '\n')
with open(RESULT_ROOT + 'test.txt', 'w') as f:
for prot_id, seq, gos in all_prots[train_len:]:
f.write(prot_id + '\t' + seq + '\t' + gos + '\n')
# print 'Loading unseen proteins'
# unseen = load_unseen_proteins()
# print 'Loading all proteins'
# all_prots = load_all_proteins()
# with open(DATA_ROOT + 'unseen-gos.txt', 'w') as f:
# for prot_id in unseen:
# f.write(prot_id)
# f.write('\t' + all_prots[prot_id] + '\n')
end_time = time.time() - start_time
print 'Done in %d seconds' % (end_time, )
if __name__ == '__main__':
main()