-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathEncodedPairs_FileGenerator_FinalVerison.py
More file actions
302 lines (226 loc) · 10.8 KB
/
EncodedPairs_FileGenerator_FinalVerison.py
File metadata and controls
302 lines (226 loc) · 10.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 5 16:03:53 2016
@author: yiz613
"""
import re
import sqlite3
import time
import random
import math
import pickle
import sys
def main():
start_time = time.time()
global noFound # for the drug didn't find from RDF according to given drug uris file
noFound =0
uris_file = "drugtrain_05pl1_uris.txt"
rdf_file = 'input.nt.fixed6.nt'
encoded_output = "encodedrugs.txt"
ratio = 0.1 # [positive pairs out of TOTAL poistive groups]/[negative pairs out of TOTAL negative groups]
'''the ratio is also depend on how many positive and negative pairs in total in orignal RDF file'''
startEnter = input("""Please put following files in the same working directory:
'drugtrain_05pl1_uris.txt'
'input.nt.fixed6.nt'
'encodedrugs.txt'
Hit Enter to start (default)>>>""")
if len(startEnter)<1:
print ('\nStart to parsing files...', flush = True)
else:
sys.exit("Error message")
def LoadDrugName2SQL (uris):
print('')
'input drug names from uris file to sql'
DrugList = list()
for i in uris:
a = re.findall ('#(.+)', i)
try:
DrugList.append(a[0].lower())
except:
print('Something is wrong with this uri:\n', i)
sqlInput( DrugList )
def sqlInput (DrugList):
'input the drug name list into sql with a id, which should be the same number is the row number in encoded file for each drug'
for drug in DrugList:
cur.execute('''INSERT INTO drugNameTable (drug_Name) VALUES ( ? )''', (drug.lower(), ) )
conn.commit()
# modified from Kavitha's extractDataFromFile.py code
def RDF2SQL(rdf_file):
'parse RDF by Regular Expression then put the drug-drug relation into SQL'
counter = 0
for line1 in rdf_file:
if 'NoInteraction' in line1:
try:
drug_A, drug_B = RDFre(line1)
cur.execute('''INSERT INTO MatchTable (relation, subjectID, objectID) VALUES ( ?, ?, ? )''', ( 0, drug_A, drug_B ) )
counter += 1
except:
pass
elif 'drugDrugInteraction' in line1:
try:
drug_A, drug_B = RDFre(line1)
cur.execute('''INSERT INTO MatchTable (relation, subjectID, objectID) VALUES ( ?, ?, ? )''', ( 1, drug_A, drug_B ) )
counter += 1
except:
pass
conn.commit()
def RDFre(line2):
'output the ID for durgA and B'
tri = re.findall(r"[<]([^>]*)[>]", line2)
drug_AA = re.findall('.+/(.+?)$', tri[0])
drug_BB = re.findall('.+/(.+?)$', tri[2])
drugAid = findDrugFromSql ( drug_AA[0].lower() )
drugBid = findDrugFromSql ( drug_BB[0].lower() )
return drugAid, drugBid
def findDrugFromSql(drugsName):
'find the drug id from sql database by drug name'
global noFound
cur.execute("SELECT id FROM drugNameTable WHERE drug_Name= ?", (str(drugsName),))
a = cur.fetchone()
if a is not None:
b=a[0]
#print ('found it')
return b
else:
#print ('drug name is not found: ',drugsName)
noFound += 1
return None
def CombineAndShuffle(ratio = 0.1):
#output will be three list, each list have pos and neg drug interaction in tuple
cur.execute('''SELECT * FROM MatchTable WHERE relation = 1''')
pos_vectors = cur.fetchall()
cur.execute('''SELECT * FROM MatchTable WHERE relation = 0''')
neg_vectors = cur.fetchall()
random.shuffle(pos_vectors)
random.shuffle(neg_vectors)
valid_split = int(math.ceil(len(pos_vectors) * (1-2*ratio)))
test_split = int(math.ceil(len(pos_vectors) * (1-ratio)))
train_data = pos_vectors[:valid_split]
valid_data = pos_vectors[valid_split:test_split]
test_data = pos_vectors[test_split:]
valid_split = int(math.ceil(len(neg_vectors) * (1-2*ratio)))
test_split = int(math.ceil(len(neg_vectors) * (1-ratio)))
# note extend concatenates, append will put the second list as a single element
train_data.extend(neg_vectors[:valid_split])
valid_data.extend(neg_vectors[valid_split:test_split])
test_data.extend(neg_vectors[test_split:])
# print("Lengths: ", len(train_data), len(valid_data), len(test_data), flush=True)
# shuffle data
print("\nShuffling the data sets...", flush= True )
random. shuffle ( train_data)
random.shuffle(valid_data)
random.shuffle(test_data)
return train_data, valid_data, test_data, ratio
def add_drugs( encoded_file_readlines, tuple_for_drugs_interaction):
"return a list with first digit as (interaction/non-interaction) and following by drugA and drugB's encoded data"
interaction_list = list()
interaction, drugA, drugB = tuple_for_drugs_interaction
try:
############## Because the drug name ID in sql start at 1, but in encoded file drug start at 0.
############## so i have to do drugIDinSql-1 ==> to give the right line for the drug
B = encoded_file_readlines[drugB-1].strip('\n')
A = encoded_file_readlines[drugA-1].strip('\n')
AA = list(map(float, A.split()))
BB = list(map(float, B.split()))
Int = int(interaction)
interaction_list.append( Int )
interaction_list.extend( AA )
interaction_list.extend( BB )
#print (interaction_list)
return interaction_list
except:
aaa = 1
print (drugA, drugB, "doesn't work. Some problem here")
return aaa
def createFile(tupleList, encoded_file, savedFileName, reduce_size=0):
#create a pickle file
cAppended = 0
print('\nGenerating [ %s ] file right now...\n' %savedFileName)
f = open(savedFileName, 'ab')
for theTuple in tupleList:
output = add_drugs( encoded_file, theTuple)
if output == 1: #safety
continue
pickle.dump(output, f, pickle.HIGHEST_PROTOCOL)
#p.append(output)
cAppended += 1
if (cAppended % 10000) == 0:
print ('loaded', cAppended,'drug drug relations',flush = True)
if (reduce_size > 0 and reduce_size<cAppended):
break
#if cAppended == 50000:
# pickle.dump(p, f, pickle.HIGHEST_PROTOCOL)
# p=list()
# if cAppended == listLen:
# pickle.dump(p, f, pickle.HIGHEST_PROTOCOL)
# #print('The end')
# break
f.close()
####################################################################################################
####################################################################################################
######################### Beginning of the codes ###############################################
####################################################################################################
####################################################################################################
conn = sqlite3.connect(r'drugNameFile.sqlite')
cur = conn.cursor()
print ('Generating SQL file...')
cur.executescript('''
DROP TABLE IF EXISTS drugNameTable;
DROP TABLE IF EXISTS MatchTable;
CREATE TABLE drugNameTable ( id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
drug_Name TEXT UNIQUE NOT NULL);
CREATE TABLE MatchTable ( relation INTEGER NOT NULL,
subjectID INTEGER NOT NULL,
objectID INTEGER NOT NULL)''')
StepOne = open(uris_file, 'r')
LoadDrugName2SQL(StepOne)
cur.execute('''SELECT MAX(id) FROM drugNameTable''')
loaded, = cur.fetchall()[0]
#print ('\n-------- Performace Results ------------\n\nTOTAL [ %d ] unique drug names loaded into SQL Table - drugNameTable' % (loaded))
conn.commit()
StepOne.close()
###################Step One done
StepTwo = open(rdf_file,'r')
RDF2SQL(StepTwo)
conn.commit()
StepTwo.close()
cur.execute('''SELECT COUNT(*) FROM MatchTable''')
pair, = cur.fetchall()[0]
#print ('\n[ %d ] pairs of drug-drug relations loaded from RDF to SQL Table - MatchTable: ' % (pair))
train, val, test, ratio = CombineAndShuffle(ratio)
################### Step Two done
StepThree = open(encoded_output)
encoded_file = StepThree.readlines()
# mac, unix, linux, and windows all can generate the file within a folder
createFile( val, encoded_file, 'encoded_valid_pickle',10000)
createFile (test, encoded_file, 'encoded_test_pickle',10000)
createFile (train, encoded_file, 'encoded_train_pickle',80000)
StepThree.close()
#####################Step Three done
if __name__ == "__main__":
main()
print ('\n-------- Performace Results ------------\n\nTOTAL [ %d ] unique drug names loaded into SQL Table - drugNameTable' % (loaded))
print ('\n[ %d ] pairs of drug-drug relations loaded from RDF to SQL Table - MatchTable: ' % (pair))
cur.execute('''SELECT relation, count(relation) FROM MatchTable GROUP by relation''' )
statistics = cur.fetchall()
for item in statistics:
rel , times = item
if rel ==1:
rel = 'Interacting Pairs'
else:
rel = 'Non-acting Pairs'
print ('%s pairs : [ %d ]' % (rel, times))
print ( 'Numbers of drugs are not found in drugNameTable : [', noFound, ']')
print ('\nGenerated Three Files:\nTraining file: [ %d ] pairs\nValid file: [ %d ] pairs\nTest file:[ %d ] pairs\n\nThe ratio for all three files\n[positive pairs out of TOTAL poistive groups]/[negative pairs out of TOTAL negative groups] is\n [ 1 : %.1f ] (or %.2f)' %(len(train), len(val), len(test), 1/ratio, ratio))
conn.close()
print ("\n------ %s seconds ------" % ( time.time() - start_time) )
"""
def load(filename):
with open(filename, "rb") as f:
while True:
try:
yield pickle.load(f)
except EOFError:
break
items = load(myfilename)
"""