-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathqsar_example.py
More file actions
160 lines (138 loc) · 6.47 KB
/
qsar_example.py
File metadata and controls
160 lines (138 loc) · 6.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# Go through example code on website for how it works with RNNS
# We can refer to this for help if necessary or docs
# Go through code and on website to get full understanding
# https://cheminformania.com/building-a-simple-qsar-model-using-a-feed-forward-neural-network-in-pytorch/
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem
from sklearn.feature_selection import VarianceThreshold
from torch.utils.data import TensorDataset
file_path = "./assets/SLC6A4_active_excape_export.csv" # need to download from website link in link
data = pd.read_csv(file_path)
PandasTools.AddMoleculeColumnToFrame(data,'SMILES','Molecule')
data[["SMILES","Molecule"]].head(1)
data.Molecule.isna().sum()
def mol2fp(mol):
fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=4096)
ar = np.zeros((1,), dtype=np.int8)
DataStructs.ConvertToNumpyArray(fp, ar)
return ar
fp = mol2fp(Chem.MolFromSmiles(data.loc[1,"SMILES"]))
plt.matshow(fp.reshape((64,-1)) > 0)
data["FPs"] = data.Molecule.apply(mol2fp)
X = np.stack(data.FPs.values)
print(X.shape)
# Splitting data
y = data.pXC50.values.reshape((-1,1))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.05, random_state=42)
#Normalizing output using standard scaling
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
y_validation = scaler.transform(y_validation)
feature_select = VarianceThreshold(threshold=0.05)
X_train = feature_select.fit_transform(X_train)
X_validation = feature_select.transform(X_validation)
X_test = feature_select.transform(X_test)
X_train.shape
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# If you don't have a GPU, buy a graphics card. I have for a long time used a 1060 GTX, which is not that expensive anymore.
X_train = torch.tensor(X_train, device=device).float()
X_test = torch.tensor(X_test, device=device).float()
X_validation = torch.tensor(X_validation, device=device).float()
y_train = torch.tensor(y_train, device=device).float()
y_test = torch.tensor(y_test, device=device).float()
y_validation = torch.tensor(y_validation, device=device).float()
train_dataset = TensorDataset(X_train, y_train)
validation_dataset = TensorDataset(X_validation, y_validation)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=256,
shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset,
batch_size=256,
shuffle=False)
class Net(nn.Module):
def __init__(self, input_size, hidden_size, dropout_rate, out_size):
super(Net, self).__init__()
# Three layers and a output layer
self.fc1 = nn.Linear(input_size, hidden_size) # 1st Full-Connected Layer
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, hidden_size)
self.fc_out = nn.Linear(hidden_size, out_size) # Output layer
#Layer normalization for faster training
self.ln1 = nn.LayerNorm(hidden_size)
self.ln2 = nn.LayerNorm(hidden_size)
self.ln3 = nn.LayerNorm(hidden_size)
#LeakyReLU will be used as the activation function
self.activation = nn.LeakyReLU()
#Dropout for regularization
self.dropout = nn.Dropout(dropout_rate)
def forward(self, x):# Forward pass: stacking each layer together
# Fully connected => Layer Norm => LeakyReLU => Dropout times 3
out = self.fc1(x)
out = self.ln1(out)
out = self.activation(out)
out = self.dropout(out)
out = self.fc2(out)
out = self.ln2(out)
out = self.activation(out)
out = self.dropout(out)
out = self.fc3(out)
out = self.ln3(out)
out = self.activation(out)
out = self.dropout(out)
#Final output layer
out = self.fc_out(out)
return out
#Defining the hyperparameters
input_size = X_train.size()[-1] # The input size should fit our fingerprint size
hidden_size = 1024 # The size of the hidden layer
dropout_rate = 0.80 # The dropout rate
output_size = 1 # This is just a single task, so this will be one
learning_rate = 0.001 # The learning rate for the optimizer
model = Net(input_size, hidden_size, dropout_rate, output_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model.train() #Ensure the network is in "train" mode with dropouts active
epochs = 200
for e in range(epochs):
running_loss = 0
for fps, labels in train_loader:
# Training pass
optimizer.zero_grad() # Initialize the gradients, which will be recorded during the forward pa
output = model(fps) #Forward pass of the mini-batch
loss = criterion(output, labels) #Computing the loss
loss.backward() # calculate the backward pass
optimizer.step() # Optimize the weights
running_loss += loss.item()
else:
if e%10 == 0:
validation_loss = torch.mean(( y_validation - model(X_validation) )**2).item()
print("Epoch: %3i Training loss: %0.2F Validation lss: %0.2F"%(e,(running_loss/len(train_loader)), validation_loss))
model.eval() #Swith to evaluation mode, where dropout is switched off
y_pred_train = model(X_train)
y_pred_validation = model(X_validation)
y_pred_test = model(X_test)
def flatten(tensor):
return tensor.cpu().detach().numpy().flatten()
plt.scatter(flatten(y_pred_test), flatten(y_test), alpha=0.5, label="Test")
plt.scatter(flatten(y_pred_train), flatten(y_train), alpha=0.1, label="Train")
plt.legend()
plt.plot([-1.5, 1.5], [-1.5,1.5], c="b")
def predict_smiles(smiles):
fp =mol2fp(Chem.MolFromSmiles(smiles)).reshape(1,-1)
fp_filtered = feature_select.transform(fp)
fp_tensor = torch.tensor(fp_filtered, device=device).float()
prediction = model(fp_tensor)
#return prediction.cpu().detach().numpy()
pXC50 = scaler.inverse_transform(prediction.cpu().detach().numpy())
return pXC50[0][0]
predict_smiles('Cc1ccc2c(N3CCNCC3)cc(F)cc2n1')