Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 105 additions & 67 deletions QSAR_workflow_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,26 @@
- Prediction for new SMILES
"""

import io
import math
import matplotlib.pyplot as plt
import pandas as pd

import numpy as np

# import matplotlib.pyplot as plt
import pandas as pd
import sklearn.metrics
import tensorflow as tf
from keras.callbacks import EarlyStopping
from keras.layers import Dense
from keras.models import Sequential
from numpy.random import seed
from rdkit import Chem, RDLogger
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import io
import tensorflow as tf
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn import metrics
from rdkit import Chem, RDLogger
from rdkit.Chem import Descriptors, MACCSkeys

from keras.layers import Dense

# Disable RDKit logging to keep output clean
RDLogger.DisableLog("rdApp.*")
Expand All @@ -45,14 +44,16 @@
# -------------------------
def rmse(y_true, y_pred):
from tensorflow.keras import backend as K

return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))


def r_square(y_true, y_pred):
from tensorflow.keras import backend as K

SS_res = K.sum(K.square(y_true - y_pred))
SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
return (1 - SS_res / (SS_tot + K.epsilon()))
return 1 - SS_res / (SS_tot + K.epsilon())


class RDKit_2D:
Expand All @@ -62,23 +63,25 @@ def __init__(self, smiles):

def compute_2Drdkit(self, name):
rdkit_2d_desc = []
calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
calc = MoleculeDescriptors.MolecularDescriptorCalculator(
[x[0] for x in Descriptors._descList]
)
header = calc.GetDescriptorNames()
for i in range(len(self.mols)):
ds = calc.CalcDescriptors(self.mols[i])
rdkit_2d_desc.append(ds)
df = pd.DataFrame(rdkit_2d_desc, columns=header)
df.insert(loc=0, column='smiles', value=self.smiles)
df.insert(loc=0, column="smiles", value=self.smiles)
return df

def compute_MACCS(self, name):
MACCS_list = []
header = ['bit' + str(i) for i in range(167)]
header = ["bit" + str(i) for i in range(167)]
for i in range(len(self.mols)):
ds = list(MACCSkeys.GenMACCSKeys(self.mols[i]).ToBitString())
MACCS_list.append(ds)
df2 = pd.DataFrame(MACCS_list, columns=header)
df2.insert(loc=0, column='smiles', value=self.smiles)
df2.insert(loc=0, column="smiles", value=self.smiles)
return df2


Expand All @@ -96,22 +99,22 @@ def compute_MACCS(self, name):

# Compute RDKit descriptors for the model dataset (training data)
# Assumes RDKit_2D accepts a list/series of smiles for initialization
RDKit_descriptor = RDKit_2D(data['smiles'])
RDKit_descriptor = RDKit_2D(data["smiles"])
x1 = RDKit_descriptor.compute_2Drdkit(data)
x2 = RDKit_descriptor.compute_MACCS(data)
x3 = x2.iloc[:, 1:]
x4 = pd.concat([data['DockingScore'], x1, x3], axis=1)
x4 = pd.concat([data["DockingScore"], x1, x3], axis=1)

print('2-D descriptors and MACCS fingerprints generated for model data:')
print("2-D descriptors and MACCS fingerprints generated for model data:")
print(x4)

# -------------------------
# Prepare features (X) and labels (Y) for model training
# -------------------------
labels = x4['DockingScore']
labels = x4["DockingScore"]
features = x4.iloc[:, 3:] # keep as original
# ensure numeric
features = features.apply(pd.to_numeric, errors='coerce')
features = features.apply(pd.to_numeric, errors="coerce")
print(f"Features shape before cleaning: {features.shape}")

# Clean features: remove inf/NaN, clip extremes
Expand Down Expand Up @@ -155,10 +158,10 @@ def compute_MACCS(self, name):

# Rebuild cleaned df3 used by descriptor functions (ensures format expected by RDKit_2D)
df3 = pd.DataFrame({"smiles": valid_smiles})
smiles = df3['smiles'].values # update variable used later
smiles = df3["smiles"].values # update variable used later

# Save cleaned .smi (tab-separated) for downstream tools (as before)
df3.to_csv('molecule.smi', sep='\t', header=False, index=False)
df3.to_csv("molecule.smi", sep="\t", header=False, index=False)

# -------------------------
# Compute descriptors for input SMILES — with safe try/except
Expand All @@ -169,22 +172,25 @@ def compute_MACCS(self, name):
x6 = RDKit_descriptor.compute_MACCS(df3)
except Exception as e:
print(
"Error while computing RDKit descriptors. One or more SMILES may be invalid or RDKit raised an error.")
"Error while computing RDKit descriptors. One or more SMILES may be invalid or RDKit raised an error."
)
print(e)
print()

x7 = x6.iloc[:, 1:]
x8 = pd.concat([x5, x7], axis=1)

print('2-D descriptors and MACCS fingerprints generated for input SMILES:')
print("2-D descriptors and MACCS fingerprints generated for input SMILES:")
print(x8)

# -------------------------
# Train/test split and scaling (same logic as original, but cleaned)
# -------------------------

# Split the data with fixed random state
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, shuffle=True, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, shuffle=True, random_state=0
)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
Expand Down Expand Up @@ -213,17 +219,17 @@ def compute_MACCS(self, name):
X_test, y_test = X_test[mask_test, :], y_test[mask_test]

# Summarize the shape of the updated training & test dataset
print('Updated training & test dataset after removal of outliers')
print("Updated training & test dataset after removal of outliers")
print("Training set after outlier removal: {}".format(X_train.shape))
print("Test set after outlier removal: {}".format(X_test.shape))

# Convert x4 and data columns to numeric, invalid parsing will be set as NaN
x10 = x8.apply(pd.to_numeric, errors='coerce')
x4 = x4.apply(pd.to_numeric, errors='coerce')
x10 = x8.apply(pd.to_numeric, errors="coerce")
x4 = x4.apply(pd.to_numeric, errors="coerce")

# Extract the columns from 'MaxAbsEStateIndex' onwards
descriptor_columns = x10.loc[:, 'MaxAbsEStateIndex':]
model_data_columns = x4.loc[:, 'MaxAbsEStateIndex':]
descriptor_columns = x10.loc[:, "MaxAbsEStateIndex":]
model_data_columns = x4.loc[:, "MaxAbsEStateIndex":]

# Initialize a flag to check if all values are within the range
all_in = True
Expand All @@ -233,50 +239,63 @@ def compute_MACCS(self, name):
if column in model_data_columns.columns:
min_value = model_data_columns[column].min()
max_value = model_data_columns[column].max()
if not ((descriptor_columns[column] >= min_value) & (
descriptor_columns[column] <= max_value)).all():
if not (
(descriptor_columns[column] >= min_value)
& (descriptor_columns[column] <= max_value)
).all():
all_in = False
break

# Determine the result based on the all_in flag
applicability_domain_result = 'IN' if all_in else 'OUT'
applicability_domain_result = "IN" if all_in else "OUT"

# Create the final DataFrame with input SMILES and applicability domain result
final_df = pd.DataFrame({
'Input SMILES': smiles,
'Applicability Domain': applicability_domain_result
})
final_df = pd.DataFrame(
{"Input SMILES": smiles, "Applicability Domain": applicability_domain_result}
)

print('Applicability domain information for input SMILES')
print("Applicability domain information for input SMILES")
# Display results in the Streamlit app
print("**Applicability domain (PCA & Isolation forest)**")
print(final_df)

# Define the model
model = Sequential()
model.add(Dense(600, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae', 'mape', rmse, r_square])
model.add(Dense(600, input_dim=X_train.shape[1], activation="relu"))
model.add(Dense(100, activation="relu"))
model.add(Dense(100, activation="relu"))
model.add(Dense(1, activation="linear"))
model.compile(
loss="mean_squared_error", optimizer="adam", metrics=["mae", "mape", rmse, r_square]
)

# Enable early stopping based on r_square
earlystopping = EarlyStopping(monitor='val_r_square', patience=200, verbose=1, mode='max')
earlystopping = EarlyStopping(
monitor="val_r_square", patience=200, verbose=1, mode="max"
)

# Train the model
result = model.fit(X_train, y_train, epochs=200, batch_size=400, shuffle=True, verbose=2,
validation_data=(X_test, y_test), callbacks=[earlystopping])
result = model.fit(
X_train,
y_train,
epochs=200,
batch_size=400,
shuffle=True,
verbose=2,
validation_data=(X_test, y_test),
callbacks=[earlystopping],
)

# Predict on new data
x9 = sc.transform(x8.iloc[:, 2:])
ACEpredict = model.predict(x9)
deepdock = pd.DataFrame(ACEpredict, columns=['DeepDock Prediction'])
deepdock.insert(loc=0, column='smiles', value=smiles)
deepdock = pd.DataFrame(ACEpredict, columns=["DeepDock Prediction"])
deepdock.insert(loc=0, column="smiles", value=smiles)
# Set a subheader and display the regression
print('Prediction:\nBinding affinity of input data against selected protein target\n ')
print("Prediction:\nBinding affinity of input data against selected protein target\n ")
print("**DeepDock Prediction**")
print(deepdock)
print('Prediction Created Successfully!')
print("Prediction Created Successfully!")

# Make predictions with the neural network
y_pred = model.predict(X_test)
Expand All @@ -286,31 +305,50 @@ def compute_MACCS(self, name):

# Model summary
s = io.StringIO()
model.summary(print_fn=lambda x: s.write(x + '\n'))
model.summary(print_fn=lambda x: s.write(x + "\n"))
model_summary = s.getvalue()
s.close()

print('Model summary')
print("Model summary")
print(model_summary)

print('Model prediction evaluation:')
print("Model prediction evaluation:")

# Print statistical figures of merit for training set
print('Trained_error_rate:')
print("Mean absolute error (MAE): %f" % sklearn.metrics.mean_absolute_error(y_train, x_pred))
print("Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_train, x_pred))
print("Root mean squared error (RMSE): %f" % math.sqrt(
sklearn.metrics.mean_squared_error(y_train, x_pred)))
print("Coefficient of determination ($R^2$): %f" % sklearn.metrics.r2_score(y_train, x_pred))
print("Trained_error_rate:")
print(
"Mean absolute error (MAE): %f"
% sklearn.metrics.mean_absolute_error(y_train, x_pred)
)
print(
"Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_train, x_pred)
)
print(
"Root mean squared error (RMSE): %f"
% math.sqrt(sklearn.metrics.mean_squared_error(y_train, x_pred))
)
print(
"Coefficient of determination ($R^2$): %f"
% sklearn.metrics.r2_score(y_train, x_pred)
)

# Print statistical figures of merit for test set
print('Test_error_rate:')
print("Mean absolute error (MAE): %f" % sklearn.metrics.mean_absolute_error(y_test, y_pred))
print("Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_test, y_pred))
print("Root mean squared error (RMSE): %f" % math.sqrt(
sklearn.metrics.mean_squared_error(y_test, y_pred)))
print("Coefficient of determination ($R^2$): %f" % sklearn.metrics.r2_score(y_test, y_pred))
print("Test_error_rate:")
print(
"Mean absolute error (MAE): %f"
% sklearn.metrics.mean_absolute_error(y_test, y_pred)
)
print(
"Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_test, y_pred)
)
print(
"Root mean squared error (RMSE): %f"
% math.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred))
)
print(
"Coefficient of determination ($R^2$): %f"
% sklearn.metrics.r2_score(y_test, y_pred)
)

# Print final results
print("Done.")

2 changes: 2 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
docker run --gpus all --workdir /workspace -v "$(pwd)":/workspace chiral.sakuracr.jp/tensorflow:2025_12_02_v2 python QSAR_workflow_sample.py