From e9c3233cc0737234836f8a25fa0e4e795c2e4df7 Mon Sep 17 00:00:00 2001 From: Qin Wan Date: Tue, 2 Dec 2025 06:53:27 +0000 Subject: [PATCH 1/2] add: run.sh --- run.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100755 run.sh diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..14aaa76 --- /dev/null +++ b/run.sh @@ -0,0 +1,2 @@ +#!/bin/bash +docker run --gpus all --workdir /workspace -v "$(pwd)":/workspace chiral.sakuracr.jp/tensorflow:2025_12_02_v2 python QSAR_workflow_sample.py From 963f3e603aa212bdec874100f5e02b277f82daef Mon Sep 17 00:00:00 2001 From: Qin Wan Date: Wed, 10 Dec 2025 03:18:13 +0000 Subject: [PATCH 2/2] git commit -m "Update QSAR_workflow_sample.py" --- QSAR_workflow_sample.py | 172 ++++++++++++++++++++++++---------------- 1 file changed, 105 insertions(+), 67 deletions(-) diff --git a/QSAR_workflow_sample.py b/QSAR_workflow_sample.py index ef759f8..90357fa 100644 --- a/QSAR_workflow_sample.py +++ b/QSAR_workflow_sample.py @@ -14,27 +14,26 @@ - Prediction for new SMILES """ +import io import math -import matplotlib.pyplot as plt -import pandas as pd + import numpy as np + +# import matplotlib.pyplot as plt +import pandas as pd import sklearn.metrics +import tensorflow as tf from keras.callbacks import EarlyStopping +from keras.layers import Dense from keras.models import Sequential from numpy.random import seed +from rdkit import Chem, RDLogger +from rdkit.Chem import Descriptors, MACCSkeys +from rdkit.ML.Descriptors import MoleculeDescriptors from sklearn.decomposition import PCA from sklearn.ensemble import IsolationForest -from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler -import io -import tensorflow as tf -from rdkit.ML.Descriptors import MoleculeDescriptors -from sklearn import metrics -from rdkit import Chem, RDLogger -from rdkit.Chem import Descriptors, MACCSkeys - -from keras.layers import Dense # Disable RDKit logging to keep output clean RDLogger.DisableLog("rdApp.*") @@ -45,14 +44,16 @@ # ------------------------- def rmse(y_true, y_pred): from tensorflow.keras import backend as K + return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) def r_square(y_true, y_pred): from tensorflow.keras import backend as K + SS_res = K.sum(K.square(y_true - y_pred)) SS_tot = K.sum(K.square(y_true - K.mean(y_true))) - return (1 - SS_res / (SS_tot + K.epsilon())) + return 1 - SS_res / (SS_tot + K.epsilon()) class RDKit_2D: @@ -62,23 +63,25 @@ def __init__(self, smiles): def compute_2Drdkit(self, name): rdkit_2d_desc = [] - calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList]) + calc = MoleculeDescriptors.MolecularDescriptorCalculator( + [x[0] for x in Descriptors._descList] + ) header = calc.GetDescriptorNames() for i in range(len(self.mols)): ds = calc.CalcDescriptors(self.mols[i]) rdkit_2d_desc.append(ds) df = pd.DataFrame(rdkit_2d_desc, columns=header) - df.insert(loc=0, column='smiles', value=self.smiles) + df.insert(loc=0, column="smiles", value=self.smiles) return df def compute_MACCS(self, name): MACCS_list = [] - header = ['bit' + str(i) for i in range(167)] + header = ["bit" + str(i) for i in range(167)] for i in range(len(self.mols)): ds = list(MACCSkeys.GenMACCSKeys(self.mols[i]).ToBitString()) MACCS_list.append(ds) df2 = pd.DataFrame(MACCS_list, columns=header) - df2.insert(loc=0, column='smiles', value=self.smiles) + df2.insert(loc=0, column="smiles", value=self.smiles) return df2 @@ -96,22 +99,22 @@ def compute_MACCS(self, name): # Compute RDKit descriptors for the model dataset (training data) # Assumes RDKit_2D accepts a list/series of smiles for initialization -RDKit_descriptor = RDKit_2D(data['smiles']) +RDKit_descriptor = RDKit_2D(data["smiles"]) x1 = RDKit_descriptor.compute_2Drdkit(data) x2 = RDKit_descriptor.compute_MACCS(data) x3 = x2.iloc[:, 1:] -x4 = pd.concat([data['DockingScore'], x1, x3], axis=1) +x4 = pd.concat([data["DockingScore"], x1, x3], axis=1) -print('2-D descriptors and MACCS fingerprints generated for model data:') +print("2-D descriptors and MACCS fingerprints generated for model data:") print(x4) # ------------------------- # Prepare features (X) and labels (Y) for model training # ------------------------- -labels = x4['DockingScore'] +labels = x4["DockingScore"] features = x4.iloc[:, 3:] # keep as original # ensure numeric -features = features.apply(pd.to_numeric, errors='coerce') +features = features.apply(pd.to_numeric, errors="coerce") print(f"Features shape before cleaning: {features.shape}") # Clean features: remove inf/NaN, clip extremes @@ -155,10 +158,10 @@ def compute_MACCS(self, name): # Rebuild cleaned df3 used by descriptor functions (ensures format expected by RDKit_2D) df3 = pd.DataFrame({"smiles": valid_smiles}) -smiles = df3['smiles'].values # update variable used later +smiles = df3["smiles"].values # update variable used later # Save cleaned .smi (tab-separated) for downstream tools (as before) -df3.to_csv('molecule.smi', sep='\t', header=False, index=False) +df3.to_csv("molecule.smi", sep="\t", header=False, index=False) # ------------------------- # Compute descriptors for input SMILES — with safe try/except @@ -169,14 +172,15 @@ def compute_MACCS(self, name): x6 = RDKit_descriptor.compute_MACCS(df3) except Exception as e: print( - "Error while computing RDKit descriptors. One or more SMILES may be invalid or RDKit raised an error.") + "Error while computing RDKit descriptors. One or more SMILES may be invalid or RDKit raised an error." + ) print(e) print() x7 = x6.iloc[:, 1:] x8 = pd.concat([x5, x7], axis=1) -print('2-D descriptors and MACCS fingerprints generated for input SMILES:') +print("2-D descriptors and MACCS fingerprints generated for input SMILES:") print(x8) # ------------------------- @@ -184,7 +188,9 @@ def compute_MACCS(self, name): # ------------------------- # Split the data with fixed random state -X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, shuffle=True, random_state=0) +X_train, X_test, y_train, y_test = train_test_split( + X, Y, test_size=0.30, shuffle=True, random_state=0 +) sc = StandardScaler() X_train = sc.fit_transform(X_train) @@ -213,17 +219,17 @@ def compute_MACCS(self, name): X_test, y_test = X_test[mask_test, :], y_test[mask_test] # Summarize the shape of the updated training & test dataset -print('Updated training & test dataset after removal of outliers') +print("Updated training & test dataset after removal of outliers") print("Training set after outlier removal: {}".format(X_train.shape)) print("Test set after outlier removal: {}".format(X_test.shape)) # Convert x4 and data columns to numeric, invalid parsing will be set as NaN -x10 = x8.apply(pd.to_numeric, errors='coerce') -x4 = x4.apply(pd.to_numeric, errors='coerce') +x10 = x8.apply(pd.to_numeric, errors="coerce") +x4 = x4.apply(pd.to_numeric, errors="coerce") # Extract the columns from 'MaxAbsEStateIndex' onwards -descriptor_columns = x10.loc[:, 'MaxAbsEStateIndex':] -model_data_columns = x4.loc[:, 'MaxAbsEStateIndex':] +descriptor_columns = x10.loc[:, "MaxAbsEStateIndex":] +model_data_columns = x4.loc[:, "MaxAbsEStateIndex":] # Initialize a flag to check if all values are within the range all_in = True @@ -233,50 +239,63 @@ def compute_MACCS(self, name): if column in model_data_columns.columns: min_value = model_data_columns[column].min() max_value = model_data_columns[column].max() - if not ((descriptor_columns[column] >= min_value) & ( - descriptor_columns[column] <= max_value)).all(): + if not ( + (descriptor_columns[column] >= min_value) + & (descriptor_columns[column] <= max_value) + ).all(): all_in = False break # Determine the result based on the all_in flag -applicability_domain_result = 'IN' if all_in else 'OUT' +applicability_domain_result = "IN" if all_in else "OUT" # Create the final DataFrame with input SMILES and applicability domain result -final_df = pd.DataFrame({ - 'Input SMILES': smiles, - 'Applicability Domain': applicability_domain_result -}) +final_df = pd.DataFrame( + {"Input SMILES": smiles, "Applicability Domain": applicability_domain_result} +) -print('Applicability domain information for input SMILES') +print("Applicability domain information for input SMILES") # Display results in the Streamlit app print("**Applicability domain (PCA & Isolation forest)**") print(final_df) # Define the model model = Sequential() -model.add(Dense(600, input_dim=X_train.shape[1], activation='relu')) -model.add(Dense(100, activation='relu')) -model.add(Dense(100, activation='relu')) -model.add(Dense(1, activation='linear')) -model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae', 'mape', rmse, r_square]) +model.add(Dense(600, input_dim=X_train.shape[1], activation="relu")) +model.add(Dense(100, activation="relu")) +model.add(Dense(100, activation="relu")) +model.add(Dense(1, activation="linear")) +model.compile( + loss="mean_squared_error", optimizer="adam", metrics=["mae", "mape", rmse, r_square] +) # Enable early stopping based on r_square -earlystopping = EarlyStopping(monitor='val_r_square', patience=200, verbose=1, mode='max') +earlystopping = EarlyStopping( + monitor="val_r_square", patience=200, verbose=1, mode="max" +) # Train the model -result = model.fit(X_train, y_train, epochs=200, batch_size=400, shuffle=True, verbose=2, - validation_data=(X_test, y_test), callbacks=[earlystopping]) +result = model.fit( + X_train, + y_train, + epochs=200, + batch_size=400, + shuffle=True, + verbose=2, + validation_data=(X_test, y_test), + callbacks=[earlystopping], +) # Predict on new data x9 = sc.transform(x8.iloc[:, 2:]) ACEpredict = model.predict(x9) -deepdock = pd.DataFrame(ACEpredict, columns=['DeepDock Prediction']) -deepdock.insert(loc=0, column='smiles', value=smiles) +deepdock = pd.DataFrame(ACEpredict, columns=["DeepDock Prediction"]) +deepdock.insert(loc=0, column="smiles", value=smiles) # Set a subheader and display the regression -print('Prediction:\nBinding affinity of input data against selected protein target\n ') +print("Prediction:\nBinding affinity of input data against selected protein target\n ") print("**DeepDock Prediction**") print(deepdock) -print('Prediction Created Successfully!') +print("Prediction Created Successfully!") # Make predictions with the neural network y_pred = model.predict(X_test) @@ -286,31 +305,50 @@ def compute_MACCS(self, name): # Model summary s = io.StringIO() -model.summary(print_fn=lambda x: s.write(x + '\n')) +model.summary(print_fn=lambda x: s.write(x + "\n")) model_summary = s.getvalue() s.close() -print('Model summary') +print("Model summary") print(model_summary) -print('Model prediction evaluation:') +print("Model prediction evaluation:") # Print statistical figures of merit for training set -print('Trained_error_rate:') -print("Mean absolute error (MAE): %f" % sklearn.metrics.mean_absolute_error(y_train, x_pred)) -print("Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_train, x_pred)) -print("Root mean squared error (RMSE): %f" % math.sqrt( - sklearn.metrics.mean_squared_error(y_train, x_pred))) -print("Coefficient of determination ($R^2$): %f" % sklearn.metrics.r2_score(y_train, x_pred)) +print("Trained_error_rate:") +print( + "Mean absolute error (MAE): %f" + % sklearn.metrics.mean_absolute_error(y_train, x_pred) +) +print( + "Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_train, x_pred) +) +print( + "Root mean squared error (RMSE): %f" + % math.sqrt(sklearn.metrics.mean_squared_error(y_train, x_pred)) +) +print( + "Coefficient of determination ($R^2$): %f" + % sklearn.metrics.r2_score(y_train, x_pred) +) # Print statistical figures of merit for test set -print('Test_error_rate:') -print("Mean absolute error (MAE): %f" % sklearn.metrics.mean_absolute_error(y_test, y_pred)) -print("Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_test, y_pred)) -print("Root mean squared error (RMSE): %f" % math.sqrt( - sklearn.metrics.mean_squared_error(y_test, y_pred))) -print("Coefficient of determination ($R^2$): %f" % sklearn.metrics.r2_score(y_test, y_pred)) +print("Test_error_rate:") +print( + "Mean absolute error (MAE): %f" + % sklearn.metrics.mean_absolute_error(y_test, y_pred) +) +print( + "Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_test, y_pred) +) +print( + "Root mean squared error (RMSE): %f" + % math.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred)) +) +print( + "Coefficient of determination ($R^2$): %f" + % sklearn.metrics.r2_score(y_test, y_pred) +) # Print final results print("Done.") -