From e9c3233cc0737234836f8a25fa0e4e795c2e4df7 Mon Sep 17 00:00:00 2001
From: Qin Wan <rogerwq@gmai.com>
Date: Tue, 2 Dec 2025 06:53:27 +0000
Subject: [PATCH 1/2] add: run.sh

---
 run.sh | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100755 run.sh

diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..14aaa76
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+docker run --gpus all --workdir /workspace -v "$(pwd)":/workspace chiral.sakuracr.jp/tensorflow:2025_12_02_v2 python QSAR_workflow_sample.py

From 963f3e603aa212bdec874100f5e02b277f82daef Mon Sep 17 00:00:00 2001
From: Qin Wan <rogerwq@gmail.com>
Date: Wed, 10 Dec 2025 03:18:13 +0000
Subject: [PATCH 2/2] git commit -m "Update QSAR_workflow_sample.py"

---
 QSAR_workflow_sample.py | 172 ++++++++++++++++++++++++----------------
 1 file changed, 105 insertions(+), 67 deletions(-)

diff --git a/QSAR_workflow_sample.py b/QSAR_workflow_sample.py
index ef759f8..90357fa 100644
--- a/QSAR_workflow_sample.py
+++ b/QSAR_workflow_sample.py
@@ -14,27 +14,26 @@
  - Prediction for new SMILES
 """
 
+import io
 import math
-import matplotlib.pyplot as plt
-import pandas as pd
+
 import numpy as np
+
+# import matplotlib.pyplot as plt
+import pandas as pd
 import sklearn.metrics
+import tensorflow as tf
 from keras.callbacks import EarlyStopping
+from keras.layers import Dense
 from keras.models import Sequential
 from numpy.random import seed
+from rdkit import Chem, RDLogger
+from rdkit.Chem import Descriptors, MACCSkeys
+from rdkit.ML.Descriptors import MoleculeDescriptors
 from sklearn.decomposition import PCA
 from sklearn.ensemble import IsolationForest
-from sklearn.linear_model import LinearRegression
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
-import io
-import tensorflow as tf
-from rdkit.ML.Descriptors import MoleculeDescriptors
-from sklearn import metrics
-from rdkit import Chem, RDLogger
-from rdkit.Chem import Descriptors, MACCSkeys
-
-from keras.layers import Dense
 
 # Disable RDKit logging to keep output clean
 RDLogger.DisableLog("rdApp.*")
@@ -45,14 +44,16 @@
 # -------------------------
 def rmse(y_true, y_pred):
     from tensorflow.keras import backend as K
+
     return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))
 
 
 def r_square(y_true, y_pred):
     from tensorflow.keras import backend as K
+
     SS_res = K.sum(K.square(y_true - y_pred))
     SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
-    return (1 - SS_res / (SS_tot + K.epsilon()))
+    return 1 - SS_res / (SS_tot + K.epsilon())
 
 
 class RDKit_2D:
@@ -62,23 +63,25 @@ def __init__(self, smiles):
 
     def compute_2Drdkit(self, name):
         rdkit_2d_desc = []
-        calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
+        calc = MoleculeDescriptors.MolecularDescriptorCalculator(
+            [x[0] for x in Descriptors._descList]
+        )
         header = calc.GetDescriptorNames()
         for i in range(len(self.mols)):
             ds = calc.CalcDescriptors(self.mols[i])
             rdkit_2d_desc.append(ds)
         df = pd.DataFrame(rdkit_2d_desc, columns=header)
-        df.insert(loc=0, column='smiles', value=self.smiles)
+        df.insert(loc=0, column="smiles", value=self.smiles)
         return df
 
     def compute_MACCS(self, name):
         MACCS_list = []
-        header = ['bit' + str(i) for i in range(167)]
+        header = ["bit" + str(i) for i in range(167)]
         for i in range(len(self.mols)):
             ds = list(MACCSkeys.GenMACCSKeys(self.mols[i]).ToBitString())
             MACCS_list.append(ds)
         df2 = pd.DataFrame(MACCS_list, columns=header)
-        df2.insert(loc=0, column='smiles', value=self.smiles)
+        df2.insert(loc=0, column="smiles", value=self.smiles)
         return df2
 
 
@@ -96,22 +99,22 @@ def compute_MACCS(self, name):
 
 # Compute RDKit descriptors for the model dataset (training data)
 # Assumes RDKit_2D accepts a list/series of smiles for initialization
-RDKit_descriptor = RDKit_2D(data['smiles'])
+RDKit_descriptor = RDKit_2D(data["smiles"])
 x1 = RDKit_descriptor.compute_2Drdkit(data)
 x2 = RDKit_descriptor.compute_MACCS(data)
 x3 = x2.iloc[:, 1:]
-x4 = pd.concat([data['DockingScore'], x1, x3], axis=1)
+x4 = pd.concat([data["DockingScore"], x1, x3], axis=1)
 
-print('2-D descriptors and MACCS fingerprints generated for model data:')
+print("2-D descriptors and MACCS fingerprints generated for model data:")
 print(x4)
 
 # -------------------------
 # Prepare features (X) and labels (Y) for model training
 # -------------------------
-labels = x4['DockingScore']
+labels = x4["DockingScore"]
 features = x4.iloc[:, 3:]  # keep as original
 # ensure numeric
-features = features.apply(pd.to_numeric, errors='coerce')
+features = features.apply(pd.to_numeric, errors="coerce")
 print(f"Features shape before cleaning: {features.shape}")
 
 # Clean features: remove inf/NaN, clip extremes
@@ -155,10 +158,10 @@ def compute_MACCS(self, name):
 
 # Rebuild cleaned df3 used by descriptor functions (ensures format expected by RDKit_2D)
 df3 = pd.DataFrame({"smiles": valid_smiles})
-smiles = df3['smiles'].values  # update variable used later
+smiles = df3["smiles"].values  # update variable used later
 
 # Save cleaned .smi (tab-separated) for downstream tools (as before)
-df3.to_csv('molecule.smi', sep='\t', header=False, index=False)
+df3.to_csv("molecule.smi", sep="\t", header=False, index=False)
 
 # -------------------------
 # Compute descriptors for input SMILES — with safe try/except
@@ -169,14 +172,15 @@ def compute_MACCS(self, name):
     x6 = RDKit_descriptor.compute_MACCS(df3)
 except Exception as e:
     print(
-        "Error while computing RDKit descriptors. One or more SMILES may be invalid or RDKit raised an error.")
+        "Error while computing RDKit descriptors. One or more SMILES may be invalid or RDKit raised an error."
+    )
     print(e)
     print()
 
 x7 = x6.iloc[:, 1:]
 x8 = pd.concat([x5, x7], axis=1)
 
-print('2-D descriptors and MACCS fingerprints generated for input SMILES:')
+print("2-D descriptors and MACCS fingerprints generated for input SMILES:")
 print(x8)
 
 # -------------------------
@@ -184,7 +188,9 @@ def compute_MACCS(self, name):
 # -------------------------
 
 # Split the data with fixed random state
-X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, shuffle=True, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, Y, test_size=0.30, shuffle=True, random_state=0
+)
 
 sc = StandardScaler()
 X_train = sc.fit_transform(X_train)
@@ -213,17 +219,17 @@ def compute_MACCS(self, name):
 X_test, y_test = X_test[mask_test, :], y_test[mask_test]
 
 # Summarize the shape of the updated training & test dataset
-print('Updated training & test dataset after removal of outliers')
+print("Updated training & test dataset after removal of outliers")
 print("Training set after outlier removal: {}".format(X_train.shape))
 print("Test set after outlier removal: {}".format(X_test.shape))
 
 # Convert x4 and data columns to numeric, invalid parsing will be set as NaN
-x10 = x8.apply(pd.to_numeric, errors='coerce')
-x4 = x4.apply(pd.to_numeric, errors='coerce')
+x10 = x8.apply(pd.to_numeric, errors="coerce")
+x4 = x4.apply(pd.to_numeric, errors="coerce")
 
 # Extract the columns from 'MaxAbsEStateIndex' onwards
-descriptor_columns = x10.loc[:, 'MaxAbsEStateIndex':]
-model_data_columns = x4.loc[:, 'MaxAbsEStateIndex':]
+descriptor_columns = x10.loc[:, "MaxAbsEStateIndex":]
+model_data_columns = x4.loc[:, "MaxAbsEStateIndex":]
 
 # Initialize a flag to check if all values are within the range
 all_in = True
@@ -233,50 +239,63 @@ def compute_MACCS(self, name):
     if column in model_data_columns.columns:
         min_value = model_data_columns[column].min()
         max_value = model_data_columns[column].max()
-        if not ((descriptor_columns[column] >= min_value) & (
-                descriptor_columns[column] <= max_value)).all():
+        if not (
+            (descriptor_columns[column] >= min_value)
+            & (descriptor_columns[column] <= max_value)
+        ).all():
             all_in = False
             break
 
 # Determine the result based on the all_in flag
-applicability_domain_result = 'IN' if all_in else 'OUT'
+applicability_domain_result = "IN" if all_in else "OUT"
 
 # Create the final DataFrame with input SMILES and applicability domain result
-final_df = pd.DataFrame({
-    'Input SMILES': smiles,
-    'Applicability Domain': applicability_domain_result
-})
+final_df = pd.DataFrame(
+    {"Input SMILES": smiles, "Applicability Domain": applicability_domain_result}
+)
 
-print('Applicability domain information for input SMILES')
+print("Applicability domain information for input SMILES")
 # Display results in the Streamlit app
 print("**Applicability domain (PCA & Isolation forest)**")
 print(final_df)
 
 # Define the model
 model = Sequential()
-model.add(Dense(600, input_dim=X_train.shape[1], activation='relu'))
-model.add(Dense(100, activation='relu'))
-model.add(Dense(100, activation='relu'))
-model.add(Dense(1, activation='linear'))
-model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae', 'mape', rmse, r_square])
+model.add(Dense(600, input_dim=X_train.shape[1], activation="relu"))
+model.add(Dense(100, activation="relu"))
+model.add(Dense(100, activation="relu"))
+model.add(Dense(1, activation="linear"))
+model.compile(
+    loss="mean_squared_error", optimizer="adam", metrics=["mae", "mape", rmse, r_square]
+)
 
 # Enable early stopping based on r_square
-earlystopping = EarlyStopping(monitor='val_r_square', patience=200, verbose=1, mode='max')
+earlystopping = EarlyStopping(
+    monitor="val_r_square", patience=200, verbose=1, mode="max"
+)
 
 # Train the model
-result = model.fit(X_train, y_train, epochs=200, batch_size=400, shuffle=True, verbose=2,
-                   validation_data=(X_test, y_test), callbacks=[earlystopping])
+result = model.fit(
+    X_train,
+    y_train,
+    epochs=200,
+    batch_size=400,
+    shuffle=True,
+    verbose=2,
+    validation_data=(X_test, y_test),
+    callbacks=[earlystopping],
+)
 
 # Predict on new data
 x9 = sc.transform(x8.iloc[:, 2:])
 ACEpredict = model.predict(x9)
-deepdock = pd.DataFrame(ACEpredict, columns=['DeepDock Prediction'])
-deepdock.insert(loc=0, column='smiles', value=smiles)
+deepdock = pd.DataFrame(ACEpredict, columns=["DeepDock Prediction"])
+deepdock.insert(loc=0, column="smiles", value=smiles)
 # Set a subheader and display the regression
-print('Prediction:\nBinding affinity of input data against selected protein target\n ')
+print("Prediction:\nBinding affinity of input data against selected protein target\n ")
 print("**DeepDock Prediction**")
 print(deepdock)
-print('Prediction Created Successfully!')
+print("Prediction Created Successfully!")
 
 # Make predictions with the neural network
 y_pred = model.predict(X_test)
@@ -286,31 +305,50 @@ def compute_MACCS(self, name):
 
 # Model summary
 s = io.StringIO()
-model.summary(print_fn=lambda x: s.write(x + '\n'))
+model.summary(print_fn=lambda x: s.write(x + "\n"))
 model_summary = s.getvalue()
 s.close()
 
-print('Model summary')
+print("Model summary")
 print(model_summary)
 
-print('Model prediction evaluation:')
+print("Model prediction evaluation:")
 
 # Print statistical figures of merit for training set
-print('Trained_error_rate:')
-print("Mean absolute error (MAE): %f" % sklearn.metrics.mean_absolute_error(y_train, x_pred))
-print("Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_train, x_pred))
-print("Root mean squared error (RMSE): %f" % math.sqrt(
-    sklearn.metrics.mean_squared_error(y_train, x_pred)))
-print("Coefficient of determination ($R^2$): %f" % sklearn.metrics.r2_score(y_train, x_pred))
+print("Trained_error_rate:")
+print(
+    "Mean absolute error (MAE): %f"
+    % sklearn.metrics.mean_absolute_error(y_train, x_pred)
+)
+print(
+    "Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_train, x_pred)
+)
+print(
+    "Root mean squared error (RMSE): %f"
+    % math.sqrt(sklearn.metrics.mean_squared_error(y_train, x_pred))
+)
+print(
+    "Coefficient of determination ($R^2$): %f"
+    % sklearn.metrics.r2_score(y_train, x_pred)
+)
 
 # Print statistical figures of merit for test set
-print('Test_error_rate:')
-print("Mean absolute error (MAE): %f" % sklearn.metrics.mean_absolute_error(y_test, y_pred))
-print("Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_test, y_pred))
-print("Root mean squared error (RMSE): %f" % math.sqrt(
-    sklearn.metrics.mean_squared_error(y_test, y_pred)))
-print("Coefficient of determination ($R^2$): %f" % sklearn.metrics.r2_score(y_test, y_pred))
+print("Test_error_rate:")
+print(
+    "Mean absolute error (MAE): %f"
+    % sklearn.metrics.mean_absolute_error(y_test, y_pred)
+)
+print(
+    "Mean squared error (MSE): %f" % sklearn.metrics.mean_squared_error(y_test, y_pred)
+)
+print(
+    "Root mean squared error (RMSE): %f"
+    % math.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred))
+)
+print(
+    "Coefficient of determination ($R^2$): %f"
+    % sklearn.metrics.r2_score(y_test, y_pred)
+)
 
 # Print final results
 print("Done.")
-