diff --git a/.gitignore b/.gitignore index 43432f9..e834b95 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,3 @@ __pycache__/ .DS_Store Notebooks/ config/ - -BAFU_e_CMYK_pos_hoch.pdf -my_figure.pdf -my_figure.png diff --git a/backend/Dockerfile b/backend/Dockerfile index 4da816d..7a2b91e 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,7 +1,11 @@ -FROM python:3.9 +FROM python:3.11 COPY requirements.txt app/requirements.txt WORKDIR /app RUN pip install -r requirements.txt +RUN apt-get update && \ + apt-get install -y openjdk-17-jre && \ + apt-get clean && \ + update-ca-certificates -f \ COPY . /app EXPOSE 8000 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000" ,"--reload"] \ No newline at end of file diff --git a/backend/app.py b/backend/app.py index 3f568f2..123d3cf 100644 --- a/backend/app.py +++ b/backend/app.py @@ -1,10 +1,5 @@ from fastapi import FastAPI -import joblib -import numpy as np - -from descriptors import get_maccs_fingerprints - -model_pipeline = joblib.load('pepper_pipeline_model.pkl') +from predict_target_endpoint import predict app = FastAPI() @app.get("/") @@ -13,11 +8,4 @@ async def read_root(): @app.get('/predict/') async def serve_foo(smiles: str): - smiles_list = smiles.split(',') - - # Calculate the MACCS fingerprints for the input data - X = get_maccs_fingerprints(smiles_list) - - # Use the pipeline to make predictions - predicted_logB = model_pipeline.predict(X) - return np.round((10**predicted_logB ) *100).tolist() \ No newline at end of file + return predict(smiles.split(',')) diff --git a/backend/descriptors.py b/backend/descriptors.py deleted file mode 100644 index f53fc6c..0000000 --- a/backend/descriptors.py +++ /dev/null @@ -1,6 +0,0 @@ -from molfeat.trans.fp import FPVecTransformer - -def get_maccs_fingerprints(smiles): - transformer = FPVecTransformer(kind='maccs', dtype=float) - maccs = transformer(smiles) - return maccs diff --git a/backend/pepper_object_wwtp_optimized_trained_model.pkl b/backend/pepper_object_wwtp_optimized_trained_model.pkl new file mode 100644 index 0000000..79f67d3 Binary files /dev/null and b/backend/pepper_object_wwtp_optimized_trained_model.pkl differ diff --git a/backend/pepper_pipeline_model.pkl b/backend/pepper_pipeline_model.pkl deleted file mode 100644 index 49ddf9f..0000000 Binary files a/backend/pepper_pipeline_model.pkl and /dev/null differ diff --git a/backend/predict_target_endpoint.py b/backend/predict_target_endpoint.py new file mode 100644 index 0000000..b54d54c --- /dev/null +++ b/backend/predict_target_endpoint.py @@ -0,0 +1,26 @@ +from pepper_lab.predict import Predict +import pandas as pd + +def predict(input_smiles): + data = pd.DataFrame(input_smiles, columns=["SMILES"]) + pepper_predict = Predict(renku=True) + predictions_df = pepper_predict.predict_endpoint('pepper_object_wwtp_optimized_trained_model.pkl', + input_model_format='pickle', input_smiles=data, + input_smiles_type='dataframe') # The backend accepts single molecules + + # Select what to show in the app + logb = predictions_df['logB_predicted'] + breakthrough_perc = (10**logb)*100 + rounded_b_perc = round(breakthrough_perc, 1) + predictions_df['Breakthrough (%)'] = rounded_b_perc + + confidence = predictions_df['{}_predicted'.format(pepper_predict.model.target_variable_std_name)] + rounded_confidence = round(confidence, 2) + predictions_df['Confidence 0-1'] = rounded_confidence + + predictions_df = predictions_df[[pepper_predict.model.compound_name, + pepper_predict.model.smiles_name, + 'Breakthrough (%)', + 'Confidence 0-1']] + predictions_df.fillna("", inplace=True) + return predictions_df.to_dict(orient="list") diff --git a/backend/requirements.txt b/backend/requirements.txt index 0d68c94..011ca46 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,6 +1,5 @@ -fastapi==0.115.2 -joblib==1.4.2 -molfeat==0.10.0 -numpy==1.26.0 -scikit-learn==1.3.2 -uvicorn==0.32.0 \ No newline at end of file +fastapi>=0.115.2,<1.0.0 +pandas>=2.2.3,<3.0.0 +pepper-lab>=1.1.0 +rdkit>=2024.9.5,<2025.0.0 +uvicorn>=0.32.0,<1.0.0 \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index d91c77a..389deff 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,4 @@ -version: '3.9' +version: '3.12' services: frontend: build: streamlit diff --git a/streamlit/Dockerfile b/streamlit/Dockerfile index 59384ff..ecaebb4 100644 --- a/streamlit/Dockerfile +++ b/streamlit/Dockerfile @@ -1,13 +1,12 @@ -FROM python:3.9 - +FROM python:3.12 COPY requirements.txt app/requirements.txt WORKDIR /app - RUN pip install -r requirements.txt +RUN apt-get update && \ + apt-get install -y openjdk-17-jre && \ + apt-get clean && \ + update-ca-certificates -f COPY . /app - EXPOSE 8501 - ENTRYPOINT ["streamlit","run"] - CMD ["app.py"] \ No newline at end of file diff --git a/streamlit/app.py b/streamlit/app.py index 34fbfd6..6e9eb53 100644 --- a/streamlit/app.py +++ b/streamlit/app.py @@ -1,39 +1,69 @@ -import pandas as pd import streamlit as st -import requests +import pandas as pd from rdkit.Chem import PandasTools +import requests + +from utils import image_from_mol +@st.cache_data +def convert_df(df): + # IMPORTANT: Cache the conversion to prevent computation on every rerun + return df.to_csv().encode("utf-8") +example_csv = pd.read_csv('test_pepper_app.csv') def main(): # Streamlit app title st.title("PEPPER: an app to Predict Environmental Pollutant PERsistence ") + st.markdown(""" + Currently we support the prediction of the expected percentage breakthrough of micropollutants from + conventional wastewater treatment, that is, the percentage that potentially escapes the plant + without being successfully removed. Visit section [Learn more](https://pepper-app.streamlit.app/Learn_more) + for further details. + """) + # Upload CSV file uploaded_file = st.file_uploader("Upload a CSV file with chemical substance data", type="csv") + + csv = convert_df(example_csv) + + st.sidebar.download_button( + label="Download example file", + data=csv, + file_name="pepper_example.csv", + mime="text/csv", + ) + if uploaded_file is not None: # Load the uploaded data - input_data = pd.read_csv(uploaded_file) + df = pd.read_csv(uploaded_file) # Show the input data - st.write("Uploaded data:", input_data) + st.write(" ### Uploaded data:", df) - response = requests.request("get", "http://backend:8000/predict/", - params={"smiles": ",".join(input_data.SMILES)}) - - # Show it as a dataframe - predictions_df = pd.DataFrame(input_data) - predictions_df['Breakthrough (%)'] = response.json() + print('Start predictions') + # Calculate using pepper-lab + response = requests.request("get", "http://backend:8000/predict/", params={"smiles": ",".join(df.SMILES)}).json() + df = pd.DataFrame.from_dict(response) + + PandasTools.AddMoleculeColumnToFrame(df, 'SMILES', 'Structure') + df["Structure"] = df["Structure"].apply(image_from_mol) + df.drop(columns='SMILES', inplace=True) # Show the predictions - st.write("Predictions:", predictions_df) - - # PandasTools.AddMoleculeColumnToFrame(predictions_df, smilesCol='SMILES') - # predictions_df.rename(columns={'ROMol': 'Structure'}) - # predictions_df.drop(columns='SMILES', inplace=True) + st.markdown(""" ### Predictions: """) + config = { + "Structure": st.column_config.ImageColumn(width="medium"), + } + st.dataframe(df, column_config=config, row_height=100) - st.markdown(predictions_df.to_html(escape=False), unsafe_allow_html=True) + # st.write(""" + # 📢⚠️ The frame below shows the predictions along chemical structures. + # We are working to give you the chemical structures as part of the file to be downloaded. """) + # + # st.markdown(predictions_df.to_html(escape=False), unsafe_allow_html=True) if __name__ == '__main__': diff --git a/streamlit/packages.txt b/streamlit/packages.txt deleted file mode 100644 index 131a1a5..0000000 --- a/streamlit/packages.txt +++ /dev/null @@ -1 +0,0 @@ -libxrender1 diff --git a/streamlit/requirements.txt b/streamlit/requirements.txt index dc77083..bda6f1f 100644 --- a/streamlit/requirements.txt +++ b/streamlit/requirements.txt @@ -1,9 +1,4 @@ -joblib==1.4.2 -molfeat==0.10.0 -pandas==2.2.3 -streamlit==1.38.0 -scipy==1.12.0 -matplotlib==3.6.2 -numpy==1.26.0 -scikit-learn==1.3.0 -rdkit==2024.03.5 +pandas>=2.2.3,<3.0.0 +requests +rdkit>=2024.9.5,<2025.0.0 +streamlit>=1.41.1 \ No newline at end of file diff --git a/streamlit/test_pepper_app.csv b/streamlit/test_pepper_app.csv new file mode 100644 index 0000000..7e3a875 --- /dev/null +++ b/streamlit/test_pepper_app.csv @@ -0,0 +1,125 @@ +SMILES,Compound +O=c1[nH]cnc2c1ncn2C1CC(O)C(CO)O1,2'-Deoxyinosine +CC(C)C(O)(CC(=O)O)C(=O)O,2-Isopropylmalic acid +OCC1OC(OC2C(CO)OC(O)C(O)C2O)C(O)C(O)C1O,alpha-Lactose +O=P(c1ccccc1)(c1ccccc1)c1ccccc1,triphenylphosphineoxide +CCCCC(=O)N(Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1)C(C(=O)O)C(C)C,Valsartan +CC(C)(CC(=O)O)CC(=O)O,"3,3-Dimethylglutaric acid" +COCCc1ccc(OCC(O)CNC(C)C)cc1,Metoprolol +NC(Cc1ccc(O)cc1)C(=O)O,L-Tyrosine +O=c1[nH]sc2ccccc12,"1,2-Benzisothiazolin-3-one" +CC(CCC(=O)O)C1CCC2C3CCC4CC(O)CCC4(C)C3CC(O)C12C,Deoxycholic acid +O=C(O)C1CC(=O)N(Cc2ccco2)C1,1-(2-Furylmethyl)-5-oxopyrrolidine-3-carboxylic acid +Cc1ccc(C(N)=O)cn1,6-Methylnicotinamide +CNCCCC(C#N)(c1ccc(OC)c(OC)c1)C(C)C,"2-(3,4-Dimethoxyphenyl)-5-methylamino-2-isopropylvaleronitrile" +O=C(O)C(O)Cc1ccccc1,3-Phenyllactic acid +CC(C)CC(N)C(=O)NC(C(=O)O)C(C)C,Leucylvaline +CCOP(=O)(OCC)OCC,Triethyl phosphate +Nc1c2c(=O)[nH]c(O)cc2nn1-c1ccccc1,"3-amino-2-phenyl-2H-pyrazolo[4,3-c]pyridine-4,6-diol" +CC1CN2CC(C)OB(O1)OC(C)C2,Triisopropanolamine cyclic borate +CC(O)(CC(=O)O)C(=O)O,citramalic acid +N#Cc1cc(Br)c(O)c(Br)c1,Bromoxynil +CCCCNS(=O)(=O)c1ccccc1,N-Butylbenzenesulfonamide +O=c1cc[nH]c2ccccc12,4-Hydroxyquinoline +O=C1CC2(CCCCC2)CN1,"3,3-pentamethylene-4-butyrolactam" +CC(O)COC(C)COC(C)COC(C)COC(C)COC(C)COC(C)COC(C)COC(C)COC(C)COC(C)CO,Undecapropylene glycol +COc1cc(CC(N)C(=O)O)ccc1O,3-Methoxytyrosine +CC(Cc1ccc(C(C)C(=O)O)cc1)C(=O)O,Carboxyibuprofen +NC(=O)c1ccccc1O,Salicylamide +CC(C)CC(NC(=O)CN)C(=O)O,Glycyl-L-leucine +CCCCCC(O)C(O)CC=CCCCCCCCC(=O)O,(+/-)12(13)-DiHOME +CC(CC(=O)O)C(=O)O,Methylsuccinic acid +COc1cc(C=O)cc(OC)c1O,Syringaldehyde +CCC=CCC(O)C(O)C=CC(O)CCCCCCCC(=O)O,"(10e,15z)-9,12,13-Trihydroxyoctadeca-10,15-dienoic acid" +CC(CCl)OP(=O)(OC(C)CCl)OC(C)CCl,Tris(1-chloro-2-propyl)phosphate +CC(N)C12CC3CC(CC(C3)C1)C2,Rimantadine +CC1CC2OC2C=CC(=O)CC(=O)O1,Decarestrictine F +NC(Cc1ccccc1O)C(=O)O,2-Hydroxyphenylalanine +S=c1[nH]c2ccccc2s1,2-Mercaptobenzothiazole +Oc1ccc2ncccc2c1,6-Quinolinol +O=C(O)C1(O)CC(O)C(O)C(O)C1,D-(-)-Quinic acid +O=C(O)c1cc(O)c(O)c(O)c1,Gallic acid +COc1ccc(-c2coc3cc(O)cc(O)c3c2=O)cc1,Biochanin A +Nc1cccc(O)c1,3-Aminophenol +COc1cc(C(=O)O)ccc1O,Vanillic acid +CC12C=CC(=O)C=C1CCC1C2C(O)CC2(C)C1CCC2(O)C(=O)CO,Prednisolone +COc1cc(O)c(C(=O)c2ccccc2)cc1S(=O)(=O)O,Sulisobenzone +c1ccc2n[nH]nc2c1,Benzotriazole +Cc1nc(-c2ccc(OCC(C)C)c(C#N)c2)sc1C(=O)O,Febuxostat +Cc1cc(O)cc(C)c1S(C)(=O)=O,Methiocarb-TP methiocarb sulfone phenol (M05) +O=CNCc1ccccc1,N-Benzylformamide +OCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCO,"3,6,9,12,15,18,21,24,27,30,33,36,39,42-Tetradecaoxatetratetracontane-1,44-diol" +NC(=O)N1c2ccccc2C(O)C(O)c2ccccc21,"10,11-Dihydro-10,11-dihydroxycarbamazepine" +CC12CCC3c4ccc(OS(=O)(=O)O)cc4CCC3C1CCC2=O,Estrone sulfate +CCC=CCC=CCC=CCC=CCC=CCCCCCC(=O)O,Docosapentaenoic acid +O=C(O)CC1(C(=O)O)CCCCC1,1-(Carboxymethyl)cyclohexanecarboxylic acid +CC(=O)Nc1ccc(C)cc1C,"N-(2,4-Dimethylphenyl)acetamide" +NC(CC(=O)O)C(=O)NC(Cc1ccccc1)C(=O)O,L-Aspartyl-L-phenylalanine +Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(CC(O)C(O)C(O)CO)c2cc1C,Riboflavin +Cc1oc(C(=O)O)cc(=O)c1C,"5,6-dimethyl-4-oxo-4H-pyran-2-carboxylic acid" +CC(=O)NC(CC(C)C)C(=O)O,N-Acetyl-L-leucine +O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,Diclofenac +OCCN(CCO)CCO,Triethanolamine +Cc1ccncc1,4-Picoline +O=C(NC1CCCCC1)NC1CCCCC1,Dicyclohexylurea +CC(C)(Oc1ccc(Cl)cc1)C(=O)O,Clofibric acid +NCC(=O)N1CCCC1C(=O)O,Glycylproline +OCCOCCOCCOCCOCCOCCOCCOCCOCCOCCO,"3,6,9,12,15,18,21,24,27-Nonaoxanonacosane-1,29-diol" +NC(N)=NCCCC(N)C(=O)O,DL-Arginine +CN(C)CC1CCCCC1(O)c1cccc(O)c1,O-Desmethyl-cis-tramadol +CC(=O)Nc1ccc(O)c(C(=O)O)c1,N-Acetyl-5-aminosalicylic acid +CC(C)(C)NC(=O)NCCO,1-(2-Hydroxyethyl)-3-t-butylurea +O=C(O)c1ccc([O-])c(O)c1,"3,4-Dihydroxybenzoate" +Cc1cccc(C(=O)NCC(=O)O)c1,3-Methylhippuric acid +CC(C)CC(NC(=O)OC(C)(C)C)C(N)=O,tert-Butyl N-[1-(aminocarbonyl)-3-methylbutyl]carbamate +CCCCOP(=O)(O)OCCCC,Dibutyl phosphate +Cc1nc(C)c(C)nc1C,"2,3,5,6-Tetramethylpyrazine" +CN(C)CCC=C1c2ccccc2CCc2ccccc21,Amitriptyline +NCC(O)c1ccccc1,Phenylethanolamine +CC(C)NCC(O)COc1ccc(COCCOC(C)C)cc1,Bisoprolol +NC(CCC(=O)O)C(=O)NC(Cc1ccc(O)cc1)C(=O)O,Glutamyltyrosine +COc1ccc(C=O)cc1,4-Methoxybenzaldehyde +COc1cccc(CC(=O)O)c1,3-Methoxyphenylacetic acid +Cc1cc(=O)oc2cc(O)c(O)cc12,"6,7-Dihydroxy-4-methylcoumarin" +NCC1(CC(=O)O)CCCCC1,Gabapentin +O=C(O)c1ccc([N+](=O)[O-])cc1,4-Nitrobenzoic acid +Cc1cc(=O)oc2cc(N)ccc12,7-Amino-4-methylcoumarin +CC=C(C)C(=O)O,Tiglic acid +CCCc1nc(C(C)(C)O)c(C(=O)O)n1Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1,Olmesartan +OCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCO,Dodecaethylene glycol +Nc1ccccc1C(=O)O,Anthranilic acid +CN(C)C(=O)Oc1ccc[n+](C)c1,Pyridostigmine +O=S(=O)(O)c1ccccc1,Benzenesulfonic acid +COc1ccc(C2Sc3ccccc3N(CCN(C)C)C(=O)C2O)cc1,Desacetyl diltiazem +O=C(O)C(c1ccccc1Cl)N1CCc2sccc2C1,Clopidogrel carboxylic acid +Cn1cnc2[nH]c(=O)[nH]c(=O)c21,7-Methylxanthine +CN(CC(=O)O)C(=N)N,Creatine +CCCCCCC=CCCCCCCCC(=O)O,Palmitoleic acid +NC1CCCCC1,Cyclohexylamine +CC1NC(=O)NC1CCCCCC(=O)O,Desthiobiotin +O=C1CCCCN1,2-Piperidone +CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,Bezafibrate +CN1C(=O)CC(C)(c2ccccc2)C1=O,Methsuximide +O=S(=O)(O)c1ccc2nc(-c3ccccc3)[nH]c2c1,Phenylbenzimidazole sulfonic acid +O=C1OC(C(O)CO)C(O)=C1O,Ascorbic acid +CCN(CC)CC(=O)Nc1c(C)cccc1C,Lidocaine +O=C1NS(=O)(=O)c2ccccc21,Saccharin +O=C(O)CCc1ccc(O)cc1,3-(4-Hydroxyphenyl)propionic acid +CCC=CCC(O)C(O)CCC(O)CCCCCCCC(=O)O,"(15Z)-9,12,13-Trihydroxy-15-octadecenoic acid" +CCOP(=O)(Sc1ccccc1)Sc1ccccc1,EDDP +c1ccc2c(c1)[nH]c1cnccc12,Norharman +CCCCCCC(=O)O,Heptanoic acid +NC(CCC(=O)O)C(=O)O,L-Glutamic acid +CN1CC(=O)NC1=O,N-Methylhydantoin +CC(=O)CCc1ccc(O)cc1,4-(4-Hydroxyphenyl)butan-2-one +CCCCCCCC(O)CC(=O)O,3-Hydroxydecanoic acid +c1ccc(-c2cnc[nH]2)cc1,4-Phenylimidazole +CN1CCC23c4c5ccc(O)c4OC2C(O)C=CC3C1C5,Morphine +O=C(O)c1c[nH]c2ccccc12,3-Indolylcarboxylic Acid +CC(C)C1NC(=O)C2CCCN2C1=O,XLUAWXQORJEMBD-UHFFFAOYSA-N +O=C(O)C=Cc1ccc(O)cc1,trans-4-hydroxycinnamic acid +O=S(=O)(c1ccc(O)cc1)c1ccc(O)cc1,"4,4'-Sulfonyldiphenol" +O=C(O)CCCCCCCCCCCC(=O)O,Tridecanedioic acid +CC(N)C(=O)N1CCCC1C(=O)O,L-Alanyl-L-proline +CC(C)N(C(=O)C(=O)O)c1ccccc1,Propachlor OXA +CC1(C)CC(O)CC(C)(C)N1CCO,"4-Hydroxy-1-(2-hydroxyethyl)-2,2,6,6-tetramethylpiperidine" \ No newline at end of file diff --git a/streamlit/utils.py b/streamlit/utils.py new file mode 100644 index 0000000..7f5435d --- /dev/null +++ b/streamlit/utils.py @@ -0,0 +1,12 @@ +import io +import base64 +from rdkit.Chem import Draw + + +def image_from_mol(mol): + img = Draw.MolToImage(mol) + bio = io.BytesIO() + img.save(bio, format="PNG") + img_bytes = bio.getvalue() + base64_str = base64.b64encode(img_bytes).decode("utf-8") + return f"data:image/png;base64,{base64_str}" \ No newline at end of file