example-sphinx-basic/lumache.py at main · aerte/example-sphinx-basic · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os.path as osp

import pandas as pd
from torch_geometric.data import download_url
#from dglchem.utils.data import GraphDataSet

__all__ = [
    'LogP'
]

class LogP(object):
    """A dataset class inspired by the Torchvision datasets such as MNIST. It will download the corrected *logP* Dataset
    from https://github.com/nadinulrich/log_P_prediction/blob/30f2f6ad0d7806a3246a5b3da936aa02478d5202/Dataset_and_Predictions.xlsx
    [1], first introduced in [2], should it not already exist. It then initializes it into a **GraphDataSet** class.

    Parameters:
    ----------
    root: str
        Indicates what the root or working directory is. Default: None
    target_string: str
        A string that indicates which of the features from the dataset should be the 'target'.
    global_features: list of str
        A list of strings indicating any additional features that should be included as global features.
    allowed_atoms: list of str
        List of allowed atom symbols. Default are the AFP atoms.
    only_organic: bool
        Checks if a molecule is ``organic`` counting the number of ``C`` atoms. If set to True, then molecules with less
        than one carbon will be discarded. Default: True
    atom_feature_list: list of str
        List of features to be applied. Default are the AFP atom features.
    bond_feature_list: list of str
        List of features that will be applied. Default are the AFP features
    split: bool
        An indicator if the dataset should be split. Only takes effect if nothing else regarding the split is specified
        and will trigger the default split. Default: False (recommended)
    split_type: str
        Indicates what split should be used. Default: random. The options are:
        [consecutive, random, molecular weight, scaffold, stratified, custom]
    split_frac: array
        Indicates what the split fractions should be. Default: [0.8, 0.1, 0.1]
    custom_split: array
        The custom split that should be applied. Has to be an array matching the length of the filtered smiles,
        where 0 indicates a training sample, 1 a testing sample and 2 a validation sample.
    log: bool
        Decides if the filtering output and other outputs will be shown. Default: False
    save_data_filename: str
        The filename of the saved dataset. If given, the dataset will be automatically saved after processing.
        Default: None

    ----

    References

    [1] Ulrich, N., Goss, KU. & Ebert, A., Exploring the octanol–water partition coefficient dataset using deep learning
    techniques and datasets augmentation., Commun Chem 4, 90 (2021), http://dx.doi.org/10.1038/s42004-021-00528-9

    [2] Mansouri K, Grulke CM, Richard AM, Judson RS, Williams AJ., An automated curation procedure for addressing
    chemical errors and inconsistencies in public datasets used in QSAR modelling., SAR QSAR Environ Res. (2016),
    http://dx.doi.org/10.1080/1062936X.2016.1253611

    """


    def __init__(self, root: str = None, global_features: list or str = None,
                 allowed_atoms: list[str] = None, only_organic: bool = True,
                 atom_feature_list: list[str] = None, bond_feature_list: list[str] = None,
                 split: bool = False, split_type: str = None, split_frac: list[float] = None,
                 custom_split: list[int] = None, log: bool = False, save_data_filename: str =None):


        self.root = './data' if root is None else root

        file_name = 'LogP'

        self.raw_path = self.raw_dir

        if not osp.exists(osp.join(self.raw_path, file_name)):
            download_url(
                'https://github.com/nadinulrich/log_P_prediction/blob/30f2f6ad0d7806a3246a5b3da936aa02478d5202/Dataset_and_Predictions.xlsx?raw=true',
                 folder = self.raw_path,
                 filename= file_name,
                 log = True
            )

            path = osp.join(self.raw_path, file_name)

        else:
            path = osp.join(self.raw_path, file_name)

        df = pd.read_excel(path)
        labels = df.columns[3]
        self.target_name = 'logP'

        #super().__init__(smiles = df.SMILES, target = df[labels], global_features=global_features,
        #                 allowed_atoms = allowed_atoms, only_organic=only_organic,
        #                 atom_feature_list = atom_feature_list,
        #                bond_feature_list = bond_feature_list, split=split, split_type=split_type,
        #                 split_frac=split_frac, custom_split=custom_split, log = log)

        #self.data_name = 'LogP'


        #if save_data_filename is not None:
        #    self.save_data_set(filename=save_data_filename)
        #    self.get_smiles()