diff --git a/.gitignore b/.gitignore index 2d4daa40..821015bd 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,10 @@ .idea* + +**/.DS_Store +*.pyc + +**/data/* +**/models/* + +.vscode +api_key.txt diff --git a/alec-glisman/ML-Band-Gaps.md b/alec-glisman/ML-Band-Gaps.md new file mode 100644 index 00000000..2ccb5adc --- /dev/null +++ b/alec-glisman/ML-Band-Gaps.md @@ -0,0 +1,31 @@ +# ML Band Gaps (Materials) + +> Ideal candidate: skilled ML data scientist with solid knowledge of materials science. + +# Overview + +The aim of this task is to create a python package that implements automatic prediction of electronic band gaps for a set of materials based on training data. + +# User story + +As a user of this software I can predict the value of an electronic band gap after passing training data and structural information about the target material. + +# Requirements + +- suggest the bandgap values for a set of materials designated by their crystallographic and stoichiometric properties +- the code shall be written in a way that can facilitate easy addition of other characteristics extracted from simulations (forces, pressures, phonon frequencies etc) + +# Expectations + +- the code shall be able to suggest realistic values for slightly modified geometry sets - eg. trained on Si and Ge it should suggest the value of bandgap for Si49Ge51 to be between those of Si and Ge +- modular and object-oriented implementation +- commit early and often - at least once per 24 hours + +# Timeline + +We leave exact timing to the candidate. Must fit Within 5 days total. + +# Notes + +- use a designated github repository for version control +- suggested source of training data: materialsproject.org diff --git a/alec-glisman/README.md b/alec-glisman/README.md new file mode 100644 index 00000000..28817594 --- /dev/null +++ b/alec-glisman/README.md @@ -0,0 +1,66 @@ +# ReWoTes: ML Property Predict + +Alec Glisman + +## Overview + +This directory contains files for the ML Property Predict project for Mat3ra.com. + +Input data is accessed from the Materials Project and the data is cleaned into Pandas Dataframes inside `data/data_load.py`. +I chose to download all materials with a bandgap of less than 10 eV from the Materials Project and parsed all data related to the crystallographic and stoichiometric properties. +Categorical data is converted to numeric data using one-hot encoding and the data is then scaled using `sklearn.preprocessing.StandardScaler`. +The input data source to the machine learning model can be augmented with additional Materials Project data with the `MaterialData` init method and external data can also be merged using its respective `add_data_columns` method. +The cleaned data is archived using Pandas in conjunction with HDF5 to lower runtime costs for model development. + +I chose to pursue two machine-learning architectures: XGBoost and feed-forward, fully connected, neural networks. +XGBoost generally performs better than neural networks when the data set is not large, and XGBoost is also much faster to train. +Neural networks were included for their superior expressivity and serve as a useful comparison to XGBoost. +In both cases, I employed `KFold` and `RandomizedSearchCV` from `scikit-learn` to cross-validate and select hyperparameters, respectively. + +The best XGBoost Regressor that I trained is saved during runtime under the `models` directory and has a testing sample MSE of 0.646 eV. +Similarly, the best fully connected neural network I trained is saved during runtime under the `models` directory and has a testing sample MSE of 0.817 eV. +The seed used is provided in `main.py` for reproducibility. + +Areas for future work include: + +1. Stratified sampling for test/train split or cross-validation to make sure different space groups are represented properly in each subset. +2. Explore the use of feed-forward neural networks and experiment with architecture, drop-out, and regularization to optimize the performance. Additionally, increase the epochs from 40. I used 40 due to computational constraints, but the loss was still noticeably shrinking. +3. Addition of more data from the Materials Project to lower the inductive bias of the models. +4. Attempt transfer-learning of these models and fine-tune to more specific databases, such as silicon semiconductors. + +## Usage + +A Conda environment file has been provided (`requirements.yml`) to set up a Python environment called `ml-band-gaps` with the following command + +```[bash] +$ conda env create -f requirements.yml +``` + +The overall project can then be run with + +```[bash] +$ python main.py +``` + +Unit tests can be run with pytest as + +```[bash] +$ pytest tests +``` + +Data ingested is cached to the `data` directory, and machine-learning models are cached to the `models` directory. +Each of these directories is created automatically as part of the main script. + +Note that the data is sourced from the Materials Project, which requires an API key to access it. +I have added my API key to the `.gitignore` for security reasons, so users will need to generate their own and add it to an `api_key.txt` file. + +## Requirements + +- suggest the bandgap values for a set of materials designated by their crystallographic and stoichiometric properties +- the code shall be written in a way that can facilitate easy addition of other characteristics extracted from simulations (forces, pressures, phonon frequencies etc.) + +## Expectations + +- the code shall be able to suggest realistic values for slightly modified geometry sets - e.g. trained on Si and Ge it should suggest the value of bandgap for Si49Ge51 to be between those of Si and Ge +- modular and object-oriented implementation +- commit early and often - at least once per 24 hours diff --git a/alec-glisman/main.py b/alec-glisman/main.py new file mode 100644 index 00000000..33cf4fb1 --- /dev/null +++ b/alec-glisman/main.py @@ -0,0 +1,61 @@ +"""Main script that trains models using the XGBoostModels class. + +The main function in this script is `main()`, which is responsible for +executing the script. It follows the steps mentioned above and does not +return any value. + +To run this script, execute the `main()` function. + +Example: + python main.py + +Note: Before running the script, make sure to provide the API key in a file +named "api_key.txt" located in the same directory as this script. +""" + +from pathlib import Path + +from src.data_load import MaterialData +from src.models import XGBoostModels, NeuralNetModels + + +def main() -> None: + """ + Main function that executes the script. + + This function performs the following steps: + 1. Reads the API key from a file. + 2. Loads data using the MaterialData class. + 3. Splits the data into training and testing sets. + 4. Trains models using the XGBoostModels class. + 5. Trains models using a Neural Network. + 6. Prints a completion message. + + Returns: + None + """ + file_path = Path(__file__).resolve().parent + seed = 42 + + # API key is not included in the code for security reasons + with open(file_path / "api_key.txt", "r", encoding="utf-8") as f: + api_key = f.read().strip() + + # Load data + data = MaterialData(api_key, band_gap=(0.0, 10.0)) + x_train, x_test, y_train, y_test, _, _ = data.split_data(seed=seed) + + # Train models + xgb = XGBoostModels(x_train, y_train, x_test, y_test, save=True) + xgb.train_models(seed=seed) + xgb.evaluate_model() + nn = NeuralNetModels(x_train, y_train, x_test, y_test, save=True) + nn.train_models(seed=seed) + nn.evaluate_model() + + # Notify user that the script has finished + print("Script completed successfully.") + + +if __name__ == "__main__": + main() diff --git a/alec-glisman/requirements.yml b/alec-glisman/requirements.yml new file mode 100644 index 00000000..367fc381 --- /dev/null +++ b/alec-glisman/requirements.yml @@ -0,0 +1,35 @@ +name: ml-band-gaps +channels: + - conda-forge +dependencies: + - pip + - tqdm + - joblib + - numpy + - pandas + - pytables + - scipy + - scikit-learn + - xgboost + - pytorch + - torchvision + - skorch + - matplotlib + - pymatgen + - phonopy + - ipykernel + - ipywidgets + - ipympl + - pandoc + - notebook + - jupyter_client + - pytest + - pytest-cov + - pytest-xdist + - coverage + - autopep8 + - black + - flake8 + - pip: + - "--editable=git+https://github.com/materialsproject/api.git@main#egg=mp-api" + \ No newline at end of file diff --git a/alec-glisman/src/data_load.py b/alec-glisman/src/data_load.py new file mode 100644 index 00000000..6a9a1f11 --- /dev/null +++ b/alec-glisman/src/data_load.py @@ -0,0 +1,306 @@ +""" +This module provides a class for loading and processing material data from the +Materials Project API. + +The `MaterialData` class allows users to retrieve material data from the +Materials Project API, process and clean the data, and perform operations +such as splitting the data into train and test sets. + +Example: + ```python + from pathlib import Path + from src.data_load import MaterialData + + # load all materials with band gap between 0 and 1000 eV + data = MaterialData(api_key, band_gap=(0.0, 1000.0)) + data.get_data() + + # split data into train and test sets for band gap prediction + x_train, x_test, y_train, y_test, _ = data.split_data() + ``` + +Classes: + MaterialData: A class for loading and processing material data from the + Materials Project API. +""" + +from pathlib import Path + +import joblib +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler + +from mp_api.client import MPRester + + +class MaterialData: + """A class for loading and processing material data from the Materials + Project API. + + Extra fields from the Materials Project can be added with the `fields` + parameter in the constructor. + External data for each material can be added with the `add_data_columns` + method. + + Args: + api_key (str): The API key for accessing the material data. + fields (list, optional): The list of fields to retrieve from the + material data. Defaults to None. + save (bool, optional): Whether to save the loaded data. Defaults to + True. + **kwargs: Additional keyword arguments to be passed to the material + data API. + + Attributes: + api_key (str): The API key for accessing the material data. + fields (list): The list of fields to retrieve from the material data. + save (bool): Whether to save the loaded data. + kwargs (dict): Additional keyword arguments to be passed to the + material data API. + materials (list): The loaded material data. + dataframe (pd.DataFrame): The processed material data. + _dir_output (Path): The output directory for saving the data. + _file_data (Path): The file path for saving the data. + + Methods: + __init__: Initializes the MaterialData object. + __repr__: Returns a string representation of the MaterialData object. + __len__: Returns the number of rows in the material data. + _fetch_materials: Fetches the material data from the API. + get_materials: Returns the loaded material data. + get_data: Returns the processed material data. + split_data: Splits the material data into train and test sets. + add_data_columns: Adds additional columns to the material data. + _extract_data: Extracts and cleans the material data. + _encode_data: Encodes the categorical columns in the material data. + """ + + def __init__(self, api_key: str, fields: list = None, save: bool = True, **kwargs): + """ + Initialize the DataLoad object. + + Parameters: + - api_key (str): The API key for accessing the data. + - fields (list): The list of fields to retrieve from the data. + Defaults to a predefined list of fields. + - save (bool): Flag indicating whether to save the data. + Defaults to True. + - **kwargs: Additional keyword arguments. + + Raises: + - ValueError: If the API key is not provided. + """ + self.api_key: str = api_key + self.fields: list[str] = fields or [ + "material_id", + "composition_reduced", + "symmetry", + "structure", + "band_gap", + ] + self.save: bool = save + self.kwargs: dict = kwargs + + if not isinstance(self.api_key, str) or not self.api_key: + raise ValueError("API key must be provided") + + self.materials: list = None + self.dataframe: pd.DataFrame = None + + self._dir_output: Path = Path("./data") + self._file_data: Path = self._dir_output / "materials_data.hdf5" + + def __repr__(self) -> str: + """Return a string representation of the MaterialData object. + + Returns: + str: A string representation of the MaterialData object. + """ + return ( + f"MaterialData(api_key={self.api_key}, fields={self.fields}" + + f", kwargs={self.kwargs})" + ) + + def __len__(self) -> int: + """Return the number of rows in the material data frame. + + Returns: + int: The number of rows in the material data frame. + """ + return len(self.dataframe) if self.dataframe is not None else 0 + + def _fetch_materials(self) -> None: + """Retrieve the material data from the Materials Project API.""" + with MPRester(self.api_key) as mpr: + self.materials = mpr.materials.summary.search( + fields=self.fields, **self.kwargs + ) + + def get_materials(self) -> list: + """Return the loaded Material Project API data. + + If the data has not been loaded, it will be fetched from the API. + + Returns: + list: Material Project data for each material. + """ + if self.materials is None: + self._fetch_materials() + return self.materials + + def get_data(self) -> pd.DataFrame: + """Return the processed and cleaned material data. + + If the data has been cached, it will be loaded from the file. + Otherwise, the data will be fetched from the API, cleaned, and + saved to the file. + + Returns: + pd.DataFrame: Material data + """ + + # load data if it exists + if self.dataframe is None and self._file_data.exists(): + self.dataframe = pd.read_hdf(self._file_data, key="data") + elif self.dataframe is None: + self._extract_data() + self._encode_data() + + if self.save: + self._dir_output.mkdir(exist_ok=True, parents=True) + self.dataframe.to_hdf(self._file_data, key="data", mode="w") + + return self.dataframe + + def split_data( + self, target: str = "band_gap", test_size: float = 0.2, seed: int = 42 + ) -> tuple: + """Split the material data into train and test sets. + + Parameters: + - target (str): The target column for prediction. Defaults to + "band_gap". + - test_size (float): The proportion of the data to include in the test + set. Defaults to 0.2. + - seed (int): The random seed for splitting the data. Defaults to 42. + + Returns: + - tuple: A tuple containing the train and test sets of the input + features and the target variable, as well as the material IDs, and the + scaler used to scale the data. + """ + if self.dataframe is None: + self.get_data() + + # extract ID for later use + mpid = self.dataframe["id"] + + x = self.dataframe.drop(columns=[target, "id"]).to_numpy(dtype=np.float64) + y = self.dataframe[target].to_numpy(dtype=np.float64).reshape(-1, 1) + + # drop rows with NaN entries in x or y + mask_x = np.isnan(x).any(axis=1) + mask_y = np.isnan(y).flatten() + mask = mask_x | mask_y + x = x[~mask] + y = y[~mask] + mpid = mpid[~mask] + + # test/train split + x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=test_size, random_state=seed, shuffle=True + ) + + # scale the training and testing data + scaler = StandardScaler() + x_train = scaler.fit_transform(x_train) + x_test = scaler.transform(x_test) + + # save the scaler + if self.save: + scaler_filename = self._dir_output / "scaler.save" + joblib.dump(scaler, scaler_filename) + + return x_train, x_test, y_train, y_test, mpid, scaler + + def add_data_columns(self, data: dict) -> None: + """Add additional columns to the material data. + + Note that the data will be added to the existing data frame and it is + assumed that the data is already encoded if necessary. + + Parameters: + - data (dict): A dictionary of additional columns to add to the data. + """ + if self.dataframe is None: + self._extract_data() + self._encode_data() + + self.dataframe = self.dataframe.assign(**data) + + if self.save: + self.dataframe.to_hdf(self._file_data, key="data", mode="w") + + def _extract_data(self) -> pd.DataFrame: + """Extract and clean the material data from the API into a DataFrame + for analysis. + + Returns: + pd.DataFrame: The cleaned material data. + """ + if self.materials is None: + self._fetch_materials() + + cleaned_data = [] + for doc in self.materials: + # extract subset of symmetry data + keys = ["crystal_system", "symbol", "point_group"] + d = doc.symmetry.dict() + symmetry = dict((k, d[k]) for k in keys) + symmetry["crystal_system"] = symmetry["crystal_system"].value + + # extract subset of structure data + lattice = doc.structure.lattice + structure = { + "a": lattice.a, + "b": lattice.b, + "c": lattice.c, + "alpha": lattice.alpha, + "beta": lattice.beta, + "gamma": lattice.gamma, + "density": doc.structure.density, + } + + # combine dicts + data = { + **{"id": doc.material_id.split("()")[0]}, + **doc.composition_reduced.as_dict(), + **symmetry, + **structure, + **{"band_gap": doc.band_gap}, + } + cleaned_data.append(data) + + # convert list of dicts to pandas, and fill missing values in elements + self.dataframe = pd.DataFrame(cleaned_data).fillna(0) + return self.dataframe + + def _encode_data(self) -> pd.DataFrame: + """Encode the categorical columns in the material data. + + Returns: + pd.DataFrame: The encoded material data. + """ + if self.dataframe is None: + self._extract_data() + + # one-hot encoding for categorical columns + self.dataframe = pd.get_dummies( + self.dataframe, + columns=["crystal_system", "point_group", "symbol"], + drop_first=True, + ) + + return self.dataframe diff --git a/alec-glisman/src/models.py b/alec-glisman/src/models.py new file mode 100644 index 00000000..fc710618 --- /dev/null +++ b/alec-glisman/src/models.py @@ -0,0 +1,433 @@ +""" +This module contains classes for training and evaluating materials models. + +The module includes the following classes: +- MaterialsModels: A base class for training and evaluating machine learning +models on materials data. +- XGBoostModels: A class for training and evaluating XGBoost models on +materials data. +- NeuralNetModels: A class for training and evaluating PyTorch models on +materials data. +""" + +from pathlib import Path + +import numpy as np +import pandas as pd +from scipy.stats import uniform, randint +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import KFold, RandomizedSearchCV +from skorch import NeuralNetRegressor +import torch +from torch import nn +import xgboost as xgb + + +class MaterialsModels: + """A class for training and evaluating materials models. + + This class provides a template for training and evaluating machine learning + models on materials data. It includes methods for training models with + cross-validation and randomized search for hyperparameter tuning, as well + as evaluating the trained models. + + Args: + x_train (np.ndarray): The training features. + y_train (np.ndarray): The training labels. + x_test (np.ndarray): The testing features. + y_test (np.ndarray): The testing labels. + save (bool, optional): Whether to save the trained model and metrics. + Defaults to True. + + Attributes: + x_train (np.ndarray): The training features. + y_train (np.ndarray): The training labels. + x_test (np.ndarray): The testing features. + y_test (np.ndarray): The testing labels. + save (bool): Whether to save the trained model and metrics. + metrics (dict): The evaluation metrics. + model: The trained model. + _dir_out (Path): The output directory for saving the model and metrics. + + Raises: + NotImplementedError: If the train_models method is not implemented. + + """ + + def __init__( + self, + x_train: np.ndarray, + y_train: np.ndarray, + x_test: np.ndarray, + y_test: np.ndarray, + save: bool = True, + ) -> None: + """Initialize the MaterialsModels class. + + Args: + x_train (np.ndarray): The training features. + y_train (np.ndarray): The training labels. + x_test (np.ndarray): The testing features. + y_test (np.ndarray): The testing labels. + save (bool, optional): Whether to save the trained model and + metrics. Defaults to True. + """ + + self.x_train = x_train + self.y_train = y_train + self.x_test = x_test + self.y_test = y_test + self.save = save + + self.metrics = None + self.model = None + + self._dir_out = Path("./models") + + def train_models( + self, param_grid: dict = None, cv: int = 10, seed: int = 42 + ) -> dict: + """Train XGBoost models with cross-validation and grid search. + + Args: + param_grid (dict): The parameter grid for the grid search. + Defaults to None. + cv (int, optional): The number of cross-validation folds. + Defaults to 5. + seed (int, optional): The random seed for the train-test split. + Defaults to 42. + + Returns: + dict: The trained models. + + Raises: + NotImplementedError: If the method is not implemented. + """ + raise NotImplementedError("Method not implemented") + + def evaluate_model(self) -> dict: + """Evaluate the trained model. + + Returns: + dict: The evaluation metrics. + """ + y_pred = self.model.predict(self.x_test) + mse = mean_squared_error(self.y_test, y_pred) + + self.metrics = { + "mean_squared_error": mse, + "root_mean_squared_error": np.sqrt(mse), + } + return self.metrics + + +class XGBoostModels(MaterialsModels): + """ + This class represents a set of XGBoost models for materials data. + + It inherits from the MaterialsModels base class and implements the + train_models method specifically for XGBoost models. It uses + cross-validation and grid search for training. + + Attributes: + model: The trained model. + metrics: The evaluation metrics for the model. + x_train: The training data. + y_train: The training labels. + x_test: The test data. + y_test: The test labels. + """ + + def train_models( + self, param_grid: dict = None, cv: int = 5, seed: int = 42 + ) -> xgb.XGBRegressor: + """Train XGBoost models with cross-validation and grid search. + + Args: + param_grid (dict): The parameter grid for the grid search. + Defaults to None. + cv (int, optional): The number of cross-validation folds. + Defaults to 5. + seed (int, optional): The random seed for the train-test split. + Defaults to 42. + + Returns: + xgb.XGBRegressor: The best trained XGBoost model. + """ + if param_grid is None: + param_grid = { + "n_estimators": randint(100, 1000), + "max_depth": randint(3, 10), + "subsample": uniform(0.5, 0.5), + "colsample_bytree": uniform(0.5, 0.5), + } + + model = xgb.XGBRegressor( + n_estimators=1000, + max_depth=7, + learning_rate=0.1, + colsample_bytree=0.8, + subsample=0.8, + n_jobs=1, + ) + kfold = KFold(n_splits=cv, random_state=seed, shuffle=True) + random_search = RandomizedSearchCV( + model, + param_distributions=param_grid, + n_iter=20, + scoring="neg_mean_squared_error", + n_jobs=-1, + cv=kfold.split(self.x_train, self.y_train), + refit=True, + verbose=3, + random_state=seed, + ) + random_search.fit(self.x_train, self.y_train) + best_model = random_search.best_estimator_ + self.model = best_model + + params = random_search.best_params_ + print(f"Best Model:\n{best_model}") + + y_pred = best_model.predict(self.x_test) + mse = mean_squared_error(self.y_test, y_pred) + print(f"Mean Squared Error: {mse}") + + if self.save: + self._dir_out.mkdir(exist_ok=True) + best_model.save_model(self._dir_out / "xgboost_model.json") + pd.DataFrame(params, index=[0]).to_csv( + self._dir_out / "xgboost_params.csv", index=False + ) + + return best_model + + def evaluate_model(self) -> dict: + """Evaluate the trained model. + + Returns: + dict: The evaluation metrics. + """ + super().evaluate_model() + + if self.save: + self._dir_out.mkdir(exist_ok=True) + pd.DataFrame(self.metrics, index=[0]).to_csv( + self._dir_out / "xgboost_metrics.csv", index=False + ) + + return self.metrics + + +class NeuralNetModels(MaterialsModels): + """ + This class represents a set of PyTorch models for materials data. + + It inherits from the MaterialsModels base class and implements the + train_models method specifically for PyTorch models. It uses + cross-validation and grid search for training. + + Attributes: + model: The trained model. + metrics: The evaluation metrics for the model. + x_train: The training data. + y_train: The training labels. + x_test: The test data. + y_test: The test labels. + """ + + class Net(nn.Module): + """A simple feedforward neural network. + + Args: + nn (Module): The PyTorch neural network module. + """ + + def __init__( + self, + input_size, + hidden_size, + output_size, + num_layers, + dropout, + activation=nn.ReLU(), + ): + """Initialize the neural network. + + Args: + input_size (int): The size of the input layer. + hidden_size (int): The size of the hidden layers. + output_size (int): The size of the output layer. + num_layers (int): The number of hidden layers. + dropout (float): The dropout rate. + activation (nn.Module, optional): The activation function. + Defaults to nn.ReLU(). + """ + + super(NeuralNetModels.Net, self).__init__() + # make a list of hidden layers + layers = [] + # input layer + layers.append(nn.Linear(input_size, hidden_size)) + layers.append(activation) + layers.append(nn.Dropout(dropout)) + # hidden layers + for _ in range(1, num_layers): + layers.append(nn.Linear(hidden_size, hidden_size)) + layers.append(activation) + layers.append(nn.Dropout(dropout)) + # output layer + layers.append(nn.Linear(hidden_size, output_size)) + self.net = nn.Sequential(*layers) + + def forward(self, x): + """Forward pass of the neural network. + + Args: + x (torch.Tensor): The input data. + """ + y = self.net(x) + return y + + def __init__( + self, + x_train: np.ndarray, + y_train: np.ndarray, + x_test: np.ndarray, + y_test: np.ndarray, + save: bool = True, + ) -> None: + """Initialize the MaterialsModels class. + + Args: + x_train (np.ndarray): The training features. + y_train (np.ndarray): The training labels. + x_test (np.ndarray): The testing features. + y_test (np.ndarray): The testing labels. + save (bool, optional): Whether to save the trained model and + metrics. Defaults to True. + """ + super().__init__(x_train, y_train, x_test, y_test, save) + + # convert x and y to torch tensors of type float + self.x_train = torch.tensor(x_train, dtype=torch.float32) + self.y_train = torch.tensor(y_train, dtype=torch.float32) + self.x_test = torch.tensor(x_test, dtype=torch.float32) + self.y_test = torch.tensor(y_test, dtype=torch.float32) + + def train_models( + self, + param_grid: dict = None, + cv: int = 3, + seed: int = 42, + epochs: int = 40, + ) -> NeuralNetRegressor: + """Train PyTorch models with cross-validation and grid search. + + Args: + param_grid (dict): The parameter grid for the grid search. + Defaults to None. + cv (int, optional): The number of cross-validation folds. + Defaults to 3. + seed (int, optional): The random seed for the train-test split. + Defaults to 42. + epochs (int, optional): The number of training epochs. + Defaults to 40. + + Returns: + NeuralNetRegressor: The best trained PyTorch model. + """ + if param_grid is None: + param_grid = { + "module__hidden_size": randint(20, 300), + "module__num_layers": randint(4, 8), + "module__dropout": uniform(0.0, 0.2), + "lr": uniform(0.001, 0.01), + } + input_size = self.x_train.shape[1] + output_size = self.y_train.shape[1] + + # set seed for reproducibility + torch.manual_seed(seed) + + net = NeuralNetRegressor( + module=self.Net, + module__input_size=input_size, + module__hidden_size=15, + module__output_size=output_size, + module__num_layers=4, + module__dropout=0.01, + max_epochs=epochs, + lr=0.1, + optimizer=torch.optim.SGD, + optimizer__weight_decay=0.0005, + device="cuda" if torch.cuda.is_available() else "cpu", + batch_size=256, + ) + + kfold = KFold(n_splits=cv, random_state=seed, shuffle=True) + random_search = RandomizedSearchCV( + net, + param_distributions=param_grid, + n_iter=20, + scoring="neg_mean_squared_error", + n_jobs=3, + cv=kfold.split(self.x_train, self.y_train), + verbose=3, + random_state=seed, + refit=False, + ) + random_search.fit(self.x_train, self.y_train) + + # find the best model from the search manually as refit is False + results = pd.DataFrame(random_search.cv_results_) + best_idx = results["rank_test_score"].idxmin() + best_params = results.loc[best_idx, "params"] + print(f"Best Parameters:\n{best_params}") + bestnet = NeuralNetRegressor( + module=self.Net, + module__input_size=input_size, + module__hidden_size=best_params["module__hidden_size"], + module__output_size=output_size, + module__num_layers=best_params["module__num_layers"], + module__dropout=best_params["module__dropout"], + max_epochs=epochs, + lr=best_params["lr"], + optimizer=torch.optim.SGD, + optimizer__weight_decay=0.0005, + device="cuda" if torch.cuda.is_available() else "cpu", + batch_size=256, + ) + bestnet.fit(self.x_train, self.y_train) + self.model = bestnet + + y_pred = self.model.predict(self.x_test) + mse = mean_squared_error(self.y_test, y_pred) + print(f"Mean Squared Error: {mse}") + + if self.save: + self._dir_out.mkdir(exist_ok=True) + filename = self._dir_out / "neural_network_model.pkl" + with open(filename, "wb") as f: + torch.save(self.model, f) + pd.DataFrame(best_params, index=[0]).to_csv( + self._dir_out / "neural_network_params.csv", index=False + ) + + return self.model + + def evaluate_model(self) -> dict: + """Evaluate the trained model. + + Returns: + dict: The evaluation metrics. + """ + super().evaluate_model() + + if self.save: + self._dir_out.mkdir(exist_ok=True) + pd.DataFrame(self.metrics, index=[0]).to_csv( + self._dir_out / "neural_network_metrics.csv", index=False + ) + + return self.metrics diff --git a/alec-glisman/tests/test_data_load.py b/alec-glisman/tests/test_data_load.py new file mode 100644 index 00000000..e8f1b0f5 --- /dev/null +++ b/alec-glisman/tests/test_data_load.py @@ -0,0 +1,82 @@ +"""Test the MaterialData class + +This module contains tests for the MaterialData class in the data +loading module. + +Example: + To test the MaterialData class, run the following command: + + $ pytest tests/test_data_load.py +""" + +import sys +from pathlib import Path +import pytest + +import pandas as pd + +sys.path.append(str(Path(__file__).resolve().parents[1])) +from src.data_load import MaterialData # noqa: E402 + + +@pytest.fixture(scope="module") +def api_key() -> str: + """Read the API key from a file""" + with open("api_key.txt", "r", encoding="utf-8") as f: + api = f.read().strip() + return api + + +@pytest.fixture(scope="module") +def data(api_key: str) -> MaterialData: + """Return a MaterialData object""" + return MaterialData(api_key, save=False, band_gap=(0.5, 0.55)) + + +class TestMaterialData: + def test_empty_init(self) -> None: + """Expect a ValueError when no arguments are passed""" + with pytest.raises(TypeError): + MaterialData() + + def test_bad_init(self) -> None: + """Expect a ValueError when an invalid argument is passed""" + with pytest.raises(ValueError): + MaterialData(24512) + + def test_init(self, data: MaterialData) -> None: + """Expect a MaterialData object to be created""" + assert data is not None + + # Check that the attributes are set correctly + assert data.materials is None + assert data.dataframe is None + assert len(data) == 0 + + def test_repr(self, data: MaterialData) -> None: + """Expect the __repr__ method to return a string""" + assert isinstance(repr(data), str) + + def test_get_materials(self, data: MaterialData) -> None: + """Expect the material data to be fetched, but not saved""" + materials = data.get_materials() + + # check that the materials are fetched + assert materials is not None + assert data.materials is not None + assert len(materials) > 0 + + def test_get_data(self, data: MaterialData) -> None: + """Expect the material data to be fetched, cleaned, and not saved""" + data._file_data = Path("temp/materials_data.hdf5") + df = data.get_data() + + # check that the data is fetched and cleaned + assert df is not None + assert data.dataframe is not None + assert len(data) > 0 + assert len(data) == len(df) + assert isinstance(data.dataframe, pd.DataFrame) + + # check that the output file is not created + assert not data._file_data.exists()