diff --git a/README.md b/README.md index 80c10f9..9486f80 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# 21s_msai_python +# 21s_msai_python_HW_5 diff --git a/hw_5/HW_5.ipynb b/hw_5/HW_5.ipynb new file mode 100644 index 0000000..9c9c0ed --- /dev/null +++ b/hw_5/HW_5.ipynb @@ -0,0 +1,212 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 21, + "id": "2563c0e9", + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn.utils\n", + "from sklearn import datasets\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.utils import shuffle\n", + "import sklearn.metrics as metrics\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import ExtraTreesClassifier\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "c12f8e84", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of observations used for training = 11\n", + "Score for 10%: 0.0754281385286842\n", + "number of observations used for training = 22\n", + "Score for 20%: 0.05711922218712858\n", + "number of observations used for training = 33\n", + "Score for 30%: 0.061250197185758755\n", + "number of observations used for training = 44\n", + "Score for 40%: 0.05634600896238332\n", + "number of observations used for training = 56\n", + "Score for 50%: 0.05917650323357551\n", + "number of observations used for training = 67\n", + "Score for 60%: 0.055275408731475364\n", + "number of observations used for training = 78\n", + "Score for 70%: 0.05446147878810882\n", + "number of observations used for training = 89\n", + "Score for 80%: 0.05355228831140269\n", + "number of observations used for training = 100\n", + "Score for 90%: 0.05313063624422463\n", + "number of observations used for training = 112\n", + "Score for 100%: 0.051750199610600275\n" + ] + } + ], + "source": [ + "class Builder_Creation:\n", + " \n", + " def __init__(self, X_train, y_train):\n", + " self.X_train = X_train\n", + " self.y_train = y_train\n", + "\n", + " def get_subsample(self, df_share):\n", + " \"\"\"\n", + " 1. Copy train dataset\n", + " 2. Shuffle data (don't miss the connection between X_train and y_train)\n", + " 3. Return df_share %-subsample of X_train and y_train\n", + " \"\"\"\n", + " n_samples = int(df_share / 100 * len(self.X_train))\n", + " print(\"number of observations used for training = \", n_samples)\n", + " X_train_sub, y_train_sub = shuffle(self.X_train, self.y_train, random_state=42, n_samples=n_samples)\n", + " return X_train_sub, y_train_sub\n", + "\n", + " \n", + "if __name__ == \"__main__\":\n", + " \"\"\"\n", + " 1. Load iris dataset\n", + " 2. Shuffle data and divide into train / test.\n", + " \"\"\"\n", + "\n", + " dataset = datasets.load_iris()\n", + " features = dataset.data\n", + " targets = dataset.target\n", + " X_train, X_test, y_train, y_test = train_test_split(features, targets)\n", + " pipe_lr = make_pipeline(StandardScaler(), LinearRegression())\n", + " \n", + " pattern_item = Builder_Creation(X_train, y_train)\n", + " \n", + " for df_share in range(10, 101, 10):\n", + " \"\"\"\n", + " 1. Preprocess curr_X_train, curr_y_train in the way you want\n", + " 2. Train Linear Regression on the subsample\n", + " 3. Save or print the score to check how df_share affects the quality\n", + " \"\"\"\n", + " curr_X_train, curr_y_train = pattern_item.get_subsample(df_share)\n", + " pipe_lr = LinearRegression().fit(curr_X_train, curr_y_train)\n", + " pipe_lr.fit(curr_X_train, curr_y_train)\n", + " y_pred_test = pipe_lr.predict(X_test)\n", + " mse = metrics.mean_squared_error(y_test, y_pred_test) \n", + " print(f'Score for {df_share}%: {mse}')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "01f45f07", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score with ensemble = 0.9210526315789473\n" + ] + } + ], + "source": [ + "class Decorator_Structure:\n", + " def __init__(self, classifier_list) -> None:\n", + " \"\"\"\n", + " Initialize a class item with a list of classificators\n", + " \"\"\"\n", + " self.classifier_list = classifier_list\n", + " \n", + "\n", + " def fit(self, feature_matrix, response):\n", + " \"\"\"\n", + " Fit classifiers from the initialization stage\n", + " \"\"\"\n", + " for c in self.classifier_list:\n", + " c.fit(feature_matrix, response)\n", + "\n", + " def predict(self, feature_matrix):\n", + " \n", + " \"\"\"\n", + " Get predicts from all the classifiers and return\n", + " the most popular answers\n", + " \"\"\"\n", + " y_pred_dict = {}\n", + " for c in self.classifier_list:\n", + " y_pred = c.predict(feature_matrix)\n", + " y_pred_dict[c] = y_pred\n", + " return y_pred_dict\n", + " \n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " \"\"\"\n", + " 1. Load iris dataset\n", + " 2. Shuffle data and divide into train / test.\n", + " 3. Prepare classifiers to initialize class.\n", + " 4. Train the ensemble\n", + " \"\"\"\n", + " dataset = datasets.load_iris()\n", + " features = dataset.data\n", + " targets = dataset.target\n", + " X_train, X_test, y_train, y_test = train_test_split(features, targets)\n", + " classifier_list = [\n", + " DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0),\n", + " RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42),\n", + " ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0),\n", + " RandomForestClassifier(n_estimators=10)\n", + " ]\n", + " \n", + " ensemble = Decorator_Structure(classifier_list) \n", + " ensemble.fit(X_train, y_train)\n", + " \n", + " y_pred_dict = ensemble.predict(X_test)\n", + " list_of_lists = []\n", + " for k, v in y_pred_dict.items():\n", + " list_of_lists.append(list(v))\n", + " nume = np.array([sum(i) for i in zip(list_of_lists[0], list_of_lists[1], list_of_lists[2], list_of_lists[3])])\n", + " deno = len(classifier_list)\n", + " y_pred = np.floor(nume / deno)\n", + " acc = accuracy_score(y_test, y_pred)\n", + " print(f\"Score with ensemble = {acc}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3d7d30e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hw_5/hw5_1.py b/hw_5/hw5_1.py new file mode 100644 index 0000000..45b718a --- /dev/null +++ b/hw_5/hw5_1.py @@ -0,0 +1,61 @@ +import sklearn.utils +from sklearn import datasets +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.utils import shuffle +import sklearn.metrics as metrics +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.metrics import accuracy_score +import numpy as np + +class Builder_Creation: + + def __init__(self, X_train, y_train): + self.X_train = X_train + self.y_train = y_train + + def get_subsample(self, df_share): + """ + 1. Copy train dataset + 2. Shuffle data (don't miss the connection between X_train and y_train) + 3. Return df_share %-subsample of X_train and y_train + """ + n_samples = int(df_share / 100 * len(self.X_train)) + print("number of observations used for training = ", n_samples) + X_train_sub, y_train_sub = shuffle(self.X_train, self.y_train, random_state=42, n_samples=n_samples) + return X_train_sub, y_train_sub + + +if __name__ == "__main__": + """ + 1. Load iris dataset + 2. Shuffle data and divide into train / test. + """ + + dataset = datasets.load_iris() + features = dataset.data + targets = dataset.target + X_train, X_test, y_train, y_test = train_test_split(features, targets) + pipe_lr = make_pipeline(StandardScaler(), LinearRegression()) + + pattern_item = Builder_Creation(X_train, y_train) + + for df_share in range(10, 101, 10): + """ + 1. Preprocess curr_X_train, curr_y_train in the way you want + 2. Train Linear Regression on the subsample + 3. Save or print the score to check how df_share affects the quality + """ + curr_X_train, curr_y_train = pattern_item.get_subsample(df_share) + pipe_lr = LinearRegression().fit(curr_X_train, curr_y_train) + pipe_lr.fit(curr_X_train, curr_y_train) + y_pred_test = pipe_lr.predict(X_test) + mse = metrics.mean_squared_error(y_test, y_pred_test) + print(f'Score for {df_share}%: {mse}') + + diff --git a/hw_5/hw5_2.py b/hw_5/hw5_2.py new file mode 100644 index 0000000..e51a206 --- /dev/null +++ b/hw_5/hw5_2.py @@ -0,0 +1,74 @@ +import sklearn.utils +from sklearn import datasets +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.utils import shuffle +import sklearn.metrics as metrics +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.metrics import accuracy_score +import numpy as np + +class Composite_Structure: + def __init__(self, classifier_list) -> None: + """ + Initialize a class item with a list of classificators + """ + self.classifier_list = classifier_list + + + def fit(self, feature_matrix, response): + """ + Fit classifiers from the initialization stage + """ + for c in self.classifier_list: + c.fit(feature_matrix, response) + + def predict(self, feature_matrix): + + """ + Get predicts from all the classifiers and return + the most popular answers + """ + y_pred_dict = {} + for c in self.classifier_list: + y_pred = c.predict(feature_matrix) + y_pred_dict[c] = y_pred + return y_pred_dict + + + +if __name__ == "__main__": + """ + 1. Load iris dataset + 2. Shuffle data and divide into train / test. + 3. Prepare classifiers to initialize class. + 4. Train the ensemble + """ + dataset = datasets.load_iris() + features = dataset.data + targets = dataset.target + X_train, X_test, y_train, y_test = train_test_split(features, targets) + classifier_list = [ + DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0), + RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42), + ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0), + RandomForestClassifier(n_estimators=10) + ] + + ensemble = Composite_Structure(classifier_list) + ensemble.fit(X_train, y_train) + + y_pred_dict = ensemble.predict(X_test) + list_of_lists = [] + for k, v in y_pred_dict.items(): + list_of_lists.append(list(v)) + nume = np.array([sum(i) for i in zip(list_of_lists[0], list_of_lists[1], list_of_lists[2], list_of_lists[3])]) + deno = len(classifier_list) + y_pred = np.floor(nume / deno) + acc = accuracy_score(y_test, y_pred) + print(f"Score with ensemble = {acc}")