bharani1990 · bharani1990 · Apr 24, 2022 · Apr 24, 2022 · May 6, 2022
diff --git a/README.md b/README.md
@@ -1 +1 @@
-# 21s_msai_python
+# 21s_msai_python_HW_5
diff --git a/hw_5/HW_5.ipynb b/hw_5/HW_5.ipynb
@@ -0,0 +1,212 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "2563c0e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sklearn.utils\n",
+    "from sklearn import datasets\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.utils import shuffle\n",
+    "import sklearn.metrics as metrics\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.ensemble import ExtraTreesClassifier\n",
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "c12f8e84",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "number of observations used for training =  11\n",
+      "Score for 10%: 0.0754281385286842\n",
+      "number of observations used for training =  22\n",
+      "Score for 20%: 0.05711922218712858\n",
+      "number of observations used for training =  33\n",
+      "Score for 30%: 0.061250197185758755\n",
+      "number of observations used for training =  44\n",
+      "Score for 40%: 0.05634600896238332\n",
+      "number of observations used for training =  56\n",
+      "Score for 50%: 0.05917650323357551\n",
+      "number of observations used for training =  67\n",
+      "Score for 60%: 0.055275408731475364\n",
+      "number of observations used for training =  78\n",
+      "Score for 70%: 0.05446147878810882\n",
+      "number of observations used for training =  89\n",
+      "Score for 80%: 0.05355228831140269\n",
+      "number of observations used for training =  100\n",
+      "Score for 90%: 0.05313063624422463\n",
+      "number of observations used for training =  112\n",
+      "Score for 100%: 0.051750199610600275\n"
+     ]
+    }
+   ],
+   "source": [
+    "class Builder_Creation:\n",
+    "    \n",
+    "    def __init__(self, X_train, y_train):\n",
+    "        self.X_train = X_train\n",
+    "        self.y_train = y_train\n",
+    "\n",
+    "    def get_subsample(self, df_share):\n",
+    "        \"\"\"\n",
+    "        1. Copy train dataset\n",
+    "        2. Shuffle data (don't miss the connection between X_train and y_train)\n",
+    "        3. Return df_share %-subsample of X_train and y_train\n",
+    "        \"\"\"\n",
+    "        n_samples = int(df_share / 100 * len(self.X_train))\n",
+    "        print(\"number of observations used for training = \", n_samples)\n",
+    "        X_train_sub, y_train_sub = shuffle(self.X_train, self.y_train, random_state=42, n_samples=n_samples)\n",
+    "        return X_train_sub, y_train_sub\n",
+    "\n",
+    "    \n",
+    "if __name__ == \"__main__\":\n",
+    "    \"\"\"\n",
+    "    1. Load iris dataset\n",
+    "    2. Shuffle data and divide into train / test.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    dataset = datasets.load_iris()\n",
+    "    features = dataset.data\n",
+    "    targets = dataset.target\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(features, targets)\n",
+    "    pipe_lr = make_pipeline(StandardScaler(), LinearRegression())\n",
+    "    \n",
+    "    pattern_item = Builder_Creation(X_train, y_train)\n",
+    "    \n",
+    "    for df_share in range(10, 101, 10):\n",
+    "        \"\"\"\n",
+    "        1. Preprocess curr_X_train, curr_y_train in the way you want\n",
+    "        2. Train Linear Regression on the subsample\n",
+    "        3. Save or print the score to check how df_share affects the quality\n",
+    "        \"\"\"\n",
+    "        curr_X_train, curr_y_train = pattern_item.get_subsample(df_share)\n",
+    "        pipe_lr = LinearRegression().fit(curr_X_train, curr_y_train)\n",
+    "        pipe_lr.fit(curr_X_train, curr_y_train)\n",
+    "        y_pred_test = pipe_lr.predict(X_test)\n",
+    "        mse = metrics.mean_squared_error(y_test, y_pred_test)        \n",
+    "        print(f'Score for {df_share}%: {mse}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "01f45f07",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Score with ensemble = 0.9210526315789473\n"
+     ]
+    }
+   ],
+   "source": [
+    "class Decorator_Structure:\n",
+    "    def __init__(self, classifier_list) -> None:\n",
+    "        \"\"\"\n",
+    "        Initialize a class item with a list of classificators\n",
+    "        \"\"\"\n",
+    "        self.classifier_list = classifier_list\n",
+    "        \n",
+    "\n",
+    "    def fit(self, feature_matrix, response):\n",
+    "        \"\"\"\n",
+    "        Fit classifiers from the initialization stage\n",
+    "        \"\"\"\n",
+    "        for c in self.classifier_list:\n",
+    "            c.fit(feature_matrix, response)\n",
+    "\n",
+    "    def predict(self, feature_matrix):\n",
+    "        \n",
+    "        \"\"\"\n",
+    "        Get predicts from all the classifiers and return\n",
+    "        the most popular answers\n",
+    "        \"\"\"\n",
+    "        y_pred_dict = {}\n",
+    "        for c in self.classifier_list:\n",
+    "            y_pred = c.predict(feature_matrix)\n",
+    "            y_pred_dict[c] = y_pred\n",
+    "        return y_pred_dict\n",
+    "        \n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    \"\"\"\n",
+    "    1. Load iris dataset\n",
+    "    2. Shuffle data and divide into train / test.\n",
+    "    3. Prepare classifiers to initialize <StructuralPatternName> class.\n",
+    "    4. Train the ensemble\n",
+    "    \"\"\"\n",
+    "    dataset = datasets.load_iris()\n",
+    "    features = dataset.data\n",
+    "    targets = dataset.target\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(features, targets)\n",
+    "    classifier_list = [\n",
+    "        DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0),\n",
+    "        RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42),\n",
+    "        ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0),\n",
+    "        RandomForestClassifier(n_estimators=10)\n",
+    "    ]\n",
+    "    \n",
+    "    ensemble = Decorator_Structure(classifier_list)    \n",
+    "    ensemble.fit(X_train, y_train)\n",
+    "    \n",
+    "    y_pred_dict = ensemble.predict(X_test)\n",
+    "    list_of_lists = []\n",
+    "    for k, v in y_pred_dict.items():\n",
+    "        list_of_lists.append(list(v))\n",
+    "    nume = np.array([sum(i) for i in zip(list_of_lists[0], list_of_lists[1], list_of_lists[2], list_of_lists[3])])\n",
+    "    deno = len(classifier_list)\n",
+    "    y_pred = np.floor(nume / deno)\n",
+    "    acc = accuracy_score(y_test, y_pred)\n",
+    "    print(f\"Score with ensemble = {acc}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3d7d30e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/hw_5/hw5_1.py b/hw_5/hw5_1.py
@@ -0,0 +1,61 @@
+import sklearn.utils
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils import shuffle
+import sklearn.metrics as metrics
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.metrics import accuracy_score
+import numpy as np
+
+class Builder_Creation:
+
+    def __init__(self, X_train, y_train):
+        self.X_train = X_train
+        self.y_train = y_train
+
+    def get_subsample(self, df_share):
+        """
+        1. Copy train dataset
+        2. Shuffle data (don't miss the connection between X_train and y_train)
+        3. Return df_share %-subsample of X_train and y_train
+        """
+        n_samples = int(df_share / 100 * len(self.X_train))
+        print("number of observations used for training = ", n_samples)
+        X_train_sub, y_train_sub = shuffle(self.X_train, self.y_train, random_state=42, n_samples=n_samples)
+        return X_train_sub, y_train_sub
+
+
+if __name__ == "__main__":
+    """
+    1. Load iris dataset
+    2. Shuffle data and divide into train / test.
+    """
+
+    dataset = datasets.load_iris()
+    features = dataset.data
+    targets = dataset.target
+    X_train, X_test, y_train, y_test = train_test_split(features, targets)
+    pipe_lr = make_pipeline(StandardScaler(), LinearRegression())
+
+    pattern_item = Builder_Creation(X_train, y_train)
+
+    for df_share in range(10, 101, 10):
+        """
+        1. Preprocess curr_X_train, curr_y_train in the way you want
+        2. Train Linear Regression on the subsample
+        3. Save or print the score to check how df_share affects the quality
+        """
+        curr_X_train, curr_y_train = pattern_item.get_subsample(df_share)
+        pipe_lr = LinearRegression().fit(curr_X_train, curr_y_train)
+        pipe_lr.fit(curr_X_train, curr_y_train)
+        y_pred_test = pipe_lr.predict(X_test)
+        mse = metrics.mean_squared_error(y_test, y_pred_test)        
+        print(f'Score for {df_share}%: {mse}')
+
+
diff --git a/hw_5/hw5_2.py b/hw_5/hw5_2.py
@@ -0,0 +1,74 @@
+import sklearn.utils
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils import shuffle
+import sklearn.metrics as metrics
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.metrics import accuracy_score
+import numpy as np
+
+class Composite_Structure:
+    def __init__(self, classifier_list) -> None:
+        """
+        Initialize a class item with a list of classificators
+        """
+        self.classifier_list = classifier_list
+
+
+    def fit(self, feature_matrix, response):
+        """
+        Fit classifiers from the initialization stage
+        """
+        for c in self.classifier_list:
+            c.fit(feature_matrix, response)
+
+    def predict(self, feature_matrix):
+
+        """
+        Get predicts from all the classifiers and return
+        the most popular answers
+        """
+        y_pred_dict = {}
+        for c in self.classifier_list:
+            y_pred = c.predict(feature_matrix)
+            y_pred_dict[c] = y_pred
+        return y_pred_dict
+
+
+
+if __name__ == "__main__":
+    """
+    1. Load iris dataset
+    2. Shuffle data and divide into train / test.
+    3. Prepare classifiers to initialize <StructuralPatternName> class.
+    4. Train the ensemble
+    """
+    dataset = datasets.load_iris()
+    features = dataset.data
+    targets = dataset.target
+    X_train, X_test, y_train, y_test = train_test_split(features, targets)
+    classifier_list = [
+        DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0),
+        RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42),
+        ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0),
+        RandomForestClassifier(n_estimators=10)
+    ]
+
+    ensemble = Composite_Structure(classifier_list)    
+    ensemble.fit(X_train, y_train)
+
+    y_pred_dict = ensemble.predict(X_test)
+    list_of_lists = []
+    for k, v in y_pred_dict.items():
+        list_of_lists.append(list(v))
+    nume = np.array([sum(i) for i in zip(list_of_lists[0], list_of_lists[1], list_of_lists[2], list_of_lists[3])])
+    deno = len(classifier_list)
+    y_pred = np.floor(nume / deno)
+    acc = accuracy_score(y_test, y_pred)
+    print(f"Score with ensemble = {acc}")