Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
# 21s_msai_python
# 21s_msai_python_HW_5
212 changes: 212 additions & 0 deletions hw_5/HW_5.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 21,
"id": "2563c0e9",
"metadata": {},
"outputs": [],
"source": [
"import sklearn.utils\n",
"from sklearn import datasets\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.utils import shuffle\n",
"import sklearn.metrics as metrics\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import ExtraTreesClassifier\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.metrics import accuracy_score\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "c12f8e84",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"number of observations used for training = 11\n",
"Score for 10%: 0.0754281385286842\n",
"number of observations used for training = 22\n",
"Score for 20%: 0.05711922218712858\n",
"number of observations used for training = 33\n",
"Score for 30%: 0.061250197185758755\n",
"number of observations used for training = 44\n",
"Score for 40%: 0.05634600896238332\n",
"number of observations used for training = 56\n",
"Score for 50%: 0.05917650323357551\n",
"number of observations used for training = 67\n",
"Score for 60%: 0.055275408731475364\n",
"number of observations used for training = 78\n",
"Score for 70%: 0.05446147878810882\n",
"number of observations used for training = 89\n",
"Score for 80%: 0.05355228831140269\n",
"number of observations used for training = 100\n",
"Score for 90%: 0.05313063624422463\n",
"number of observations used for training = 112\n",
"Score for 100%: 0.051750199610600275\n"
]
}
],
"source": [
"class Builder_Creation:\n",
" \n",
" def __init__(self, X_train, y_train):\n",
" self.X_train = X_train\n",
" self.y_train = y_train\n",
"\n",
" def get_subsample(self, df_share):\n",
" \"\"\"\n",
" 1. Copy train dataset\n",
" 2. Shuffle data (don't miss the connection between X_train and y_train)\n",
" 3. Return df_share %-subsample of X_train and y_train\n",
" \"\"\"\n",
" n_samples = int(df_share / 100 * len(self.X_train))\n",
" print(\"number of observations used for training = \", n_samples)\n",
" X_train_sub, y_train_sub = shuffle(self.X_train, self.y_train, random_state=42, n_samples=n_samples)\n",
" return X_train_sub, y_train_sub\n",
"\n",
" \n",
"if __name__ == \"__main__\":\n",
" \"\"\"\n",
" 1. Load iris dataset\n",
" 2. Shuffle data and divide into train / test.\n",
" \"\"\"\n",
"\n",
" dataset = datasets.load_iris()\n",
" features = dataset.data\n",
" targets = dataset.target\n",
" X_train, X_test, y_train, y_test = train_test_split(features, targets)\n",
" pipe_lr = make_pipeline(StandardScaler(), LinearRegression())\n",
" \n",
" pattern_item = Builder_Creation(X_train, y_train)\n",
" \n",
" for df_share in range(10, 101, 10):\n",
" \"\"\"\n",
" 1. Preprocess curr_X_train, curr_y_train in the way you want\n",
" 2. Train Linear Regression on the subsample\n",
" 3. Save or print the score to check how df_share affects the quality\n",
" \"\"\"\n",
" curr_X_train, curr_y_train = pattern_item.get_subsample(df_share)\n",
" pipe_lr = LinearRegression().fit(curr_X_train, curr_y_train)\n",
" pipe_lr.fit(curr_X_train, curr_y_train)\n",
" y_pred_test = pipe_lr.predict(X_test)\n",
" mse = metrics.mean_squared_error(y_test, y_pred_test) \n",
" print(f'Score for {df_share}%: {mse}')"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "01f45f07",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Score with ensemble = 0.9210526315789473\n"
]
}
],
"source": [
"class Decorator_Structure:\n",
" def __init__(self, classifier_list) -> None:\n",
" \"\"\"\n",
" Initialize a class item with a list of classificators\n",
" \"\"\"\n",
" self.classifier_list = classifier_list\n",
" \n",
"\n",
" def fit(self, feature_matrix, response):\n",
" \"\"\"\n",
" Fit classifiers from the initialization stage\n",
" \"\"\"\n",
" for c in self.classifier_list:\n",
" c.fit(feature_matrix, response)\n",
"\n",
" def predict(self, feature_matrix):\n",
" \n",
" \"\"\"\n",
" Get predicts from all the classifiers and return\n",
" the most popular answers\n",
" \"\"\"\n",
" y_pred_dict = {}\n",
" for c in self.classifier_list:\n",
" y_pred = c.predict(feature_matrix)\n",
" y_pred_dict[c] = y_pred\n",
" return y_pred_dict\n",
" \n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" \"\"\"\n",
" 1. Load iris dataset\n",
" 2. Shuffle data and divide into train / test.\n",
" 3. Prepare classifiers to initialize <StructuralPatternName> class.\n",
" 4. Train the ensemble\n",
" \"\"\"\n",
" dataset = datasets.load_iris()\n",
" features = dataset.data\n",
" targets = dataset.target\n",
" X_train, X_test, y_train, y_test = train_test_split(features, targets)\n",
" classifier_list = [\n",
" DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0),\n",
" RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42),\n",
" ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0),\n",
" RandomForestClassifier(n_estimators=10)\n",
" ]\n",
" \n",
" ensemble = Decorator_Structure(classifier_list) \n",
" ensemble.fit(X_train, y_train)\n",
" \n",
" y_pred_dict = ensemble.predict(X_test)\n",
" list_of_lists = []\n",
" for k, v in y_pred_dict.items():\n",
" list_of_lists.append(list(v))\n",
" nume = np.array([sum(i) for i in zip(list_of_lists[0], list_of_lists[1], list_of_lists[2], list_of_lists[3])])\n",
" deno = len(classifier_list)\n",
" y_pred = np.floor(nume / deno)\n",
" acc = accuracy_score(y_test, y_pred)\n",
" print(f\"Score with ensemble = {acc}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f3d7d30e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
61 changes: 61 additions & 0 deletions hw_5/hw5_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import sklearn.utils
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import sklearn.metrics as metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

class Builder_Creation:

def __init__(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train

def get_subsample(self, df_share):
"""
1. Copy train dataset
2. Shuffle data (don't miss the connection between X_train and y_train)
3. Return df_share %-subsample of X_train and y_train
"""
n_samples = int(df_share / 100 * len(self.X_train))
print("number of observations used for training = ", n_samples)
X_train_sub, y_train_sub = shuffle(self.X_train, self.y_train, random_state=42, n_samples=n_samples)
return X_train_sub, y_train_sub


if __name__ == "__main__":
"""
1. Load iris dataset
2. Shuffle data and divide into train / test.
"""

dataset = datasets.load_iris()
features = dataset.data
targets = dataset.target
X_train, X_test, y_train, y_test = train_test_split(features, targets)
pipe_lr = make_pipeline(StandardScaler(), LinearRegression())

pattern_item = Builder_Creation(X_train, y_train)

for df_share in range(10, 101, 10):
"""
1. Preprocess curr_X_train, curr_y_train in the way you want
2. Train Linear Regression on the subsample
3. Save or print the score to check how df_share affects the quality
"""
curr_X_train, curr_y_train = pattern_item.get_subsample(df_share)
pipe_lr = LinearRegression().fit(curr_X_train, curr_y_train)
pipe_lr.fit(curr_X_train, curr_y_train)
y_pred_test = pipe_lr.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred_test)
print(f'Score for {df_share}%: {mse}')


74 changes: 74 additions & 0 deletions hw_5/hw5_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import sklearn.utils
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import sklearn.metrics as metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

class Composite_Structure:
def __init__(self, classifier_list) -> None:
"""
Initialize a class item with a list of classificators
"""
self.classifier_list = classifier_list


def fit(self, feature_matrix, response):
"""
Fit classifiers from the initialization stage
"""
for c in self.classifier_list:
c.fit(feature_matrix, response)

def predict(self, feature_matrix):

"""
Get predicts from all the classifiers and return
the most popular answers
"""
y_pred_dict = {}
for c in self.classifier_list:
y_pred = c.predict(feature_matrix)
y_pred_dict[c] = y_pred
return y_pred_dict



if __name__ == "__main__":
"""
1. Load iris dataset
2. Shuffle data and divide into train / test.
3. Prepare classifiers to initialize <StructuralPatternName> class.
4. Train the ensemble
"""
dataset = datasets.load_iris()
features = dataset.data
targets = dataset.target
X_train, X_test, y_train, y_test = train_test_split(features, targets)
classifier_list = [
DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0),
RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=42),
ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0),
RandomForestClassifier(n_estimators=10)
]

ensemble = Composite_Structure(classifier_list)
ensemble.fit(X_train, y_train)

y_pred_dict = ensemble.predict(X_test)
list_of_lists = []
for k, v in y_pred_dict.items():
list_of_lists.append(list(v))
nume = np.array([sum(i) for i in zip(list_of_lists[0], list_of_lists[1], list_of_lists[2], list_of_lists[3])])
deno = len(classifier_list)
y_pred = np.floor(nume / deno)
acc = accuracy_score(y_test, y_pred)
print(f"Score with ensemble = {acc}")