From 4f687419ea4e983e8679e15e71f896ce2d614b07 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 14:00:49 -0600
Subject: [PATCH 01/19] Add files via upload

---
 requirements.txt |  5 +++++
 setup.py         | 13 +++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 setup.py

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..904e466
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+numpy
+csv
+pytest
+ipython
+matplotlib
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..9aa97b1
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,13 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='GradientBoosting',
+    version='0.1',
+    packages=find_packages(),
+    install_requires=[
+        'numpy',
+        'scikit-learn',
+        'pytest',
+        'matplotlib'
+    ],
+)

From 72ae3df6ad30449804c35eb14f47aaf5db005a21 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 14:02:07 -0600
Subject: [PATCH 02/19] Add files via upload

---
 GradientBoosting/__init__.py                  |   0
 .../__pycache__/__init__.cpython-312.pyc      | Bin 0 -> 165 bytes
 GradientBoosting/models/GradientBoosting.py   | 220 ++++++++++++++++++
 GradientBoosting/models/__init__.py           |   0
 .../GradientBoosting.cpython-312.pyc          | Bin 0 -> 8540 bytes
 .../__pycache__/__init__.cpython-312.pyc      | Bin 0 -> 172 bytes
 GradientBoosting/tests/__init__.py            |   0
 .../__pycache__/__init__.cpython-312.pyc      | Bin 0 -> 171 bytes
 ...dientBoosting.cpython-312-pytest-8.3.3.pyc | Bin 0 -> 6198 bytes
 GradientBoosting/tests/small_test.csv         |  51 ++++
 .../tests/test_GradientBoosting.py            | 118 ++++++++++
 11 files changed, 389 insertions(+)
 create mode 100644 GradientBoosting/__init__.py
 create mode 100644 GradientBoosting/__pycache__/__init__.cpython-312.pyc
 create mode 100644 GradientBoosting/models/GradientBoosting.py
 create mode 100644 GradientBoosting/models/__init__.py
 create mode 100644 GradientBoosting/models/__pycache__/GradientBoosting.cpython-312.pyc
 create mode 100644 GradientBoosting/models/__pycache__/__init__.cpython-312.pyc
 create mode 100644 GradientBoosting/tests/__init__.py
 create mode 100644 GradientBoosting/tests/__pycache__/__init__.cpython-312.pyc
 create mode 100644 GradientBoosting/tests/__pycache__/test_GradientBoosting.cpython-312-pytest-8.3.3.pyc
 create mode 100644 GradientBoosting/tests/small_test.csv
 create mode 100644 GradientBoosting/tests/test_GradientBoosting.py

diff --git a/GradientBoosting/__init__.py b/GradientBoosting/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/GradientBoosting/__pycache__/__init__.cpython-312.pyc b/GradientBoosting/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1cea006bc4845162f33461ec18908ef8d09e095
GIT binary patch
literal 165
zcmX@j%ge<81P1L^=^*+sh(HIQS%4zb87dhx8U0o=6fpsLpFwJVxj0+Jgche36~`nc
zrW9qylx3!*<`>5V6y;~7CYKl?816-hDVeExB~JPI#U+_}=`r!~nR%Hd@$q^EmA^P_
ea`RJ4b5iY!Sb>Hz0&y{j@sXL4k+Fyw$N~T?>?y$j

literal 0
HcmV?d00001

diff --git a/GradientBoosting/models/GradientBoosting.py b/GradientBoosting/models/GradientBoosting.py
new file mode 100644
index 0000000..0d64a42
--- /dev/null
+++ b/GradientBoosting/models/GradientBoosting.py
@@ -0,0 +1,220 @@
+import numpy as np
+
+
+class DecisionTree:
+    def __init__(self, max_depth=3):
+        """
+        Initialize the DecisionTree with a specified maximum depth.
+
+        Parameters:
+        - max_depth: Maximum depth of the decision tree.
+        """
+        self.max_depth = max_depth
+        self.tree = None
+
+    def fit(self, X, y):
+        """
+        Fit a decision tree to the given data.
+
+        Parameters:
+        - X: Input features (NumPy array).
+        - y: Target variable (NumPy array).
+        """
+        self.tree = self._grow_tree(X, y)
+
+    def _grow_tree(self, X, y, depth=0):
+        """
+        Recursively grow the decision tree by splitting nodes.
+
+        Parameters:
+        - X: Input features for the current node.
+        - y: Target variable for the current node.
+        - depth: Current depth of the tree.
+
+        Returns:
+        - A dictionary representing the tree structure.
+        """
+        n_samples, n_features = X.shape
+
+        # Base case: If maximum depth is reached or only one sample remains
+        if depth >= self.max_depth or n_samples <= 1:
+            # Create a leaf node with the mean value of the target variable
+            leaf_value = np.mean(y)
+            return {'leaf': leaf_value}
+
+        # Find the best feature and threshold to split the data
+        best_split = self._find_best_split(X, y, n_features)
+
+        # If no valid split is found, return a leaf node
+        if not best_split:
+            leaf_value = np.mean(y)
+            return {'leaf': leaf_value}
+
+        # Recursively grow left and right subtrees
+        left_indices, right_indices = best_split['left_indices'], best_split['right_indices']
+        left_tree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
+        right_tree = self._grow_tree(X[right_indices], y[right_indices], depth + 1)
+
+        # Return the split node
+        return {'feature': best_split['feature'], 'threshold': best_split['threshold'], 'left': left_tree,
+                'right': right_tree}
+
+    def _find_best_split(self, X, y, n_features):
+        """
+        Find the best feature and threshold to split the data.
+
+        Parameters:
+        - X: Input features.
+        - y: Target variable.
+        - n_features: Number of features.
+
+        Returns:
+        - A dictionary containing the best split information, or None if no split is found.
+        """
+        y = np.array(y)  # Ensure compatibility with NumPy indexing
+        best_split = {}
+        min_mse = float('inf')  # Start with a very high MSE
+
+        # Iterate over each feature
+        for feature_index in range(n_features):
+            thresholds = np.unique(X[:, feature_index])  # Get all unique values for the feature
+            for threshold in thresholds:
+                # Split data into left and right based on the threshold
+                left_indices = np.where(X[:, feature_index] <= threshold)[0]
+                right_indices = np.where(X[:, feature_index] > threshold)[0]
+
+                # Skip invalid splits
+                if len(left_indices) == 0 or len(right_indices) == 0:
+                    continue
+
+                # Calculate mean squared error for the split
+                mse = self._calculate_mse(y[left_indices], y[right_indices])
+                if mse < min_mse:
+                    min_mse = mse
+                    best_split = {
+                        'feature': feature_index,
+                        'threshold': threshold,
+                        'left_indices': left_indices,
+                        'right_indices': right_indices
+                    }
+        return best_split if best_split else None
+
+    def _calculate_mse(self, left_y, right_y):
+        """
+        Calculate the mean squared error for a split.
+
+        Parameters:
+        - left_y: Target values for the left split.
+        - right_y: Target values for the right split.
+
+        Returns:
+        - Mean squared error for the split.
+        """
+        left_mse = np.var(left_y) * len(left_y)
+        right_mse = np.var(right_y) * len(right_y)
+        return (left_mse + right_mse) / (len(left_y) + len(right_y))
+
+    def predict(self, X):
+        """
+        Predict target values using the fitted decision tree.
+
+        Parameters:
+        - X: Input features.
+
+        Returns:
+        - Predicted target values.
+        """
+        return np.array([self._predict_sample(sample) for sample in X])
+
+    def _predict_sample(self, sample):
+        """
+        Predict a single sample by traversing the tree.
+
+        Parameters:
+        - sample: A single input sample.
+
+        Returns:
+        - Predicted value for the sample.
+        """
+        node = self.tree
+        while 'leaf' not in node:
+            # Traverse left or right based on the feature threshold
+            if sample[node['feature']] <= node['threshold']:
+                node = node['left']
+            else:
+                node = node['right']
+        return node['leaf']
+
+
+class GradientBoosting:
+    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
+        """
+        Initialize the GradientBoosting model.
+
+        Parameters:
+        - n_estimators: Number of decision trees in the ensemble.
+        - learning_rate: Step size for updating residuals.
+        - max_depth: Maximum depth of each decision tree.
+        """
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.max_depth = max_depth
+        self.trees = []
+        self.initial_prediction = 0
+
+    def fit(self, X, y):
+        """
+        Fit the Gradient Boosting model to the data.
+
+        Parameters:
+        - X: Input features.
+        - y: Target variable.
+
+        Returns:
+        - GradientBoostingResults containing the fitted model.
+        """
+        # Initialize the first prediction as the mean of the target variable
+        self.initial_prediction = np.mean(y)
+        residuals = y - self.initial_prediction
+
+        # Train `n_estimators` decision trees
+        for _ in range(self.n_estimators):
+            tree = DecisionTree(max_depth=self.max_depth)
+            tree.fit(X, residuals)  # Fit tree on current residuals
+            predictions = tree.predict(X)  # Get predictions from the tree
+            residuals -= self.learning_rate * predictions  # Update residuals
+            self.trees.append(tree)  # Store the fitted tree
+
+        return GradientBoostingResults(self.initial_prediction, self.trees, self.learning_rate)
+
+
+class GradientBoostingResults:
+    def __init__(self, initial_prediction, trees, learning_rate):
+        """
+        Store results of the Gradient Boosting model.
+
+        Parameters:
+        - initial_prediction: The initial prediction (mean of the target variable).
+        - trees: List of fitted decision trees.
+        - learning_rate: Learning rate used for updating residuals.
+        """
+        self.initial_prediction = initial_prediction
+        self.trees = trees
+        self.learning_rate = learning_rate
+
+    def predict(self, X):
+        """
+        Predict using the fitted Gradient Boosting model.
+
+        Parameters:
+        - X: Input features.
+
+        Returns:
+        - Predicted target values.
+        """
+        # Start with the initial prediction
+        y_pred = np.full(X.shape[0], self.initial_prediction)
+        # Add predictions from all trees
+        for tree in self.trees:
+            y_pred += self.learning_rate * tree.predict(X)
+        return y_pred
diff --git a/GradientBoosting/models/__init__.py b/GradientBoosting/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/GradientBoosting/models/__pycache__/GradientBoosting.cpython-312.pyc b/GradientBoosting/models/__pycache__/GradientBoosting.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9655793bf9d4a685ffb62cb28fdf355851f2ea19
GIT binary patch
literal 8540
zcmb_hYit`=cAg<;_!9N7M2Vtg$z#j1MA?$#bd%Jw*Ivh}Y%7jpN8atiu$LKghSJy+
zDbEZYlcj)dkp<{hi%3XY%gq8&HYviyY2!uv!;Af~UZ6l-6a_R^QV~-F4bVk_{IjJ1
zK^OhebM8Di9MN%W^g_D4bLZZ3ALpF!Jm#Mp8$%S7fuA0oKG;K1|A7xyVa&$nNod@p
z1WKUO)Op%NouCBITa@6v?J-82_mDmY`dGWqOZt4!=ePTqJRJ{wg_9<e##Ey==}?=a
z@O+aJVHJTo?-7Cm{T6-Ri>sYygplBcCsu%U&ie!wYQF&V&j$oQ)WLip-k^ua#H6H1
z*~~dv6q6QRTm{=+_}}E9x=9u3B9(xaQ3QOVttZ%}r|@owGs~%rq)L2RDu|pqBXZ6a
zxHlzrhT}P94wg+xqQK4a*QMFqEGLL_>dZ*UCUS<C`B_mFWo6Xv8^+NIGIErA)j5&N
zrpQ`?c^XcI<BZr7C*vMHXwTIdd<$KSBBoQa500VWX*uq9Q^!Xyd|!b@FI?gUS-Nmd
z62z=>VNT9o7L)4p)y4^#7bG#GzMRb}s+5_&Fq;*`v~t1SKQfos1BrwLSCU8++MRbX
zVs*2)fmWz~Og(C%y824YLQ^HuSz>+~+=N~#iEOY63K?O`^UwgcP?v2sSVJgemd6sJ
zEsXlmzLs%G1zyzL8K-8+MNUiC#0)3!D*sfT{BV>zm6^+_TuS8CoGdEb;AC$0OrGOq
zna{_oTo~r^que=Oo)%T^8ZS%yrL_2@vB<GdVp`6=nLuXAemGp*D+l05r+=u^c?B;4
z1WpcOmp3J;g_fGzg`Q@3rF1|A7u!O$$8Ns1^xDd?m8)x!I9{&Y0I!mk)<T;?Q6QJA
z!lk0jU&dZnxd5^<&&;z$FI>J;W6W_^F2Jg6krC)CT@)q9ROBxymyy1!0HaI>&ie#U
z(I?SU$W%O)s~}U0>}}j1lzD&AuRIHFzu=9*{GwlA@OM5qAAt4nDqIED3y_f>^VciT
z1sG%RAoRp}L35t*>z>yDYqRDaeB?J&Y!LiEW2UHhfd75C4qHNB7n3<z0a;AvITVJi
zO3q!%1Dd2I6}6DdfEFsxAhD^eOi06OGH5lS^Ay2-+7uI}j++$8qXkn2Iyp`WdXuRE
zx?`LmB~`#>Ue0r}I0tG4`@$n4q~d%|QRQ3`6|TC`uTYe9CN1(QJ)9O(DgY1A5f!~r
zmZoQ_kA72E^`JTfq-U~eA<oFCDst2$ky3S*Op5z-R+-`FM1qvMCo`urvm&3-TM{WC
zpSUC{YJ#vpZiKn<Fws)E35lTw>MS|C9?T>Zes(Smy9s3y7Fg&Z+->3-pU#PT$U!TD
z^V@NLkeme1tcQ$aU|R{pI{*N?5F{X5C`4<Jpb8@>Y|4K^1%S{=?cDW||9!s}=qUvk
zM<4m9@V3RVo0ChE4;pr@NOunWJoDRF_j}P>(UooQc9kY7k*?*5)wY4v@BobM=vr?d
z(%Og02g>}NuC?~VrO;y^)zbCw$x{Xn-09W!eDB`S{e<@N#l_iD7(m{Hr1bl*-8p-w
z_>06n@jm|>X)SW56s!PXC6*E^XIJL6{ul0Cz1yR`bmD&d>e<)-DYn)=RSLm2w(nVy
zwC?Bb7VhuWj{acv4XG0Ad+eoK`X5nruzxGSA&e4$yH5ZPVa_P^fXo3+9tj|?3S<Ip
zgBGhIO;U<pt9OjL_%{G8J_mh7ZdWPNf=BTD1VAt4LC#pS5!!&lp8|z|g9359YDu^$
z6axsk799iV=7axd8e>~%gj5u5Qi{PU{VRhYPfd<@q1tycy&wQcq87LcK)D5yA#!%p
zkjys}8;a}|cv9$vKPfhVnei0^#b7ZcFh67FnS}+0TKJ5iij3?j0$>Ur08N&E&Iy|1
zpp^tBP@62P%<-htRCYA;L}d*?eI_$sg`Ika-TAhv?V}vH2A4z`fzUz#2VB1{emf8*
zvl*2KU&@3*az5i2QU>sNmPf?g#{rm6W-}rur2xpyb4m)A%H}dcbziy{CQZtC@lNcM
zZQzLq<ZgJWv&50n*;G2qBPjBjX;Js(GSXFmEcVSAQ5JP?TFmH8i6oy+=F+?>CT10J
z3lhp4kdt>IP5-Qv!7+NHc@2ov;&nY_<FP`JxVjtN3&WoPdJu_Fg`Qf_b7Aul7@`D0
zOnyv#-AOezzkTw?$;GpO_DYGav`0&!hpo|N_MLqt|JOdMw{K;1t!HF|iUd1KuOgzw
zZpBs({QPXWakc*it?$LP*h_$iJp&t5Yh%Y^@V7g5zIXlB^)mg*&ho|8p;2w{(A~T?
zcyevWsrx&%9j6z=54l|*1>O&ow|#P;JiEI0kQP4-BjP95I2^&9UTlPABhkg;THEe2
z{h%%W>C~N`wc(@p4m=nht3<W~(nY)1JK|bLyu7n~`s4j~58S<|9eQQ0<Mg7p($=vY
zedoKC-kpmlzU+Ih(mzre{7xnQ;^QE@t7B>6aW`!85esK|JVf;lmI|wZ1Lbp28R%%b
zQ*b*Jg^n6Q$9vGYNjY%C)K2KOOVP%_DlS(03P5Y#`xFYc>*t_g_Yn#VIGGGOU5*j;
z)~(Khu?2AE02S!Me}^pNc-)dpqVH(EIpu1OmmvTXWf}AZy&2w66^B0755kB!cDnLl
zx>36f7Yg56gskO<Im2#0aT0lL&9Zl02M%7Xzjbtq2oO}&8=?iGy<P}4<b7~0an=YQ
ze8&EDzquzpKsrzvgGLuB8p;CMzTs93<Gps*YFLf3r(l5cDpa6f9aN;_=8H=&F7J8g
z$cM+vz7Jl}+V+<M521H_X?*#mcV7E2SswTxtwo*(GvCs=d{ArI{b^hI2e-TbUb*|y
z-xam^ajki5H85t#s8jmTjuIK2gT~Wkbc}l2lk`keNzcVLkVp?$BRd=c0%oaN`X$do
zI|HNVZHbzKCz|xUi8A%KHh0c|^nj1bsa8Zs#2}|w=nMgv3i2h?hUC@hbp!k@!K6bP
z>xhH72Uw<N7h9x29)OeT%?a?ManGhBY6g_VRn>i_bBdP>j5p#+zd4~0sbRg@n}eYW
ziWcxZLbbHMef`Gud&mB@bNud}<;Z$$Sc?spQ(wehc+fe%)_h_$aN@t9r7$qz&pUrF
zP?K%JRS8>Xph3cV_;r_g59D1u*J!yHV?lv9O(cHu7_JV9H+}HNi0nKW7TP4zvS-o@
zz0gPFEPx1K;a63DnCt`;01iTAZrlgPSE|fkgPe%PSXcB?A0&*~queozoJ4X2M)&`g
zsU#|}k;R%z)Dl?)5=~Wu7P%82@nmt@nJ<t>p-X2-Xy|0Fd=UBz+iMu>5|BTJ5y}))
z5HA3OBX8$#<V#GYDY87KHE~}yw=W9I2bZ#1^PckV@*inKhaNN^szke&$Crvr%$LwO
zu{6KPJZhkt+n2{4G;xo@KxA?JMsYRJO;md_PJe}af*qrh$3vJb%0S{Pk%06^0+LF(
zG`7Qu1i*FLcq5}HEVp2VUa5@MUq*nE4`GGJ`j{f*4eUaJld-~oB@WeF)TSrE#x^1W
zc4#xw$M$bb(0kbDH_p(F>>tn@y&?7_z1g{q-EL69c}e)Uiv>uV;BAAn+WAMwc~tX{
zd2gK6Tigk;ntTh=AyPj<=zvr^a*W_2J93PeZAn<`QqzzlN#IOUNH!8TNlM-m_wtO>
zA$BDs-CN>F?#>~T^(8u!0C!pfA2SO%NrxX>18@o_Wr&RxGm1D139c#>4Ra9b%ILf%
zWbhD2xhYkg1MPq`Dca859Aw*2#lS_E1YkDgF)T^{_Qa>iCuiI+Yt1BLH3^IeJGZYl
z)@;QH>{$|3>#dlpg!HYY3gA(LdyHmB9>?kgR+z{lK!)oWQieiVh`I09PICSw&KQCU
zRH>Effdt&=fmo$y2!FdOv0WtT37#mk5UT8g6zCU$9ft9<xeRk{2kjGJ?^BJRZNi+A
zgHm<bmKC6BmICY_9JXpOOI^k@yUJ}?7Yl@^0Mrl?+j%pfiU|o<`P6pSAdqA027)w%
z&8A$_6pY#CA_0$8w{qS`vW}Q6c9o(J9mWD=Z{{&Ci9w93T*l0ft3W={JP}PTz_q9d
zpQO~krTUbix6Y2NJR#T_PvPHfS;nL@x@(rCa8?T4P(v#L{cNdcr^+~;O(WUtQ1Lpx
zgtZ}SKd*~QF0Cq@E9f-c8WS@tJp%woorpIQr&HbzZTS>d2)BmfpwN*)26PjwCdzaY
zUg$o4ZcfYydh`j~i2L1XwV(|YI-MW}Pj4_C6VO>jK~XlhXJ{x|j6%dse*_Nn*YHO9
z1ym$B?Cbw%;QfK}&JT!k?x;lCmznj>{Tlp5_LqVXs<lQJ)%ED079CvM7T=)Q;O^3~
zhiyB`yFVNGc;xf8gQc-bxW$MtmxG^&cU8K2-`jU<-^$f@_m^I&gxk?O+N;I(mak~B
zgIf4t=@gEixG}LF?$N?M%fc7ozDEr}_Uk69Wne{JA2^~79J$B*W9T=b`(x{;f25uM
z(OPq2HIOi^sP`)Yc0c`Bx3K*3sEpAr`1?oUNB$mG$FM?(GA{pRY@zNNH#3GUto{Q2
z6*Q`7AsgA<o4rx?4cZuq?`y1}sX5dpIvx%yNW<4P^BWx4Kt?`|6B($iVw=4Y=y88q
z`!=(OAKLisu&%95Q(%omG#Nxx&97h<i(RVa5dZoXZY%P`_Zm=Dv!AO<7dJ>^NOL{g
z_cqk5=93RGDx=(K2~ts*^r;gR)EK%N!vyn8lgXjU<Z=pl`cE^Kli=19)kD1^=E|~P
zwY~N1myJp;5U|bIJ)o&zTuIDW-wSBIc2wHCh}}YK_C=t}uv@mT0)r76h@bNh(0Hod
zqU+fw!)FJiMOv<6*$)!l)v;P&t@?;vK--5(;3`G$Wt4eWsahRkSzyGxXQ3ZbkC<}w
z6h5(K@QfKbYzh6IRSy5ZgFwT#_8a*CCY4L4WprF!QbN3EOH}adh3?A}Cx~FHOIi@U
z3p=a>x~0Wblz)Li${#{SassW9o6)7{@;2j>!mX~AbL;(wwf@6v+rC@!eF>ieCYB~P
zC^~ou(X(|?Snn9rItI(%`Rwq=hwsw2k7^yGT6pxv_#-cjf!L$eXo6pToFn$dg>#5R
z1de`<J41ivEo#Hl!0z4bYGykOn8PvPFS423>|EZkW{6z`9?4v}mgw&ShmA1hh>S-?
tP$V%iY?G${jcWbIH|e3bKc=w$<`OeNzd=8yu>R&8-9vY5P*9PT{|6MJ#}5Di

literal 0
HcmV?d00001

diff --git a/GradientBoosting/models/__pycache__/__init__.cpython-312.pyc b/GradientBoosting/models/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e502be697aaed3d382403f1ba2ea0b41b1325f5
GIT binary patch
literal 172
zcmX@j%ge<81P1L^=^*+sh(HIQS%4zb87dhx8U0o=6fpsLpFwJVc{^Lhgche36~`nc
zrW9qylx3!*<`>5V6y;~7CYKl?816-hDVeExB~JPI#U+_}=`p$aDXBTdG4b)4d6^~g
k@p=W7zc_4wf~7gBc15f}vl)T77{vI<%*e=C#0+Es0G7lor~m)}

literal 0
HcmV?d00001

diff --git a/GradientBoosting/tests/__init__.py b/GradientBoosting/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/GradientBoosting/tests/__pycache__/__init__.cpython-312.pyc b/GradientBoosting/tests/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ed8c667ab8d3bdc7395c1d2c8ebfc44356b516c
GIT binary patch
literal 171
zcmX@j%ge<81P1L^=^*+sh(HIQS%4zb87dhx8U0o=6fpsLpFwJVc{y9fgche36~`nc
zrW9qylx3!*<`>5V6y;~7CYKl?816-hDVeExB~JPI#U+_}=`kg##U;ft@$s2?nI-Y@
kdIgogIBatBQ%ZAE?TT1|Ml%9&F^KVznURsPh#ANN0A_D2RsaA1

literal 0
HcmV?d00001

diff --git a/GradientBoosting/tests/__pycache__/test_GradientBoosting.cpython-312-pytest-8.3.3.pyc b/GradientBoosting/tests/__pycache__/test_GradientBoosting.cpython-312-pytest-8.3.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e34f2f6336ba7036cb47c83fedb812e4271abae3
GIT binary patch
literal 6198
zcmc&2TWlN0@s2!_$Cuu3$u?z4w&;iaj9+;=w&aIoCADN(p_N`vypw$N@!`FrEip$#
zZe4^zQds^9our8J6&Kcr(e}gnYTBZxgSP0&q9gaBz#q+z{|v`Kk$iRb$kkCY;{+&*
z9;BVwnc3Od+1c40{?+gIQ1IRV<%{DlRa4ZzaEE-h0`krKfV@Wult9Z=h&G`uY7g6J
zD$0Z$be`i3IZci$#G23@@&L5S-l#9+GiB_uKUx+lGjT>Pk5+^#qLraav+t0rqSc}5
zXicc5pszMs7pe!`d6^0|h>e2lTqi{#)<%g9HwuQ))FeGg2iXU>-yfuniX(^@Br&GG
z9FHri6dT7h74cGxQ$<DPl!Pp)L7Pz(75NyaoKNyd6gUx~I5JqYlT*TRBm&?6N=!T&
zSG!~3xF90K*T;{Fa-kP4Ol|@<w08Larq7FAq|$^b(1J~{&oMVJ7rrzlIMTFQ=uuRJ
z$<vEljlNMBpSEeXh^v^F#|7s&J?C1MvTHWM*$AvOquC?amQ|m?3hr^+oQI5vn47&)
zDD6=FgrYf!q~JYw7~FkHEhjv|C-~>eZWMwd?JQbL1(C6!=8Ql%t^PE-=D_xnS-|R=
z)iM+{SH3)p3-r`KDPh^Exi|;mgSHYr(;USfE3&$QjI$uv=Nh*O6?2u#7T7f=(qu^x
zOkmA$Y*yD03JJ<ItFa~5&X8578Fd{Q6$uc`f?Ba;8==GM+6it!jS;GZ>baU5g?JhP
zn0BW<X>Z!6QIU<7B*8Sd=3QQ$U-L(TgkwQ%Ibzaf>SjXMe44fY<?7aw^a^zwp$oNI
z`3KamY-HUuRm7{)DpvBeGR>EQ_)J%-JBWtJPJ&qwtXSS1&}hl-wqyt<c!heQVXl#!
z!WHs+h?E6|rXpJ{&HF3~f(gwfB@Yn31%;N9lFwNZ1S1i&!phPfSl)*fvM&%#7E})s
zT=Nk-)I(3w#%(`d()8&X5^*am`0sqWmRPt#XDOeqD=GO@pN>=MD)_HfyO&o_MS2Ks
zL8AFE_4=PEY6S9Kx+Yy)bYhQMvIK*ZRzj0NX*EJc3WS=d!_<}9ux*?Q+g^vfn6A}o
zOP)J7nf<Em-C72u)3t)PC>zyjRgqpYs|AbBs8_2ar<>>kZ(8~0e`)=4*7zRl>$U11
zu->QDKWY8De`bo$d5I|3s)WE?D;c>WdP+%G3+ul}x>WqO75VY$IibCzWU2V=C@I-z
z?I^)Y=7gulxw@8&jr0@Dg6RhJC!_}%w}JR9-KaIr(8@FF0O3Xk31&fI1F>7!h~rx5
zR8JFbWSC$UB-US`l*S^>AxnZ_>Z=4JGoa!TBZO<gCr89+p;HL{Xp3r}ZV}vdh1VD=
zmMme3B?%S;g-t)&g5njH^2zc}E?ZJOqJ$;Z-jfkv(WatxmKXyuP*~>=$Tbo<Ve^NM
zloPU8lgg3(C>r|)p;=H{zE_EkpFtThL8lwijgye&XfxAYEW(0Y`X30OVEp}S`1|z5
z2l(k6v>Uz{2QPUk%Byjt7=BsgQ4AjX9O6~ca7XzGP7o97*;L2sQeYdd0F-S?l$T|*
z8t4iu7Yuq5N(5IpE+?Zg#c;+)BVt%J*f3}tk0X;Q#lmtD$^iWG?J*o<5>jvt;Zzbl
zQbdChP&@%j98ydWCmvv5LXY7mB&A^d=Py3kgsC>eov%WqlxX-69~0tHPEld1RMQD4
zNCE@-f+Wz53z9(RiGl9mfk3hn#-7|-C{_ZUC;#{xpc*tv*|u&C+7W)B8xB|j)G%&Y
zioq%;WijO)L`g9)#LG!hG2ACbMG}&{tfc$|4ZJGHVI2qu8N&BNs^V3shNQ463Wag0
zMpMZJr7Q3Rt85ULRp9x&wb%hNu98yzLA*bNo+|H8Mn^>yh>ryZktiyuswbD4D(fq3
zW8hE{T>y6!rjz2afTRRsaWw$$;Su5|Q-hutctsJB8tCXuOzsIPo;M3f;CYj@J*aeI
z%8sD2#U$(|GJq6hb_SKMm|!aOq=TNc;gJ+6273_;i-scseozcsED>}V_Qd3N-0Z;3
zPTcIq&7R;6gd@_hB?KrPMPA?_oKPJm`4v%yDB=WlG9fAmS4W0j7GnrkCx-hlukt;J
zkBWwWOj0>0Ph-kh97Pc>bd1_@QRT)&UQHsAi-DBF#gnSx498;-l!n7pidYauxEwGX
z<{}V&{u|B<`R%rYUc-e~0lSHCeT3k+f@}gwF%`_0fvtc#6^dIIKXDd^jp3XyH_&iS
z=AlECR2kx1IUB!dFbRkj!yc7l(C`z6TZ|>6qIu{XD&AMat|X&|lTReXm|!?%aU4Te
z!O3Vnp5OCQV$wVTJTfXdyk=hI@|zh|#tk2^xzVH~L-@d1l;dFtSfhah1Cp}9Au)lt
zaXCK9%cf%tH>dDpA|@MU2hEUxcrnk8QBAaq6A77*@hWhF&4wci^}KBOVXL??Nfx;T
zubwq*!k9r1=MPdv;p9s&HwTAGF<fHdR{-}gmq!IiJ2=z}U2x$r-bHYY6A*}g!!=0=
zfS3d%DoI&YP#gAxJqmflhISfmAm`7e;miL_Vg44O7+%v&g$^wl4EC{64H-w3z%VmF
z3sY3BBI4=1*pqq3?kes)lkem`zgKAnV*=m*eRulMfss>i)Rd7?UO>{w1xXO&%18pm
zahBX^A**xk2sU3aTW&S4D=`U+1-`io9H0>}!wdfk3sF2pJ@!!kHP_p24ctDM^Yu@4
zKV+%e=cZ09xxH7Kr<-S9UvRfgy$nR?yQW>)>Z~K<T42{s9lXbSGP|Z-QwM?B(D>e4
zSKqqH-rA&Z>z;XQp}uFazJI>HUuSBTTpr!qyx?ldp1<P?ELE+WP2Sv_t9mx$`nsm>
zy}qk`v-OLud*)mB+&r)EKQ_Pj_(E&1J}|gA5Skwd<!a7k-1l1A7F+hux9rys9A9YZ
z&A4-H<2|4MO3!qU-n==}v*6p3p_hDRSB^~|n>n8i&cv?o$oaNrXfTX*zkTu2#hK1K
zuBN4i_7BQ3{(HX0Mc>AG-^T0of^SoXe(c0cW2f1f%6Gg|2OqOk=e}F?@7Yh;&suM*
z`lyiGD9$tOQ%5pGGpDm-ptHQ@iZm@{U9<HIWx=T<51o|1=E^J6uVlfJwyAE=;9Zj)
znmx7P**taVYj@?$hHS@zJ23S!Oy%(Cz74a=tf+go<e05Xj9d4u$&O|Rb<g@7)ArCq
zRn}#lSIef3EitaQdoT6QIJ52?v!3)?v%9Xh-(|Mp{_#u4GwB?&hTw1Jm}ZP0yL2oQ
z&oOH;e)Q7O%g4bY2U}3)uo&-#XEwZZ?!#T#3m-hI`!~+DT_3x#`FF~#J->ZZ4<5R`
zOAr0@F2fP+MKkH;tp$@{D!nY2V>T_%1^%h2o9eq)RWsGQRNaz2_0gGYXBMi1k0_U;
zm!5WI=*;e~*9JZsyEb;)@el9kUVU&_KQ*ME9-contRH>ti;mD|%&hQ<{IPs<@7)d0
z-suSG{Ag}%SZAB>u{Dcq^E}&}Yw5avQQz0Q(9)M<Pk>uN;n20A&&0o+`@=c?<PgLU
z=$s!I(tA#S(LQ`jm>v9N<l~W>8}4@OztcXfb8qC<@H*RckF8#0o95Z3T=UlJWBQ(>
z3(d#wvd15mV_P5BV6TA#Gb4KIj=Rjx!WuJYX3Os~+Y5ft%L6&4b-B})W7=`Ybj;cu
zv#!9o+=mf!H~tN~afDuEdj5}pI@9y*qYi4r8T#8tEY<NEt@t1#{H?NKsKxX5?F~b%
z9)sn$DE=jv3pN=nxhBwY5HlLN{Ke9h&v}sSyO4MhN%%>)Ofp;r8p1WOVaLxu2i#(b
z$^1>~Ou&nbR}g-y8J$n0E%WM7x>nQkuYh{xocR*%GNriu0|3m(-^=C8Tv0_*SSh-a
zYY7WdI^@-`sK7g9+<YmRm%LF4cO<@hWoZ;Lr+EuI^AO>mrOf1sU4x$r9QSML8z53h
zg62E+Ij|-vi!UGzy7*zLJP*x7J5AH~De5xybI+Gl)t6NDS5)^`)b9J#zWdbkUs3BG
hImc-F2>o!-PBFer>W;nfpG?Is`X55K3B3s#{sRWldvgE)

literal 0
HcmV?d00001

diff --git a/GradientBoosting/tests/small_test.csv b/GradientBoosting/tests/small_test.csv
new file mode 100644
index 0000000..bf8442e
--- /dev/null
+++ b/GradientBoosting/tests/small_test.csv
@@ -0,0 +1,51 @@
+x_0,x_1,x_2,y
+-2.421348566501347,6.290215260063935,2.516304163087373,10.240119830146476
+8.13465811997068,-6.975968662410185,-3.2810945459842866,-6.8962940548446845
+-0.4531238994261493,0.05889462611191654,-3.592293253611172,14.10428803155231
+3.979832584128687,-8.129001764124755,9.202914789330517,-43.788867687445624
+-4.354231825431758,2.4724749171156333,8.45972163584499,-12.067617018047834
+8.726620980175113,-9.607722575405269,-5.092837184080405,-8.265643240683891
+-0.29136484802189955,8.224663789274086,-3.8193339707565555,32.98185595386334
+1.4118708853910462,6.003042800612462,3.9968255952773095,0.7267789346532836
+0.21525181834957507,-3.321041549359367,-5.352746248495515,11.93444109619503
+4.80226153299567,9.818246112545182,4.936296097738831,3.5995719453822046
+9.71733974143089,0.1440918710436101,8.74993701189404,-34.917122745540794
+4.098687611436789,-9.75205878861841,7.980744101999381,-43.32805584620358
+-2.398060521804659,2.8278192128541733,-1.626174948927721,16.91539285950553
+5.398272903061114,7.583046908728093,2.758295974535457,4.437457748228852
+3.371527871466675,-5.430064318728407,2.1915998058530857,-16.03565826569788
+2.0863644528269365,0.10824916542728857,8.144465640869694,-25.094326089867696
+2.8255940202840684,-2.286321234798363,4.771241059098381,-18.000440202657604
+-8.150227640024978,-4.259315052105519,1.8923353680502952,-1.3930242667026356
+-6.067265316809651,3.6776254617776942,8.4817269440159,-10.278522746897893
+8.64017362219969,9.717801217085075,4.980672567111553,-0.9266647796977245
+-4.636910653452324,0.9373715699813872,4.978170771263397,-3.8217233698137143
+-7.940395120999431,2.953441321061362,-0.9370552302607145,21.291726783530805
+7.692709298116139,-5.485844206553388,-6.019643260327971,2.1873435652525455
+-6.485086441297707,7.06589989184231,-8.842925435171665,50.35981404591074
+5.036321300769028,2.0420739888497152,-4.368234397412891,15.435100617505809
+-2.203566631709222,-6.141030616852454,-1.822186931753599,-0.5890454529472771
+3.2620868350599768,7.851306022896178,-4.479265977335616,27.896949611024628
+6.402611257683294,-4.018677430646336,0.48600102750762986,-12.289355696825485
+5.378501224056757,4.355667003325474,-7.565417868242747,31.017195148404717
+2.0486633392332614,8.253411759540757,-3.966950647644751,29.555547834722987
+2.626017326894857,3.314924154867276,9.810418858378235,-22.85112181951592
+-0.04750452520510429,5.935777040113393,-0.3470621837504506,16.516617979443822
+-6.775500897482147,-0.8747563332852692,-2.758815934335188,16.55155644731519
+-5.130765599150095,8.959898235120185,1.1701541118251235,22.753375944830324
+9.607901921761815,-9.108821424255002,5.524296399378377,-41.93781490943017
+-2.9201254899877434,5.134928295361929,-9.896226148902585,43.58829658171542
+6.956501039100711,0.8359369151964895,-6.1636372998431295,16.225403196517274
+7.725179239543149,-4.913104095867496,-1.110476120153832,-9.936035489824537
+-6.142683379729563,1.4244393989902058,1.8529074318076262,5.554396424524908
+-2.0474061706133977,-1.2170618863263076,8.899325908803291,-23.596187786238964
+9.359523403637155,3.4124788823300065,-1.4222946765509725,2.4507844709064064
+-8.642800876507275,-9.508822574677566,2.9901775243378577,-16.775543378589024
+-2.470992582133973,5.1672327675732195,-8.753045094764744,40.855147394263106
+-7.756097982925145,5.227601844332813,-3.179199348468109,30.739018818654756
+5.393783291304004,-1.5186710515725927,-7.469139234639499,17.503383657767756
+-7.644671911438172,1.8115363641056241,-6.167155079348694,33.57677356652164
+6.557442460132911,-4.44188855380612,-6.368621306151785,7.435670420087931
+0.21009363927752744,-2.719754693698011,1.0885820356480096,-6.289562485886653
+-8.571672299069252,8.890348599509473,5.468260371802332,15.412904086362603
+7.872454219630789,-3.9905860234116357,0.9068940749874717,-16.017543419998542
diff --git a/GradientBoosting/tests/test_GradientBoosting.py b/GradientBoosting/tests/test_GradientBoosting.py
new file mode 100644
index 0000000..db4888f
--- /dev/null
+++ b/GradientBoosting/tests/test_GradientBoosting.py
@@ -0,0 +1,118 @@
+import csv
+import numpy as np
+import pandas as pd
+from GradientBoosting.models.GradientBoosting import GradientBoosting
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.preprocessing import OneHotEncoder, LabelEncoder
+import matplotlib.pyplot as plt
+
+
+def test_predict():
+    # Initialize GradientBoosting model
+    # n_estimators: Number of trees in the ensemble
+    # learning_rate: Step size for updating residuals
+    # max_depth: Maximum depth of each decision tree
+    model = GradientBoosting(n_estimators=100, learning_rate=0.1, max_depth=3)
+
+    # Load data from CSV file
+    # csv_file_path: Path to the dataset
+    csv_file_path = "GradientBoosting/tests/small_test.csv"
+    df = pd.read_csv(csv_file_path)  # Load data into a pandas DataFrame
+
+    # Separating features (X) and target variable (y)
+    # X contains all columns except 'y', which is the target
+    X = df.drop(columns=['y'])
+    y = df['y']
+
+    # Handling categorical columns in X
+    # Identifies categorical columns and applies OneHotEncoding
+    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
+    if len(categorical_cols) > 0:
+        encoder = OneHotEncoder(sparse=False, drop='first')  # OneHotEncoder avoids collinearity
+        # Encode categorical columns and add to DataFrame
+        X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]), index=X.index)
+        X_encoded.columns = encoder.get_feature_names_out(categorical_cols)
+        X = X.drop(columns=categorical_cols)  # Drop original categorical columns
+        X = pd.concat([X, X_encoded], axis=1)  # Add encoded columns to X
+
+    # Handling categorical target variable y
+    # Converts target variable y into numeric using LabelEncoder if categorical
+    if y.dtype == 'object':
+        label_encoder = LabelEncoder()
+        y = label_encoder.fit_transform(y)
+
+    # Convert y to NumPy array to avoid index mismatches
+    y = np.array(y)
+
+    # Split data into training and testing sets (80-20 split)
+    # Training set is 80%, Testing set is 20%
+    X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.20, random_state=42)
+
+    # Fit the GradientBoosting model on training data
+    # Model learns the relationship between X_train and y_train
+    results = model.fit(X_train, y_train)
+
+    # Predict using the fitted model on the test set
+    preds = results.predict(X_test)
+
+    # Calculate evaluation metrics
+    # Mean Squared Error (MSE): Measures the average squared difference between predictions and actual values
+    mse = mean_squared_error(y_test, preds)
+    # R-squared (R²): Proportion of variance in the dependent variable explained by the model
+    r2 = r2_score(y_test, preds)
+
+    # Print evaluation metrics
+    print(f"Mean Squared Error (MSE): {mse}")
+    print(f"R-squared (R²): {r2}")
+
+    # **1. Residual Plot**
+    # Residual = Predicted - Actual
+    # Visualizes residuals to assess model bias
+    residuals = preds - y_test
+    plt.scatter(y_test, residuals)
+    plt.axhline(y=0, color='r', linestyle='--')  # Reference line at residual=0
+    plt.xlabel("True Values")
+    plt.ylabel("Residuals")
+    plt.title("Residual Plot")
+    plt.show()
+
+    # **2. Predicted vs. True Values**
+    # Visualizes how well predictions align with actual values
+    plt.scatter(y_test, preds)
+    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='r', linestyle='--')  # Ideal fit line
+    plt.xlabel("True Values")
+    plt.ylabel("Predicted Values")
+    plt.title("True vs. Predicted Values")
+    plt.show()
+
+    # **3. Learning Curve**
+    # Visualizes training and testing errors as the number of trees increases
+    train_errors = []
+    test_errors = []
+    for i, tree in enumerate(model.trees):
+        # Calculate predictions for training data using the first i+1 trees
+        partial_preds_train = np.sum([tree.predict(X_train) for tree in model.trees[:i+1]], axis=0)
+        train_errors.append(mean_squared_error(y_train, partial_preds_train))
+
+        # Calculate predictions for testing data using the first i+1 trees
+        partial_preds_test = np.sum([tree.predict(X_test) for tree in model.trees[:i+1]], axis=0)
+        test_errors.append(mean_squared_error(y_test, partial_preds_test))
+
+    # Plot the learning curve
+    plt.plot(train_errors, label='Training Error')
+    plt.plot(test_errors, label='Testing Error')
+    plt.xlabel("Number of Trees")
+    plt.ylabel("Mean Squared Error")
+    plt.title("Learning Curve")
+    plt.legend()  # Add a legend to differentiate training and testing errors
+    plt.show()
+
+    # Dummy assertion to validate test structure
+    # Ensures that predictions are returned as a NumPy array
+    assert isinstance(preds, np.ndarray), "Prediction is not an array"
+
+
+# Run the test function if executed directly
+if __name__ == "__main__":
+    test_predict()

From e6c00acd7253efc6681ad8b56e59c939959cbc1a Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 14:05:50 -0600
Subject: [PATCH 03/19] Update README.md

---
 README.md | 68 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 50 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index f746e56..ec08baa 100644
--- a/README.md
+++ b/README.md
@@ -1,29 +1,61 @@
-# Project 2
+## Team Members:
+1. Badri Adusumalli A20530163
+2. Bhuvana Chandrika Natharga A20553587
+3. Santhosh Kumar Kathiresan A20546185
+4. Sriram Ravichandran A20583347
 
-Select one of the following two options:
 
-## Boosting Trees
 
-Implement the gradient-boosting tree algorithm (with the usual fit-predict interface) as described in Sections 10.9-10.10 of Elements of Statistical Learning (2nd Edition). Answer the questions below as you did for Project 1.
 
-Put your README below. Answer the following questions.
+## How to Run the Code
 
-* What does the model you have implemented do and when should it be used?
-* How did you test your model to determine if it is working reasonably correctly?
-* What parameters have you exposed to users of your implementation in order to tune performance? (Also perhaps provide some basic usage examples.)
-* Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?
+Follow the steps below to set up and run the code on any system. These instructions will guide you through downloading the repository, installing dependencies, and executing the tests.
 
-## Model Selection
+### Step 1: Download the Repository
 
-Implement generic k-fold cross-validation and bootstrapping model selection methods.
+1. First, download the repository from GitHub to your local machine. You can do this by either:
+   - Cloning the repository using `git clone` command (recommended):
+     ```bash
+     git clone https://github.com/your-username/your-repo-name.git
+     ```
+     Replace `your-username/your-repo-name` with the actual URL of your GitHub repository.
 
-In your README, answer the following questions:
+   - Alternatively, you can download the ZIP file from GitHub and extract it to your desired location.
+  
+### Step 2: Open Git Bash and Navigate to the Project Folder
 
-* Do your cross-validation and bootstrapping model selectors agree with a simpler model selector like AIC in simple cases (like linear regression)?
-* In what cases might the methods you've written fail or give incorrect or undesirable results?
-* What could you implement given more time to mitigate these cases or help users of your methods?
-* What parameters have you exposed to your users in order to use your model selectors.
+1. Open **Git Bash** (or any command line terminal that supports Git) on your computer.
+2. Navigate to the directory where the project is located. For example:
+   ```bash
+   cd ~/music/project1
+   ```
+   In this example, we are assuming that the project is located in the `music/project1` directory. Replace this path with the actual path where you have downloaded the repository.
 
-See sections 7.10-7.11 of Elements of Statistical Learning and the lecture notes. Pay particular attention to Section 7.10.2.
+### Step 3: Install the Required Dependencies
 
-As usual, above-and-beyond efforts will be considered for bonus points.
+1. To run the project, you need to install the necessary dependencies listed in the `requirements.txt` file.
+2. Use the following command to install all the required libraries:
+   ```bash
+   pip install -r requirements.txt
+   ```
+   - This command tells `pip` to install all the packages specified in the `requirements.txt` file. Make sure you have **Python** and **pip** installed on your system. If not, you will need to install them first.
+
+### Step 4: Install the Project in "Editable" Mode Using `setup.py`
+
+1. To allow the project to be used in any location, install it in **editable mode**. This will let Python recognize the `elasticnet` module regardless of your current working directory.
+2. Run the following command:
+   ```bash
+   pip install -e .
+   ```
+   - The `-e` flag stands for "editable," which allows changes to the source code to be reflected immediately without having to reinstall the package.
+   - The `.` specifies the current directory, where the `setup.py` file is located.
+
+### Step 5: Run the Tests to Verify the Installation
+
+1. Now that the dependencies are installed and the project is set up, you can run the tests to ensure everything is working correctly.
+2. Execute the following command to run the test file:
+   ```bash
+   pytest -s elasticnet/tests/test_ElasticNetModel.py
+   ```
+   - The `-s` flag ensures that any `print` statements in the test file are displayed in the terminal.
+   - `pytest` will run the test cases defined in `test_ElasticNetModel.py` to verify the functionality of your ElasticNet implementation.

From a1595e04ba51b634eaa914caf8dc0dd2643294f4 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 14:10:38 -0600
Subject: [PATCH 04/19] Update README.md

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index ec08baa..795e291 100644
--- a/README.md
+++ b/README.md
@@ -27,9 +27,9 @@ Follow the steps below to set up and run the code on any system. These instructi
 1. Open **Git Bash** (or any command line terminal that supports Git) on your computer.
 2. Navigate to the directory where the project is located. For example:
    ```bash
-   cd ~/music/project1
+   cd ~/videos/project2
    ```
-   In this example, we are assuming that the project is located in the `music/project1` directory. Replace this path with the actual path where you have downloaded the repository.
+   In this example, we are assuming that the project is located in the `videos/project2` directory. Replace this path with the actual path where you have downloaded the repository.
 
 ### Step 3: Install the Required Dependencies
 
@@ -55,7 +55,7 @@ Follow the steps below to set up and run the code on any system. These instructi
 1. Now that the dependencies are installed and the project is set up, you can run the tests to ensure everything is working correctly.
 2. Execute the following command to run the test file:
    ```bash
-   pytest -s elasticnet/tests/test_ElasticNetModel.py
+   pytest -s GradientBoosting/tests/test_GradientBoosting.py
    ```
    - The `-s` flag ensures that any `print` statements in the test file are displayed in the terminal.
-   - `pytest` will run the test cases defined in `test_ElasticNetModel.py` to verify the functionality of your ElasticNet implementation.
+   - `pytest` will run the test cases defined in `test_GradientBoosting.py` to verify the functionality of your ElasticNet implementation.

From 8cee57f51d691db60918db70e4096de0e4e0a130 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 14:11:53 -0600
Subject: [PATCH 05/19] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 795e291..412113e 100644
--- a/README.md
+++ b/README.md
@@ -58,4 +58,4 @@ Follow the steps below to set up and run the code on any system. These instructi
    pytest -s GradientBoosting/tests/test_GradientBoosting.py
    ```
    - The `-s` flag ensures that any `print` statements in the test file are displayed in the terminal.
-   - `pytest` will run the test cases defined in `test_GradientBoosting.py` to verify the functionality of your ElasticNet implementation.
+   - `pytest` will run the test cases defined in `test_GradientBoosting.py` to verify the functionality of your GradientBoosting implementation.

From 4cbf2cbe76d88507b75ec289f092fc75cf0762f8 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:04:00 -0600
Subject: [PATCH 06/19] Update GradientBoosting.py

---
 GradientBoosting/models/GradientBoosting.py | 60 +++++----------------
 1 file changed, 14 insertions(+), 46 deletions(-)

diff --git a/GradientBoosting/models/GradientBoosting.py b/GradientBoosting/models/GradientBoosting.py
index 0d64a42..bd08de5 100644
--- a/GradientBoosting/models/GradientBoosting.py
+++ b/GradientBoosting/models/GradientBoosting.py
@@ -36,28 +36,26 @@ def _grow_tree(self, X, y, depth=0):
         """
         n_samples, n_features = X.shape
 
-        # Base case: If maximum depth is reached or only one sample remains
         if depth >= self.max_depth or n_samples <= 1:
-            # Create a leaf node with the mean value of the target variable
             leaf_value = np.mean(y)
             return {'leaf': leaf_value}
 
-        # Find the best feature and threshold to split the data
         best_split = self._find_best_split(X, y, n_features)
 
-        # If no valid split is found, return a leaf node
         if not best_split:
             leaf_value = np.mean(y)
             return {'leaf': leaf_value}
 
-        # Recursively grow left and right subtrees
         left_indices, right_indices = best_split['left_indices'], best_split['right_indices']
         left_tree = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
         right_tree = self._grow_tree(X[right_indices], y[right_indices], depth + 1)
 
-        # Return the split node
-        return {'feature': best_split['feature'], 'threshold': best_split['threshold'], 'left': left_tree,
-                'right': right_tree}
+        return {
+            'feature': best_split['feature'],
+            'threshold': best_split['threshold'],
+            'left': left_tree,
+            'right': right_tree,
+        }
 
     def _find_best_split(self, X, y, n_features):
         """
@@ -71,23 +69,18 @@ def _find_best_split(self, X, y, n_features):
         Returns:
         - A dictionary containing the best split information, or None if no split is found.
         """
-        y = np.array(y)  # Ensure compatibility with NumPy indexing
         best_split = {}
-        min_mse = float('inf')  # Start with a very high MSE
+        min_mse = float('inf')
 
-        # Iterate over each feature
         for feature_index in range(n_features):
-            thresholds = np.unique(X[:, feature_index])  # Get all unique values for the feature
+            thresholds = np.unique(X[:, feature_index])
             for threshold in thresholds:
-                # Split data into left and right based on the threshold
                 left_indices = np.where(X[:, feature_index] <= threshold)[0]
                 right_indices = np.where(X[:, feature_index] > threshold)[0]
 
-                # Skip invalid splits
                 if len(left_indices) == 0 or len(right_indices) == 0:
                     continue
 
-                # Calculate mean squared error for the split
                 mse = self._calculate_mse(y[left_indices], y[right_indices])
                 if mse < min_mse:
                     min_mse = mse
@@ -95,7 +88,7 @@ def _find_best_split(self, X, y, n_features):
                         'feature': feature_index,
                         'threshold': threshold,
                         'left_indices': left_indices,
-                        'right_indices': right_indices
+                        'right_indices': right_indices,
                     }
         return best_split if best_split else None
 
@@ -138,7 +131,6 @@ def _predict_sample(self, sample):
         """
         node = self.tree
         while 'leaf' not in node:
-            # Traverse left or right based on the feature threshold
             if sample[node['feature']] <= node['threshold']:
                 node = node['left']
             else:
@@ -169,38 +161,16 @@ def fit(self, X, y):
         Parameters:
         - X: Input features.
         - y: Target variable.
-
-        Returns:
-        - GradientBoostingResults containing the fitted model.
         """
-        # Initialize the first prediction as the mean of the target variable
         self.initial_prediction = np.mean(y)
         residuals = y - self.initial_prediction
 
-        # Train `n_estimators` decision trees
         for _ in range(self.n_estimators):
             tree = DecisionTree(max_depth=self.max_depth)
-            tree.fit(X, residuals)  # Fit tree on current residuals
-            predictions = tree.predict(X)  # Get predictions from the tree
-            residuals -= self.learning_rate * predictions  # Update residuals
-            self.trees.append(tree)  # Store the fitted tree
-
-        return GradientBoostingResults(self.initial_prediction, self.trees, self.learning_rate)
-
-
-class GradientBoostingResults:
-    def __init__(self, initial_prediction, trees, learning_rate):
-        """
-        Store results of the Gradient Boosting model.
-
-        Parameters:
-        - initial_prediction: The initial prediction (mean of the target variable).
-        - trees: List of fitted decision trees.
-        - learning_rate: Learning rate used for updating residuals.
-        """
-        self.initial_prediction = initial_prediction
-        self.trees = trees
-        self.learning_rate = learning_rate
+            tree.fit(X, residuals)
+            predictions = tree.predict(X)
+            residuals -= self.learning_rate * predictions
+            self.trees.append(tree)
 
     def predict(self, X):
         """
@@ -210,11 +180,9 @@ def predict(self, X):
         - X: Input features.
 
         Returns:
-        - Predicted target values.
+        - Predicted target values as a NumPy array.
         """
-        # Start with the initial prediction
         y_pred = np.full(X.shape[0], self.initial_prediction)
-        # Add predictions from all trees
         for tree in self.trees:
             y_pred += self.learning_rate * tree.predict(X)
         return y_pred

From 043a1e5d6ab94d5a2b4deb26f29172e883b68439 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:04:37 -0600
Subject: [PATCH 07/19] Add files via upload

---
 GradientBoosting/models/Check.py       | 53 +++++++++++++++++++++++++
 GradientBoosting/models/grid_search.py | 54 ++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 GradientBoosting/models/Check.py
 create mode 100644 GradientBoosting/models/grid_search.py

diff --git a/GradientBoosting/models/Check.py b/GradientBoosting/models/Check.py
new file mode 100644
index 0000000..16c62ef
--- /dev/null
+++ b/GradientBoosting/models/Check.py
@@ -0,0 +1,53 @@
+import numpy as np
+
+
+def fill_if_null(data):
+    """
+    Fill null values in a DataFrame with the mean of each column.
+
+    Parameters:
+    - data: pandas DataFrame
+
+    Returns:
+    - data: pandas DataFrame with nulls filled
+    """
+    null_boy = np.array(data.columns[data.isnull().any()])
+    for i in null_boy:
+        data[i] = data[i].fillna(data[i].mean())
+    return data
+
+
+def check_null(data):
+    """
+    Check for null values in a DataFrame and fill them if found.
+
+    Parameters:
+    - data: pandas DataFrame
+
+    Returns:
+    - None: Prints the count of null values in each column.
+    """
+    if data.isnull().values.any():
+        fill_if_null(data)
+        print(data.isnull().sum())
+    else:
+        print(data.isnull().sum())
+
+
+def XandY(data, target_column):
+    """
+    Split the DataFrame into features (X) and target (Y).
+
+    Parameters:
+    - data: pandas DataFrame
+    - target_column: str, name of the target column
+
+    Returns:
+    - X: NumPy array of features
+    - Y: NumPy array of target
+    """
+    Y = data[target_column].to_numpy()
+    data.drop(target_column, axis=1, inplace=True)
+    X = data.to_numpy()
+
+    return [X, Y]
diff --git a/GradientBoosting/models/grid_search.py b/GradientBoosting/models/grid_search.py
new file mode 100644
index 0000000..36eac82
--- /dev/null
+++ b/GradientBoosting/models/grid_search.py
@@ -0,0 +1,54 @@
+from GradientBoosting.models.GradientBoosting import GradientBoosting
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+from itertools import product
+import numpy as np
+
+
+def grid_search(X, y, param_grid):
+    """
+    Perform grid search to find the best hyperparameters for the Gradient Boosting model.
+
+    Parameters:
+    - X: Input features (NumPy array or pandas DataFrame).
+    - y: Target variable (NumPy array or pandas Series).
+    - param_grid: Dictionary of hyperparameters to search, e.g.,
+                  {'n_estimators': [50, 100], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]}.
+
+    Returns:
+    - A dictionary containing the best hyperparameters and the corresponding evaluation metric.
+    """
+    best_params = None
+    best_score = float('inf')  # Lower score is better (MSE)
+
+    # Generate all combinations of hyperparameters
+    keys, values = zip(*param_grid.items())
+    param_combinations = [dict(zip(keys, v)) for v in product(*values)]
+
+    for params in param_combinations:
+        # Create and fit the model with the current hyperparameters
+        model = GradientBoosting(
+            n_estimators=params['n_estimators'],
+            learning_rate=params['learning_rate'],
+            max_depth=params['max_depth']
+        )
+
+        # Split the data into training and testing sets
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
+
+        # Fit the model
+        model.fit(X_train, y_train)
+
+        # Evaluate the model on the test set
+        preds = model.predict(X_test)  # Use the trained model for predictions
+        mse = mean_squared_error(y_test, preds)
+
+        # Update the best parameters if the current score is better
+        if mse < best_score:
+            best_score = mse
+            best_params = params
+
+    return {
+        'best_params': best_params,
+        'best_score': best_score
+    }

From 2c3ca3b351de3aca53db9eb4626e65f367eb2b98 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:05:18 -0600
Subject: [PATCH 08/19] Update test_GradientBoosting.py

---
 .../tests/test_GradientBoosting.py            | 218 +++++++++---------
 1 file changed, 115 insertions(+), 103 deletions(-)

diff --git a/GradientBoosting/tests/test_GradientBoosting.py b/GradientBoosting/tests/test_GradientBoosting.py
index db4888f..0cc7ad1 100644
--- a/GradientBoosting/tests/test_GradientBoosting.py
+++ b/GradientBoosting/tests/test_GradientBoosting.py
@@ -1,118 +1,130 @@
-import csv
 import numpy as np
 import pandas as pd
-from GradientBoosting.models.GradientBoosting import GradientBoosting
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import mean_squared_error, r2_score
-from sklearn.preprocessing import OneHotEncoder, LabelEncoder
 import matplotlib.pyplot as plt
+import seaborn as sns
+
+from GradientBoosting.models.GradientBoosting import GradientBoosting
+from GradientBoosting.models.grid_search import grid_search
+from GradientBoosting.models.Check import check_null, XandY
 
 
 def test_predict():
-    # Initialize GradientBoosting model
-    # n_estimators: Number of trees in the ensemble
-    # learning_rate: Step size for updating residuals
-    # max_depth: Maximum depth of each decision tree
-    model = GradientBoosting(n_estimators=100, learning_rate=0.1, max_depth=3)
-
-    # Load data from CSV file
-    # csv_file_path: Path to the dataset
-    csv_file_path = "GradientBoosting/tests/small_test.csv"
-    df = pd.read_csv(csv_file_path)  # Load data into a pandas DataFrame
-
-    # Separating features (X) and target variable (y)
-    # X contains all columns except 'y', which is the target
-    X = df.drop(columns=['y'])
-    y = df['y']
-
-    # Handling categorical columns in X
-    # Identifies categorical columns and applies OneHotEncoding
-    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
-    if len(categorical_cols) > 0:
-        encoder = OneHotEncoder(sparse=False, drop='first')  # OneHotEncoder avoids collinearity
-        # Encode categorical columns and add to DataFrame
-        X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]), index=X.index)
-        X_encoded.columns = encoder.get_feature_names_out(categorical_cols)
-        X = X.drop(columns=categorical_cols)  # Drop original categorical columns
-        X = pd.concat([X, X_encoded], axis=1)  # Add encoded columns to X
-
-    # Handling categorical target variable y
-    # Converts target variable y into numeric using LabelEncoder if categorical
-    if y.dtype == 'object':
-        label_encoder = LabelEncoder()
-        y = label_encoder.fit_transform(y)
-
-    # Convert y to NumPy array to avoid index mismatches
-    y = np.array(y)
-
-    # Split data into training and testing sets (80-20 split)
-    # Training set is 80%, Testing set is 20%
-    X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.20, random_state=42)
-
-    # Fit the GradientBoosting model on training data
-    # Model learns the relationship between X_train and y_train
-    results = model.fit(X_train, y_train)
-
-    # Predict using the fitted model on the test set
-    preds = results.predict(X_test)
+    """
+    Test the GradientBoosting model with a dataset, evaluate its performance, and visualize results.
+    """
+
+    #! If you are going to use "pytest", enable this block
+    # file_path = "GradientBoosting/tests/small_test.csv"
+    # df = pd.read_csv(file_path)
+    # target = 'y'
+
+    #! Comment it out if you are using "pytest"
+    file_path = input("Please enter the path to your dataset file: ")
+
+    try:
+        if file_path.endswith('.csv'):
+            df = pd.read_csv(file_path)
+        else:
+            print("Unsupported file format. Please provide a CSV, Excel, JSON, or Parquet file.")
+            return
+    except FileNotFoundError:
+        print("File not found. Please check the path and try again.")
+        return
+
+    print("\n" + "=" * 40)
+    print("Dataset Preview:")
+    print("=" * 40)
+    print(df.head())
+
+    #! Uncomment this block if using "pytest"
+    # target = 'y'
+
+    #! Comment out this block if using "pytest"
+    target = input("Enter the target column name: ")
+
+    # Check and handle null values
+    check_null(df)
+
+    # Split data into features (X) and target (Y)
+    X, Y = XandY(df, target)
+
+    # Split data into training and testing sets
+    np.random.seed(42)
+    shuffled_indices = np.random.permutation(X.shape[0])
+    train_size = int(0.8 * len(shuffled_indices))
+    train_indices, test_indices = shuffled_indices[:train_size], shuffled_indices[train_size:]
+    X_train, X_test = X[train_indices], X[test_indices]
+    y_train, y_test = Y[train_indices], Y[test_indices]
+
+    # Define hyperparameters for grid search
+    param_grid = {
+        'n_estimators': [50, 100, 150],
+        'learning_rate': [0.05, 0.1, 0.2],
+        'max_depth': [3, 5, 7]
+    }
+
+    # Perform grid search to find the best hyperparameters
+    grid_results = grid_search(X_train, y_train, param_grid)
+    best_params = grid_results['best_params']
+
+    print("\n" + "=" * 40)
+    print("Best Parameters from Grid Search")
+    print("=" * 40)
+    print(f"Number of Estimators: {best_params['n_estimators']}")
+    print(f"Learning Rate: {best_params['learning_rate']}")
+    print(f"Maximum Depth: {best_params['max_depth']}")
+    print(f"Best MSE: {grid_results['best_score']:.4f}")
+    print("=" * 40)
+
+    # Initialize the model with the best parameters
+    final_model = GradientBoosting(
+        n_estimators=best_params['n_estimators'],
+        learning_rate=best_params['learning_rate'],
+        max_depth=best_params['max_depth']
+    )
+
+    # Train the final model
+    final_model.fit(X_train, y_train)
+    final_predictions = final_model.predict(X_test)
 
     # Calculate evaluation metrics
-    # Mean Squared Error (MSE): Measures the average squared difference between predictions and actual values
-    mse = mean_squared_error(y_test, preds)
-    # R-squared (R²): Proportion of variance in the dependent variable explained by the model
-    r2 = r2_score(y_test, preds)
-
-    # Print evaluation metrics
-    print(f"Mean Squared Error (MSE): {mse}")
-    print(f"R-squared (R²): {r2}")
-
-    # **1. Residual Plot**
-    # Residual = Predicted - Actual
-    # Visualizes residuals to assess model bias
-    residuals = preds - y_test
-    plt.scatter(y_test, residuals)
-    plt.axhline(y=0, color='r', linestyle='--')  # Reference line at residual=0
-    plt.xlabel("True Values")
-    plt.ylabel("Residuals")
-    plt.title("Residual Plot")
+    mse = np.mean((y_test - final_predictions) ** 2)
+    rmse = np.sqrt(mse)
+    r2 = 1 - (np.sum((y_test - final_predictions) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2))
+
+    print("\n" + "=" * 40)
+    print("Final Model Evaluation")
+    print("=" * 40)
+    print(f"Mean Squared Error (MSE): {mse:.4f}")
+    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
+    print(f"R² Score: {r2:.4f}")
+    print("=" * 40)
+
+    # Visualization 1: Density Plot of Actual vs Predicted Values
+    plt.figure(figsize=(8, 6))
+    sns.kdeplot(y_test, color='blue', fill=True, label='Actual Values')
+    sns.kdeplot(final_predictions, color='blue', fill=True, label='Predicted Values')
+    plt.title('Density Plot of Actual vs Predicted Values')
+    plt.xlabel('Values')
+    plt.ylabel('Density')
+    plt.legend()
+    plt.grid(True)
     plt.show()
 
-    # **2. Predicted vs. True Values**
-    # Visualizes how well predictions align with actual values
-    plt.scatter(y_test, preds)
-    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='r', linestyle='--')  # Ideal fit line
-    plt.xlabel("True Values")
-    plt.ylabel("Predicted Values")
-    plt.title("True vs. Predicted Values")
+    # Visualization 2: Prediction Error Plot
+    plt.figure(figsize=(8, 6))
+    plt.scatter(y_test, final_predictions, color='green', label='Predicted Values', alpha=0.6)
+    plt.plot(
+        [min(y_test), max(y_test)], [min(y_test), max(y_test)],
+        color='red', linestyle='--', label='Perfect Prediction'
+    )
+    plt.xlabel('Actual Values')
+    plt.ylabel('Predicted Values')
+    plt.title('Prediction Error Plot')
+    plt.legend()
+    plt.grid(True)
     plt.show()
 
-    # **3. Learning Curve**
-    # Visualizes training and testing errors as the number of trees increases
-    train_errors = []
-    test_errors = []
-    for i, tree in enumerate(model.trees):
-        # Calculate predictions for training data using the first i+1 trees
-        partial_preds_train = np.sum([tree.predict(X_train) for tree in model.trees[:i+1]], axis=0)
-        train_errors.append(mean_squared_error(y_train, partial_preds_train))
-
-        # Calculate predictions for testing data using the first i+1 trees
-        partial_preds_test = np.sum([tree.predict(X_test) for tree in model.trees[:i+1]], axis=0)
-        test_errors.append(mean_squared_error(y_test, partial_preds_test))
-
-    # Plot the learning curve
-    plt.plot(train_errors, label='Training Error')
-    plt.plot(test_errors, label='Testing Error')
-    plt.xlabel("Number of Trees")
-    plt.ylabel("Mean Squared Error")
-    plt.title("Learning Curve")
-    plt.legend()  # Add a legend to differentiate training and testing errors
-    plt.show()
-
-    # Dummy assertion to validate test structure
-    # Ensures that predictions are returned as a NumPy array
-    assert isinstance(preds, np.ndarray), "Prediction is not an array"
-
 
-# Run the test function if executed directly
 if __name__ == "__main__":
     test_predict()

From 0807e4d0d52783f2ca53e69e529a8944ca051155 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:09:20 -0600
Subject: [PATCH 09/19] Update requirements.txt

---
 requirements.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 904e466..aae35f6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,7 @@
 numpy
+pandas
+seaborn
+itertools
 csv
 pytest
 ipython

From e5479743745df20751c9eda9ab062e0ea3e16ac7 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:09:52 -0600
Subject: [PATCH 10/19] Update setup.py

---
 setup.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9aa97b1..19fc316 100644
--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,10 @@
         'numpy',
         'scikit-learn',
         'pytest',
-        'matplotlib'
+        'matplotlib',
+        'pandas',
+        'seaborn',
+        'ipython',
+        'itertools'
     ],
 )

From 6bbd6c25236583cf6019f65c651614e638730480 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:16:38 -0600
Subject: [PATCH 11/19] Update README.md

---
 README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/README.md b/README.md
index 412113e..ce61e51 100644
--- a/README.md
+++ b/README.md
@@ -59,3 +59,27 @@ Follow the steps below to set up and run the code on any system. These instructi
    ```
    - The `-s` flag ensures that any `print` statements in the test file are displayed in the terminal.
    - `pytest` will run the test cases defined in `test_GradientBoosting.py` to verify the functionality of your GradientBoosting implementation.
+
+### **Step 6: Interactive Input During Testing**
+
+After running the test command, the script will prompt you to provide necessary inputs for testing the Gradient Boosting model:
+
+1. **Dataset File Path**:  
+   - You will see the following prompt in the terminal:  
+     ```
+     Please enter the path to your dataset file:
+     ```
+   - Enter the full path to the dataset file you want to use. Ensure the dataset is in one of the supported formats:  
+     - `.csv`  
+     - `.xlsx`  
+     - `.json`  
+     - `.parquet`  
+
+2. **Target Column Name**:  
+   - After entering the dataset file path, the script will display:  
+     ```
+     Enter the target column name:
+     ```
+   - Enter the name of the target column (e.g., `y`) that you wish to use as the dependent variable for training the model.
+
+**Note**: Make sure to provide accurate paths and column names to avoid errors during execution.

From 6309f9bd1af65145e141782d9b679a8b82665654 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:17:27 -0600
Subject: [PATCH 12/19] Update README.md

---
 README.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/README.md b/README.md
index ce61e51..7b8faf9 100644
--- a/README.md
+++ b/README.md
@@ -69,11 +69,6 @@ After running the test command, the script will prompt you to provide necessary
      ```
      Please enter the path to your dataset file:
      ```
-   - Enter the full path to the dataset file you want to use. Ensure the dataset is in one of the supported formats:  
-     - `.csv`  
-     - `.xlsx`  
-     - `.json`  
-     - `.parquet`  
 
 2. **Target Column Name**:  
    - After entering the dataset file path, the script will display:  

From 165d9f8199dc024d685c6433ebcf9681b2302cbf Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:17:54 -0600
Subject: [PATCH 13/19] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 7b8faf9..6d4d634 100644
--- a/README.md
+++ b/README.md
@@ -77,4 +77,3 @@ After running the test command, the script will prompt you to provide necessary
      ```
    - Enter the name of the target column (e.g., `y`) that you wish to use as the dependent variable for training the model.
 
-**Note**: Make sure to provide accurate paths and column names to avoid errors during execution.

From 558272162a80d698dbf9c77a17572fa60b265d23 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:20:03 -0600
Subject: [PATCH 14/19] Update README.md

---
 README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/README.md b/README.md
index 6d4d634..9fac8ab 100644
--- a/README.md
+++ b/README.md
@@ -77,3 +77,17 @@ After running the test command, the script will prompt you to provide necessary
      ```
    - Enter the name of the target column (e.g., `y`) that you wish to use as the dependent variable for training the model.
 
+
+### **Overview**
+
+This project implements a Gradient Boosting model for regression tasks. Gradient Boosting is an ensemble learning method that builds a sequence of weak learners, typically decision trees, where each new learner focuses on correcting the residual errors of the previous ones. It is a powerful and flexible technique for regression problems, known for its ability to handle complex datasets and achieve high predictive accuracy.
+
+### **Key Features**
+
+- **Iterative Residual Correction**: The model improves predictions iteratively by minimizing the residual errors from previous models.
+- **Decision Tree Base Learners**: Utilizes decision trees as weak learners, which are combined to form a strong predictive model.
+- **Learning Rate Control**: Incorporates a learning rate to manage the contribution of each tree and prevent overfitting.
+- **Hyperparameter Optimization**: Supports grid search to tune key hyperparameters such as the number of estimators, learning rate, and maximum tree depth for optimal performance.
+- **Robustness and Flexibility**: Handles complex data structures, making it well-suited for various regression tasks, even with non-linear relationships.  
+
+

From 79802098ba675c0f1d7bd35e48198a36cabdf497 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:32:20 -0600
Subject: [PATCH 15/19] Update README.md

---
 README.md | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 132 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9fac8ab..bd28b60 100644
--- a/README.md
+++ b/README.md
@@ -80,14 +80,145 @@ After running the test command, the script will prompt you to provide necessary
 
 ### **Overview**
 
+---
+
 This project implements a Gradient Boosting model for regression tasks. Gradient Boosting is an ensemble learning method that builds a sequence of weak learners, typically decision trees, where each new learner focuses on correcting the residual errors of the previous ones. It is a powerful and flexible technique for regression problems, known for its ability to handle complex datasets and achieve high predictive accuracy.
 
 ### **Key Features**
 
+---
+
 - **Iterative Residual Correction**: The model improves predictions iteratively by minimizing the residual errors from previous models.
 - **Decision Tree Base Learners**: Utilizes decision trees as weak learners, which are combined to form a strong predictive model.
 - **Learning Rate Control**: Incorporates a learning rate to manage the contribution of each tree and prevent overfitting.
 - **Hyperparameter Optimization**: Supports grid search to tune key hyperparameters such as the number of estimators, learning rate, and maximum tree depth for optimal performance.
-- **Robustness and Flexibility**: Handles complex data structures, making it well-suited for various regression tasks, even with non-linear relationships.  
+- **Robustness and Flexibility**: Handles complex data structures, making it well-suited for various regression tasks, even with non-linear relationships.
+
+
+### Gradient Boosting Implementation
+
+---
+
+
+
+### **1. What does the model you have implemented do, and when should it be used?**
+
+The Gradient Boosting model is designed to solve **regression tasks** by combining multiple weak learners (decision trees). It minimizes the error iteratively by learning from residuals, which are the differences between predicted and actual values in the dataset. Each new tree added to the model tries to correct the errors made by the previous trees.
+
+#### **Use Cases**
+- **Non-linear Relationships**: Ideal for datasets where relationships between predictors and the target variable are not linear, making traditional linear models unsuitable.
+- **High Dimensional Data**: Handles datasets with many features, even when those features have complex interactions.
+- **Predictive Accuracy**: Frequently used in competitions (like Kaggle) due to its ability to provide state-of-the-art results in regression tasks.
+- **Robustness**: Suitable for scenarios where overfitting must be controlled through learning rates and regularization.
+
+#### **When to Use It**
+- When predictive accuracy is a priority.
+- When your dataset exhibits non-linear relationships and interactions between variables.
+- When interpretability is less critical (as Gradient Boosting models are complex compared to linear regression).
+- When you want a model that performs well out-of-the-box but allows for fine-tuning through hyperparameters.
+
+---
+
+### **2. How did you test your model to determine if it is working reasonably correctly?**
+
+The model's correctness and effectiveness were validated through the following steps:
+
+1. **Test Dataset**:
+   - The model was tested on synthetic datasets with known properties to verify its ability to approximate the underlying patterns and minimize residual errors.
+   - Example: A generated dataset with non-linear relationships between features and the target variable.
+
+2. **Evaluation Metrics**:
+   - **Mean Squared Error (MSE)**: Measures the average squared difference between actual and predicted values. Lower values indicate better performance.
+   - **Root Mean Squared Error (RMSE)**: Provides a more interpretable measure by putting the error on the same scale as the target variable.
+   - **R² (Coefficient of Determination)**: Indicates the proportion of variance in the target variable explained by the model.
+
+3. **Visualization**:
+   - **Density Plot**: Compares the distribution of actual versus predicted values to assess alignment.
+   - **Prediction Error Plot**: Shows how well predictions align with actual values using scatterplots.
+
+4. **Cross-Validation**:
+   - During hyperparameter tuning via grid search, the dataset was split into training and testing sets to evaluate generalization and avoid overfitting.
+
+5. **Edge Case Testing**:
+   - Tested the model with datasets containing missing values to ensure null handling works correctly.
+   - Ensured stability when presented with datasets with correlated features or large variance in feature scales.
+
+---
+
+### **3. What parameters have you exposed to users of your implementation in order to tune performance?**
+
+The implementation allows users to tune the following parameters for performance optimization:
+
+1. **Number of Estimators (`n_estimators`)**:
+   - Specifies the number of decision trees in the ensemble.
+   - More trees generally improve performance but increase computational cost and risk of overfitting.
+   - Example: `n_estimators = 50` or `n_estimators = 150`.
+
+2. **Learning Rate (`learning_rate`)**:
+   - Controls the contribution of each tree to the overall prediction.
+   - A smaller learning rate requires more trees to achieve the same performance but improves generalization.
+   - Example: `learning_rate = 0.05`.
+
+3. **Maximum Depth of Trees (`max_depth`)**:
+   - Restricts the depth of each decision tree, controlling its complexity.
+   - A deeper tree captures more intricate patterns but increases the risk of overfitting.
+   - Example: `max_depth = 3`.
+
+#### **Basic Usage Example**
+```python
+from GradientBoosting.models.GradientBoosting import GradientBoosting
+
+# Initialize the model
+model = GradientBoosting(n_estimators=100, learning_rate=0.1, max_depth=3)
+
+# Fit the model to training data
+model.fit(X_train, y_train)
+
+# Make predictions
+predictions = model.predict(X_test)
+```
+
+#### **Hyperparameter Tuning**
+- Use `grid_search` to automatically find the optimal combination of parameters:
+```python
+from GradientBoosting.models.grid_search import grid_search
+
+param_grid = {
+    'n_estimators': [50, 100, 150],
+    'learning_rate': [0.05, 0.1, 0.2],
+    'max_depth': [3, 5, 7]
+}
+
+best_params = grid_search(X_train, y_train, param_grid)
+```
+
+---
+
+### **4. Are there specific inputs that your implementation has trouble with? Given more time, could you work around these or is it fundamental?**
+
+#### **Current Limitations**:
+1. **Categorical Features**:
+   - The model currently expects numeric inputs and does not support automatic encoding of categorical variables.
+   - **Workaround**: Preprocess categorical data using `OneHotEncoder` or similar techniques before passing it to the model.
+   - **Future Enhancement**: Integrate categorical feature support directly into the model.
+
+2. **Outliers**:
+   - Extreme outliers in the dataset can skew the residuals, affecting the performance of subsequent trees.
+   - **Workaround**: Use preprocessing steps such as outlier removal or robust scaling before fitting the model.
+
+3. **Imbalanced Datasets**:
+   - The current implementation is not optimized for datasets with highly imbalanced target distributions.
+   - **Workaround**: Use techniques like oversampling, undersampling, or appropriate evaluation metrics to address imbalance.
+
+4. **Computational Cost**:
+   - The model may become computationally expensive for large datasets or when using a high number of estimators.
+   - **Workaround**: Use a smaller learning rate and fewer estimators while monitoring performance. Parallelize tree building if possible.
+
+#### **Future Directions**:
+- **Feature Engineering**: Automate feature preprocessing (e.g., handling categorical data and missing values).
+- **Early Stopping**: Implement early stopping to halt training when performance ceases to improve on validation data.
+- **Explainability**: Add tools to interpret feature importance for better model explainability.
+
+
 
 

From a1db1d510a947c63f35cecf4b7baeded0802b03e Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:43:54 -0600
Subject: [PATCH 16/19] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bd28b60..0e1c3a2 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-## Team Members:
+## Team Members(Team Falcon):
 1. Badri Adusumalli A20530163
 2. Bhuvana Chandrika Natharga A20553587
 3. Santhosh Kumar Kathiresan A20546185

From 8cbb7bb8ef8ef56406e3eb226d18d2bcefd65001 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:44:19 -0600
Subject: [PATCH 17/19] Update requirements.txt

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index aae35f6..d9ec53e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 numpy
 pandas
 seaborn
-itertools
 csv
 pytest
 ipython

From 3335d57f1364567870e0116798cf9c8ea3357838 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:44:40 -0600
Subject: [PATCH 18/19] Update setup.py

---
 setup.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 19fc316..5c36803 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,6 @@
         'matplotlib',
         'pandas',
         'seaborn',
-        'ipython',
-        'itertools'
+        'ipython'
     ],
 )

From a59f20e42aab3fdc70c723fba4db80b911a46858 Mon Sep 17 00:00:00 2001
From: badri-4 <badusumalli@hawk.iit.edu>
Date: Thu, 21 Nov 2024 22:48:49 -0600
Subject: [PATCH 19/19] Update requirements.txt

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d9ec53e..0b6271d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 numpy
 pandas
 seaborn
-csv
 pytest
 ipython
 matplotlib