Mo-Khater · Mohamed-Ashraf273 · May 3, 2026 · May 3, 2026 · May 3, 2026 · May 3, 2026
diff --git a/.gitignore b/.gitignore
@@ -37,3 +37,5 @@ models/*.joblib
 models/*.onnx
 
 src/data/*
+.env
+mlruns/
diff --git a/Makefile b/Makefile
@@ -9,7 +9,9 @@ CLEANED_DATA ?= $(PROCESSED_DIR)/cleaned_data.csv
 REPORTS_DIR ?= reports
 OVERWRITE ?=
 PIPELINE_OVERWRITE ?= --overwrite
+PIPELINE_RUNTIME_ID ?= $(shell date -u +%Y%m%dT%H%M%SZ)
 PY_FILES := $(shell find src tests -type f -name "*.py")
+export PIPELINE_RUNTIME_ID
 
 .DEFAULT_GOAL := help
 
@@ -45,7 +47,7 @@ check-python:
 	$(PYTHON) test_environment.py
 
 acquisition:
-	$(PYTHON) $(SRC_DIR)/data/data_acquisition.py
+	PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.data.data_acquisition
 
 cleaning:
 	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.features.preprocessing $(OVERWRITE)
@@ -72,10 +74,10 @@ validate:
 reports: validate
 
 evaluate:
-	$(PYTHON) $(SRC_DIR)/models/predict.py
+	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.models.predict
 
 train:
-	$(PYTHON) $(SRC_DIR)/models/train.py
+	PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.models.train
 
 test:
 	$(PYTHON) -m pytest tests -v
@@ -101,12 +103,16 @@ clean:
 	find . -type f -name "*.py[co]" -delete
 
 pipeline:
-	$(MAKE) validate
-	$(MAKE) cleaning OVERWRITE="$(PIPELINE_OVERWRITE)"
-	$(MAKE) feature_engineering OVERWRITE="$(PIPELINE_OVERWRITE)"
-	$(MAKE) feature_selection OVERWRITE="$(PIPELINE_OVERWRITE)"
-	$(MAKE) eda
-	$(MAKE) train
-	$(MAKE) evaluate
+	@if [ ! -f "$(CLEANED_DATA)" ]; then \
+		echo "Missing $(CLEANED_DATA). Running acquisition first."; \
+		$(MAKE) acquisition PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"; \
+	fi
+	$(MAKE) validate PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
+	$(MAKE) cleaning OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
+	$(MAKE) feature_engineering OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
+	$(MAKE) feature_selection OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
+	$(MAKE) eda PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
+	$(MAKE) train PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
+	$(MAKE) evaluate PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
 
 all: check-python check lint test pipeline
diff --git a/app/House-Price-ntelligence/deployed_model.pkl b/app/House-Price-ntelligence/deployed_model.pkl
diff --git a/config.py b/config.py
@@ -0,0 +1,91 @@
+from pathlib import Path
+
+from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import GaussianNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.tree import DecisionTreeClassifier
+
+PROJECT_ROOT = Path(__file__).resolve().parent
+SRC_DIR = PROJECT_ROOT / "src"
+PACKAGE_DIR = SRC_DIR / "house_price_class_prediction"
+
+DATA_DIR = PROJECT_ROOT / "data"
+RAW_DIR = DATA_DIR / "raw"
+INTERIM_DIR = DATA_DIR / "interim"
+PROCESSED_DIR = DATA_DIR / "processed"
+TEMP_DIR = DATA_DIR / "temp"
+
+REPORTS_DIR = PROJECT_ROOT / "reports"
+FIGURES_DIR = REPORTS_DIR / "figures"
+PIPELINE_LOG_PATH = PROJECT_ROOT / "pipeline.log"
+
+MODELS_DIR = PROJECT_ROOT / "models"
+TEST_MODEL_PATH = MODELS_DIR / "test_model.pkl"
+
+APP_DIR = PROJECT_ROOT / "app" / "House-Price-ntelligence"
+DEPLOYED_MODEL_PATH = APP_DIR / "deployed_model.pkl"
+
+MLFLOW_DB_PATH = PACKAGE_DIR / "models" / "mlflow.db"
+
+SELECTED_MODEL_NAME = "random_forest"
+MODEL_CV_FOLDS = 5
+MODEL_SCORING = "f1_macro"
+MODEL_N_JOBS = -1
+
+LOG_MODEL_PIP_REQUIREMENTS = [
+    "mlflow",
+    "scikit-learn",
+    "pandas",
+    "numpy",
+]
+
+MODEL_CONFIGS = {
+    "naive": {
+        "estimator": GaussianNB(),
+        "params": {},
+    },
+    "logistic": {
+        "estimator": Pipeline(
+            [
+                ("scaler", StandardScaler()),
+                (
+                    "model",
+                    LogisticRegression(
+                        random_state=42, class_weight="balanced", max_iter=5000
+                    ),
+                ),
+            ]
+        ),
+        "params": {"model__C": [0.1, 1, 10]},
+    },
+    "knn": {
+        "estimator": Pipeline(
+            [("scaler", StandardScaler()), ("model", KNeighborsClassifier())]
+        ),
+        "params": {
+            "model__n_neighbors": [3, 5, 7, 9, 11, 13],
+            "model__weights": ["uniform", "distance"],
+            "model__metric": ["euclidean", "manhattan", "minkowski"],
+        },
+    },
+    "decision_tree": {
+        "estimator": DecisionTreeClassifier(random_state=42, class_weight="balanced"),
+        "params": {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5]},
+    },
+    "random_forest": {
+        "estimator": RandomForestClassifier(random_state=42, class_weight="balanced"),
+        "params": {
+            "n_estimators": [50, 100],
+            "max_depth": [5, 10],
+            "min_samples_split": [5, 10],
+            "min_samples_leaf": [2, 4],
+        },
+    },
+    "ada_boost": {
+        "estimator": AdaBoostClassifier(random_state=42),
+        "params": {"n_estimators": [50, 100], "learning_rate": [0.5, 1.0]},
+    },
+}
diff --git a/docs/latex/main.pdf b/docs/latex/main.pdf
diff --git a/docs/latex/main.tex b/docs/latex/main.tex
@@ -2,7 +2,15 @@
 \input{preamble.tex}
 
 \title{\projectname\\Preprocessing, Feature Engineering,EDA and model Documentation}
-\author{}
+\author{
+Mohamed Ashraf\\ID: 9220658
+\and
+Nesma Osama\\ID: 9220912
+\and
+Mazen Adel\\ID: 9220625
+\and
+Mohamed Khater\\ID: 9220713
+}
 \date{April 17, 2026}
 
 \begin{document}
@@ -28,27 +36,29 @@ \section{Pipeline Overview}
     \item explore distributions, pairwise relationships, and class separation in the training set.
 \end{enumerate}
 
+\input{sections/business_problem}
 \input{sections/preprocessing}
 \input{sections/build_features}
 \input{sections/eda}
 \input{sections/models}
 \input{sections/results}
+\input{sections/testing}
 \input{sections/ci}
 
 \section{Artifacts}
-\begin{longtable}{p{0.34\textwidth} p{0.58\textwidth}}
+\begin{longtable}{>{\raggedright\arraybackslash}p{0.42\textwidth} p{0.50\textwidth}}
 \toprule
 Artifact & Description \\
 \midrule
-\texttt{data/processed/train\_data.csv} & Preprocessed training split before feature encoding. \\
-\texttt{data/processed/test\_data.csv} & Preprocessed test split before feature encoding. \\
-\texttt{data/processed/duplicate\_rows.csv} & Records removed as duplicates using the defined structural key. \\
-\texttt{data/processed/rejected\_rows.csv} & Records rejected because critical enrichment fields are missing. \\
-\texttt{data/processed/X\_train\_encoded.csv} & Encoded and engineered training features. \\
-\texttt{data/processed/X\_test\_encoded.csv} & Encoded and engineered test features. \\
-\texttt{data/processed/y\_train\_encoded.csv} & Numeric target labels for the training split. \\
-\texttt{data/processed/y\_test\_encoded.csv} & Numeric target labels for the test split. \\
-\texttt{reports/figures/01\_eda.png} to \texttt{33\_eda.png} & Exported EDA visualizations from the analysis notebook. \\
+\path{data/processed/train_data.csv} & Preprocessed training split before feature encoding. \\
+\path{data/processed/test_data.csv} & Preprocessed test split before feature encoding. \\
+\path{data/processed/duplicate_rows.csv} & Records removed as duplicates using the defined structural key. \\
+\path{data/processed/rejected_rows.csv} & Records rejected because critical enrichment fields are missing. \\
+\path{data/processed/X_train_encoded.csv} & Encoded and engineered training features. \\
+\path{data/processed/X_test_encoded.csv} & Encoded and engineered test features. \\
+\path{data/processed/y_train_encoded.csv} & Numeric target labels for the training split. \\
+\path{data/processed/y_test_encoded.csv} & Numeric target labels for the test split. \\
+\path{reports/figures/01_eda.png} to \path{33_eda.png} & Exported EDA visualizations from the analysis notebook. \\
 \bottomrule
 \end{longtable}
 

diff --git a/docs/latex/sections/business_problem.tex b/docs/latex/sections/business_problem.tex
@@ -0,0 +1,56 @@
+\section{Business Problem and Stakeholders}
+
+\subsection{Business Context}
+The project uses a United States residential real estate dataset to classify listings into four price categories: Budget, Mid-Range, Premium, and Luxury. The raw source contains more than 2.2 million listings, and the current modeling dataset uses a balanced 40,000-row sample enriched with city, weather, and nearby point-of-interest variables.
+
+Residential real estate decisions are time sensitive. Buyers, agents, listing platforms, and investors need to compare properties quickly, but raw listing prices alone do not explain whether a property belongs to a low-cost, average, premium, or luxury segment. Two houses with similar physical features can belong to different market segments depending on location, city density, climate, nearby services, and other contextual variables.
+
+\subsection{Business Problem}
+The business problem is to help real estate decision makers automatically identify the market segment of a residential property listing using its physical, geographic, demographic, climate, and amenity features.
+
+More specifically, the project answers the question:
+
+\begin{quote}
+Given a property's listing attributes and local context, can we predict whether the property belongs to the Budget, Mid-Range, Premium, or Luxury category?
+\end{quote}
+
+This is useful because an accurate price-category classifier can support faster listing review, better customer search filters, improved lead routing, and early detection of listings that may need human pricing review.
+
+\subsection{Business Objectives}
+The main business objectives are:
+\begin{itemize}
+    \item classify each residential listing into a clear price segment,
+    \item help buyers and agents compare similar properties within the same market tier,
+    \item support listing platforms in organizing search results and recommendations,
+    \item identify high-value Luxury listings with strong precision and recall,
+    \item flag uncertain adjacent categories, especially Mid-Range and Premium, for human review,
+    \item create a repeatable data and modeling pipeline that can be updated as new listings arrive.
+\end{itemize}
+
+\subsection{Stakeholders}
+\begin{longtable}{p{0.28\textwidth} p{0.62\textwidth}}
+\toprule
+Stakeholder & Interest in the Project \\
+\midrule
+Real estate platforms & Need consistent listing categories for search, recommendations, ranking, and marketplace analytics. \\
+Real estate agents and brokers & Need quick market-segment estimates to prioritize listings, advise sellers, and route leads. \\
+Property buyers & Need clearer filters and comparisons when searching for homes within a target budget or quality tier. \\
+Property sellers & Need feedback on how their listing is positioned in the market compared with similar properties. \\
+Investors and portfolio managers & Need fast screening of properties by segment before deeper financial analysis. \\
+Appraisers and pricing analysts & Can use the model output as an initial signal, while retaining human responsibility for final valuation. \\
+Lenders and mortgage teams & May use segment information as supporting context during risk review and property assessment. \\
+Data science and engineering team & Owns the data pipeline, validation checks, model training, experiment tracking, and deployment workflow. \\
+Compliance and governance reviewers & Need transparency about the data sources, model limitations, and appropriate use of predictions. \\
+\bottomrule
+\end{longtable}
+
+\subsection{How We Are Solving It}
+The project solves the business problem by building a model that learns the relationship between a property's listing details, location context, nearby services, climate conditions, and its market segment. Instead of using only the raw price, the solution combines property attributes with external context so each listing can be assigned to one of four practical categories: Budget, Mid-Range, Premium, or Luxury.
+
+The model is intended to work as a decision-support tool. It gives real estate teams an initial, consistent classification for each listing, helping them organize properties, prioritize high-value cases, and identify listings that may need closer human review. The detailed data preparation, feature engineering, model training, and evaluation steps are described in the following sections of the report.
+
+\subsection{Expected Business Value}
+The model provides a practical decision-support layer rather than replacing human real estate expertise. It can reduce manual screening time, improve consistency across listings, and highlight Luxury properties where misclassification has higher business impact. The current results also show that adjacent middle categories can be harder to separate, so the best operational use is automated support for clear cases plus human review for uncertain cases.
+
+\subsection{Risks and Limitations}
+The main limitations are missing values in original listing fields such as previous sold date, house size, bath, bed, and acre lot; duplicate records in the cleaned export; and outlier values flagged by validation rules. Because the target is derived from price quartiles, the categories are relative market segments rather than official appraisal classes. Model predictions should therefore be used as a screening and prioritization tool, not as a final property valuation.
diff --git a/docs/latex/sections/eda.tex b/docs/latex/sections/eda.tex
@@ -1,12 +1,12 @@
     \section{EDA}
 
     \begin{enumerate}
-        For example california has the highest num of luxuary houses 2509 while missouri show lowest number 74 so if the house from california it is probable to be luxuray
+        \item For example california has the highest num of luxuary houses 2509 while missouri show lowest number 74 so if the house from california it is probable to be luxuray
 
         \begin{figure}[H]
             \centering
-            \includegraphics[width=0.88\textwidth]{sections/figures/city_price_cat.png}
-            \caption{city\_price\_cat}
+            \includegraphics[width=0.88\textwidth]{sections/figures/state_price_cat.png}
+            \caption{state\_price\_cat}
         \end{figure}
 
         \item this image draw a pairplot between weather data which shows the hist of the feature and the scatter plot with the other featre. For example avg\_humidity has positive realtion with avg\_precip. some features has skew like avg\_humidity while other has normal dist like avg\_temp.
@@ -41,7 +41,7 @@
 
         \begin{figure}[H]
             \centering
-            \includegraphics[width=0.88\textwidth]{docs/latex/sections/figures/city_vs_price.png}
-            \caption{state\_vs\_price\_cat}
+            \includegraphics[width=0.88\textwidth]{sections/figures/city_vs_price.png}
+            \caption{city\_vs\_price}
         \end{figure}
     \end{enumerate}
diff --git a/docs/latex/sections/figures/accuracy_test vs accuracy_training.png b/docs/latex/sections/figures/accuracy_test vs accuracy_training.png
diff --git a/docs/latex/sections/figures/f1_macro_test vs f1_macro_training.png b/docs/latex/sections/figures/f1_macro_test vs f1_macro_training.png
diff --git a/docs/latex/sections/figures/luxury_precision_test.png b/docs/latex/sections/figures/luxury_precision_test.png
diff --git a/docs/latex/sections/figures/luxury_recall_test.png b/docs/latex/sections/figures/luxury_recall_test.png
diff --git a/docs/latex/sections/figures/precision_macro_test vs recall_macro_test.png b/docs/latex/sections/figures/precision_macro_test vs recall_macro_test.png
diff --git a/docs/latex/sections/preprocessing.tex b/docs/latex/sections/preprocessing.tex
@@ -1,7 +1,5 @@
 \section{Preprocessing Pipeline}
 
-This section documents the preprocessing decisions applied before model training. It converts the working notes into a structured LaTeX report and reflects the logic used in the project preprocessing workflow.
-
 \subsection{Cleaning Steps and Justification}
 
 \subsubsection{Data Accuracy}
-Original file line number
+Diff line change
@@ Expand Up / @@ -37,3 +37,5 @@ models/*.joblib @@
     models/*.onnx
     src/data/*
+    .env
+    mlruns/