Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@ models/*.joblib
models/*.onnx

src/data/*
.env
mlruns/
26 changes: 16 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ CLEANED_DATA ?= $(PROCESSED_DIR)/cleaned_data.csv
REPORTS_DIR ?= reports
OVERWRITE ?=
PIPELINE_OVERWRITE ?= --overwrite
PIPELINE_RUNTIME_ID ?= $(shell date -u +%Y%m%dT%H%M%SZ)
PY_FILES := $(shell find src tests -type f -name "*.py")
export PIPELINE_RUNTIME_ID

.DEFAULT_GOAL := help

Expand Down Expand Up @@ -45,7 +47,7 @@ check-python:
$(PYTHON) test_environment.py

acquisition:
$(PYTHON) $(SRC_DIR)/data/data_acquisition.py
PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.data.data_acquisition

cleaning:
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.features.preprocessing $(OVERWRITE)
Expand All @@ -72,10 +74,10 @@ validate:
reports: validate

evaluate:
$(PYTHON) $(SRC_DIR)/models/predict.py
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.models.predict

train:
$(PYTHON) $(SRC_DIR)/models/train.py
PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.models.train

test:
$(PYTHON) -m pytest tests -v
Expand All @@ -101,12 +103,16 @@ clean:
find . -type f -name "*.py[co]" -delete

pipeline:
$(MAKE) validate
$(MAKE) cleaning OVERWRITE="$(PIPELINE_OVERWRITE)"
$(MAKE) feature_engineering OVERWRITE="$(PIPELINE_OVERWRITE)"
$(MAKE) feature_selection OVERWRITE="$(PIPELINE_OVERWRITE)"
$(MAKE) eda
$(MAKE) train
$(MAKE) evaluate
@if [ ! -f "$(CLEANED_DATA)" ]; then \
echo "Missing $(CLEANED_DATA). Running acquisition first."; \
$(MAKE) acquisition PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"; \
fi
$(MAKE) validate PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) cleaning OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) feature_engineering OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) feature_selection OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) eda PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) train PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"
$(MAKE) evaluate PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"

all: check-python check lint test pipeline
4 changes: 2 additions & 2 deletions app/House-Price-ntelligence/deployed_model.pkl
Git LFS file not shown
91 changes: 91 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from pathlib import Path

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

PROJECT_ROOT = Path(__file__).resolve().parent
SRC_DIR = PROJECT_ROOT / "src"
PACKAGE_DIR = SRC_DIR / "house_price_class_prediction"

DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
INTERIM_DIR = DATA_DIR / "interim"
PROCESSED_DIR = DATA_DIR / "processed"
TEMP_DIR = DATA_DIR / "temp"

REPORTS_DIR = PROJECT_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"
PIPELINE_LOG_PATH = PROJECT_ROOT / "pipeline.log"

MODELS_DIR = PROJECT_ROOT / "models"
TEST_MODEL_PATH = MODELS_DIR / "test_model.pkl"

APP_DIR = PROJECT_ROOT / "app" / "House-Price-ntelligence"
DEPLOYED_MODEL_PATH = APP_DIR / "deployed_model.pkl"

MLFLOW_DB_PATH = PACKAGE_DIR / "models" / "mlflow.db"

SELECTED_MODEL_NAME = "random_forest"
MODEL_CV_FOLDS = 5
MODEL_SCORING = "f1_macro"
MODEL_N_JOBS = -1

LOG_MODEL_PIP_REQUIREMENTS = [
"mlflow",
"scikit-learn",
"pandas",
"numpy",
]

MODEL_CONFIGS = {
"naive": {
"estimator": GaussianNB(),
"params": {},
},
"logistic": {
"estimator": Pipeline(
[
("scaler", StandardScaler()),
(
"model",
LogisticRegression(
random_state=42, class_weight="balanced", max_iter=5000
),
),
]
),
"params": {"model__C": [0.1, 1, 10]},
},
"knn": {
"estimator": Pipeline(
[("scaler", StandardScaler()), ("model", KNeighborsClassifier())]
),
"params": {
"model__n_neighbors": [3, 5, 7, 9, 11, 13],
"model__weights": ["uniform", "distance"],
"model__metric": ["euclidean", "manhattan", "minkowski"],
},
},
"decision_tree": {
"estimator": DecisionTreeClassifier(random_state=42, class_weight="balanced"),
"params": {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5]},
},
"random_forest": {
"estimator": RandomForestClassifier(random_state=42, class_weight="balanced"),
"params": {
"n_estimators": [50, 100],
"max_depth": [5, 10],
"min_samples_split": [5, 10],
"min_samples_leaf": [2, 4],
},
},
"ada_boost": {
"estimator": AdaBoostClassifier(random_state=42),
"params": {"n_estimators": [50, 100], "learning_rate": [0.5, 1.0]},
},
}
Binary file modified docs/latex/main.pdf
Binary file not shown.
32 changes: 21 additions & 11 deletions docs/latex/main.tex
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,15 @@
\input{preamble.tex}

\title{\projectname\\Preprocessing, Feature Engineering,EDA and model Documentation}
\author{}
\author{
Mohamed Ashraf\\ID: 9220658
\and
Nesma Osama\\ID: 9220912
\and
Mazen Adel\\ID: 9220625
\and
Mohamed Khater\\ID: 9220713
}
\date{April 17, 2026}

\begin{document}
Expand All @@ -28,27 +36,29 @@ \section{Pipeline Overview}
\item explore distributions, pairwise relationships, and class separation in the training set.
\end{enumerate}

\input{sections/business_problem}
\input{sections/preprocessing}
\input{sections/build_features}
\input{sections/eda}
\input{sections/models}
\input{sections/results}
\input{sections/testing}
\input{sections/ci}

\section{Artifacts}
\begin{longtable}{p{0.34\textwidth} p{0.58\textwidth}}
\begin{longtable}{>{\raggedright\arraybackslash}p{0.42\textwidth} p{0.50\textwidth}}
\toprule
Artifact & Description \\
\midrule
\texttt{data/processed/train\_data.csv} & Preprocessed training split before feature encoding. \\
\texttt{data/processed/test\_data.csv} & Preprocessed test split before feature encoding. \\
\texttt{data/processed/duplicate\_rows.csv} & Records removed as duplicates using the defined structural key. \\
\texttt{data/processed/rejected\_rows.csv} & Records rejected because critical enrichment fields are missing. \\
\texttt{data/processed/X\_train\_encoded.csv} & Encoded and engineered training features. \\
\texttt{data/processed/X\_test\_encoded.csv} & Encoded and engineered test features. \\
\texttt{data/processed/y\_train\_encoded.csv} & Numeric target labels for the training split. \\
\texttt{data/processed/y\_test\_encoded.csv} & Numeric target labels for the test split. \\
\texttt{reports/figures/01\_eda.png} to \texttt{33\_eda.png} & Exported EDA visualizations from the analysis notebook. \\
\path{data/processed/train_data.csv} & Preprocessed training split before feature encoding. \\
\path{data/processed/test_data.csv} & Preprocessed test split before feature encoding. \\
\path{data/processed/duplicate_rows.csv} & Records removed as duplicates using the defined structural key. \\
\path{data/processed/rejected_rows.csv} & Records rejected because critical enrichment fields are missing. \\
\path{data/processed/X_train_encoded.csv} & Encoded and engineered training features. \\
\path{data/processed/X_test_encoded.csv} & Encoded and engineered test features. \\
\path{data/processed/y_train_encoded.csv} & Numeric target labels for the training split. \\
\path{data/processed/y_test_encoded.csv} & Numeric target labels for the test split. \\
\path{reports/figures/01_eda.png} to \path{33_eda.png} & Exported EDA visualizations from the analysis notebook. \\
\bottomrule
\end{longtable}

Expand Down
56 changes: 56 additions & 0 deletions docs/latex/sections/business_problem.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
\section{Business Problem and Stakeholders}

\subsection{Business Context}
The project uses a United States residential real estate dataset to classify listings into four price categories: Budget, Mid-Range, Premium, and Luxury. The raw source contains more than 2.2 million listings, and the current modeling dataset uses a balanced 40,000-row sample enriched with city, weather, and nearby point-of-interest variables.

Residential real estate decisions are time sensitive. Buyers, agents, listing platforms, and investors need to compare properties quickly, but raw listing prices alone do not explain whether a property belongs to a low-cost, average, premium, or luxury segment. Two houses with similar physical features can belong to different market segments depending on location, city density, climate, nearby services, and other contextual variables.

\subsection{Business Problem}
The business problem is to help real estate decision makers automatically identify the market segment of a residential property listing using its physical, geographic, demographic, climate, and amenity features.

More specifically, the project answers the question:

\begin{quote}
Given a property's listing attributes and local context, can we predict whether the property belongs to the Budget, Mid-Range, Premium, or Luxury category?
\end{quote}

This is useful because an accurate price-category classifier can support faster listing review, better customer search filters, improved lead routing, and early detection of listings that may need human pricing review.

\subsection{Business Objectives}
The main business objectives are:
\begin{itemize}
\item classify each residential listing into a clear price segment,
\item help buyers and agents compare similar properties within the same market tier,
\item support listing platforms in organizing search results and recommendations,
\item identify high-value Luxury listings with strong precision and recall,
\item flag uncertain adjacent categories, especially Mid-Range and Premium, for human review,
\item create a repeatable data and modeling pipeline that can be updated as new listings arrive.
\end{itemize}

\subsection{Stakeholders}
\begin{longtable}{p{0.28\textwidth} p{0.62\textwidth}}
\toprule
Stakeholder & Interest in the Project \\
\midrule
Real estate platforms & Need consistent listing categories for search, recommendations, ranking, and marketplace analytics. \\
Real estate agents and brokers & Need quick market-segment estimates to prioritize listings, advise sellers, and route leads. \\
Property buyers & Need clearer filters and comparisons when searching for homes within a target budget or quality tier. \\
Property sellers & Need feedback on how their listing is positioned in the market compared with similar properties. \\
Investors and portfolio managers & Need fast screening of properties by segment before deeper financial analysis. \\
Appraisers and pricing analysts & Can use the model output as an initial signal, while retaining human responsibility for final valuation. \\
Lenders and mortgage teams & May use segment information as supporting context during risk review and property assessment. \\
Data science and engineering team & Owns the data pipeline, validation checks, model training, experiment tracking, and deployment workflow. \\
Compliance and governance reviewers & Need transparency about the data sources, model limitations, and appropriate use of predictions. \\
\bottomrule
\end{longtable}

\subsection{How We Are Solving It}
The project solves the business problem by building a model that learns the relationship between a property's listing details, location context, nearby services, climate conditions, and its market segment. Instead of using only the raw price, the solution combines property attributes with external context so each listing can be assigned to one of four practical categories: Budget, Mid-Range, Premium, or Luxury.

The model is intended to work as a decision-support tool. It gives real estate teams an initial, consistent classification for each listing, helping them organize properties, prioritize high-value cases, and identify listings that may need closer human review. The detailed data preparation, feature engineering, model training, and evaluation steps are described in the following sections of the report.

\subsection{Expected Business Value}
The model provides a practical decision-support layer rather than replacing human real estate expertise. It can reduce manual screening time, improve consistency across listings, and highlight Luxury properties where misclassification has higher business impact. The current results also show that adjacent middle categories can be harder to separate, so the best operational use is automated support for clear cases plus human review for uncertain cases.

\subsection{Risks and Limitations}
The main limitations are missing values in original listing fields such as previous sold date, house size, bath, bed, and acre lot; duplicate records in the cleaned export; and outlier values flagged by validation rules. Because the target is derived from price quartiles, the categories are relative market segments rather than official appraisal classes. Model predictions should therefore be used as a screening and prioritization tool, not as a final property valuation.
10 changes: 5 additions & 5 deletions docs/latex/sections/eda.tex
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
\section{EDA}

\begin{enumerate}
For example california has the highest num of luxuary houses 2509 while missouri show lowest number 74 so if the house from california it is probable to be luxuray
\item For example california has the highest num of luxuary houses 2509 while missouri show lowest number 74 so if the house from california it is probable to be luxuray

\begin{figure}[H]
\centering
\includegraphics[width=0.88\textwidth]{sections/figures/city_price_cat.png}
\caption{city\_price\_cat}
\includegraphics[width=0.88\textwidth]{sections/figures/state_price_cat.png}
\caption{state\_price\_cat}
\end{figure}

\item this image draw a pairplot between weather data which shows the hist of the feature and the scatter plot with the other featre. For example avg\_humidity has positive realtion with avg\_precip. some features has skew like avg\_humidity while other has normal dist like avg\_temp.
Expand Down Expand Up @@ -41,7 +41,7 @@

\begin{figure}[H]
\centering
\includegraphics[width=0.88\textwidth]{docs/latex/sections/figures/city_vs_price.png}
\caption{state\_vs\_price\_cat}
\includegraphics[width=0.88\textwidth]{sections/figures/city_vs_price.png}
\caption{city\_vs\_price}
\end{figure}
\end{enumerate}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/latex/sections/figures/luxury_precision_test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/latex/sections/figures/luxury_recall_test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 0 additions & 2 deletions docs/latex/sections/preprocessing.tex
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
\section{Preprocessing Pipeline}

This section documents the preprocessing decisions applied before model training. It converts the working notes into a structured LaTeX report and reflects the logic used in the project preprocessing workflow.

\subsection{Cleaning Steps and Justification}

\subsubsection{Data Accuracy}
Expand Down
Loading
Loading