diff --git a/.gitignore b/.gitignore index 5fc9761..9063744 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,5 @@ models/*.joblib models/*.onnx src/data/* +.env +mlruns/ \ No newline at end of file diff --git a/Makefile b/Makefile index 55b00d8..3b8891c 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,9 @@ CLEANED_DATA ?= $(PROCESSED_DIR)/cleaned_data.csv REPORTS_DIR ?= reports OVERWRITE ?= PIPELINE_OVERWRITE ?= --overwrite +PIPELINE_RUNTIME_ID ?= $(shell date -u +%Y%m%dT%H%M%SZ) PY_FILES := $(shell find src tests -type f -name "*.py") +export PIPELINE_RUNTIME_ID .DEFAULT_GOAL := help @@ -45,7 +47,7 @@ check-python: $(PYTHON) test_environment.py acquisition: - $(PYTHON) $(SRC_DIR)/data/data_acquisition.py + PYTHONPATH=.:$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.data.data_acquisition cleaning: PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.features.preprocessing $(OVERWRITE) @@ -72,10 +74,10 @@ validate: reports: validate evaluate: - $(PYTHON) $(SRC_DIR)/models/predict.py + PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.models.predict train: - $(PYTHON) $(SRC_DIR)/models/train.py + PYTHONPATH=$(PYTHONPATH) $(PYTHON) -m house_price_class_prediction.models.train test: $(PYTHON) -m pytest tests -v @@ -101,12 +103,16 @@ clean: find . -type f -name "*.py[co]" -delete pipeline: - $(MAKE) validate - $(MAKE) cleaning OVERWRITE="$(PIPELINE_OVERWRITE)" - $(MAKE) feature_engineering OVERWRITE="$(PIPELINE_OVERWRITE)" - $(MAKE) feature_selection OVERWRITE="$(PIPELINE_OVERWRITE)" - $(MAKE) eda - $(MAKE) train - $(MAKE) evaluate + @if [ ! -f "$(CLEANED_DATA)" ]; then \ + echo "Missing $(CLEANED_DATA). Running acquisition first."; \ + $(MAKE) acquisition PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)"; \ + fi + $(MAKE) validate PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)" + $(MAKE) cleaning OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)" + $(MAKE) feature_engineering OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)" + $(MAKE) feature_selection OVERWRITE="$(PIPELINE_OVERWRITE)" PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)" + $(MAKE) eda PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)" + $(MAKE) train PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)" + $(MAKE) evaluate PIPELINE_RUNTIME_ID="$(PIPELINE_RUNTIME_ID)" all: check-python check lint test pipeline diff --git a/app/House-Price-ntelligence/deployed_model.pkl b/app/House-Price-ntelligence/deployed_model.pkl index 305ad56..617aa6f 100644 --- a/app/House-Price-ntelligence/deployed_model.pkl +++ b/app/House-Price-ntelligence/deployed_model.pkl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ba53e118e21637edd36446a62d98b94d420a5ae08d7be787b564546e2b8b388 -size 11578613 +oid sha256:e724fc8a029291df885c15d93f9385f277634cf450c5110c9ae52915dfc53cd6 +size 10815739 diff --git a/config.py b/config.py new file mode 100644 index 0000000..13524aa --- /dev/null +++ b/config.py @@ -0,0 +1,91 @@ +from pathlib import Path + +from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import GaussianNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.tree import DecisionTreeClassifier + +PROJECT_ROOT = Path(__file__).resolve().parent +SRC_DIR = PROJECT_ROOT / "src" +PACKAGE_DIR = SRC_DIR / "house_price_class_prediction" + +DATA_DIR = PROJECT_ROOT / "data" +RAW_DIR = DATA_DIR / "raw" +INTERIM_DIR = DATA_DIR / "interim" +PROCESSED_DIR = DATA_DIR / "processed" +TEMP_DIR = DATA_DIR / "temp" + +REPORTS_DIR = PROJECT_ROOT / "reports" +FIGURES_DIR = REPORTS_DIR / "figures" +PIPELINE_LOG_PATH = PROJECT_ROOT / "pipeline.log" + +MODELS_DIR = PROJECT_ROOT / "models" +TEST_MODEL_PATH = MODELS_DIR / "test_model.pkl" + +APP_DIR = PROJECT_ROOT / "app" / "House-Price-ntelligence" +DEPLOYED_MODEL_PATH = APP_DIR / "deployed_model.pkl" + +MLFLOW_DB_PATH = PACKAGE_DIR / "models" / "mlflow.db" + +SELECTED_MODEL_NAME = "random_forest" +MODEL_CV_FOLDS = 5 +MODEL_SCORING = "f1_macro" +MODEL_N_JOBS = -1 + +LOG_MODEL_PIP_REQUIREMENTS = [ + "mlflow", + "scikit-learn", + "pandas", + "numpy", +] + +MODEL_CONFIGS = { + "naive": { + "estimator": GaussianNB(), + "params": {}, + }, + "logistic": { + "estimator": Pipeline( + [ + ("scaler", StandardScaler()), + ( + "model", + LogisticRegression( + random_state=42, class_weight="balanced", max_iter=5000 + ), + ), + ] + ), + "params": {"model__C": [0.1, 1, 10]}, + }, + "knn": { + "estimator": Pipeline( + [("scaler", StandardScaler()), ("model", KNeighborsClassifier())] + ), + "params": { + "model__n_neighbors": [3, 5, 7, 9, 11, 13], + "model__weights": ["uniform", "distance"], + "model__metric": ["euclidean", "manhattan", "minkowski"], + }, + }, + "decision_tree": { + "estimator": DecisionTreeClassifier(random_state=42, class_weight="balanced"), + "params": {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5]}, + }, + "random_forest": { + "estimator": RandomForestClassifier(random_state=42, class_weight="balanced"), + "params": { + "n_estimators": [50, 100], + "max_depth": [5, 10], + "min_samples_split": [5, 10], + "min_samples_leaf": [2, 4], + }, + }, + "ada_boost": { + "estimator": AdaBoostClassifier(random_state=42), + "params": {"n_estimators": [50, 100], "learning_rate": [0.5, 1.0]}, + }, +} diff --git a/docs/latex/main.pdf b/docs/latex/main.pdf index e296d32..7f660cb 100644 Binary files a/docs/latex/main.pdf and b/docs/latex/main.pdf differ diff --git a/docs/latex/main.tex b/docs/latex/main.tex index 4e4c72b..224d9cc 100644 --- a/docs/latex/main.tex +++ b/docs/latex/main.tex @@ -2,7 +2,15 @@ \input{preamble.tex} \title{\projectname\\Preprocessing, Feature Engineering,EDA and model Documentation} -\author{} +\author{ +Mohamed Ashraf\\ID: 9220658 +\and +Nesma Osama\\ID: 9220912 +\and +Mazen Adel\\ID: 9220625 +\and +Mohamed Khater\\ID: 9220713 +} \date{April 17, 2026} \begin{document} @@ -28,27 +36,29 @@ \section{Pipeline Overview} \item explore distributions, pairwise relationships, and class separation in the training set. \end{enumerate} +\input{sections/business_problem} \input{sections/preprocessing} \input{sections/build_features} \input{sections/eda} \input{sections/models} \input{sections/results} +\input{sections/testing} \input{sections/ci} \section{Artifacts} -\begin{longtable}{p{0.34\textwidth} p{0.58\textwidth}} +\begin{longtable}{>{\raggedright\arraybackslash}p{0.42\textwidth} p{0.50\textwidth}} \toprule Artifact & Description \\ \midrule -\texttt{data/processed/train\_data.csv} & Preprocessed training split before feature encoding. \\ -\texttt{data/processed/test\_data.csv} & Preprocessed test split before feature encoding. \\ -\texttt{data/processed/duplicate\_rows.csv} & Records removed as duplicates using the defined structural key. \\ -\texttt{data/processed/rejected\_rows.csv} & Records rejected because critical enrichment fields are missing. \\ -\texttt{data/processed/X\_train\_encoded.csv} & Encoded and engineered training features. \\ -\texttt{data/processed/X\_test\_encoded.csv} & Encoded and engineered test features. \\ -\texttt{data/processed/y\_train\_encoded.csv} & Numeric target labels for the training split. \\ -\texttt{data/processed/y\_test\_encoded.csv} & Numeric target labels for the test split. \\ -\texttt{reports/figures/01\_eda.png} to \texttt{33\_eda.png} & Exported EDA visualizations from the analysis notebook. \\ +\path{data/processed/train_data.csv} & Preprocessed training split before feature encoding. \\ +\path{data/processed/test_data.csv} & Preprocessed test split before feature encoding. \\ +\path{data/processed/duplicate_rows.csv} & Records removed as duplicates using the defined structural key. \\ +\path{data/processed/rejected_rows.csv} & Records rejected because critical enrichment fields are missing. \\ +\path{data/processed/X_train_encoded.csv} & Encoded and engineered training features. \\ +\path{data/processed/X_test_encoded.csv} & Encoded and engineered test features. \\ +\path{data/processed/y_train_encoded.csv} & Numeric target labels for the training split. \\ +\path{data/processed/y_test_encoded.csv} & Numeric target labels for the test split. \\ +\path{reports/figures/01_eda.png} to \path{33_eda.png} & Exported EDA visualizations from the analysis notebook. \\ \bottomrule \end{longtable} diff --git a/docs/latex/sections/business_problem.tex b/docs/latex/sections/business_problem.tex new file mode 100644 index 0000000..704202f --- /dev/null +++ b/docs/latex/sections/business_problem.tex @@ -0,0 +1,56 @@ +\section{Business Problem and Stakeholders} + +\subsection{Business Context} +The project uses a United States residential real estate dataset to classify listings into four price categories: Budget, Mid-Range, Premium, and Luxury. The raw source contains more than 2.2 million listings, and the current modeling dataset uses a balanced 40,000-row sample enriched with city, weather, and nearby point-of-interest variables. + +Residential real estate decisions are time sensitive. Buyers, agents, listing platforms, and investors need to compare properties quickly, but raw listing prices alone do not explain whether a property belongs to a low-cost, average, premium, or luxury segment. Two houses with similar physical features can belong to different market segments depending on location, city density, climate, nearby services, and other contextual variables. + +\subsection{Business Problem} +The business problem is to help real estate decision makers automatically identify the market segment of a residential property listing using its physical, geographic, demographic, climate, and amenity features. + +More specifically, the project answers the question: + +\begin{quote} +Given a property's listing attributes and local context, can we predict whether the property belongs to the Budget, Mid-Range, Premium, or Luxury category? +\end{quote} + +This is useful because an accurate price-category classifier can support faster listing review, better customer search filters, improved lead routing, and early detection of listings that may need human pricing review. + +\subsection{Business Objectives} +The main business objectives are: +\begin{itemize} + \item classify each residential listing into a clear price segment, + \item help buyers and agents compare similar properties within the same market tier, + \item support listing platforms in organizing search results and recommendations, + \item identify high-value Luxury listings with strong precision and recall, + \item flag uncertain adjacent categories, especially Mid-Range and Premium, for human review, + \item create a repeatable data and modeling pipeline that can be updated as new listings arrive. +\end{itemize} + +\subsection{Stakeholders} +\begin{longtable}{p{0.28\textwidth} p{0.62\textwidth}} +\toprule +Stakeholder & Interest in the Project \\ +\midrule +Real estate platforms & Need consistent listing categories for search, recommendations, ranking, and marketplace analytics. \\ +Real estate agents and brokers & Need quick market-segment estimates to prioritize listings, advise sellers, and route leads. \\ +Property buyers & Need clearer filters and comparisons when searching for homes within a target budget or quality tier. \\ +Property sellers & Need feedback on how their listing is positioned in the market compared with similar properties. \\ +Investors and portfolio managers & Need fast screening of properties by segment before deeper financial analysis. \\ +Appraisers and pricing analysts & Can use the model output as an initial signal, while retaining human responsibility for final valuation. \\ +Lenders and mortgage teams & May use segment information as supporting context during risk review and property assessment. \\ +Data science and engineering team & Owns the data pipeline, validation checks, model training, experiment tracking, and deployment workflow. \\ +Compliance and governance reviewers & Need transparency about the data sources, model limitations, and appropriate use of predictions. \\ +\bottomrule +\end{longtable} + +\subsection{How We Are Solving It} +The project solves the business problem by building a model that learns the relationship between a property's listing details, location context, nearby services, climate conditions, and its market segment. Instead of using only the raw price, the solution combines property attributes with external context so each listing can be assigned to one of four practical categories: Budget, Mid-Range, Premium, or Luxury. + +The model is intended to work as a decision-support tool. It gives real estate teams an initial, consistent classification for each listing, helping them organize properties, prioritize high-value cases, and identify listings that may need closer human review. The detailed data preparation, feature engineering, model training, and evaluation steps are described in the following sections of the report. + +\subsection{Expected Business Value} +The model provides a practical decision-support layer rather than replacing human real estate expertise. It can reduce manual screening time, improve consistency across listings, and highlight Luxury properties where misclassification has higher business impact. The current results also show that adjacent middle categories can be harder to separate, so the best operational use is automated support for clear cases plus human review for uncertain cases. + +\subsection{Risks and Limitations} +The main limitations are missing values in original listing fields such as previous sold date, house size, bath, bed, and acre lot; duplicate records in the cleaned export; and outlier values flagged by validation rules. Because the target is derived from price quartiles, the categories are relative market segments rather than official appraisal classes. Model predictions should therefore be used as a screening and prioritization tool, not as a final property valuation. diff --git a/docs/latex/sections/eda.tex b/docs/latex/sections/eda.tex index a84f1f8..86717d0 100644 --- a/docs/latex/sections/eda.tex +++ b/docs/latex/sections/eda.tex @@ -1,12 +1,12 @@ \section{EDA} \begin{enumerate} - For example california has the highest num of luxuary houses 2509 while missouri show lowest number 74 so if the house from california it is probable to be luxuray + \item For example california has the highest num of luxuary houses 2509 while missouri show lowest number 74 so if the house from california it is probable to be luxuray \begin{figure}[H] \centering - \includegraphics[width=0.88\textwidth]{sections/figures/city_price_cat.png} - \caption{city\_price\_cat} + \includegraphics[width=0.88\textwidth]{sections/figures/state_price_cat.png} + \caption{state\_price\_cat} \end{figure} \item this image draw a pairplot between weather data which shows the hist of the feature and the scatter plot with the other featre. For example avg\_humidity has positive realtion with avg\_precip. some features has skew like avg\_humidity while other has normal dist like avg\_temp. @@ -41,7 +41,7 @@ \begin{figure}[H] \centering - \includegraphics[width=0.88\textwidth]{docs/latex/sections/figures/city_vs_price.png} - \caption{state\_vs\_price\_cat} + \includegraphics[width=0.88\textwidth]{sections/figures/city_vs_price.png} + \caption{city\_vs\_price} \end{figure} \end{enumerate} diff --git a/docs/latex/sections/figures/accuracy_test vs accuracy_training.png b/docs/latex/sections/figures/accuracy_test vs accuracy_training.png index 71e1436..83b8794 100644 Binary files a/docs/latex/sections/figures/accuracy_test vs accuracy_training.png and b/docs/latex/sections/figures/accuracy_test vs accuracy_training.png differ diff --git a/docs/latex/sections/figures/f1_macro_test vs f1_macro_training.png b/docs/latex/sections/figures/f1_macro_test vs f1_macro_training.png index de420db..93da244 100644 Binary files a/docs/latex/sections/figures/f1_macro_test vs f1_macro_training.png and b/docs/latex/sections/figures/f1_macro_test vs f1_macro_training.png differ diff --git a/docs/latex/sections/figures/luxury_precision_test.png b/docs/latex/sections/figures/luxury_precision_test.png index 2b2354e..8f52237 100644 Binary files a/docs/latex/sections/figures/luxury_precision_test.png and b/docs/latex/sections/figures/luxury_precision_test.png differ diff --git a/docs/latex/sections/figures/luxury_recall_test.png b/docs/latex/sections/figures/luxury_recall_test.png index 5abb743..8c4d799 100644 Binary files a/docs/latex/sections/figures/luxury_recall_test.png and b/docs/latex/sections/figures/luxury_recall_test.png differ diff --git a/docs/latex/sections/figures/precision_macro_test vs recall_macro_test.png b/docs/latex/sections/figures/precision_macro_test vs recall_macro_test.png index 95ca320..c18a903 100644 Binary files a/docs/latex/sections/figures/precision_macro_test vs recall_macro_test.png and b/docs/latex/sections/figures/precision_macro_test vs recall_macro_test.png differ diff --git a/docs/latex/sections/preprocessing.tex b/docs/latex/sections/preprocessing.tex index 7ab04d0..a19a472 100644 --- a/docs/latex/sections/preprocessing.tex +++ b/docs/latex/sections/preprocessing.tex @@ -1,7 +1,5 @@ \section{Preprocessing Pipeline} -This section documents the preprocessing decisions applied before model training. It converts the working notes into a structured LaTeX report and reflects the logic used in the project preprocessing workflow. - \subsection{Cleaning Steps and Justification} \subsubsection{Data Accuracy} diff --git a/docs/latex/sections/results.tex b/docs/latex/sections/results.tex index 9e8f053..a4d4e70 100644 --- a/docs/latex/sections/results.tex +++ b/docs/latex/sections/results.tex @@ -1,62 +1,102 @@ -\section{Results & Evaluation} - -This section describes results of all the models used in this problem - -\subsection{Model performance on training and test data} - -\begin{figure}[H] - \centering - \includegraphics[width=0.9\textwidth]{sections/figures/accuracy_test vs accuracy_training.png} - \caption{Training and Testing Accuracy.} +\section{Results \& Evaluation} + +This section describes results of all the models used in this problem + +\subsection{Model performance on training and test data} + +\begin{figure}[H] + \centering + \includegraphics[width=0.9\textwidth]{sections/figures/accuracy_test vs accuracy_training.png} + \caption{Training and Testing Accuracy.} +\end{figure} + + +\begin{figure}[H] + \centering + \includegraphics[width=0.9\textwidth]{sections/figures/f1_macro_test vs f1_macro_training.png} + \caption{Training and Testing f1 macro.} +\end{figure} + + + +\subsection{Evaluation Metrics} +\begin{itemize} + \item Luxury Precision (precision for Luxury class only) + \item Luxury Recall (recall for Luxury class only) +\end{itemize} +Luxury Precision measures the proportion of correctly identified luxury items among all items predicted as luxury. Low precision may lead to mislabeling non-luxury items, resulting in poor pricing or resource allocation decisions. +Luxury Recall measures the proportion of actual luxury items that are correctly identified. Low recall may cause the model to miss valuable luxury cases, leading to lost revenue opportunities. + +\begin{figure}[H] + \centering + \includegraphics[width=0.9\textwidth]{sections/figures/luxury_precision_test.png} + \caption{Testing luxury precision.} +\end{figure} +\begin{figure}[H] + \centering + \includegraphics[width=0.9\textwidth]{sections/figures/luxury_recall_test.png} + \caption{Testing luxury recall.} +\end{figure} + +\subsection{Chart} + + +\begin{figure}[H] + \centering + \includegraphics[width=0.9\textwidth]{sections/figures/f1_macro_val.png} + \caption{Cross validation f1 macro.} +\end{figure} + +\begin{figure}[H] + \centering + \includegraphics[width=0.9\textwidth]{sections/figures/precision_macro_test vs recall_macro_test.png} + \caption{Testing precision and recall macro.} \end{figure} +\subsection{Error analysis} +The updated confusion matrix is shown in Table~\ref{tab:confusion-matrix}. Rows represent the actual class and columns represent the predicted class. - -\begin{figure}[H] - \centering - \includegraphics[width=0.9\textwidth]{sections/figures/f1_macro_test vs f1_macro_training.png} - \caption{Training and Testing f1 macro.} -\end{figure} - - - -\subsection{Evaluation Metrics} -\begin{itemize} - \item Luxury Precision (precision for Luxury class only) - \item Luxury Recall (recall for Luxury class only) -\end{itemize} -Luxury Precision measures the proportion of correctly identified luxury items among all items predicted as luxury. Low precision may lead to mislabeling non-luxury items, resulting in poor pricing or resource allocation decisions. -Luxury Recall measures the proportion of actual luxury items that are correctly identified. Low recall may cause the model to miss valuable luxury cases, leading to lost revenue opportunities. - -\begin{figure}[H] - \centering - \includegraphics[width=0.9\textwidth]{sections/figures/luxury_precision_test.png} - \caption{Testing luxury precision.} -\end{figure} -\begin{figure}[H] - \centering - \includegraphics[width=0.9\textwidth]{sections/figures/luxury_recall_test.png} - \caption{Testing luxury recall.} -\end{figure} - -\subsection{Chart} - - -\begin{figure}[H] +\begin{table}[H] \centering - \includegraphics[width=0.9\textwidth]{sections/figures/f1_macro_val.png} - \caption{Cross validation f1 macro.} -\end{figure} + \begin{tabular}{lrrrr} + \toprule + Actual / Predicted & Class 0 & Class 1 & Class 2 & Class 3 \\ + \midrule + Class 0 (Budget) & 1367 & 401 & 67 & 15 \\ + Class 1 (Mid-Range) & 331 & 1077 & 381 & 27 \\ + Class 2 (Premium) & 44 & 399 & 1095 & 242 \\ + Class 3 (Luxury) & 10 & 70 & 392 & 1358 \\ + \bottomrule + \end{tabular} + \caption{Confusion matrix for the selected model.} + \label{tab:confusion-matrix} +\end{table} -\begin{figure}[H] - \centering - \includegraphics[width=0.9\textwidth]{sections/figures/precision_macro_test vs recall_macro_test.png} - \caption{Testing precision and recall macro.} -\end{figure} -\subsection{Error analysis} +The confusion matrix reveals that the model struggles primarily with distinguishing between adjacent classes. The most significant misclassifications occur between: \begin{itemize} - \item The model cannot correctly distinguish between adjacent categories - \item the hardest two class are class 1 and 2 + \item \textbf{Class 1 (Mid-Range) and Class 2 (Premium):} + 381 instances of class 1 are predicted as class 2, and 399 instances of class 2 are predicted as class 1. This represents the largest source of error. + + \item \textbf{Class 2 (Premium) and Class 3 (Luxury):} + 242 premium items are misclassified as luxury, and 392 luxury items are misclassified as premium, indicating overlap between high-end categories. + + \item \textbf{Class 0 (Budget) and Class 1 (Mid-Range):} + 401 budget items are predicted as mid-range, and 331 mid-range items are predicted as budget, showing difficulty in separating lower-price segments. \end{itemize} -\subsection{Business-oriented interpretation} -The best model can predict two categories, which are category 0 and category 3, so we can use this model to predict them, but it requires human review for categories 1 and 2, or more features can be obtained to distinguish them. + + +\section{Business-Oriented Interpretation} + +The model demonstrates reliable performance for class 0 (Budget) and class 3 (Luxury), where both precision and recall are relatively high. This makes the model suitable for automated decision-making in these segments. + +However, the model shows significant confusion between class 1 (Mid-Range) and class 2 (Premium). From a business perspective, this may lead to: + +\begin{itemize} + \item Incorrect pricing decisions (overpricing or underpricing) + \item Reduced customer satisfaction +\end{itemize} + +Therefore, predictions for class 1 and class 2 should be treated with caution. These cases may require human review or additional validation. Further improvements can be achieved by incorporating more discriminative features or refining the model to better separate these closely related categories. + + + diff --git a/docs/latex/sections/testing.tex b/docs/latex/sections/testing.tex new file mode 100644 index 0000000..4c5dd65 --- /dev/null +++ b/docs/latex/sections/testing.tex @@ -0,0 +1,40 @@ +\section{Testing and Coverage} + +\subsection{Testing Objective} +The goal of the test suite is to verify that the project code behaves correctly across the most important data science workflow steps. The tests focus on failure-prone business logic such as target creation, stratified sampling, geospatial enrichment, validation rules, feature engineering, feature selection, model prediction, and visualization output. + +Test coverage measures how much executable source code is exercised by automated tests. Coverage does not prove that the code is bug-free, but it helps identify modules, functions, and branches that are not currently verified by the test suite. + +\subsection{Latest Test Results} +The latest full test command was: +\begin{verbatim} +python -m pytest tests \ + --cov=house_price_class_prediction \ + --cov-report=term-missing +\end{verbatim} + +The result was: +\begin{itemize} + \item 33 tests passed, + \item total package coverage: 51\%, + \item main covered areas: validation rules, deterministic acquisition transforms, feature engineering helpers, POI counting, model prediction helpers, logging utilities, and core plotting behavior. +\end{itemize} + +\begin{longtable}{p{0.62\textwidth} p{0.20\textwidth}} +\toprule +Module & Coverage \\ +\midrule +\texttt{data.data\_acquisition} & 34\% \\ +\texttt{features.add\_schools\_pharmacies\_hospitals} & 31\% \\ +\texttt{features.build\_features} & 59\% \\ +\texttt{features.feature\_selection} & 65\% \\ +\texttt{features.preprocessing} & 42\% \\ +\texttt{models.predict} & 80\% \\ +\texttt{utils.logging\_utils} & 96\% \\ +\texttt{validation.validate\_data} & 80\% \\ +\texttt{visualization.visualize} & 37\% \\ +\bottomrule +\end{longtable} + +\subsection{Remaining Gaps} +The lowest-coverage modules are \path{features.add_schools_pharmacies_hospitals}, \path{data.data_acquisition}, \path{visualization.visualize}, and \path{features.preprocessing}. These modules include external IO, command-line entry points, large batch-processing paths, plotting branches, and expensive operations that require more mocking or fixtures. diff --git a/mlflow.db b/mlflow.db index 4e39882..644cc18 100644 Binary files a/mlflow.db and b/mlflow.db differ diff --git a/models/test_model.pkl b/models/test_model.pkl new file mode 100644 index 0000000..c80a08a Binary files /dev/null and b/models/test_model.pkl differ diff --git a/poetry.lock b/poetry.lock index aafcfc7..e00bf0a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.2 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -215,19 +215,6 @@ dev = ["duckdb (>=1.0)", "geopandas (>=0.14.3)", "hatch (>=1.13.0)", "ipykernel" doc = ["docutils", "jinja2", "myst-parser", "numpydoc", "pillow", "pydata-sphinx-theme (>=0.14.1)", "scipy", "scipy-stubs ; python_version >= \"3.10\"", "sphinx", "sphinx-autobuild", "sphinx-copybutton", "sphinx-design", "sphinxext-altair"] save = ["vl-convert-python (>=1.9.0)"] -[[package]] -name = "aniso8601" -version = "7.0.0" -description = "A library for parsing ISO 8601 strings." -optional = false -python-versions = "*" -groups = ["main"] -markers = "python_version >= \"3.12\"" -files = [ - {file = "aniso8601-7.0.0-py2.py3-none-any.whl", hash = "sha256:d10a4bf949f619f719b227ef5386e31f49a2b6d453004b21f02661ccc8670c7b"}, - {file = "aniso8601-7.0.0.tar.gz", hash = "sha256:513d2b6637b7853806ae79ffaca6f3e8754bdd547048f5ccc1420aec4b714f1e"}, -] - [[package]] name = "annotated-doc" version = "0.0.4" @@ -289,7 +276,7 @@ version = "26.3.1" description = "The uncompromising code formatter." optional = false python-versions = ">=3.10" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "black-26.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:86a8b5035fce64f5dcd1b794cf8ec4d31fe458cf6ce3986a30deb434df82a1d2"}, {file = "black-26.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5602bdb96d52d2d0672f24f6ffe5218795736dd34807fd0fd55ccd6bf206168b"}, @@ -678,7 +665,7 @@ version = "8.3.2" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.10" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "click-8.3.2-py3-none-any.whl", hash = "sha256:1924d2c27c5653561cd2cae4548d1406039cb79b858b747cfea24924bbc1616d"}, {file = "click-8.3.2.tar.gz", hash = "sha256:14162b8b3b3550a7d479eafa77dfd3c38d9dc8951f6f69c78913a8f9a7540fd5"}, @@ -705,12 +692,12 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -groups = ["main"] -markers = "platform_system == \"Windows\"" +groups = ["main", "dev"] files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +markers = {main = "platform_system == \"Windows\"", dev = "platform_system == \"Windows\" or sys_platform == \"win32\""} [[package]] name = "contourpy" @@ -804,6 +791,125 @@ mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.17.0)", " test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"] +[[package]] +name = "coverage" +version = "7.13.5" +description = "Code coverage measurement for Python" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "coverage-7.13.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e0723d2c96324561b9aa76fb982406e11d93cdb388a7a7da2b16e04719cf7ca5"}, + {file = "coverage-7.13.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52f444e86475992506b32d4e5ca55c24fc88d73bcbda0e9745095b28ef4dc0cf"}, + {file = "coverage-7.13.5-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:704de6328e3d612a8f6c07000a878ff38181ec3263d5a11da1db294fa6a9bdf8"}, + {file = "coverage-7.13.5-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a1a6d79a14e1ec1832cabc833898636ad5f3754a678ef8bb4908515208bf84f4"}, + {file = "coverage-7.13.5-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79060214983769c7ba3f0cee10b54c97609dca4d478fa1aa32b914480fd5738d"}, + {file = "coverage-7.13.5-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:356e76b46783a98c2a2fe81ec79df4883a1e62895ea952968fb253c114e7f930"}, + {file = "coverage-7.13.5-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0cef0cdec915d11254a7f549c1170afecce708d30610c6abdded1f74e581666d"}, + {file = "coverage-7.13.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:dc022073d063b25a402454e5712ef9e007113e3a676b96c5f29b2bda29352f40"}, + {file = "coverage-7.13.5-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9b74db26dfea4f4e50d48a4602207cd1e78be33182bc9cbf22da94f332f99878"}, + {file = "coverage-7.13.5-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ad146744ca4fd09b50c482650e3c1b1f4dfa1d4792e0a04a369c7f23336f0400"}, + {file = "coverage-7.13.5-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:c555b48be1853fe3997c11c4bd521cdd9a9612352de01fa4508f16ec341e6fe0"}, + {file = "coverage-7.13.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7034b5c56a58ae5e85f23949d52c14aca2cfc6848a31764995b7de88f13a1ea0"}, + {file = "coverage-7.13.5-cp310-cp310-win32.whl", hash = "sha256:eb7fdf1ef130660e7415e0253a01a7d5a88c9c4d158bcf75cbbd922fd65a5b58"}, + {file = "coverage-7.13.5-cp310-cp310-win_amd64.whl", hash = "sha256:3e1bb5f6c78feeb1be3475789b14a0f0a5b47d505bfc7267126ccbd50289999e"}, + {file = "coverage-7.13.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66a80c616f80181f4d643b0f9e709d97bcea413ecd9631e1dedc7401c8e6695d"}, + {file = "coverage-7.13.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:145ede53ccbafb297c1c9287f788d1bc3efd6c900da23bf6931b09eafc931587"}, + {file = "coverage-7.13.5-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0672854dc733c342fa3e957e0605256d2bf5934feeac328da9e0b5449634a642"}, + {file = "coverage-7.13.5-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ec10e2a42b41c923c2209b846126c6582db5e43a33157e9870ba9fb70dc7854b"}, + {file = "coverage-7.13.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be3d4bbad9d4b037791794ddeedd7d64a56f5933a2c1373e18e9e568b9141686"}, + {file = "coverage-7.13.5-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4d2afbc5cc54d286bfb54541aa50b64cdb07a718227168c87b9e2fb8f25e1743"}, + {file = "coverage-7.13.5-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3ad050321264c49c2fa67bb599100456fc51d004b82534f379d16445da40fb75"}, + {file = "coverage-7.13.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7300c8a6d13335b29bb76d7651c66af6bd8658517c43499f110ddc6717bfc209"}, + {file = "coverage-7.13.5-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:eb07647a5738b89baab047f14edd18ded523de60f3b30e75c2acc826f79c839a"}, + {file = "coverage-7.13.5-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:9adb6688e3b53adffefd4a52d72cbd8b02602bfb8f74dcd862337182fd4d1a4e"}, + {file = "coverage-7.13.5-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7c8d4bc913dd70b93488d6c496c77f3aff5ea99a07e36a18f865bca55adef8bd"}, + {file = "coverage-7.13.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0e3c426ffc4cd952f54ee9ffbdd10345709ecc78a3ecfd796a57236bfad0b9b8"}, + {file = "coverage-7.13.5-cp311-cp311-win32.whl", hash = "sha256:259b69bb83ad9894c4b25be2528139eecba9a82646ebdda2d9db1ba28424a6bf"}, + {file = "coverage-7.13.5-cp311-cp311-win_amd64.whl", hash = "sha256:258354455f4e86e3e9d0d17571d522e13b4e1e19bf0f8596bcf9476d61e7d8a9"}, + {file = "coverage-7.13.5-cp311-cp311-win_arm64.whl", hash = "sha256:bff95879c33ec8da99fc9b6fe345ddb5be6414b41d6d1ad1c8f188d26f36e028"}, + {file = "coverage-7.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:460cf0114c5016fa841214ff5564aa4864f11948da9440bc97e21ad1f4ba1e01"}, + {file = "coverage-7.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0e223ce4b4ed47f065bfb123687686512e37629be25cc63728557ae7db261422"}, + {file = "coverage-7.13.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6e3370441f4513c6252bf042b9c36d22491142385049243253c7e48398a15a9f"}, + {file = "coverage-7.13.5-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:03ccc709a17a1de074fb1d11f217342fb0d2b1582ed544f554fc9fc3f07e95f5"}, + {file = "coverage-7.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3f4818d065964db3c1c66dc0fbdac5ac692ecbc875555e13374fdbe7eedb4376"}, + {file = "coverage-7.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:012d5319e66e9d5a218834642d6c35d265515a62f01157a45bcc036ecf947256"}, + {file = "coverage-7.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8dd02af98971bdb956363e4827d34425cb3df19ee550ef92855b0acb9c7ce51c"}, + {file = "coverage-7.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f08fd75c50a760c7eb068ae823777268daaf16a80b918fa58eea888f8e3919f5"}, + {file = "coverage-7.13.5-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:843ea8643cf967d1ac7e8ecd4bb00c99135adf4816c0c0593fdcc47b597fcf09"}, + {file = "coverage-7.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:9d44d7aa963820b1b971dbecd90bfe5fe8f81cff79787eb6cca15750bd2f79b9"}, + {file = "coverage-7.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:7132bed4bd7b836200c591410ae7d97bf7ae8be6fc87d160b2bd881df929e7bf"}, + {file = "coverage-7.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a698e363641b98843c517817db75373c83254781426e94ada3197cabbc2c919c"}, + {file = "coverage-7.13.5-cp312-cp312-win32.whl", hash = "sha256:bdba0a6b8812e8c7df002d908a9a2ea3c36e92611b5708633c50869e6d922fdf"}, + {file = "coverage-7.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:d2c87e0c473a10bffe991502eac389220533024c8082ec1ce849f4218dded810"}, + {file = "coverage-7.13.5-cp312-cp312-win_arm64.whl", hash = "sha256:bf69236a9a81bdca3bff53796237aab096cdbf8d78a66ad61e992d9dac7eb2de"}, + {file = "coverage-7.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ec4af212df513e399cf11610cc27063f1586419e814755ab362e50a85ea69c1"}, + {file = "coverage-7.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:941617e518602e2d64942c88ec8499f7fbd49d3f6c4327d3a71d43a1973032f3"}, + {file = "coverage-7.13.5-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:da305e9937617ee95c2e39d8ff9f040e0487cbf1ac174f777ed5eddd7a7c1f26"}, + {file = "coverage-7.13.5-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:78e696e1cc714e57e8b25760b33a8b1026b7048d270140d25dafe1b0a1ee05a3"}, + {file = "coverage-7.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02ca0eed225b2ff301c474aeeeae27d26e2537942aa0f87491d3e147e784a82b"}, + {file = "coverage-7.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:04690832cbea4e4663d9149e05dba142546ca05cb1848816760e7f58285c970a"}, + {file = "coverage-7.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0590e44dd2745c696a778f7bab6aa95256de2cbc8b8cff4f7db8ff09813d6969"}, + {file = "coverage-7.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d7cfad2d6d81dd298ab6b89fe72c3b7b05ec7544bdda3b707ddaecff8d25c161"}, + {file = "coverage-7.13.5-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e092b9499de38ae0fbfbc603a74660eb6ff3e869e507b50d85a13b6db9863e15"}, + {file = "coverage-7.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:48c39bc4a04d983a54a705a6389512883d4a3b9862991b3617d547940e9f52b1"}, + {file = "coverage-7.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:2d3807015f138ffea1ed9afeeb8624fd781703f2858b62a8dd8da5a0994c57b6"}, + {file = "coverage-7.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ee2aa19e03161671ec964004fb74b2257805d9710bf14a5c704558b9d8dbaf17"}, + {file = "coverage-7.13.5-cp313-cp313-win32.whl", hash = "sha256:ce1998c0483007608c8382f4ff50164bfc5bd07a2246dd272aa4043b75e61e85"}, + {file = "coverage-7.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:631efb83f01569670a5e866ceb80fe483e7c159fac6f167e6571522636104a0b"}, + {file = "coverage-7.13.5-cp313-cp313-win_arm64.whl", hash = "sha256:f4cd16206ad171cbc2470dbea9103cf9a7607d5fe8c242fdf1edf36174020664"}, + {file = "coverage-7.13.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0428cbef5783ad91fe240f673cc1f76b25e74bbfe1a13115e4aa30d3f538162d"}, + {file = "coverage-7.13.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e0b216a19534b2427cc201a26c25da4a48633f29a487c61258643e89d28200c0"}, + {file = "coverage-7.13.5-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:972a9cd27894afe4bc2b1480107054e062df08e671df7c2f18c205e805ccd806"}, + {file = "coverage-7.13.5-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:4b59148601efcd2bac8c4dbf1f0ad6391693ccf7a74b8205781751637076aee3"}, + {file = "coverage-7.13.5-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:505d7083c8b0c87a8fa8c07370c285847c1f77739b22e299ad75a6af6c32c5c9"}, + {file = "coverage-7.13.5-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:60365289c3741e4db327e7baff2a4aaacf22f788e80fa4683393891b70a89fbd"}, + {file = "coverage-7.13.5-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1b88c69c8ef5d4b6fe7dea66d6636056a0f6a7527c440e890cf9259011f5e606"}, + {file = "coverage-7.13.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5b13955d31d1633cf9376908089b7cebe7d15ddad7aeaabcbe969a595a97e95e"}, + {file = "coverage-7.13.5-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:f70c9ab2595c56f81a89620e22899eea8b212a4041bd728ac6f4a28bf5d3ddd0"}, + {file = "coverage-7.13.5-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:084b84a8c63e8d6fc7e3931b316a9bcafca1458d753c539db82d31ed20091a87"}, + {file = "coverage-7.13.5-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad14385487393e386e2ea988b09d62dd42c397662ac2dabc3832d71253eee479"}, + {file = "coverage-7.13.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7f2c47b36fe7709a6e83bfadf4eefb90bd25fbe4014d715224c4316f808e59a2"}, + {file = "coverage-7.13.5-cp313-cp313t-win32.whl", hash = "sha256:67e9bc5449801fad0e5dff329499fb090ba4c5800b86805c80617b4e29809b2a"}, + {file = "coverage-7.13.5-cp313-cp313t-win_amd64.whl", hash = "sha256:da86cdcf10d2519e10cabb8ac2de03da1bcb6e4853790b7fbd48523332e3a819"}, + {file = "coverage-7.13.5-cp313-cp313t-win_arm64.whl", hash = "sha256:0ecf12ecb326fe2c339d93fc131816f3a7367d223db37817208905c89bded911"}, + {file = "coverage-7.13.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fbabfaceaeb587e16f7008f7795cd80d20ec548dc7f94fbb0d4ec2e038ce563f"}, + {file = "coverage-7.13.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9bb2a28101a443669a423b665939381084412b81c3f8c0fcfbac57f4e30b5b8e"}, + {file = "coverage-7.13.5-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:bd3a2fbc1c6cccb3c5106140d87cc6a8715110373ef42b63cf5aea29df8c217a"}, + {file = "coverage-7.13.5-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6c36ddb64ed9d7e496028d1d00dfec3e428e0aabf4006583bb1839958d280510"}, + {file = "coverage-7.13.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:380e8e9084d8eb38db3a9176a1a4f3c0082c3806fa0dc882d1d87abc3c789247"}, + {file = "coverage-7.13.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e808af52a0513762df4d945ea164a24b37f2f518cbe97e03deaa0ee66139b4d6"}, + {file = "coverage-7.13.5-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e301d30dd7e95ae068671d746ba8c34e945a82682e62918e41b2679acd2051a0"}, + {file = "coverage-7.13.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:800bc829053c80d240a687ceeb927a94fd108bbdc68dfbe505d0d75ab578a882"}, + {file = "coverage-7.13.5-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:0b67af5492adb31940ee418a5a655c28e48165da5afab8c7fa6fd72a142f8740"}, + {file = "coverage-7.13.5-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c9136ff29c3a91e25b1d1552b5308e53a1e0653a23e53b6366d7c2dcbbaf8a16"}, + {file = "coverage-7.13.5-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:cff784eef7f0b8f6cb28804fbddcfa99f89efe4cc35fb5627e3ac58f91ed3ac0"}, + {file = "coverage-7.13.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:68a4953be99b17ac3c23b6efbc8a38330d99680c9458927491d18700ef23ded0"}, + {file = "coverage-7.13.5-cp314-cp314-win32.whl", hash = "sha256:35a31f2b1578185fbe6aa2e74cea1b1d0bbf4c552774247d9160d29b80ed56cc"}, + {file = "coverage-7.13.5-cp314-cp314-win_amd64.whl", hash = "sha256:2aa055ae1857258f9e0045be26a6d62bdb47a72448b62d7b55f4820f361a2633"}, + {file = "coverage-7.13.5-cp314-cp314-win_arm64.whl", hash = "sha256:1b11eef33edeae9d142f9b4358edb76273b3bfd30bc3df9a4f95d0e49caf94e8"}, + {file = "coverage-7.13.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:10a0c37f0b646eaff7cce1874c31d1f1ccb297688d4c747291f4f4c70741cc8b"}, + {file = "coverage-7.13.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b5db73ba3c41c7008037fa731ad5459fc3944cb7452fc0aa9f822ad3533c583c"}, + {file = "coverage-7.13.5-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:750db93a81e3e5a9831b534be7b1229df848b2e125a604fe6651e48aa070e5f9"}, + {file = "coverage-7.13.5-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9ddb4f4a5479f2539644be484da179b653273bca1a323947d48ab107b3ed1f29"}, + {file = "coverage-7.13.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8a7a2049c14f413163e2bdabd37e41179b1d1ccb10ffc6ccc4b7a718429c607"}, + {file = "coverage-7.13.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1c85e0b6c05c592ea6d8768a66a254bfb3874b53774b12d4c89c481eb78cb90"}, + {file = "coverage-7.13.5-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:777c4d1eff1b67876139d24288aaf1817f6c03d6bae9c5cc8d27b83bcfe38fe3"}, + {file = "coverage-7.13.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6697e29b93707167687543480a40f0db8f356e86d9f67ddf2e37e2dfd91a9dab"}, + {file = "coverage-7.13.5-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:8fdf453a942c3e4d99bd80088141c4c6960bb232c409d9c3558e2dbaa3998562"}, + {file = "coverage-7.13.5-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:32ca0c0114c9834a43f045a87dcebd69d108d8ffb666957ea65aa132f50332e2"}, + {file = "coverage-7.13.5-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:8769751c10f339021e2638cd354e13adeac54004d1941119b2c96fe5276d45ea"}, + {file = "coverage-7.13.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cec2d83125531bd153175354055cdb7a09987af08a9430bd173c937c6d0fba2a"}, + {file = "coverage-7.13.5-cp314-cp314t-win32.whl", hash = "sha256:0cd9ed7a8b181775459296e402ca4fb27db1279740a24e93b3b41942ebe4b215"}, + {file = "coverage-7.13.5-cp314-cp314t-win_amd64.whl", hash = "sha256:301e3b7dfefecaca37c9f1aa6f0049b7d4ab8dd933742b607765d757aca77d43"}, + {file = "coverage-7.13.5-cp314-cp314t-win_arm64.whl", hash = "sha256:9dacc2ad679b292709e0f5fc1ac74a6d4d5562e424058962c7bb0c658ad25e45"}, + {file = "coverage-7.13.5-py3-none-any.whl", hash = "sha256:34b02417cf070e173989b3db962f7ed56d2f644307b2cf9d5a0f258e13084a61"}, + {file = "coverage-7.13.5.tar.gz", hash = "sha256:c81f6515c4c40141f83f502b07bbfa5c240ba25bbe73da7b33f1e5b6120ff179"}, +] + +[package.extras] +toml = ["tomli ; python_full_version <= \"3.11.0a6\""] + [[package]] name = "cryptography" version = "46.0.7" @@ -975,6 +1081,23 @@ files = [ [package.extras] devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] +[[package]] +name = "flake8" +version = "7.3.0" +description = "the modular source code checker: pep8 pyflakes and co" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "flake8-7.3.0-py2.py3-none-any.whl", hash = "sha256:b9696257b9ce8beb888cdbe31cf885c90d31928fe202be0889a7cdafad32f01e"}, + {file = "flake8-7.3.0.tar.gz", hash = "sha256:fe044858146b9fc69b551a4b490d69cf960fcb78ad1edcb84e7fbb1b4a8e3872"}, +] + +[package.dependencies] +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.14.0,<2.15.0" +pyflakes = ">=3.4.0,<3.5.0" + [[package]] name = "flask" version = "3.1.3" @@ -999,22 +1122,6 @@ werkzeug = ">=3.1.0" async = ["asgiref (>=3.2)"] dotenv = ["python-dotenv"] -[[package]] -name = "flask-cors" -version = "5.0.0" -description = "A Flask extension adding a decorator for CORS support" -optional = false -python-versions = "*" -groups = ["main"] -markers = "python_version >= \"3.12\"" -files = [ - {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"}, - {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"}, -] - -[package.dependencies] -Flask = ">=0.9" - [[package]] name = "flask-cors" version = "6.0.2" @@ -1022,7 +1129,6 @@ description = "A Flask extension simplifying CORS support" optional = false python-versions = "<4.0,>=3.9" groups = ["main"] -markers = "python_version == \"3.11\"" files = [ {file = "flask_cors-6.0.2-py3-none-any.whl", hash = "sha256:e57544d415dfd7da89a9564e1e3a9e515042df76e12130641ca6f3f2f03b699a"}, {file = "flask_cors-6.0.2.tar.gz", hash = "sha256:6e118f3698249ae33e429760db98ce032a8bf9913638d085ca0f4c5534ad2423"}, @@ -1355,30 +1461,6 @@ rsa = ["rsa (>=3.1.4,<5)"] testing = ["aiohttp (<3.10.0)", "aiohttp (>=3.6.2,<4.0.0)", "aioresponses", "flask", "freezegun", "grpcio", "packaging", "pyjwt (>=2.0)", "pyopenssl (<24.3.0)", "pyopenssl (>=20.0.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-localserver", "pyu2f (>=0.1.5)", "requests (>=2.20.0,<3.0.0)", "responses", "urllib3"] urllib3 = ["packaging", "urllib3"] -[[package]] -name = "graphene" -version = "2.1.9" -description = "GraphQL Framework for Python" -optional = false -python-versions = "*" -groups = ["main"] -markers = "python_version >= \"3.12\"" -files = [ - {file = "graphene-2.1.9-py2.py3-none-any.whl", hash = "sha256:3d446eb1237c551052bc31155cf1a3a607053e4f58c9172b83a1b597beaa0868"}, - {file = "graphene-2.1.9.tar.gz", hash = "sha256:b9f2850e064eebfee9a3ef4a1f8aa0742848d97652173ab44c82cc8a62b9ed93"}, -] - -[package.dependencies] -aniso8601 = ">=3,<=7" -graphql-core = ">=2.1,<3" -graphql-relay = ">=2,<3" -six = ">=1.10.0,<2" - -[package.extras] -django = ["graphene-django"] -sqlalchemy = ["graphene-sqlalchemy"] -test = ["coveralls", "fastdiff (==0.2.0)", "iso8601", "mock", "promise", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytz", "six", "snapshottest"] - [[package]] name = "graphene" version = "3.4.3" @@ -1386,7 +1468,6 @@ description = "GraphQL Framework for Python" optional = false python-versions = "*" groups = ["main"] -markers = "python_version == \"3.11\"" files = [ {file = "graphene-3.4.3-py2.py3-none-any.whl", hash = "sha256:820db6289754c181007a150db1f7fff544b94142b556d12e3ebc777a7bf36c71"}, {file = "graphene-3.4.3.tar.gz", hash = "sha256:2a3786948ce75fe7e078443d37f609cbe5bb36ad8d6b828740ad3b95ed1a0aaa"}, @@ -1402,28 +1483,6 @@ typing-extensions = ">=4.7.1,<5" dev = ["coveralls (>=3.3,<5)", "mypy (>=1.10,<2)", "pytest (>=8,<9)", "pytest-asyncio (>=0.16,<2)", "pytest-benchmark (>=4,<5)", "pytest-cov (>=5,<6)", "pytest-mock (>=3,<4)", "ruff (==0.5.0)", "types-python-dateutil (>=2.8.1,<3)"] test = ["coveralls (>=3.3,<5)", "pytest (>=8,<9)", "pytest-asyncio (>=0.16,<2)", "pytest-benchmark (>=4,<5)", "pytest-cov (>=5,<6)", "pytest-mock (>=3,<4)"] -[[package]] -name = "graphql-core" -version = "2.3.2" -description = "GraphQL implementation for Python" -optional = false -python-versions = "*" -groups = ["main"] -markers = "python_version >= \"3.12\"" -files = [ - {file = "graphql-core-2.3.2.tar.gz", hash = "sha256:aac46a9ac524c9855910c14c48fc5d60474def7f99fd10245e76608eba7af746"}, - {file = "graphql_core-2.3.2-py2.py3-none-any.whl", hash = "sha256:44c9bac4514e5e30c5a595fac8e3c76c1975cae14db215e8174c7fe995825bad"}, -] - -[package.dependencies] -promise = ">=2.3,<3" -rx = ">=1.6,<2" -six = ">=1.10.0" - -[package.extras] -gevent = ["gevent (>=1.1)"] -test = ["coveralls (==1.11.1)", "cython (==0.29.17)", "gevent (==1.5.0)", "pyannotate (==1.2.0)", "pytest (==4.6.10)", "pytest-benchmark (==3.2.3)", "pytest-cov (==2.8.1)", "pytest-django (==3.9.0)", "pytest-mock (==2.0.0)", "six (==1.14.0)"] - [[package]] name = "graphql-core" version = "3.2.8" @@ -1431,30 +1490,11 @@ description = "GraphQL implementation for Python, a port of GraphQL.js, the Java optional = false python-versions = "<4,>=3.7" groups = ["main"] -markers = "python_version == \"3.11\"" files = [ {file = "graphql_core-3.2.8-py3-none-any.whl", hash = "sha256:cbee07bee1b3ed5e531723685369039f32ff815ef60166686e0162f540f1520c"}, {file = "graphql_core-3.2.8.tar.gz", hash = "sha256:015457da5d996c924ddf57a43f4e959b0b94fb695b85ed4c29446e508ed65cf3"}, ] -[[package]] -name = "graphql-relay" -version = "2.0.1" -description = "Relay implementation for Python" -optional = false -python-versions = "*" -groups = ["main"] -markers = "python_version >= \"3.12\"" -files = [ - {file = "graphql-relay-2.0.1.tar.gz", hash = "sha256:870b6b5304123a38a0b215a79eace021acce5a466bf40cd39fa18cb8528afabb"}, - {file = "graphql_relay-2.0.1-py3-none-any.whl", hash = "sha256:ac514cb86db9a43014d7e73511d521137ac12cf0101b2eaa5f0a3da2e10d913d"}, -] - -[package.dependencies] -graphql-core = ">=2.2,<3" -promise = ">=2.2,<3" -six = ">=1.12" - [[package]] name = "graphql-relay" version = "3.2.0" @@ -1462,7 +1502,6 @@ description = "Relay library for graphql-core" optional = false python-versions = ">=3.6,<4" groups = ["main"] -markers = "python_version == \"3.11\"" files = [ {file = "graphql-relay-3.2.0.tar.gz", hash = "sha256:1ff1c51298356e481a0be009ccdff249832ce53f30559c1338f22a0e0d17250c"}, {file = "graphql_relay-3.2.0-py3-none-any.whl", hash = "sha256:c9b22bd28b170ba1fe674c74384a8ff30a76c8e26f88ac3aa1584dd3179953e5"}, @@ -1636,6 +1675,34 @@ perf = ["ipython"] test = ["flufl.flake8", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] type = ["mypy (<1.19) ; platform_python_implementation == \"PyPy\"", "pytest-mypy (>=1.0.1)"] +[[package]] +name = "iniconfig" +version = "2.3.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12"}, + {file = "iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730"}, +] + +[[package]] +name = "isort" +version = "7.0.0" +description = "A Python utility / library to sort Python imports." +optional = false +python-versions = ">=3.10.0" +groups = ["dev"] +files = [ + {file = "isort-7.0.0-py3-none-any.whl", hash = "sha256:1bcabac8bc3c36c7fb7b98a76c8abb18e0f841a3ba81decac7691008592499c1"}, + {file = "isort-7.0.0.tar.gz", hash = "sha256:5513527951aadb3ac4292a41a16cbc50dd1642432f5e8c20057d414bdafb4187"}, +] + +[package.extras] +colors = ["colorama"] +plugins = ["setuptools"] + [[package]] name = "itsdangerous" version = "2.2.0" @@ -1808,7 +1875,7 @@ files = [ [package.dependencies] attrs = ">=22.2.0" -jsonschema-specifications = ">=2023.03.6" +jsonschema-specifications = ">=2023.3.6" referencing = ">=0.28.4" rpds-py = ">=0.25.0" @@ -2269,6 +2336,18 @@ python-dateutil = ">=2.7" [package.extras] dev = ["meson-python (>=0.13.1,<0.17.0)", "pybind11 (>=2.13.2,!=2.13.3)", "setuptools (>=64)", "setuptools_scm (>=7,<10)"] +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +optional = false +python-versions = ">=3.6" +groups = ["dev"] +files = [ + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, +] + [[package]] name = "mdit-py-plugins" version = "0.5.0" @@ -2584,7 +2663,7 @@ version = "1.1.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505"}, {file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"}, @@ -2852,7 +2931,7 @@ version = "26.1" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "packaging-26.1-py3-none-any.whl", hash = "sha256:5d9c0669c6285e491e0ced2eee587eaf67b670d94a19e94e3984a481aba6802f"}, {file = "packaging-26.1.tar.gz", hash = "sha256:f042152b681c4bfac5cae2742a55e103d27ab2ec0f3d88037136b6bfe7c9c5de"}, @@ -2963,7 +3042,7 @@ version = "1.0.4" description = "Utility library for gitignore style pattern matching of file paths." optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "pathspec-1.0.4-py3-none-any.whl", hash = "sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723"}, {file = "pathspec-1.0.4.tar.gz", hash = "sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645"}, @@ -3128,12 +3207,28 @@ version = "4.9.6" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.10" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "platformdirs-4.9.6-py3-none-any.whl", hash = "sha256:e61adb1d5e5cb3441b4b7710bea7e4c12250ca49439228cc1021c00dcfac0917"}, {file = "platformdirs-4.9.6.tar.gz", hash = "sha256:3bfa75b0ad0db84096ae777218481852c0ebc6c727b3168c1b9e0118e458cf0a"}, ] +[[package]] +name = "pluggy" +version = "1.6.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, + {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["coverage", "pytest", "pytest-benchmark"] + [[package]] name = "prettytable" version = "3.17.0" @@ -3152,24 +3247,6 @@ wcwidth = "*" [package.extras] tests = ["pytest", "pytest-cov", "pytest-lazy-fixtures"] -[[package]] -name = "promise" -version = "2.3" -description = "Promises/A+ implementation for Python" -optional = false -python-versions = "*" -groups = ["main"] -markers = "python_version >= \"3.12\"" -files = [ - {file = "promise-2.3.tar.gz", hash = "sha256:dfd18337c523ba4b6a58801c164c1904a9d4d1b1747c7d5dbf45b693a49d93d0"}, -] - -[package.dependencies] -six = "*" - -[package.extras] -test = ["coveralls", "futures", "mock", "pytest (>=2.7.3)", "pytest-benchmark", "pytest-cov"] - [[package]] name = "propcache" version = "0.4.1" @@ -3421,6 +3498,18 @@ files = [ [package.dependencies] pyasn1 = ">=0.6.1,<0.7.0" +[[package]] +name = "pycodestyle" +version = "2.14.0" +description = "Python style guide checker" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pycodestyle-2.14.0-py2.py3-none-any.whl", hash = "sha256:dd6bf7cb4ee77f8e016f9c8e74a35ddd9f67e1d5fd4184d86c3b98e07099f42d"}, + {file = "pycodestyle-2.14.0.tar.gz", hash = "sha256:c4b5b517d278089ff9d0abdec919cd97262a3367449ea1c8b49b91529167b783"}, +] + [[package]] name = "pycparser" version = "3.0" @@ -3609,6 +3698,33 @@ numpy = ">=1.16.4" carto = ["pydeck-carto"] jupyter = ["ipykernel (>=5.1.2)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"] +[[package]] +name = "pyflakes" +version = "3.4.0" +description = "passive checker of Python programs" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pyflakes-3.4.0-py2.py3-none-any.whl", hash = "sha256:f742a7dbd0d9cb9ea41e9a24a918996e8170c799fa528688d40dd582c8265f4f"}, + {file = "pyflakes-3.4.0.tar.gz", hash = "sha256:b24f96fafb7d2ab0ec5075b7350b3d2d2218eab42003821c06344973d3ea2f58"}, +] + +[[package]] +name = "pygments" +version = "2.20.0" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176"}, + {file = "pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f"}, +] + +[package.extras] +windows-terminal = ["colorama (>=0.4.6)"] + [[package]] name = "pyparsing" version = "3.3.2" @@ -3624,6 +3740,48 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pytest" +version = "9.0.3" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9"}, + {file = "pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c"}, +] + +[package.dependencies] +colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""} +iniconfig = ">=1.0.1" +packaging = ">=22" +pluggy = ">=1.5,<2" +pygments = ">=2.7.2" + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "pytest-cov" +version = "7.1.0" +description = "Pytest plugin for measuring coverage." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pytest_cov-7.1.0-py3-none-any.whl", hash = "sha256:a0461110b7865f9a271aa1b51e516c9a95de9d696734a2f71e3e78f46e1d4678"}, + {file = "pytest_cov-7.1.0.tar.gz", hash = "sha256:30674f2b5f6351aa09702a9c8c364f6a01c27aae0c1366ae8016160d1efc56b2"}, +] + +[package.dependencies] +coverage = {version = ">=7.10.6", extras = ["toml"]} +pluggy = ">=1.2" +pytest = ">=7" + +[package.extras] +testing = ["process-tests", "pytest-xdist", "virtualenv"] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -3678,7 +3836,7 @@ version = "0.4.1" description = "A Fast, spec compliant Python 3.14+ tokenizer that runs on older Pythons." optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "pytokens-0.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a44ed93ea23415c54f3face3b65ef2b844d96aeb3455b8a69b3df6beab6acc5"}, {file = "pytokens-0.4.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:add8bf86b71a5d9fb5b89f023a80b791e04fba57960aa790cc6125f7f1d39dfe"}, @@ -4191,18 +4349,6 @@ files = [ {file = "rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84"}, ] -[[package]] -name = "rx" -version = "1.6.3" -description = "Reactive Extensions (Rx) for Python" -optional = false -python-versions = "*" -groups = ["main"] -markers = "python_version >= \"3.12\"" -files = [ - {file = "Rx-1.6.3.tar.gz", hash = "sha256:ca71b65d0fc0603a3b5cfaa9e33f5ba81e4aae10a58491133595088d7734b2da"}, -] - [[package]] name = "scikit-learn" version = "1.8.0" @@ -5159,5 +5305,5 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" -python-versions = ">=3.11" -content-hash = "5ecee7d5789a8a3ba59c6e542d8cb50c8eef653f7e23a07c4d1eccb84e600474" +python-versions = ">=3.11,<4" +content-hash = "a4b5e6b0cdc9b6fcd26f95dc0fb5cf34039562c61be2f9cce4a54317ba06734f" diff --git a/pyproject.toml b/pyproject.toml index 7c812f2..8dbb91f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ authors = [ {name = "khatora",email = "mohamed.khater03@eng-st.cu.edu.eg"} ] readme = "README.md" -requires-python = ">=3.11" +requires-python = ">=3.11,<4" dependencies = [ "streamlit (>=1.56.0,<2.0.0)", "black (>=26.3.1,<27.0.0)", @@ -25,7 +25,8 @@ dependencies = [ "tqdm (>=4.67.1,<5.0.0)", "category-encoders (>=2.9.0,<3.0.0)", "kaggle (>=2.1.0,<3.0.0)", - "protobuf (<=3.20.3)" + "protobuf (<=3.20.3)", + "graphql-core (>=3.2.0,<3.3.0)" ] [tool.poetry] diff --git a/src/house_price_class_prediction/data/data_acquisition.py b/src/house_price_class_prediction/data/data_acquisition.py index 05964ad..eac23db 100644 --- a/src/house_price_class_prediction/data/data_acquisition.py +++ b/src/house_price_class_prediction/data/data_acquisition.py @@ -1,4 +1,5 @@ import argparse +import logging import os import shutil import subprocess @@ -16,12 +17,8 @@ from geopy.distance import geodesic from tqdm import tqdm -PROJECT_ROOT = Path(__file__).resolve().parents[3] -DATA_DIR = PROJECT_ROOT / "data" -RAW_DIR = DATA_DIR / "raw" -INTERIM_DIR = DATA_DIR / "interim" -PROCESSED_DIR = DATA_DIR / "processed" -TEMP_DIR = DATA_DIR / "temp" +from config import INTERIM_DIR, PROCESSED_DIR, RAW_DIR, TEMP_DIR +from house_price_class_prediction.utils.logging_utils import setup_logging REALTOR_DATASET = "ahmedshahriarsakib/usa-real-estate-dataset" REALTOR_FILENAME = "realtor-data.zip.csv" @@ -32,7 +29,6 @@ def kaggle_credentials_available(): - """Return True when Kaggle credentials are available for CLI downloads.""" if os.getenv("KAGGLE_USERNAME") and os.getenv("KAGGLE_KEY"): return True @@ -41,7 +37,6 @@ def kaggle_credentials_available(): def resolve_kaggle_command(): - """Return the best Kaggle CLI command for the active Python environment.""" venv_kaggle = Path(sys.executable).resolve().with_name("kaggle") if venv_kaggle.exists(): return [str(venv_kaggle)] @@ -61,7 +56,6 @@ def resolve_kaggle_command(): def ensure_realtor_dataset(data_path=REALTOR_DATA_PATH): - """Return the realtor dataset path, downloading it from Kaggle if missing.""" data_path = Path(data_path) if data_path.exists(): print(f"Using existing raw dataset: {data_path}") @@ -122,14 +116,12 @@ def ensure_realtor_dataset(data_path=REALTOR_DATA_PATH): def make_temp_output_dir(base_dir=TEMP_DIR): - """Create a unique temp output directory for a data acquisition run.""" base_dir = Path(base_dir) base_dir.mkdir(parents=True, exist_ok=True) return Path(mkdtemp(prefix="data_acquisition_", dir=base_dir)) def seed_output_file(source_path, output_path): - """Copy an existing input file into the temp run folder if it exists.""" source_path = Path(source_path) output_path = Path(output_path) if source_path.exists() and not output_path.exists(): @@ -138,7 +130,7 @@ def seed_output_file(source_path, output_path): def create_target_variable_classification(df): - df = df.dropna(subset=["price"]) + df = df.dropna(subset=["price"]).copy() df["price_category"] = pd.qcut( df["price"], q=4, labels=["Budget", "Mid-Range", "Premium", "Luxury"] ) @@ -146,21 +138,20 @@ def create_target_variable_classification(df): def get_dataset_sample(df): - """Get a sample using stratified sampling""" - df_clean = df.dropna(subset=["price_category", "state"]) - df_sample = ( - df_clean.groupby(["price_category", "state"], group_keys=False) - .apply( - lambda x: x.sample( - n=max(1, int(len(x) / len(df_clean) * 40000)), - replace=max(1, int(len(x) / len(df_clean) * 40000)) > len(x), + sampled_groups = [] + for _, group in df_clean.groupby(["price_category", "state"], observed=False): + group_sample_size = max(1, int(len(group) / len(df_clean) * 40000)) + sampled_groups.append( + group.sample( + n=group_sample_size, + replace=group_sample_size > len(group), random_state=42, ) ) - .reset_index(drop=True) - ) + + df_sample = pd.concat(sampled_groups, ignore_index=True) if len(df_sample) >= 40000: df_sample = df_sample.sample(n=40000, random_state=42).reset_index(drop=True) else: @@ -406,6 +397,7 @@ def merge_weather_data(df, climate_data): def data_acquisition_pipeline(output_dir=None): + logging.info("Starting data acquisition pipeline") if output_dir is None: cleaned_output_dir = PROCESSED_DIR interim_output_dir = INTERIM_DIR @@ -466,6 +458,7 @@ def data_acquisition_pipeline(output_dir=None): df = add_services(df, enriched) df.to_csv(cleaned_data_path, index=False) print(f"Cleaned data written to: {cleaned_data_path}") + logging.info("Data acquisition complete: %s", cleaned_data_path) return cleaned_data_path @@ -484,5 +477,6 @@ def parse_args(): if __name__ == "__main__": + setup_logging("data_acquisition") args = parse_args() data_acquisition_pipeline(output_dir=args.output_dir) diff --git a/src/house_price_class_prediction/features/add_schools_pharmacies_hospitals.py b/src/house_price_class_prediction/features/add_schools_pharmacies_hospitals.py index 1ebaac5..ce62879 100644 --- a/src/house_price_class_prediction/features/add_schools_pharmacies_hospitals.py +++ b/src/house_price_class_prediction/features/add_schools_pharmacies_hospitals.py @@ -1,4 +1,5 @@ import argparse +import logging import os import sys import time @@ -10,6 +11,8 @@ import requests from scipy.spatial import cKDTree +from house_price_class_prediction.utils.logging_utils import setup_logging + HIFLD_SOURCES = { "schools": { "url": ( @@ -187,6 +190,7 @@ def parse_args(): def main(): + setup_logging("add_schools_pharmacies_hospitals") args = parse_args() output_path = Path(args.output) if output_path.exists(): @@ -201,6 +205,7 @@ def main(): sys.exit(1) print(f"Loaded {len(df)} rows from {args.input}") + logging.info("Loaded %s rows from %s", len(df), args.input) for col_arg, col_name in [(args.lat, "lat"), (args.lng, "lng")]: if col_arg not in df.columns: @@ -227,6 +232,7 @@ def main(): output_path.parent.mkdir(parents=True, exist_ok=True) result.to_csv(output_path, index=False) print(f"\nDone! → {output_path} ({len(result)} rows)") + logging.info("POI enrichment complete: %s", output_path) if __name__ == "__main__": diff --git a/src/house_price_class_prediction/features/build_features.py b/src/house_price_class_prediction/features/build_features.py index a8d544e..44c5c7c 100644 --- a/src/house_price_class_prediction/features/build_features.py +++ b/src/house_price_class_prediction/features/build_features.py @@ -1,4 +1,5 @@ import argparse +import logging from pathlib import Path from tempfile import mkdtemp @@ -6,14 +7,10 @@ import pandas as pd from category_encoders import TargetEncoder -encoding = {"Budget": 0, "Mid-Range": 1, "Premium": 2, "Luxury": 3} +from config import PROCESSED_DIR, TEMP_DIR +from house_price_class_prediction.utils.logging_utils import setup_logging -PROJECT_ROOT = Path(__file__).resolve().parents[3] -DATA_DIR = PROJECT_ROOT / "data" -RAW_DIR = DATA_DIR / "raw" -INTERIM_DIR = DATA_DIR / "interim" -PROCESSED_DIR = DATA_DIR / "processed" -TEMP_DIR = DATA_DIR / "temp" +encoding = {"Budget": 0, "Mid-Range": 1, "Premium": 2, "Luxury": 3} def make_temp_output_dir(base_dir=TEMP_DIR): @@ -126,6 +123,7 @@ def parse_args(): def main(): + setup_logging("build_features") args = parse_args() input_dir = args.input_dir output_dir = args.output_dir @@ -133,6 +131,7 @@ def main(): print(f"Reading feature inputs from: {input_dir}") print(f"Writing feature outputs to: {output_dir}") + logging.info("Building features from %s to %s", input_dir, output_dir) train_df = pd.read_csv(input_dir / "train_data.csv") test_df = pd.read_csv(input_dir / "test_data.csv") @@ -157,6 +156,7 @@ def main(): write_csv_no_overwrite( y_test, output_dir / "y_test_encoded.csv", overwrite=args.overwrite ) + logging.info("Feature building complete") if __name__ == "__main__": diff --git a/src/house_price_class_prediction/features/feature_selection.py b/src/house_price_class_prediction/features/feature_selection.py index 82270cc..9358acc 100644 --- a/src/house_price_class_prediction/features/feature_selection.py +++ b/src/house_price_class_prediction/features/feature_selection.py @@ -1,4 +1,5 @@ import argparse +import logging from pathlib import Path from tempfile import mkdtemp @@ -8,12 +9,8 @@ from sklearn.feature_selection import RFECV, VarianceThreshold from sklearn.model_selection import StratifiedKFold -PROJECT_ROOT = Path(__file__).resolve().parents[3] -DATA_DIR = PROJECT_ROOT / "data" -RAW_DIR = DATA_DIR / "raw" -INTERIM_DIR = DATA_DIR / "interim" -PROCESSED_DIR = DATA_DIR / "processed" -TEMP_DIR = DATA_DIR / "temp" +from config import PROCESSED_DIR, TEMP_DIR +from house_price_class_prediction.utils.logging_utils import setup_logging def make_temp_output_dir(base_dir=TEMP_DIR): @@ -105,6 +102,7 @@ def parse_args(): def main(): + setup_logging("feature_selection") args = parse_args() input_dir = args.input_dir output_dir = args.output_dir @@ -112,6 +110,7 @@ def main(): print(f"Reading feature-selection inputs from: {input_dir}") print(f"Writing feature-selection outputs to: {output_dir}") + logging.info("Selecting features from %s to %s", input_dir, output_dir) X_train = pd.read_csv(input_dir / "X_train_encoded.csv") X_test = pd.read_csv(input_dir / "X_test_encoded.csv") @@ -137,6 +136,7 @@ def main(): for filename, df in output_files.items(): write_csv_no_overwrite(df, output_dir / filename, overwrite=args.overwrite) + logging.info("Feature selection complete") if __name__ == "__main__": diff --git a/src/house_price_class_prediction/features/preprocessing.py b/src/house_price_class_prediction/features/preprocessing.py index 4799b4b..1fae1d1 100644 --- a/src/house_price_class_prediction/features/preprocessing.py +++ b/src/house_price_class_prediction/features/preprocessing.py @@ -1,4 +1,5 @@ import argparse +import logging import re from pathlib import Path from tempfile import mkdtemp @@ -14,6 +15,9 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder +from config import PROCESSED_DIR, RAW_DIR, TEMP_DIR +from house_price_class_prediction.utils.logging_utils import setup_logging + feature_cols = [ "state", "price_category", @@ -35,14 +39,6 @@ numeric_cols = [col for col in feature_cols if col not in categorical_cols] -PROJECT_ROOT = Path(__file__).resolve().parents[3] -DATA_DIR = PROJECT_ROOT / "data" -RAW_DIR = DATA_DIR / "raw" -INTERIM_DIR = DATA_DIR / "interim" -PROCESSED_DIR = DATA_DIR / "processed" -TEMP_DIR = DATA_DIR / "temp" - - def make_temp_output_dir(base_dir=TEMP_DIR): base_dir = Path(base_dir) base_dir.mkdir(parents=True, exist_ok=True) @@ -677,6 +673,7 @@ def parse_args(): def main(): + setup_logging("preprocessing") args = parse_args() output_dir = args.output_dir output_dir.mkdir(parents=True, exist_ok=True) @@ -701,6 +698,7 @@ def main(): ) print(f"Writing preprocessing outputs to: {output_dir}") + logging.info("Reading cleaned data from: %s", cleaned_data_path) cleaned_data = read_raw_data(cleaned_data_path) uscities_df = read_raw_data(uscities_path) if {"lat", "lng"}.issubset(cleaned_data.columns): @@ -726,6 +724,7 @@ def main(): write_csv_no_overwrite( test_df, output_dir / "test_data.csv", overwrite=args.overwrite ) + logging.info("Preprocessing complete: %s", output_dir) if __name__ == "__main__": diff --git a/src/house_price_class_prediction/models/mlflow.db b/src/house_price_class_prediction/models/mlflow.db index 0a2fbaf..321c1f1 100644 Binary files a/src/house_price_class_prediction/models/mlflow.db and b/src/house_price_class_prediction/models/mlflow.db differ diff --git a/src/house_price_class_prediction/models/predict.py b/src/house_price_class_prediction/models/predict.py index aee93ef..4fe6c3d 100644 --- a/src/house_price_class_prediction/models/predict.py +++ b/src/house_price_class_prediction/models/predict.py @@ -1,19 +1,60 @@ +import logging import pickle -from pathlib import Path - -import pandas as pd -from sklearn.metrics import classification_report, confusion_matrix - -PROJECT_ROOT = Path(__file__).resolve().parents[3] -DATA_DIR = PROJECT_ROOT / "data" / "processed" -MODEL_PATH = PROJECT_ROOT / "app" / "House-Price-ntelligence" / "deployed_model.pkl" -y_test = pd.read_csv(DATA_DIR / "y_test_final.csv") -X_test = pd.read_csv(DATA_DIR / "X_test_final.csv") -y_test_ravel = y_test.values.ravel() - -with MODEL_PATH.open("rb") as f: - model = pickle.load(f) -prediction = model.predict(X_test) -cm = confusion_matrix(y_test_ravel, prediction) -print(cm) -print(classification_report(y_test_ravel, prediction)) + +if __package__ is None or __package__ == "": + from pathlib import Path # noqa: E402 + import sys # noqa: E402 + + project_root = Path(__file__).resolve().parents[3] + sys.path.insert(0, str(project_root / "src")) + sys.path.insert(0, str(project_root)) + +import pandas as pd # noqa: E402 +from sklearn.metrics import classification_report, confusion_matrix # noqa: E402 + +from config import PROCESSED_DIR, TEST_MODEL_PATH # noqa: E402 +from house_price_class_prediction.utils.logging_utils import setup_logging # noqa: E402 + + +def read_data(): + logging.info("Reading prediction data from %s", PROCESSED_DIR) + y_test = pd.read_csv(PROCESSED_DIR / "y_test_final.csv") + X_test = pd.read_csv(PROCESSED_DIR / "X_test_final.csv") + y_test_ravel = y_test.values.ravel() + return X_test, y_test_ravel + + +def load_model(): + logging.info("Loading model from %s", TEST_MODEL_PATH) + with TEST_MODEL_PATH.open("rb") as f: + model = pickle.load(f) + return model + + +def predict(model, X_test): + return model.predict(X_test) + + +def print_results(y_test, y_pred): + cm = confusion_matrix(y_test, y_pred) + print(cm) + print(classification_report(y_test, y_pred)) + + +def run_prediction(): + setup_logging("predict") + logging.info("Starting prediction") + try: + X_test, y_test_ravel = read_data() + model = load_model() + prediction = predict(model, X_test) + print_results(y_test_ravel, prediction) + logging.info("Prediction complete") + return prediction + except Exception: + logging.exception("Prediction failed") + raise + + +if __name__ == "__main__": + run_prediction() diff --git a/src/house_price_class_prediction/models/train.py b/src/house_price_class_prediction/models/train.py index ea53d3e..2043301 100644 --- a/src/house_price_class_prediction/models/train.py +++ b/src/house_price_class_prediction/models/train.py @@ -1,99 +1,79 @@ import os -from pathlib import Path +import pickle +from dotenv import load_dotenv import mlflow import pandas as pd -from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier -from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report from sklearn.model_selection import GridSearchCV -from sklearn.naive_bayes import GaussianNB -from sklearn.neighbors import KNeighborsClassifier -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.tree import DecisionTreeClassifier -PROJECT_ROOT = Path(__file__).resolve().parents[3] -DATA_DIR = PROJECT_ROOT / "data" / "processed" -X_train = pd.read_csv(DATA_DIR / "X_train_final.csv") -X_test = pd.read_csv(DATA_DIR / "X_test_final.csv") -y_train = pd.read_csv(DATA_DIR / "y_train_final.csv") -y_test = pd.read_csv(DATA_DIR / "y_test_final.csv") +from config import ( + DEPLOYED_MODEL_PATH, + LOG_MODEL_PIP_REQUIREMENTS, + MLFLOW_DB_PATH, + MODEL_CONFIGS, + MODEL_CV_FOLDS, + MODEL_N_JOBS, + MODEL_SCORING, + PROCESSED_DIR, + SELECTED_MODEL_NAME, + TEST_MODEL_PATH, +) +from house_price_class_prediction.utils.logging_utils import setup_logging + +logger = setup_logging("train") +load_dotenv() +logger.info("Loading training data from %s", PROCESSED_DIR) + +X_train = pd.read_csv(PROCESSED_DIR / "X_train_final.csv") +X_test = pd.read_csv(PROCESSED_DIR / "X_test_final.csv") +y_train = pd.read_csv(PROCESSED_DIR / "y_train_final.csv") +y_test = pd.read_csv(PROCESSED_DIR / "y_test_final.csv") y_train_ravel = y_train.values.ravel() y_test_ravel = y_test.values.ravel() -models = { - "naive": GaussianNB(), - "logistic": Pipeline( - [ - ("scaler", StandardScaler()), - ( - "model", - LogisticRegression( - random_state=42, class_weight="balanced", max_iter=5000 - ), - ), - ] - ), - "knn": Pipeline([("scaler", StandardScaler()), ("model", KNeighborsClassifier())]), - "decision_tree": DecisionTreeClassifier(random_state=42, class_weight="balanced"), - "random_forest": RandomForestClassifier(random_state=42, class_weight="balanced"), - "ada_boost": AdaBoostClassifier(random_state=42), -} -params = { - "naive": {}, - "logistic": {"model__C": [0.1, 1, 10]}, - "knn": { - "model__n_neighbors": [3, 5, 7, 9, 11, 13], - "model__weights": ["uniform", "distance"], - "model__metric": ["euclidean", "manhattan", "minkowski"], - }, - "decision_tree": {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5]}, - "random_forest": { - "n_estimators": [50, 100], - "max_depth": [5, 10], - "min_samples_split": [5, 10], - "min_samples_leaf": [2, 4], - }, - "ada_boost": {"n_estimators": [50, 100], "learning_rate": [0.5, 1.0]}, -} -default_tracking_uri = f"sqlite:///{PROJECT_ROOT / 'src' / 'house_price_class_prediction' / 'models' / 'mlflow.db'}" +default_tracking_uri = f"sqlite:///{MLFLOW_DB_PATH}" mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", default_tracking_uri)) -mlflow.set_experiment("dataScience_project") +mlflow.set_experiment("house_price_class_prediction") -LOG_MODEL_PIP_REQUIREMENTS = [ - "mlflow", - "scikit-learn", - "pandas", - "numpy", -] +selected_model = None +selected_model_name = SELECTED_MODEL_NAME +selected_model_f1 = None -for model_name, model in models.items(): - model_params = params[model_name] +for model_name, model_config in MODEL_CONFIGS.items(): + logger.info("Training model: %s", model_name) + model = model_config["estimator"] + model_params = model_config["params"] model_grid = GridSearchCV( - model, param_grid=model_params, cv=5, scoring="f1_macro", n_jobs=-1 + model, + param_grid=model_params, + cv=MODEL_CV_FOLDS, + scoring=MODEL_SCORING, + n_jobs=MODEL_N_JOBS, ) model_grid.fit(X_train, y_train_ravel) best_params = model_grid.best_params_ best_model = model_grid.best_estimator_ val_macro_f1 = model_grid.best_score_ + training_prediction = best_model.predict(X_train) + prediction = best_model.predict(X_test) + training_report = classification_report( + y_train_ravel, training_prediction, output_dict=True + ) + training_accuracy = training_report["accuracy"] + training_f1_macro = training_report["macro avg"]["f1-score"] report = classification_report(y_test_ravel, prediction, output_dict=True) + print(f"\nModel: {model_name}") + print("testing classification report: " + str(report)) accuracy = report["accuracy"] f1_macro = report["macro avg"]["f1-score"] precision_macro = report["macro avg"]["precision"] recall_macro = report["macro avg"]["recall"] luxury_precision = report["3"]["precision"] luxury_recall = report["3"]["recall"] - training_prediction = best_model.predict(X_train) - training_report = classification_report( - y_train_ravel, training_prediction, output_dict=True - ) - training_accuracy = training_report["accuracy"] - training_f1_macro = training_report["macro avg"]["f1-score"] - print(f"\nModel: {model_name}") - print(classification_report(y_test_ravel, prediction)) + with mlflow.start_run(run_name=model_name): mlflow.log_param("model_name", model_name) mlflow.log_params(best_params) @@ -104,11 +84,33 @@ ) mlflow.log_metric("f1_macro_val", val_macro_f1) mlflow.log_metric("f1_macro_test", f1_macro) - mlflow.log_metric("accuracy_test", accuracy) mlflow.log_metric("f1_macro_training", training_f1_macro) + mlflow.log_metric("accuracy_test", accuracy) mlflow.log_metric("accuracy_training", training_accuracy) mlflow.log_metric("precision_macro_test", precision_macro) mlflow.log_metric("recall_macro_test", recall_macro) mlflow.log_metric("luxury_precision_test", luxury_precision) mlflow.log_metric("luxury_recall_test", luxury_recall) print("Finish model ", model_name) + logger.info("Finished model: %s", model_name) + + if model_name == selected_model_name: + selected_model = best_model + selected_model_f1 = f1_macro + +if selected_model is not None: + for model_path in [TEST_MODEL_PATH, DEPLOYED_MODEL_PATH]: + model_path.parent.mkdir(parents=True, exist_ok=True) + with model_path.open("wb") as f: + pickle.dump(selected_model, f) + logger.info( + "Saved selected model %s with f1_macro %.4f to %s", + selected_model_name, + selected_model_f1, + model_path, + ) + logger.info( + "Selected model saved to %s and %s", + TEST_MODEL_PATH, + DEPLOYED_MODEL_PATH, + ) diff --git a/src/house_price_class_prediction/utils/logging_utils.py b/src/house_price_class_prediction/utils/logging_utils.py new file mode 100644 index 0000000..3c171ed --- /dev/null +++ b/src/house_price_class_prediction/utils/logging_utils.py @@ -0,0 +1,130 @@ +import atexit +import logging +import os +from collections import OrderedDict +from datetime import datetime, timezone + +from config import PIPELINE_LOG_PATH + +SECTION_ORDER = [ + "data_acquisition_section", + "preprocessing_section", + "build_features_section", + "feature_selection_section", + "add_schools_pharmacies_hospitals_section", + "train_section", + "predict_section", + "validate_data_section", + "visualize_section", +] + +SECTION_NAMES = { + "data_acquisition": "data_acquisition_section", + "preprocessing": "preprocessing_section", + "build_features": "build_features_section", + "feature_selection": "feature_selection_section", + "add_schools_pharmacies_hospitals": "add_schools_pharmacies_hospitals_section", + "train": "train_section", + "predict": "predict_section", + "validate_data": "validate_data_section", + "visualize": "visualize_section", +} + + +_shared_logger = None +_runtime_id = os.getenv("PIPELINE_RUNTIME_ID") or datetime.now(timezone.utc).strftime( + "%Y%m%dT%H%M%SZ" +) + + +def setup_logging(name): + global _shared_logger + + if _shared_logger is None: + _shared_logger = SharedSectionLogger(PIPELINE_LOG_PATH) + atexit.register(_shared_logger.save) + + section = SECTION_NAMES.get(name, f"{name}_section") + _shared_logger.set_section(section, _runtime_id) + + handler = SectionLogHandler(_shared_logger) + handler.setFormatter( + logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s") + ) + logging.basicConfig(level=logging.INFO, handlers=[handler], force=True) + + logger = logging.getLogger(name) + logger.info("Logging to %s [%s]", _shared_logger.log_path, section) + return logger + + +class SharedSectionLogger: + def __init__(self, log_path): + self.log_path = log_path + self.sections = OrderedDict() + self.section_runtime_ids = {} + self.active_section = None + self.load() + + def load(self): + if not self.log_path.exists(): + return + + current_section = None + for line in self.log_path.read_text(encoding="utf-8").splitlines(): + if line.startswith("===== ") and line.endswith(" ====="): + current_section, runtime_id = self.parse_section_header(line) + self.sections.setdefault(current_section, []) + self.section_runtime_ids[current_section] = runtime_id + elif current_section and line: + self.sections[current_section].append(line) + + @staticmethod + def parse_section_header(line): + header = line.removeprefix("===== ").removesuffix(" =====") + if " | runtime_id=" not in header: + return header, "unknown" + section, runtime_id = header.split(" | runtime_id=", 1) + return section, runtime_id + + def set_section(self, section, runtime_id): + self.active_section = section + self.sections[section] = [] + self.section_runtime_ids[section] = runtime_id + + def write(self, message): + if not self.active_section or not message: + return + self.sections.setdefault(self.active_section, []) + self.sections[self.active_section].extend(message.rstrip("\n").splitlines()) + + def save(self): + self.log_path.parent.mkdir(parents=True, exist_ok=True) + ordered_sections = [ + section for section in SECTION_ORDER if self.sections.get(section) + ] + ordered_sections.extend( + section + for section in self.sections + if section not in SECTION_ORDER and self.sections.get(section) + ) + + lines = [] + for section in ordered_sections: + runtime_id = self.section_runtime_ids.get(section, "unknown") + lines.append(f"===== {section} | runtime_id={runtime_id} =====") + lines.extend(line for line in self.sections[section] if line) + lines.append("") + + self.log_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + + +class SectionLogHandler(logging.Handler): + def __init__(self, shared_logger): + super().__init__() + self.shared_logger = shared_logger + + def emit(self, record): + message = self.format(record) + self.shared_logger.write(message) + self.shared_logger.save() diff --git a/src/house_price_class_prediction/validation/validate_data.py b/src/house_price_class_prediction/validation/validate_data.py index c1b5ab0..b37922c 100644 --- a/src/house_price_class_prediction/validation/validate_data.py +++ b/src/house_price_class_prediction/validation/validate_data.py @@ -1,10 +1,13 @@ import argparse import json +import logging from pathlib import Path import numpy as np import pandas as pd +from house_price_class_prediction.utils.logging_utils import setup_logging + VALID_US_STATES = { "alabama", "alaska", @@ -70,7 +73,6 @@ def _json_default(value): def validate_schema(df: pd.DataFrame) -> dict: - """Validata df schema""" expected_columns = { "status": "object", # convert later to String "bed": "number", @@ -150,7 +152,6 @@ def validate_completeness(df: pd.DataFrame) -> dict: def validate_uniqueness(df: pd.DataFrame) -> dict: - """Check for duplicates""" duplicated_rows = df.duplicated().sum() cols_to_check = ["city", "state", "house_size", "acre_lot", "bed", "bath"] @@ -209,7 +210,6 @@ def validate_ranges(df: pd.DataFrame) -> dict: def validate_non_negative(df: pd.DataFrame) -> dict: - """Validate columns shouldn't be negative.""" non_negative_columns = [ "bed", "bath", @@ -242,7 +242,6 @@ def validate_non_negative(df: pd.DataFrame) -> dict: def validate_categorical(df: pd.DataFrame) -> dict: - """Validate Categories.""" expected = { "status": {"for_sale", "sold", "ready_to_build"}, "price_category": {"Budget", "Mid-Range", "Premium", "Luxury"}, @@ -279,7 +278,6 @@ def validate_categorical(df: pd.DataFrame) -> dict: def distribution_report(df: pd.DataFrame) -> dict: - """Distribuition estimate validation.""" numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() report = {} @@ -314,7 +312,6 @@ def distribution_report(df: pd.DataFrame) -> dict: def detect_quality_issues(validation: dict) -> list[dict]: - """Report issues.""" issues = [] schema = validation["schema"] @@ -522,11 +519,13 @@ def parse_args(): def main() -> None: + setup_logging("validate_data") args = parse_args() input_path = Path(args.input) out_json = Path(args.out_json) out_txt = Path(args.out_txt) + logging.info("Reading validation input: %s", input_path) df = pd.read_csv(input_path) report = build_validation_report(df, str(input_path)) @@ -542,6 +541,7 @@ def main() -> None: print(f"JSON report: {out_json}") print(f"Text report: {out_txt}") print(f"Quality issues found: {len(report['quality_issues'])}") + logging.info("Validation complete with %s issues", len(report["quality_issues"])) if __name__ == "__main__": diff --git a/src/house_price_class_prediction/visualization/visualize.py b/src/house_price_class_prediction/visualization/visualize.py index 05d5bed..d6b1460 100644 --- a/src/house_price_class_prediction/visualization/visualize.py +++ b/src/house_price_class_prediction/visualization/visualize.py @@ -1,5 +1,5 @@ +import logging import math -from pathlib import Path import matplotlib.pyplot as plt import pandas as pd @@ -7,10 +7,8 @@ from pandas.plotting import parallel_coordinates from sklearn.preprocessing import MinMaxScaler -PROJECT_ROOT = Path(__file__).resolve().parents[3] -DATA_DIR = PROJECT_ROOT / "data" -PROCESSED_DIR = DATA_DIR / "processed" -FIGURES_DIR = PROJECT_ROOT / "reports" / "figures" +from config import FIGURES_DIR, PROCESSED_DIR +from house_price_class_prediction.utils.logging_utils import setup_logging def save_current_figure(filename): @@ -317,6 +315,8 @@ def feature_to_target(train_df): def main(): + setup_logging("visualize") + logging.info("Generating visualizations from %s", PROCESSED_DIR) X_train_selected = pd.read_csv(PROCESSED_DIR / "X_train_final.csv") y_train = pd.read_csv(PROCESSED_DIR / "y_train_final.csv") train_df = pd.concat([X_train_selected, y_train], axis=1) @@ -352,6 +352,7 @@ def main(): "price_category" ].astype(str) feature_to_target(train_df=feature_target_df) + logging.info("Visualization complete: %s", FIGURES_DIR) if __name__ == "__main__": diff --git a/test_environment.py b/test_environment.py deleted file mode 100644 index 4294e82..0000000 --- a/test_environment.py +++ /dev/null @@ -1,26 +0,0 @@ -import sys -from pathlib import Path - - -MIN_PYTHON = (3, 11) -REQUIRED_PATHS = [ - Path("pyproject.toml"), - Path("src/house_price_class_prediction"), -] - - -def main() -> None: - if sys.version_info < MIN_PYTHON: - required = ".".join(str(part) for part in MIN_PYTHON) - current = ".".join(str(part) for part in sys.version_info[:3]) - raise SystemExit(f"Python {required}+ is required; found {current}.") - - missing_paths = [str(path) for path in REQUIRED_PATHS if not path.exists()] - if missing_paths: - raise SystemExit(f"Missing required project paths: {missing_paths}") - - print(f"Python {sys.version.split()[0]} environment check passed.") - - -if __name__ == "__main__": - main() diff --git a/tests/test_model_train.py b/tests/test_model_train.py new file mode 100644 index 0000000..17b1f62 --- /dev/null +++ b/tests/test_model_train.py @@ -0,0 +1,96 @@ +import pandas as pd +from sklearn.model_selection import GridSearchCV +from sklearn.neighbors import KNeighborsClassifier +from sklearn.metrics import classification_report + +from config import PROCESSED_DIR + + +def test_data_loading(): + X_train = pd.read_csv(PROCESSED_DIR / "X_train_final.csv") + X_test = pd.read_csv(PROCESSED_DIR / "X_test_final.csv") + y_train = pd.read_csv(PROCESSED_DIR / "y_train_final.csv") + y_test = pd.read_csv(PROCESSED_DIR / "y_test_final.csv") + assert not X_train.empty, "X_train should not be empty" + assert not y_train.empty, "y_train should not be empty" + assert not X_test.empty, "X_test should not be empty" + assert not y_test.empty, "y_test should not be empty" + + +def test_data_shape(): + X_train = pd.read_csv(PROCESSED_DIR / "X_train_final.csv") + X_test = pd.read_csv(PROCESSED_DIR / "X_test_final.csv") + y_train = pd.read_csv(PROCESSED_DIR / "y_train_final.csv") + y_test = pd.read_csv(PROCESSED_DIR / "y_test_final.csv") + assert ( + X_train.shape[0] == y_train.shape[0] + ), "number of data row in X_train and y_train should match" + assert ( + X_test.shape[0] == y_test.shape[0] + ), "number of data row in X_test and y_test should match" + + +def test_cross_validation(): + knn = KNeighborsClassifier() + X_train = pd.DataFrame({"feature1": [1, 2, 3, 4, 5]}) + y_train = pd.Series([0, 0, 1, 1, 1]) + param_grid = { + "n_neighbors": [1, 2], + "weights": ["uniform", "distance"], + "metric": ["euclidean", "manhattan", "minkowski"], + } + grid_search = GridSearchCV(knn, param_grid, cv=2) + grid_search.fit(X_train, y_train) + assert grid_search.best_estimator_ is not None, "best estimator should not be None" + assert grid_search.best_params_ is not None, "best parameters should not be None" + assert grid_search.best_score_ is not None, "best score should not be None" + + +def test_classification_report(): + y_true = [0, 1, 2, 2, 0] + y_pred = [0, 0, 2, 2, 1] + report = classification_report(y_true, y_pred, output_dict=True) + assert isinstance(report, dict), "report should be a dictionary" + assert "0" in report, "report should contain class 0" + assert "1" in report, "report should contain class 1" + assert "2" in report, "report should contain class 2" + assert "accuracy" in report, "report should contain accuracy" + assert ( + report["0"]["precision"] is not None + ), "precision for class 0 should not be None" + assert report["0"]["recall"] is not None, "recall for class 0 should not be None" + assert ( + report["0"]["f1-score"] is not None + ), "f1-score for class 0 should not be None" + assert ( + report["1"]["precision"] is not None + ), "precision for class 1 should not be None" + assert report["1"]["recall"] is not None, "recall for class 1 should not be None" + assert ( + report["1"]["f1-score"] is not None + ), "f1-score for class 1 should not be None" + assert ( + report["2"]["precision"] is not None + ), "precision for class 2 should not be None" + assert report["2"]["recall"] is not None, "recall for class 2 should not be None" + assert ( + report["2"]["f1-score"] is not None + ), "f1-score for class 2 should not be None" + assert report["accuracy"] is not None, "accuracy should not be None" + assert report["accuracy"] == 0.6, "accuracy should be 0.6" + + +def test_full_pipeline(): + X_train = pd.DataFrame({"feature1": [1, 2, 3, 4, 5]}) + y_train = pd.Series([0, 0, 1, 1, 1]) + X_test = pd.DataFrame({"feature1": [1, 2, 3]}) + y_test = pd.Series([0, 1, 1]) + knn = KNeighborsClassifier(n_neighbors=3) + knn.fit(X_train, y_train) + y_pred = knn.predict(X_test) + report = classification_report(y_test, y_pred, output_dict=True) + assert isinstance(report, dict), "report should be a dictionary" + assert len(y_pred) == len( + y_test + ), "should predict the same number of samples as in y_test" + assert report["accuracy"] is not None, "accuracy should not be None" diff --git a/tests/test_predict.py b/tests/test_predict.py new file mode 100644 index 0000000..cdeb3c3 --- /dev/null +++ b/tests/test_predict.py @@ -0,0 +1,73 @@ +import pickle + +import pandas as pd + +from config import PIPELINE_LOG_PATH, PROCESSED_DIR, TEST_MODEL_PATH +from house_price_class_prediction.models import predict as predict_module + + +class DummyModel: + def __init__(self, predictions): + self.predictions = predictions + + def predict(self, X_test): + return self.predictions[: len(X_test)] + + +def test_predict_module_uses_configured_paths(): + assert predict_module.PROCESSED_DIR == PROCESSED_DIR + assert predict_module.TEST_MODEL_PATH == TEST_MODEL_PATH + + +def test_read_data_reads_expected_files(tmp_path, monkeypatch): + pd.DataFrame({"feature": [1, 2]}).to_csv(tmp_path / "X_test_final.csv", index=False) + pd.DataFrame({"target": [0, 1]}).to_csv(tmp_path / "y_test_final.csv", index=False) + monkeypatch.setattr(predict_module, "PROCESSED_DIR", tmp_path) + + X_test, y_test = predict_module.read_data() + + assert X_test["feature"].tolist() == [1, 2] + assert y_test.tolist() == [0, 1] + + +def test_load_model_reads_expected_path(tmp_path, monkeypatch): + model_path = tmp_path / "test_model.pkl" + model = DummyModel([1, 0]) + with model_path.open("wb") as f: + pickle.dump(model, f) + monkeypatch.setattr(predict_module, "TEST_MODEL_PATH", model_path) + + loaded_model = predict_module.load_model() + + assert isinstance(loaded_model, DummyModel) + assert loaded_model.predictions == [1, 0] + + +def test_predict_returns_model_predictions(): + model = DummyModel([0, 1]) + X_test = pd.DataFrame({"feature": [10, 20]}) + + predictions = predict_module.predict(model, X_test) + + assert predictions == [0, 1] + + +def test_run_prediction_uses_small_test_files(tmp_path, monkeypatch, capsys): + pd.DataFrame({"feature": [1, 2]}).to_csv(tmp_path / "X_test_final.csv", index=False) + pd.DataFrame({"target": [0, 1]}).to_csv(tmp_path / "y_test_final.csv", index=False) + + model_path = tmp_path / "test_model.pkl" + with model_path.open("wb") as f: + pickle.dump(DummyModel([0, 1]), f) + + monkeypatch.setattr(predict_module, "PROCESSED_DIR", tmp_path) + monkeypatch.setattr(predict_module, "TEST_MODEL_PATH", model_path) + + predictions = predict_module.run_prediction() + + captured = capsys.readouterr() + log_text = PIPELINE_LOG_PATH.read_text(encoding="utf-8") + assert predictions == [0, 1] + assert "[[1 0]" in captured.out + assert "===== predict_section | runtime_id=" in log_text + assert "Prediction complete" in log_text