From f6d0577893e19f439ff33c1adc23da9a27ffa8e4 Mon Sep 17 00:00:00 2001 From: Marie Date: Thu, 24 Jul 2025 14:24:11 +0200 Subject: [PATCH] some changes to be a bit more explicit --- .../direct_vs_recursive_forecasting.py | 4 ++-- content/python_files/feature_engineering.py | 16 +++++++--------- .../python_files/multiple_horizons_prediction.py | 2 +- .../python_files/single_horizon_prediction.py | 15 ++++++++------- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/content/python_files/direct_vs_recursive_forecasting.py b/content/python_files/direct_vs_recursive_forecasting.py index 297aff1..750fc60 100644 --- a/content/python_files/direct_vs_recursive_forecasting.py +++ b/content/python_files/direct_vs_recursive_forecasting.py @@ -23,9 +23,9 @@ # segments: # # - Segment type "a" has a prefix centered around 0 with low variance and a -# suffix centered around 1. +# suffix centered around 1 with low variance. # - Segment type "b" has a prefix centered around 0 with high variance and a -# suffix centered around -1. +# suffix centered around -1 with low variance. # # Segment of type "a" and "b" are independently sampled, meaning that is not # possible to forecast beyond the length of the segments. However, it should be diff --git a/content/python_files/feature_engineering.py b/content/python_files/feature_engineering.py index a9dddad..c023d01 100644 --- a/content/python_files/feature_engineering.py +++ b/content/python_files/feature_engineering.py @@ -67,7 +67,7 @@ # We wrap the resulting polars dataframe in a `skrub` expression to benefit # from the built-in `skrub.TableReport` display in the notebook. Using the # `skrub` expression system will also be useful for other reasons: all -# operations in this notebook chain operations chained together in a directed +# operations in this notebook are chained together in a directed # acyclic graph that is automatically tracked by `skrub`. This allows us to # extract the resulting pipeline and apply it to new data later on, exactly # like a trained scikit-learn pipeline. The main difference is that we do so @@ -115,9 +115,7 @@ def build_historical_time_range( # Let's now load the data records for the time range defined above. # # To avoid network issues when running this notebook, the necessary data files -# have already been downloaded and saved in the `datasets` folder. See the -# README.md file for instructions to download the data manually if you want to -# re-run this notebook with more recent data. +# have already been downloaded and saved in the `datasets` folder. # %% data_source_folder = skrub.var("data_source_folder", "../datasets") @@ -349,9 +347,9 @@ def iqr(col, *, window_size: int): # When working with historical data, we often have access to all the past # measurements in the dataset. However, when we want to use the lagged features # in a forecasting model, we need to be careful about the length of the -# **system lag**: the time between a timestamped measurement is made in the -# real world and the time the record is made available to the downstream -# application (in our case, a deployed predictive pipeline). +# **system lag**. The system lag is the timelaps between the moment a timestamped +# measurement is made in the real world and the moment where the record is made +# available to the downstream application (in our case, a deployed predictive pipeline). # # System lag is rarely explicitly represented in the data sources even if such # delay can be as large as several hours or even days and can sometimes be @@ -386,8 +384,8 @@ def iqr(col, *, window_size: int): # %% [markdown] # -# Let's extract the dates where the inter-quartile range of the load is -# greater than 15,000 MW. +# Let's extract the dates where the inter-quartile range of the load on 7 days is +# greater than 15,000 MW, to investigate the outliers hightlighted by the TableReport. # %% electricity_lagged.filter(pl.col("load_mw_iqr_7d") > 15_000)[ diff --git a/content/python_files/multiple_horizons_prediction.py b/content/python_files/multiple_horizons_prediction.py index 2e045d7..dfc27f4 100644 --- a/content/python_files/multiple_horizons_prediction.py +++ b/content/python_files/multiple_horizons_prediction.py @@ -44,7 +44,7 @@ # # ## Predicting multiple horizons with a grid of single output models # -# Usually, it is really common to predict values for multiple horizons at once. The most +# It is really common to predict values for multiple horizons at once. The most # naive approach is to train as many models as there are horizons. To achieve this, # scikit-learn provides a meta-estimator called `MultiOutputRegressor` that can be used # to train a single model that predicts multiple horizons at once. diff --git a/content/python_files/single_horizon_prediction.py b/content/python_files/single_horizon_prediction.py index 94106a7..306d825 100644 --- a/content/python_files/single_horizon_prediction.py +++ b/content/python_files/single_horizon_prediction.py @@ -167,7 +167,7 @@ # # In the example below, we define that the training data should be at most 2 years # worth of data and the test data should be 24 weeks long. We also define a gap of -# 1 week between the training. +# 1 week between the training and the testing sets. # # Let's check those statistics by iterating over the different folds provided by the # splitter. @@ -206,7 +206,7 @@ # target variable such as electricity load forecasting. # # We can also look at the R2 score and the Poisson and Gamma deviance which are -# all strictly proper scoring rules for estimation of E[y|X]: in the large +# all strictly proper scoring rules for estimation of $E[y|X]$: in the large # sample limit, minimizers of those metrics all identify the conditional # expectation of the target variable given the features for strictly positive # target variables. All those metrics follow the higher is better convention, @@ -214,15 +214,16 @@ # predicts the mean of the target variable for all observations, irrespective # of the features. # -# No that in general, a deviance score of 1.0 is not reachable since it +# Know that in general, a deviance score of 1.0 is not reachable since it # corresponds to a model that always predicts the target value exactly # for all observations. In practice, because there is always a fraction of the # variability in the target variable that is not explained by the information -# available to construct the features. +# available to construct the features, this perfect prediction is impossible. # %% -from sklearn.metrics import make_scorer, mean_absolute_percentage_error, get_scorer -from sklearn.metrics import d2_tweedie_score +from sklearn.metrics import ( + make_scorer, mean_absolute_percentage_error, get_scorer, d2_tweedie_score +) hgbr_cv_results = hgbr_predictions.skb.cross_validate( @@ -310,7 +311,7 @@ # A true model is navigating between the diagonal and the oracle model. The area between # the diagonal and the Lorenz curve of a model is called the Gini index. # -# For our model, we observe that each oracle model is not far from the diagonal. It +# For our usecase, we observe that each oracle model is not far from the diagonal. It # means that the observed values do not contain a couple of large values with high # variability. Therefore, it informs us that the complexity of our problem at hand is # not too high. Looking at the Lorenz curve of each model, we observe that it is quite