probabl-ai · glemaitre · Aug 17, 2025 · Jul 24, 2025
diff --git a/content/python_files/direct_vs_recursive_forecasting.py b/content/python_files/direct_vs_recursive_forecasting.py
@@ -23,9 +23,9 @@
 # segments:
 #
 # - Segment type "a" has a prefix centered around 0 with low variance and a
-#   suffix centered around 1.
+#   suffix centered around 1 with low variance.
 # - Segment type "b" has a prefix centered around 0 with high variance and a
-#   suffix centered around -1.
+#   suffix centered around -1 with low variance.
 #
 # Segment of type "a" and "b" are independently sampled, meaning that is not
 # possible to forecast beyond the length of the segments. However, it should be

diff --git a/content/python_files/feature_engineering.py b/content/python_files/feature_engineering.py
@@ -67,7 +67,7 @@
 # We wrap the resulting polars dataframe in a `skrub` expression to benefit
 # from the built-in `skrub.TableReport` display in the notebook. Using the
 # `skrub` expression system will also be useful for other reasons: all
-# operations in this notebook chain operations chained together in a directed
+# operations in this notebook are chained together in a directed
 # acyclic graph that is automatically tracked by `skrub`. This allows us to
 # extract the resulting pipeline and apply it to new data later on, exactly
 # like a trained scikit-learn pipeline. The main difference is that we do so
@@ -115,9 +115,7 @@ def build_historical_time_range(
 # Let's now load the data records for the time range defined above.
 #
 # To avoid network issues when running this notebook, the necessary data files
-# have already been downloaded and saved in the `datasets` folder. See the
-# README.md file for instructions to download the data manually if you want to
-# re-run this notebook with more recent data.
+# have already been downloaded and saved in the `datasets` folder. 
 
 # %%
 data_source_folder = skrub.var("data_source_folder", "../datasets")
@@ -349,9 +347,9 @@ def iqr(col, *, window_size: int):
 # When working with historical data, we often have access to all the past
 # measurements in the dataset. However, when we want to use the lagged features
 # in a forecasting model, we need to be careful about the length of the
-# **system lag**: the time between a timestamped measurement is made in the
-# real world and the time the record is made available to the downstream
-# application (in our case, a deployed predictive pipeline).
+# **system lag**. The system lag is the timelaps between the moment a timestamped 
+# measurement is made in the real world and the moment where the record is made 
+# available to the downstream application (in our case, a deployed predictive pipeline).
 #
 # System lag is rarely explicitly represented in the data sources even if such
 # delay can be as large as several hours or even days and can sometimes be
@@ -386,8 +384,8 @@ def iqr(col, *, window_size: int):
 
 # %% [markdown]
 #
-# Let's extract the dates where the inter-quartile range of the load is
-# greater than 15,000 MW.
+# Let's extract the dates where the inter-quartile range of the load on 7 days is
+# greater than 15,000 MW, to investigate the outliers hightlighted by the TableReport.
 
 # %%
 electricity_lagged.filter(pl.col("load_mw_iqr_7d") > 15_000)[

diff --git a/content/python_files/multiple_horizons_prediction.py b/content/python_files/multiple_horizons_prediction.py
@@ -44,7 +44,7 @@
 #
 # ## Predicting multiple horizons with a grid of single output models
 #
-# Usually, it is really common to predict values for multiple horizons at once. The most
+# It is really common to predict values for multiple horizons at once. The most
 # naive approach is to train as many models as there are horizons. To achieve this,
 # scikit-learn provides a meta-estimator called `MultiOutputRegressor` that can be used
 # to train a single model that predicts multiple horizons at once.

diff --git a/content/python_files/single_horizon_prediction.py b/content/python_files/single_horizon_prediction.py
@@ -167,7 +167,7 @@
 #
 # In the example below, we define that the training data should be at most 2 years
 # worth of data and the test data should be 24 weeks long. We also define a gap of
-# 1 week between the training.
+# 1 week between the training and the testing sets.
 #
 # Let's check those statistics by iterating over the different folds provided by the
 # splitter.
@@ -206,23 +206,24 @@
 # target variable such as electricity load forecasting.
 #
 # We can also look at the R2 score and the Poisson and Gamma deviance which are
-# all strictly proper scoring rules for estimation of E[y|X]: in the large
+# all strictly proper scoring rules for estimation of $E[y|X]$: in the large
 # sample limit, minimizers of those metrics all identify the conditional
 # expectation of the target variable given the features for strictly positive
 # target variables. All those metrics follow the higher is better convention,
 # 1.0 is the maximum reachable score and 0.0 is the score of a model that
 # predicts the mean of the target variable for all observations, irrespective
 # of the features.
 #
-# No that in general, a deviance score of 1.0 is not reachable since it
+# Know that in general, a deviance score of 1.0 is not reachable since it
 # corresponds to a model that always predicts the target value exactly
 # for all observations. In practice, because there is always a fraction of the
 # variability in the target variable that is not explained by the information
-# available to construct the features.
+# available to construct the features, this perfect prediction is impossible.
 
 # %%
-from sklearn.metrics import make_scorer, mean_absolute_percentage_error, get_scorer
-from sklearn.metrics import d2_tweedie_score
+from sklearn.metrics import (
+    make_scorer, mean_absolute_percentage_error, get_scorer, d2_tweedie_score
+)
 
 
 hgbr_cv_results = hgbr_predictions.skb.cross_validate(
@@ -310,7 +311,7 @@
 # A true model is navigating between the diagonal and the oracle model. The area between
 # the diagonal and the Lorenz curve of a model is called the Gini index.
 #
-# For our model, we observe that each oracle model is not far from the diagonal. It
+# For our usecase, we observe that each oracle model is not far from the diagonal. It
 # means that the observed values do not contain a couple of large values with high
 # variability. Therefore, it informs us that the complexity of our problem at hand is
 # not too high. Looking at the Lorenz curve of each model, we observe that it is quite