From d8c580b9fc89e200418bcbd95e167ebdbc498492 Mon Sep 17 00:00:00 2001
From: Aaron Spring <draaronspring@proton.me>
Date: Thu, 11 Jun 2026 13:38:03 +0200
Subject: [PATCH] Fix continuous ROC for binary obs with out-of-range forecasts
 (#442)

In `roc(..., bin_edges="continuous")` the binary observations were
categorised with the forecast-derived threshold `i`. When forecast
values fall outside the [0, 1] range of the observations, no observation
ever lands in the event category, so the hit rate is always 0 and the
area collapses (e.g. 0.0 instead of 1.0).

Categorise observations with a fixed split at 0.5 in continuous mode,
varying only the forecast threshold. Results now match
`sklearn.metrics.roc_auc_score` regardless of forecast magnitude or
input dimension ordering.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.rst                           | 12 +++++++++
 xskillscore/core/probabilistic.py       |  7 ++++-
 xskillscore/tests/test_probabilistic.py | 34 +++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index e7f4fd45..c4fccc1f 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -3,6 +3,18 @@ Changelog History
 =================
 
 
+xskillscore unreleased
+----------------------
+
+Bug Fixes
+~~~~~~~~~
+- Fixed :py:func:`~xskillscore.roc` with ``bin_edges="continuous"`` returning
+  incorrect areas (down to ``0.0`` instead of ``1.0``) when forecast values fall
+  outside the ``[0, 1]`` range of the binary observations. Observations are now
+  categorised with a fixed split rather than the forecast-derived threshold, so
+  results match ``sklearn.metrics.roc_auc_score``. (:issue:`442`) `Aaron Spring`_
+
+
 xskillscore v0.0.29 (2026-02-18)
 --------------------------------
 
diff --git a/xskillscore/core/probabilistic.py b/xskillscore/core/probabilistic.py
index 7e650360..7134b28d 100644
--- a/xskillscore/core/probabilistic.py
+++ b/xskillscore/core/probabilistic.py
@@ -1310,6 +1310,11 @@ def roc(
 
     # loop over each bin_edge and get true positive rate and false positive rate
     # from contingency
+    if continuous:
+        # observations are binary (0/1); categorise them with a fixed split at 0.5
+        # rather than the forecast-derived threshold ``i`` (which may lie outside
+        # the [0, 1] range of the observations). See GH #442.
+        observation_category_edges = np.array([-np.inf, 0.5, np.inf])
     tpr_list, fpr_list = [], []
     for i in bin_edges:
         dichotomous_category_edges = np.array(
@@ -1318,7 +1323,7 @@ def roc(
         dichotomous_contingency = Contingency(
             observations,
             forecasts,
-            dichotomous_category_edges,
+            observation_category_edges if continuous else dichotomous_category_edges,
             dichotomous_category_edges,
             dim=dim,
         )
diff --git a/xskillscore/tests/test_probabilistic.py b/xskillscore/tests/test_probabilistic.py
index 5147d476..2763a19a 100644
--- a/xskillscore/tests/test_probabilistic.py
+++ b/xskillscore/tests/test_probabilistic.py
@@ -1020,6 +1020,40 @@ def test_roc_bin_edges_continuous_against_sklearn(
         assert (xs_tpr == sk_tpr).all()
 
 
+@pytest.mark.parametrize("drop_intermediate_bool", [False, True])
+def test_roc_continuous_forecast_out_of_unit_range_against_sklearn(drop_intermediate_bool):
+    """Continuous ROC must match sklearn even when forecasts lie outside [0, 1].
+
+    Regression test for GH #442: observations are binary but were categorised
+    with the forecast-derived threshold, so events were never detected once
+    forecast magnitudes exceeded the [0, 1] range of the observations.
+    """
+    np.random.seed(1512)
+    obs_raw = xr.DataArray(
+        np.random.normal(0.5, 0.2, size=(20, 10)),
+        coords=[("time", np.arange(20)), ("points", np.arange(10))],
+    )
+    da_obs = (obs_raw > 0.5).astype(int)
+    # forecast shifted out of [0, 1] per point and with a transposed dim order
+    alpha = xr.DataArray(np.linspace(0, 1, num=10), coords=[("points", np.arange(10))])
+    err = xr.DataArray(np.random.normal(0.0, 0.03, size=20), coords=[("time", np.arange(20))])
+    da_forecast = alpha + obs_raw + err
+
+    xs_area = roc(
+        da_obs,
+        da_forecast,
+        "continuous",
+        dim="time",
+        drop_intermediate=drop_intermediate_bool,
+        return_results="area",
+    )
+    for point in range(da_obs.points.size):
+        sk_area = roc_auc_score(
+            da_obs.isel(points=point).values, da_forecast.isel(points=point).values
+        )
+        np.testing.assert_allclose(xs_area.isel(points=point), sk_area)
+
+
 def test_roc_bin_edges_drop_intermediate(forecast_1d_long, observation_1d_long):
     """Test that drop_intermediate reduces probability_bins in xs.roc ."""
     fp = forecast_1d_long.clip(0, 1)  # prob