From 1f4ff3906ad698047222720fbb7426882f5463df Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Thu, 18 Sep 2025 08:36:28 +0100
Subject: [PATCH 01/12] well the model data is getting through but it's not
 using the yaml properly and now i need to, for regridding

---
 src/CSET/operators/__init__.py        |   2 +
 src/CSET/operators/cell_statistics.py | 561 ++++++++++++++++++++++++++
 src/CSET/recipes/cell_statistics.yaml |  27 ++
 3 files changed, 590 insertions(+)
 create mode 100644 src/CSET/operators/cell_statistics.py
 create mode 100644 src/CSET/recipes/cell_statistics.yaml

diff --git a/src/CSET/operators/__init__.py b/src/CSET/operators/__init__.py
index 8f68de590..0dc5ae949 100644
--- a/src/CSET/operators/__init__.py
+++ b/src/CSET/operators/__init__.py
@@ -28,6 +28,7 @@
 from CSET.operators import (
     ageofair,
     aggregate,
+    cell_statistics,
     collapse,
     constraints,
     convection,
@@ -46,6 +47,7 @@
 __all__ = [
     "ageofair",
     "aggregate",
+    "cell_statistics",
     "collapse",
     "constraints",
     "convection",
diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
new file mode 100644
index 000000000..965317832
--- /dev/null
+++ b/src/CSET/operators/cell_statistics.py
@@ -0,0 +1,561 @@
+"""OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE."""
+
+import iris.analysis
+import iris.coords
+import iris.cube
+import iris.util
+import numpy as np
+from scipy import ndimage
+
+
+# some things from the res job
+#
+# DATES="20200801T0600Z 20200801T1800Z 20200802T0600Z"
+# LAND_SEA_SPLIT="False"
+# NUM_PROC="8"
+# INPUT_DIR="/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_quick"
+# PLOT_DIR="/data/users/byron.blay/RES/test_full/diags"
+# RECIPE="generic_basic_qq_plot.yaml"
+# SHORT_NAMES="ral2 ral3"
+# SITE="metoffice"
+# Y_AXIS="frequency"
+# PLOT_PARAM="$CYLC_TASK_PARAM_cellplots"
+
+
+# is this going to be the cli usage?
+# export $INPUT_DIR=$SCRATCH/RES_RETRIEVAL_CACHE/test_quick
+# export OUTPUT_DIR=$SCRATCH/temp/cset
+# export RECIPE=cell_statistics.yaml
+# cset -v bake --input-dir=$INPUT_DIR --output-dir=$OUTPUT_DIR --recipe=$RECIPE
+#   --LAND_SEA_SPLIT=False
+
+
+def var_filter(cubes, long_name=None, stash=None):
+    """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE."""
+    result = []
+    for cube in cubes:
+        if long_name and cube.long_name == long_name:
+            result.append(cube)
+        elif stash and "STASH" in cube.attributes and cube.attributes["STASH"] == stash:
+            result.append(cube)
+
+    return iris.cube.CubeList(result)
+
+
+def get_cell_stat_vars(cubes):
+    """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE."""
+    vars = {}
+
+    # Large-scale rainfall rate
+    vars["rainfall rate"] = var_filter(
+        cubes, long_name="surface_microphysical_rainfall_rate", stash="m01s04i203"
+    )
+
+    # 1-hourly mean precipitation rate
+    vars["rainfall amount"] = var_filter(
+        cubes, long_name="surface_microphysical_rainfall_amount", stash="m01s04i201"
+    )
+    # vars["1-hourly mean precipitation rate"].extend(var_filter(cubes, stash="m01s04i202"))  # snow
+    # todo: RES has a switch for the following
+    # vars["1-hourly mean precipitation rate"].extend(var_filter(cubes, stash="m01s05i201"))
+    # vars["1-hourly mean precipitation rate"].extend(var_filter(cubes, stash="m01s05i202"))
+
+    # ensure at least one cube found for each variable
+    for k, v in vars.items():
+        assert len(v) > 0, f"No cubes found for variable '{k}'"
+
+    # # strictly one cube per variable?
+    # assert len(vars["Large-scale rainfall rate"]) == 1
+    # assert len(vars["1-hourly mean precipitation rate"]) == 1
+    # vars["Large-scale rainfall rate"] = vars["Large-scale rainfall rate"][0]
+    # vars["1-hourly mean precipitation rate"] = vars["1-hourly mean precipitation rate"][0]
+
+    return vars
+
+
+def caller_thing(cell_attribute, **kwargs):
+    """
+    Create a histogram from the given cube, for every threshold and cell attribute.
+
+    The cube will be "1-hourly mean precipitation rate" or "Large-scale rainfall rate".
+
+
+    Also for:
+        cell attribute
+            effective_radius_in_km, mean_value
+
+    """
+
+    # todo: remove var_name? impedes merge
+
+    # For each model, extract the two variables we're interested in.
+    model_data = {}
+    for model, cubes in kwargs.items():
+        model_data[model] = get_cell_stat_vars(cubes)
+
+    # For now, thresholds and cell attributes are hard coded.
+    thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]
+
+    # todo: what about different models? how are they loaded and labelled?
+
+    # # For now, cell attributes are hard coded.
+    # cell_attributes = ["effective_radius_in_km", "mean_value"]
+    #
+    # # For now, hard-code to these two, which is what we see in the res output.
+    # # todo: there is also "time" in the res code
+    # time_groupings = ["forecast_period", "hour"]
+
+    # We need a list of times for each time grouping
+    # For forecast time, this must contain "all", plus all the T+0.5 .. T+71.5
+    # For hourly, this must contain "all", plus all the 0.5Z .. 23.5Z
+    # todo: we should presumably get this from the cube.
+    # assert False
+
+    # todo: we need a land-sea split input?
+    # for now, hard-code as false (this is what the res output had it set to)
+    # land_sea_split = False
+
+    # what about obs? GPM & UK_RADAR_2KM? ignore for now.
+
+    # different bins for the two cell attributes
+    bin_edges = {}
+
+    bin_edges['effective_radius_in_km'] = 10**(np.arange(0.0, 3.12, 0.12))
+    bin_edges['effective_radius_in_km'] = np.insert(bin_edges['effective_radius_in_km'], 0, 0)
+
+    bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12))
+    bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0)
+
+
+
+    model_histograms = {}
+    for model, data in model_data.keys():
+        for threshold in thresholds:
+            something_like_cell_attribute_histogram(
+                cube,
+                attribute=cell_attribute,
+                bin_edges=bin_edges[cell_attribute],
+                threshold=threshold,
+            )
+
+
+    # return a cube for each model
+    return model_histograms
+
+
+def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0):
+    hist_cube = iris.cube.CubeList()
+    coords = get_non_spatial_coords(cube)
+
+    # Express histogram bins as an Iris coordinate
+    bin_centres = 0.5 * (bin_edges[1:] + bin_edges[:-1])
+    bins_as_coord = iris.coords.DimCoord(
+        bin_centres, long_name=attribute, units=cube.units,
+        coord_system=None, bounds=np.column_stack((bin_edges[0:-1], bin_edges[1:])))
+
+    hist_cubes = iris.cube.CubeList()
+
+    for slc in cube.slices_over(coords):
+        # Identify connected cells in this spatial slice
+        cells = find_cells(slc, threshold=threshold)
+
+        if cells:
+            # Extract values of the desired cell attribute
+            cell_attributes = [cell.attributes[attribute] for cell in cells]
+
+            # Construct a histogram of the desired cell attribute
+            hist, _ = np.histogram(cell_attributes)
+        else:
+            # Assign zeros to all bins
+            hist = np.zeros(bin_centres.size).astype(np.int64)
+
+        # Get a list of the non-spatial coordinates for this slice
+        aux_coords = [(coord, []) for coord in get_non_spatial_coords(slc)]
+
+        # Construct a cube to hold the cell statistic histogram for this slice
+        hist_slc = iris.cube.Cube(
+            hist,
+            long_name=("{0:s} cell {1:s} histogram".format(slc.name(), attribute)),
+            units="no_unit",
+            attributes=slc.attributes,
+            cell_methods=slc.cell_methods,
+            dim_coords_and_dims=[(bins_as_coord, 0)],
+            aux_coords_and_dims=aux_coords,
+        )
+
+        hist_cubes.append(hist_slc)
+
+    hist_cube = hist_cubes.merge_cube()
+    return hist_cube
+
+
+def get_spatial_coords(cube):
+    '''Returns the x, y coordinates of an input :class:`iris.cube.Cube`.'''
+    # Usual names for spatial coordinates
+    X_COORD_NAMES = ["longitude", "grid_longitude",
+                     "projection_x_coordinate", "x"]
+    Y_COORD_NAMES = ["latitude", "grid_latitude",
+                     "projection_y_coordinate", "y"]
+
+    # Get a list of coordinate names for the cube
+    coord_names = [coord.name() for coord in cube.coords()]
+
+    # Check which x-coordinate we have, if any
+    x_coords = [coord for coord in coord_names if coord in X_COORD_NAMES]
+    if len(x_coords) != 1:
+        raise ValueError("Could not identify a unique x-coordinate in cube")
+    x_coord = cube.coord(x_coords[0])
+
+    # Check which y-coordinate we have, if any
+    y_coords = [coord for coord in coord_names if coord in Y_COORD_NAMES]
+    if len(y_coords) != 1:
+        raise ValueError("Could not identify a unique y-coordinate in cube")
+    y_coord = cube.coord(y_coords[0])
+
+    return [x_coord, y_coord]
+
+
+def get_non_spatial_coords(cube):
+    """
+    Returns a list of the non-spatial coordinates of an input \
+    :class:`iris.cube.Cube`.
+    """
+    # Get a list of the cube coordinates
+    coords = cube.coords()
+    # Get the spatial coordinates of the cube
+    x_coord, y_coord = get_spatial_coords(cube)
+    # Remove the spatial coordinates from the list of coordinates
+    coords.remove(x_coord)
+    coords.remove(y_coord)
+    return coords
+
+
+def guess_bounds(cube):
+    '''
+    Takes an input :class:`iris.cube.Cube`, guesses bounds on the x, y \
+    coordinates, then returns the cube. Such bounds are often required by \
+    regridding algorithms.
+    '''
+    # Loop over spatial coordinates
+    for axis in ["x", "y"]:
+        coord = cube.coord(axis=axis)
+        # Check that this is not a variable resolution grid
+        # TODO: Does bounds really not work with variable resolution? AVD
+        # seem to think it won't...
+        try:
+            _ = iris.util.regular_step(coord)
+        except:
+            raise ValueError("Cannot guess bounds for a variable "
+                             "resolution grid")
+        # Guess bounds if there aren't any
+        if coord.bounds is None:
+            coord.guess_bounds()
+    return cube
+
+
+def connected_object_labelling(data, threshold=0.0, min_size=1, connectivity=1):
+    '''
+    Finds connected objects in an input array and assigns them unique labels.
+
+    Arguments:
+
+    * **data** - a :class:`numpy.ndarray` array in which to label objects.
+
+    Keyword arguments:
+
+    * **threshold** - if supplied, only regions where the input data exceeds \
+                      the threshold will be considered when searching for \
+                      connected objects.
+    * **min_size** - minimum size in grids points for connected objects. Must \
+                     be an integer >= 1.
+    * **connectivity** - given a particular grid point, all grid points up to \
+                         a squared distance of connectivity away are considered \
+                         neighbours. Connectivity may range from 1 (only direct \
+                         neighbours are considered) to :attr:`data.ndim`.
+
+    Returns:
+
+    * **label_array** - an integer array where each unique object in the input \
+                        array has a unique label in the returned array.
+    * **num_objects** - the number of objects found.
+    '''
+    # Apply the supplied threshold to the data to generate a binary array
+    binary_data = data.copy()
+    if np.ma.is_masked(binary_data):
+        # If the data is masked, replace any masked values with (threshold - 1)
+        # thus guaranteeing they will be below the threshold and thus set to 0
+        # below
+        binary_data = np.ma.filled(binary_data, fill_value=(threshold - 1))
+    # Set values above and below the threshold to 1 and 0, respectively
+    indices_above = (binary_data > threshold)
+    indices_below = (binary_data <= threshold)
+    binary_data[indices_above] = 1
+    binary_data[indices_below] = 0
+
+    # Construct a structuring element that defines how the neighbours of
+    # a grid point are assigned
+    structure_element = ndimage.morphology.generate_binary_structure(
+        data.ndim, connectivity)
+
+    # Label distinct (connected) objects in the binary array
+    label_array, num_objects = ndimage.measurements.label(
+        binary_data,
+        structure=structure_element)
+
+    # Throw away any objects smaller than min_size
+    if min_size < 1:
+        raise ValueError('"min_size" must be 1 or greater')
+    elif min_size > 1:
+        labels = np.unique(label_array)
+        # Discard the background (which will be labelled as 0)
+        labels = labels[(labels > 0)]
+        # Loop over distinct objects
+        for label in labels:
+            # Find the indices of the grid points comprising this object
+            indices = np.where(label_array == label)
+            # If this object is smaller than min_size, set it as background
+            if indices[0].size < min_size:
+                label_array[indices] = 0
+                num_objects -= 1
+
+    return label_array, num_objects
+
+
+def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1):
+    """
+    Finds connected objects (i.e. cells) in spatial slices of a given \
+    :class:`iris.cube.Cube`.
+
+    Arguments:
+
+    * **cube** - an input :class:`iris.cube.Cube` object.
+
+    Keyword arguments:
+
+    * **threshold** - if supplied, only regions where the input data exceeds \
+                      the threshold will be considered when identifying cells.
+    * **area_threshold** - minimum area in km^2 that cells must have.
+    * **connectivity** - given a particular grid point, all grid points up to a \
+                         squared distance of connectivity away are considered \
+                         neighbours. Connectivity may range from 1 (only \
+                         direct neighbours are considered) to \
+                         :attr:`cube.data.ndim`.
+
+    Returns:
+
+    * **cells** - a :class:`iris.cube.CubeList` of \
+                  :class:`iris.cube.Cube` objects, each one corresponding to \
+                  an identified cell.
+    """
+
+    # Convert input area threshold from km^2 to m^2
+    M_IN_KM = 1000
+    area_threshold = (float(M_IN_KM)**2) * area_threshold
+
+    # Get x, y coordinates of input cube
+    x_coord, y_coord = get_spatial_coords(cube)
+    x, y = iris.analysis.cartography.get_xy_grids(cube)
+
+    # Guess x, y coordinate bounds
+    cube = guess_bounds(cube)
+
+    # Loop over 2D spatial slices of the input cube and find cells in each
+    # slice
+    grid_areas = None
+    cells = iris.cube.CubeList()
+    coords = get_non_spatial_coords(cube)
+    for slc in cube.slices_over(coords):
+        if grid_areas is None:
+            # Area of grid cells, in m^2
+            grid_areas = iris.analysis.cartography.area_weights(slc)
+
+        # Store a list of the non-spatial coordinates for this slice
+        aux_coords = [(coord, []) for coord in
+                      get_non_spatial_coords(slc)]
+
+        # Find and label cells
+        # Call connected object labelling function based on
+        # scipy.ndimage.measurements.label
+        cell_label_array, _ = connected_object_labelling(
+            slc.data,
+            threshold=threshold,
+            min_size=1,
+            connectivity=connectivity)
+
+        # Get a list of unique cell labels
+        cell_labels = np.unique(cell_label_array)
+        # Discard background (which has a label of 0)
+        cell_labels = cell_labels[(cell_labels > 0)]
+        # Loop over cell and store their properties
+        for cell_label in cell_labels:
+            # Find the indices of the grid points comprising this cell
+            cell_indices = np.where(cell_label_array == cell_label)
+            cell_x = x[cell_indices]
+            cell_y = y[cell_indices]
+            cell_values = slc.data[cell_indices]
+            cell_grid_areas = grid_areas[cell_indices]
+
+            # There should not be any masked data present in cells!
+            if np.ma.is_masked(cell_values):
+                raise ValueError("Masked data found in cell {0:d}"
+                                 .format(cell_label))
+
+            # If cell area is less than area_threshold, discard it
+            # (by setting its label to the background value)
+            cell_area = np.sum(cell_grid_areas, dtype=np.float64)
+            if cell_area < area_threshold:
+                cell_label_array[cell_indices] = 0
+                continue
+
+            # Estimate cell centre position
+            # TODO Is there a better way of doing this? C.O.M?
+            cell_centre = (np.mean(cell_x, dtype=np.float64),
+                           np.mean(cell_y, dtype=np.float64))
+            # Area-weighted mean value in cell
+            cell_mean = (np.sum((cell_grid_areas * cell_values),
+                                dtype=np.float64)
+                         / cell_area)
+            # Convert cell area from m^2 to km^2...
+            cell_area /= (float(M_IN_KM)**2)
+            # ...and then cell effective radius in km
+            cell_radius = np.sqrt(cell_area / np.pi)
+
+            # Create an Iris cube to store this cell
+            cell_cube = iris.cube.Cube(
+                cell_values,
+                long_name="{:s} cell".format(cube.name()),
+                units=cube.units,
+                attributes=cube.attributes,
+                cell_methods=cube.cell_methods,
+                aux_coords_and_dims=aux_coords)
+
+            # Set up x, y coordinates describing the grid points in the cell...
+            cell_x_coord = iris.coords.AuxCoord(
+                cell_x,
+                standard_name=x_coord.standard_name,
+                long_name=x_coord.long_name,
+                units=x_coord.units,
+                bounds=None,
+                attributes=x_coord.attributes,
+                coord_system=x_coord.coord_system)
+            cell_y_coord = iris.coords.AuxCoord(
+                cell_y,
+                standard_name=y_coord.standard_name,
+                long_name=y_coord.long_name,
+                units=y_coord.units,
+                bounds=None,
+                attributes=y_coord.attributes,
+                coord_system=y_coord.coord_system)
+            # ...and add them to the cell cube
+            cell_cube.add_aux_coord(cell_x_coord, 0)
+            cell_cube.add_aux_coord(cell_y_coord, 0)
+
+            # Set up a coordinate describing the areas of grid cells in
+            # the cell object...
+            cell_grid_area_coord = iris.coords.AuxCoord(cell_grid_areas,
+                                                        long_name="grid_areas",
+                                                        units="m2")
+            #...and add it to the cell cube
+            cell_cube.add_aux_coord(cell_grid_area_coord, 0)
+
+            # Finally add some attriubtes to the cube that describe some
+            # useful information about the cell
+            #cell_cube.attributes["label"] = int(cell_label)
+            cell_cube.attributes["centre"] = cell_centre
+            cell_cube.attributes["area_in_km2"] = cell_area
+            cell_cube.attributes["effective_radius_in_km"] = cell_radius
+            cell_cube.attributes["mean_value"] = cell_mean
+            #cell_cube.attributes["indices"] = cell_indices
+            cells.append(cell_cube)
+
+    return cells
+
+
+
+
+# ignore
+
+# def cell_statistics(LAND_SEA_SPLIT: bool):
+
+
+# cell_attribute:
+#   effective_radius_in_km
+#   mean_value
+
+
+# RAIN_RATE = diagnostics.Field(
+#     long_name="Large-scale rainfall rate",
+#     short_name="rain_rate",
+#     stash=[diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 4, 203))],
+#     units="mm h-1",
+#     unit_conversion_factor=3600.0,
+#     thresholds=[0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0] if USE_LFRIC_CODE else None,
+#     regrid_method="two_stage"
+#     )
+
+
+# PRECIP_1HR_MEAN = diagnostics.Field(
+#     long_name="1-hourly mean precipitation rate",
+#     short_name="1hr_mean_precip",
+#     stash=PRECIP_1HR_MEAN_STASH,
+#     stash_operator=diagnostics.STASH_OPERATORS["ADD"],
+#     mean_period=1,
+#     use_running_accumulations=not USE_LFRIC_CODE,
+#     units="mm h-1",
+#     observations=["GPM", "UK_radar_2km", "Darwin_radar_rain_2.5km"],
+#     thresholds=[0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0],
+#     regrid_method="two_stage"
+#     )
+
+# PRECIP_1HR_MEAN_STASH = [
+#         diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 4, 201)),
+#         diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 4, 202))]
+# if not USE_LFRIC_CODE:
+#     PRECIP_1HR_MEAN_STASH.extend([
+#         diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 5, 201), is_optional=True),
+#         diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 5, 202), is_optional=True),
+#     ])
+
+
+# choose the thresholds according to the variable
+# todo: should these be [optionally] parameterised?
+
+
+# make a cube list of cell_attribute_histogram() for each cube
+
+
+# return for plotting
+# return some_cube
+
+
+"""
+
+category: Diagnostics
+title: cell statistics plot
+description: |
+  TBD.
+
+steps:
+
+  - operator: cell_statistics.caller_thing
+
+    # "effective_radius_in_km" or "mean_value"
+    cell_attribute: $CELL_ATTRIBUTE
+
+    uk_ctrl_um:
+      operator: read.read_cubes
+      file_paths:
+        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp"
+        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb012.pp"
+    uk_expt_lfric:
+      operator: read.read_cubes
+      file_paths:
+        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_000_006.nc"
+        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_006_012.nc"
+
+  - operator: write.write_cube_to_nc
+    overwrite: True
+
+  - operator: plot.plot_histogram_series
+
+"""
diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml
new file mode 100644
index 000000000..d31c1937a
--- /dev/null
+++ b/src/CSET/recipes/cell_statistics.yaml
@@ -0,0 +1,27 @@
+category: Diagnostics
+title: cell statistics plot
+description: |
+  TBD.
+
+steps:
+
+  - operator: cell_statistics.caller_thing
+
+    # "effective_radius_in_km" or "mean_value"
+    cell_attribute: $CELL_ATTRIBUTE
+
+    uk_ctrl_um:
+      operator: read.read_cubes
+      file_paths:
+        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp"
+        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb012.pp"
+    uk_expt_lfric:
+      operator: read.read_cubes
+      file_paths:
+        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_000_006.nc"
+        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_006_012.nc"
+
+  - operator: write.write_cube_to_nc
+    overwrite: True
+
+  - operator: plot.plot_histogram_series

From 53b547ef55c1ba6197479bd341c38ea369287e48 Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Mon, 22 Sep 2025 10:59:06 +0100
Subject: [PATCH 02/12] - add regrid to yaml - brought more code in from res -
 think i need to remove the outer cubes loop

---
 src/CSET/operators/cell_statistics.py | 640 ++++++++++++++++++++++----
 src/CSET/recipes/cell_statistics.yaml |  26 +-
 2 files changed, 556 insertions(+), 110 deletions(-)

diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
index 965317832..6449de083 100644
--- a/src/CSET/operators/cell_statistics.py
+++ b/src/CSET/operators/cell_statistics.py
@@ -1,12 +1,23 @@
 """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE."""
 
+import datetime
+import inspect
+import itertools
+from copy import deepcopy
+from typing import Callable
+
+import cftime
 import iris.analysis
 import iris.coords
 import iris.cube
 import iris.util
 import numpy as np
+from iris.cube import CubeList
 from scipy import ndimage
 
+M_IN_KM = 1000
+HOUR_IN_SECONDS = 3600
+
 
 # some things from the res job
 #
@@ -31,7 +42,6 @@
 
 
 def var_filter(cubes, long_name=None, stash=None):
-    """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE."""
     result = []
     for cube in cubes:
         if long_name and cube.long_name == long_name:
@@ -43,7 +53,6 @@ def var_filter(cubes, long_name=None, stash=None):
 
 
 def get_cell_stat_vars(cubes):
-    """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE."""
     vars = {}
 
     # Large-scale rainfall rate
@@ -73,87 +82,523 @@ def get_cell_stat_vars(cubes):
     return vars
 
 
-def caller_thing(cell_attribute, **kwargs):
+def get_bin_edges(cell_attribute):
+    # todo: we probably want to let the user specify the bins
+    if cell_attribute == "effective_radius_in_km":
+        bin_edges = 10 ** (np.arange(0.0, 3.12, 0.12))
+        bin_edges = np.insert(bin_edges, 0, 0)
+    elif cell_attribute == "mean_value":
+        bin_edges = 10 ** (np.arange(-1, 2.7, 0.12))
+        bin_edges = np.insert(bin_edges, 0, 0)
+    else:
+        raise ValueError(f"Unknown cell attribute '{cell_attribute}'")
+
+    return bin_edges
+
+
+def identify_unique_times(cubelist, time_coord_name):
     """
-    Create a histogram from the given cube, for every threshold and cell attribute.
+    Given a :class:`iris.cube.CubeList`, this finds the set of unique times \
+    which occur across all cubes in the cubelist.
 
-    The cube will be "1-hourly mean precipitation rate" or "Large-scale rainfall rate".
+    Arguments:
 
+    * **cubelist** - a :class:`iris.cube.CubeList` of :class:`iris.cube.Cube` \
+                     objects.
+    * **time_coord_name** - the name of the time coordinate to select, \
+                            typically "time", "forecast_period" or "hour".
 
-    Also for:
-        cell attribute
-            effective_radius_in_km, mean_value
+    Returns
+    -------
+    * **time_coord** - an :class:`iris.coords.Coord` instance containing the \
+                       unique times that occur across the cubes in the \
+                       input cubelist.
+    """
+    times = []
+    time_unit = None
+    # Loop over cubes
+    for cube in cubelist:
+        # Extract the desired time coordinate from the cube
+        time_coord = cube.coord(time_coord_name)
+
+        # Get the units for the specifed time coordinate
+        if time_unit is None:
+            time_unit = time_coord.units
+
+        # Store the time coordinate points
+        times.extend(time_coord.points)
+
+    # Construct a list of unique times...
+    times = sorted(list(set(times)))
+    # ...and store them in a new time coordinate
+    time_coord = iris.coords.DimCoord(times, units=time_unit)
+    time_coord.rename(time_coord_name)
+
+    return time_coord
+
+
+def add_categorised_coord(
+    cube: iris.cube.Cube,
+    name: str,
+    from_coord: iris.coords.DimCoord | iris.coords.AuxCoord | str,
+    category_function: Callable,
+    units: str = "1",
+) -> None:
+    """Add a new coordinate to a cube, by categorising an existing one.
+
+    Make a new :class:`iris.coords.AuxCoord` from mapped values, and add
+    it to the cube.
+
+    Parameters
+    ----------
+    cube :
+        The cube containing 'from_coord'. The new coord will be added into it.
+    name :
+        Name of the created coordinate.
+    from_coord :
+        Coordinate in 'cube', or the name of one.
+    category_function :
+        Function(coordinate, value), returning a category value for a coordinate
+        point-value. If ``value`` has a type hint :obj:`cftime.datetime`, the
+        coordinate points are translated to :obj:`cftime.datetime` s before
+        calling ``category_function``.
+    units :
+        Units of the category value, typically 'no_unit' or '1'.
+    """
+    # Interpret coord, if given as a name
+    coord = cube.coord(from_coord) if isinstance(from_coord, str) else from_coord
+
+    if len(cube.coords(name)) > 0:
+        msg = 'A coordinate "%s" already exists in the cube.' % name
+        raise ValueError(msg)
+
+    # Translate the coordinate points to cftime datetimes if requested.
+    value_param = list(inspect.signature(category_function).parameters.values())[1]
+    if issubclass(value_param.annotation, cftime.datetime):
+        points = coord.units.num2date(coord.points, only_use_cftime_datetimes=True)
+    else:
+        points = coord.points
+
+    # Construct new coordinate by mapping values, using numpy.vectorize to
+    # support multi-dimensional coords.
+    # Test whether the result contains strings. If it does we must manually
+    # force the dtype because of a numpy bug (see numpy #3270 on GitHub).
+    result = category_function(coord, points.ravel()[0])
+    if isinstance(result, str):
+        str_vectorised_fn = np.vectorize(category_function, otypes=[object])
+
+        def vectorised_fn(*args):
+            # Use a common type for string arrays (N.B. limited to 64 chars).
+            return str_vectorised_fn(*args).astype("|U64")
+
+    else:
+        vectorised_fn = np.vectorize(category_function)
+    new_coord = iris.coords.AuxCoord(
+        vectorised_fn(coord, points),
+        units=units,
+        attributes=coord.attributes.copy(),
+    )
+    new_coord.rename(name)
 
+    # Add into the cube
+    cube.add_aux_coord(new_coord, cube.coord_dims(coord))
+
+
+def hour_from_time(coord, point):
+    """
+    Category function to calculate the hour given a time, for use in \
+    :func:`iris.coord_categorisation.add_categorised_coord`.
     """
+    time = coord.units.num2date(point)
+    day_start = datetime.datetime(time.year, time.month, time.day)
+    seconds_since_day_start = (time - day_start).total_seconds()
+    hours_since_day_start = seconds_since_day_start / float(HOUR_IN_SECONDS)
+    return hours_since_day_start
 
-    # todo: remove var_name? impedes merge
 
-    # For each model, extract the two variables we're interested in.
-    model_data = {}
-    for model, cubes in kwargs.items():
-        model_data[model] = get_cell_stat_vars(cubes)
+def remove_duplicates(cubelist):
+    """
+    Removes any duplicate :class:`iris.cube.Cube` objects from an \
+    :class:`iris.cube.CubeList`.
+    """
+    # Nothing to do if the cubelist is empty
+    if not cubelist:
+        return cubelist
+    # Build up a list of indices of the cubes to remove because they are
+    # duplicated
+    indices_to_remove = []
+    for i in range(len(cubelist) - 1):
+        cube_i = cubelist[i]
+        for j in range(i + 1, len(cubelist)):
+            cube_j = cubelist[j]
+            if cube_i == cube_j:
+                if j not in indices_to_remove:
+                    indices_to_remove.append(j)
+    # Only keep unique cubes
+    cubelist = iris.cube.CubeList(
+        [cube for index, cube in enumerate(cubelist) if index not in indices_to_remove]
+    )
+    return cubelist
 
-    # For now, thresholds and cell attributes are hard coded.
-    thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]
 
-    # todo: what about different models? how are they loaded and labelled?
+def extract_unique(cubes, group):
+    # Preparation of data for averaging and plotting
+    if group == "time":
+        # Identify a unique set of times
+        times = identify_unique_times(cubes, group)
 
-    # # For now, cell attributes are hard coded.
-    # cell_attributes = ["effective_radius_in_km", "mean_value"]
-    #
-    # # For now, hard-code to these two, which is what we see in the res output.
-    # # todo: there is also "time" in the res code
-    # time_groupings = ["forecast_period", "hour"]
+        # Now extract data at these times
+        time_constraint = iris.Constraint(
+            coord_values={group: lambda cell: cell.point in times.points}
+        )
+        cubes = cubes.extract(time_constraint)
+
+        # Remove other time coordinates to allow a cube merge
+        # later
+        for cube in cubes:
+            cube.remove_coord("forecast_reference_time")
+            cube.remove_coord("forecast_period")
+    elif group == "forecast_period":
+        # Identify a unique set of lead times
+        times = identify_unique_times(cubes, group)
+
+        # Remove other time coordinates to allow a cube
+        # merge later
+        for cube in cubes:
+            cube.remove_coord("forecast_reference_time")
+            cube.remove_coord("time")
+    elif group == "hour":
+        # Categorise the time coordinate of each cube into
+        # hours
+        for cube in cubes:
+            add_categorised_coord(
+                cube, "hour", cube.coord("time"), hour_from_time, units="hour"
+            )
 
-    # We need a list of times for each time grouping
-    # For forecast time, this must contain "all", plus all the T+0.5 .. T+71.5
-    # For hourly, this must contain "all", plus all the 0.5Z .. 23.5Z
-    # todo: we should presumably get this from the cube.
-    # assert False
+        # Identify a unique set of times of day
+        times = identify_unique_times(cubes, group)
 
-    # todo: we need a land-sea split input?
-    # for now, hard-code as false (this is what the res output had it set to)
-    # land_sea_split = False
+        # Now extract data at these times
+        time_constraint = iris.Constraint(
+            coord_values={group: lambda cell: cell.point in times.points}
+        )
+        cubes = cubes.extract(time_constraint)
 
-    # what about obs? GPM & UK_RADAR_2KM? ignore for now.
+        # Remove other time coordinates to allow a cube
+        # merge later
+        for cube in cubes:
+            cube.remove_coord("forecast_reference_time")
+            cube.remove_coord("time")
+            cube.remove_coord("forecast_period")
 
-    # different bins for the two cell attributes
-    bin_edges = {}
+    # Remove any duplicate cubes to allow a successful merge
+    # Note this is typcially because we have the same set of
+    # observations associated with more than one model
+    cubes = remove_duplicates(cubes)
 
-    bin_edges['effective_radius_in_km'] = 10**(np.arange(0.0, 3.12, 0.12))
-    bin_edges['effective_radius_in_km'] = np.insert(bin_edges['effective_radius_in_km'], 0, 0)
+    return cubes, times
 
-    bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12))
-    bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0)
 
+def remove_cell_method(cube, cell_method):
+    """
+    Removes the supplied :class:`iris.coords.CellMethod` from an input
+    :class:`iris.cube.Cube`, then returns the cube.
+    """
+    cell_methods = [cm for cm in cube.cell_methods if cm != cell_method]
+    cube.cell_methods = ()
+    for cm in cell_methods:
+        cube.add_cell_method(cm)
+    return cube
+
+
+def aggregate_at_time(input_params):
+    """
+    Extracts data valid at a given time from each cube in a list of cubes, \
+    then performs an aggregation operation (e.g. mean) across this data.
+
+    Arguments (passed in as a tuple to allow parallelisation):
+
+    * **input_params** - a four-element tuple consisting of:
+
+      * **cubes** - a :class:`iris.cube.CubeList` holding the \
+                    :class:`iris.cube.Cube` objects to process.
+      * **time_coord** - the time at which the aggregation should be performed, \
+                         supplied as an :class:`iris.coords.Coord` object.
+      * **aggregator** - the aggregator to use, which can be any from \
+                         :mod:`iris.analysis`.
+      * **percentile** - the value of the percentile rank at which to extract \
+                         values, if the chosen aggregator is \
+                         :class:`iris.analysis.PERCENTILE`. For other \
+                         aggregators this not used.
+
+    Returns
+    -------
+    * **aggregated_cubes** - an :class:`iris.cube.CubeList` of \
+                             :class:`iris.cube.Cube` objects holding the \
+                             aggregated data.
+    """
+    # Unpack input parameters tuple
+    cubes = input_params[0]
+    time_coord = input_params[1]
+    aggregator = input_params[2]
+    # TODO: Can we improve the handling of this with keyword arguments?
+    percentile = input_params[3]
+
+    # Check the supplied time coordinate to make sure it corresponds to a
+    # single time only
+    if len(time_coord.points) != 1:
+        raise ValueError("Time coordinate should specify a single time only")
+
+    # Remove any duplicate cubes in the input cubelist otherwise this
+    # will break the aggregation
+    cubes = remove_duplicates(cubes)
+
+    # Name of the supplied time coordinate
+    time_coord_name = time_coord.name()
+
+    # Extract cubes matching the time specified by the supplied time coordinate.
+    # Even though the source coord might have floats for its points,
+    # the cell here will have cftime objects, such as DatetimeGregorian,
+    # so we can't just compare against the time coord's points.
+    time_constraint = iris.Constraint(
+        coord_values={time_coord_name: lambda cell: cell.point in time_coord.cells()}
+    )
+    cubes_at_time = cubes.extract(time_constraint)
+
+    # Add a temporary "number" coordinate to uniquely label the different
+    # data points at this time.
+    # An example of when there can be multiple data points at the time of
+    # interest is if the time coordinate represents the hour of day.
+    number = 0
+    numbered_cubes = iris.cube.CubeList()
+    for cube in cubes_at_time:
+        for slc in cube.slices_over(time_coord_name):
+            number_coord = iris.coords.AuxCoord(number, long_name="number")
+            slc.add_aux_coord(number_coord)
+            numbered_cubes.append(slc)
+            number += 1
+    cubes_at_time = numbered_cubes
+
+    # Merge
+    cubes_at_time = cubes_at_time.merge()
+
+    # For each cube in the cubelist, aggregate over all cases at this time
+    # using the supplied aggregator
+    aggregated_cubes = iris.cube.CubeList()
+    for cube in cubes_at_time:
+        # If there was only a single data point at this time, then "number"
+        # will be a scalar coordinate. If so, make it a dimension coordinate
+        # to allow collapsing below
+        if not cube.coord_dims("number"):
+            cube = iris.util.new_axis(cube, scalar_coord="number")
+
+        # Store the total number of data points found at this time
+        num_cases = cube.coord("number").points.size
+        num_cases_coord = iris.coords.AuxCoord(num_cases, long_name="num_cases")
+        cube.add_aux_coord(num_cases_coord)
+
+        # Do aggregation across the temporary "number" coordinate
+        if isinstance(aggregator, type(iris.analysis.PERCENTILE)):
+            cube = cube.collapsed("number", aggregator, percent=percentile)
+        else:
+            cube = cube.collapsed("number", aggregator)
+
+        # Now remove the "number" coordinate...
+        cube.remove_coord("number")
+        # ...and associated cell method
+        cell_method = iris.coords.CellMethod(aggregator.name(), coords="number")
+        cube = remove_cell_method(cube, cell_method)
+
+        aggregated_cubes.append(cube)
+
+    return aggregated_cubes
+
+
+def repeat_scalar_coord_along_dim_coord(cubelist, scalar_coord_name, dim_coord_name):
+    """
+    For each :class:`iris.cube.Cube` in a given :class:`iris.cube.CubeList`, \
+    this extends (by repetition) a specified scalar coordinate along the \
+    dimension corresponding to a specified dimension coordinate.
+    """
+    for cube in cubelist:
+        scalar_coord = cube.coord(scalar_coord_name)
+        # Check the coordinate referenced by scalar_coord_name is indeed
+        # a scalar coordinate. Otherwise nothing to do.
+        if scalar_coord.points.size == 1:
+            # Get the data value held by the scalar coordinate...
+            scalar_coord_val = scalar_coord.points[0]
+            # ...then remove it from the cube
+            cube.remove_coord(scalar_coord)
+            # Extract the dimension coordinate matching dim_coord_name
+            dim_coord = cube.coord(dim_coord_name)
+            # Get the dimension spanned by this dimension coordinate...
+            dim = cube.coord_dims(dim_coord_name)[0]
+            # ...and its length
+            dim_size = dim_coord.points.size
+            # Construct an auxillary coordinate by replicating the data
+            # value from the scalar coordinate to match the size of the
+            # specified dimension coordinate
+            scalar_coord = iris.coords.AuxCoord(
+                np.repeat(scalar_coord_val, dim_size), long_name=scalar_coord_name
+            )
+
+            # Add the new auxillary coordinate to the cube
+            cube.add_aux_coord(scalar_coord, dim)
+
+    return cubelist
+
+
+def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
+    """
+    Create a histogram from each given cube, for every threshold and cell attribute.
+
+    The cube will be "1-hourly mean precipitation rate" or "Large-scale rainfall rate".
 
+    Arguments
+    ---------
+    cubes: iris.cube.CubeList
+        Cube(s) to filter. Will be "1-hourly mean precipitation rate" or "Large-scale rainfall rate".
+        Assumed to be one cube per model, with the first being the control.
+    cell_attribute: str
+        "effective_radius_in_km" or "mean_value"
+    time_grouping: str
+        "forecast_period", "hour" or "time"
 
-    model_histograms = {}
-    for model, data in model_data.keys():
+    """
+    if not isinstance(cubes, CubeList):
+        cubes = CubeList([cubes])
+
+    # For now, thresholds are hard coded.
+    thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]
+
+    # Get a list of the common forecast periods and hours
+    # todo: For forecast time, this must contain "all", plus all the T+0.5 .. T+71.5
+    # todo: For hourly, this must contain "all", plus all the 0.5Z .. 23.5Z
+    forecast_periods = set(cubes[0].coord("forecast_period").points)
+    hours = set(cubes[0].coord("time").points)  # todo: check it's actually hourly?
+    for cube in cubes[1:]:
+        forecast_periods.intersection_update(set(cube.coord("forecast_period").points))
+        hours.intersection_update(set(cube.coord("time").points))
+
+    # todo: we need a land-sea split input?
+    # for now, hard-code as false (this is what the res output had it set to)
+    land_sea_split = False
+
+    # what about obs? GPM & UK_RADAR_2KM? ignore for now.
+
+    # different bins for the two cell attributes
+    # bin_edges = {}
+    #
+    # bin_edges['effective_radius_in_km'] = 10**(np.arange(0.0, 3.12, 0.12))
+    # bin_edges['effective_radius_in_km'] = np.insert(bin_edges['effective_radius_in_km'], 0, 0)
+    #
+    # bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12))
+    # bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0)
+
+    # model_histograms = {}
+    for cube in cubes:
         for threshold in thresholds:
-            something_like_cell_attribute_histogram(
+            # todo: check var_name has been removed by this point, as RES removed it to help with merge
+            hist_cubes = something_like_cell_attribute_histogram(
                 cube,
                 attribute=cell_attribute,
-                bin_edges=bin_edges[cell_attribute],
+                bin_edges=get_bin_edges(cell_attribute),
                 threshold=threshold,
             )
 
+            # todo: res does a deep copy at this point for some reason - seems uneceeary? double check.
+
+            # todo: RES calls this here, but we get a cube, not a cubelist so we can't.
+            # RES used it's own version because Iris' was slow at the time. todo: is it fast now?
+            # hist_cubes = cube_utils.extract_overlapping(cubes_group, "forecast_period")
+            hist_cubes = hist_cubes.extract_overlapping("forecast_period")
+
+            # Now we take the cell statistics and aggregate by "time_grouping".
+
+            # We should deep copy the cubes here becuase there is cube wrangling below,
+            # and it's possible that we'll be using the same cubes across iterations in this loop.
+            # (as extract doesn't always create a new cube).
+            hist_cubes = deepcopy(hist_cubes)
+
+            # Res comment for this bit is: preparation of data for averaging and plotting
+            hist_cubes, times = extract_unique(hist_cubes, time_grouping)
+
+            # Sum cell statistic histograms at each time in parallel
+            input_params = [
+                (hist_cubes, time, iris.analysis.SUM, None) for time in times
+            ]
+            result_list = [
+                aggregate_at_time(input_param) for input_param in input_params
+            ]
+            cubes_group = iris.cube.CubeList(itertools.chain.from_iterable(result_list))
+            cubes_group = cubes_group.merge()
+
+            # If the number of cases at each time is the same, the
+            # above merge results in a scalar coordinate representing
+            # the number of cases. Replace this scalar coordinate with
+            # an auxillary coordinate that has the same length as the
+            # time coordinate
+            cubes_group = repeat_scalar_coord_along_dim_coord(
+                cubes_group, "num_cases", time_grouping
+            )
 
-    # return a cube for each model
-    return model_histograms
+            # At this point, RES extracts every time point into a separate cube list and plots it.
+            for time in times:
+                # Extract histogram at this time
+                time_constraint = iris.Constraint(
+                    coord_values={time_grouping: lambda cell: cell.point in time.points}
+                )
+                cubes_at_time = cubes_group.extract(time_constraint)
 
+                # todo: RES creates a plot title here.
+                # Perhaps we should add a plot title attribute to each cube here?
+                if time_grouping == "forecast_time":
+                    title = "T+{0:.1f}".format(time.points[0])
+                elif time_grouping == "hour":
+                    title = "{0:.1f}Z".format(time.points[0])
+                elif time_grouping == "time":
+                    time_unit = time.units
+                    datetime = time_unit.num2date(time.points[0])
+                    title = "{0:%Y/%m/%d} {1:%H%M}Z".format(datetime, datetime)
 
-def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0):
-    hist_cube = iris.cube.CubeList()
-    coords = get_non_spatial_coords(cube)
+                # todo: RES has some analysis of the number of cases used to construct the histogram at this point.
+
+                # todo: RES plots each cube here.
+                for cube in cubes_at_time:
+                    # todo: Normalise histogram?
+                    # if y_axis == "relative_frequency":
+                    #     cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
+
+                    print(f'adding "{title}" plot for "{cube.name()}"')
+
+            # Sum all histograms. This creates the data for the "all" time point.
+            for cube in cubes_group:
+                cube = cube.collapsed(time_grouping, iris.analysis.SUM)
+
+                # todo: Normalise histogram?
+                # if y_axis == "relative_frequency":
+                #     cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
+
+                print(f'adding "all" plot for collapsed "{cube.name()}"')
+
+    # return cubes to plot. todo: in what exact arrangement?
+    return None
 
+
+def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0):
     # Express histogram bins as an Iris coordinate
     bin_centres = 0.5 * (bin_edges[1:] + bin_edges[:-1])
     bins_as_coord = iris.coords.DimCoord(
-        bin_centres, long_name=attribute, units=cube.units,
-        coord_system=None, bounds=np.column_stack((bin_edges[0:-1], bin_edges[1:])))
+        bin_centres,
+        long_name=attribute,
+        units=cube.units,
+        coord_system=None,
+        bounds=np.column_stack((bin_edges[0:-1], bin_edges[1:])),
+    )
 
+    data_min, data_max = None, None
     hist_cubes = iris.cube.CubeList()
+    coords = get_non_spatial_coords(cube)
 
     for slc in cube.slices_over(coords):
         # Identify connected cells in this spatial slice
@@ -163,8 +608,14 @@ def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshol
             # Extract values of the desired cell attribute
             cell_attributes = [cell.attributes[attribute] for cell in cells]
 
+            # Store the minimum/maximum values of the cell attribute
+            if data_min is None or np.min(cell_attributes) < data_min:
+                data_min = np.min(cell_attributes)
+            if data_max is None or np.max(cell_attributes) > data_max:
+                data_max = np.max(cell_attributes)
+
             # Construct a histogram of the desired cell attribute
-            hist, _ = np.histogram(cell_attributes)
+            hist, _ = np.histogram(cell_attributes, bin_edges)
         else:
             # Assign zeros to all bins
             hist = np.zeros(bin_centres.size).astype(np.int64)
@@ -190,12 +641,10 @@ def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshol
 
 
 def get_spatial_coords(cube):
-    '''Returns the x, y coordinates of an input :class:`iris.cube.Cube`.'''
+    """Returns the x, y coordinates of an input :class:`iris.cube.Cube`."""
     # Usual names for spatial coordinates
-    X_COORD_NAMES = ["longitude", "grid_longitude",
-                     "projection_x_coordinate", "x"]
-    Y_COORD_NAMES = ["latitude", "grid_latitude",
-                     "projection_y_coordinate", "y"]
+    X_COORD_NAMES = ["longitude", "grid_longitude", "projection_x_coordinate", "x"]
+    Y_COORD_NAMES = ["latitude", "grid_latitude", "projection_y_coordinate", "y"]
 
     # Get a list of coordinate names for the cube
     coord_names = [coord.name() for coord in cube.coords()]
@@ -231,11 +680,11 @@ def get_non_spatial_coords(cube):
 
 
 def guess_bounds(cube):
-    '''
+    """
     Takes an input :class:`iris.cube.Cube`, guesses bounds on the x, y \
     coordinates, then returns the cube. Such bounds are often required by \
     regridding algorithms.
-    '''
+    """
     # Loop over spatial coordinates
     for axis in ["x", "y"]:
         coord = cube.coord(axis=axis)
@@ -245,8 +694,7 @@ def guess_bounds(cube):
         try:
             _ = iris.util.regular_step(coord)
         except:
-            raise ValueError("Cannot guess bounds for a variable "
-                             "resolution grid")
+            raise ValueError("Cannot guess bounds for a variable resolution grid")
         # Guess bounds if there aren't any
         if coord.bounds is None:
             coord.guess_bounds()
@@ -254,7 +702,7 @@ def guess_bounds(cube):
 
 
 def connected_object_labelling(data, threshold=0.0, min_size=1, connectivity=1):
-    '''
+    """
     Finds connected objects in an input array and assigns them unique labels.
 
     Arguments:
@@ -273,12 +721,12 @@ def connected_object_labelling(data, threshold=0.0, min_size=1, connectivity=1):
                          neighbours. Connectivity may range from 1 (only direct \
                          neighbours are considered) to :attr:`data.ndim`.
 
-    Returns:
-
+    Returns
+    -------
     * **label_array** - an integer array where each unique object in the input \
                         array has a unique label in the returned array.
     * **num_objects** - the number of objects found.
-    '''
+    """
     # Apply the supplied threshold to the data to generate a binary array
     binary_data = data.copy()
     if np.ma.is_masked(binary_data):
@@ -287,20 +735,21 @@ def connected_object_labelling(data, threshold=0.0, min_size=1, connectivity=1):
         # below
         binary_data = np.ma.filled(binary_data, fill_value=(threshold - 1))
     # Set values above and below the threshold to 1 and 0, respectively
-    indices_above = (binary_data > threshold)
-    indices_below = (binary_data <= threshold)
+    indices_above = binary_data > threshold
+    indices_below = binary_data <= threshold
     binary_data[indices_above] = 1
     binary_data[indices_below] = 0
 
     # Construct a structuring element that defines how the neighbours of
     # a grid point are assigned
     structure_element = ndimage.morphology.generate_binary_structure(
-        data.ndim, connectivity)
+        data.ndim, connectivity
+    )
 
     # Label distinct (connected) objects in the binary array
     label_array, num_objects = ndimage.measurements.label(
-        binary_data,
-        structure=structure_element)
+        binary_data, structure=structure_element
+    )
 
     # Throw away any objects smaller than min_size
     if min_size < 1:
@@ -341,16 +790,14 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1):
                          direct neighbours are considered) to \
                          :attr:`cube.data.ndim`.
 
-    Returns:
-
+    Returns
+    -------
     * **cells** - a :class:`iris.cube.CubeList` of \
                   :class:`iris.cube.Cube` objects, each one corresponding to \
                   an identified cell.
     """
-
     # Convert input area threshold from km^2 to m^2
-    M_IN_KM = 1000
-    area_threshold = (float(M_IN_KM)**2) * area_threshold
+    area_threshold = (float(M_IN_KM) ** 2) * area_threshold
 
     # Get x, y coordinates of input cube
     x_coord, y_coord = get_spatial_coords(cube)
@@ -370,17 +817,14 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1):
             grid_areas = iris.analysis.cartography.area_weights(slc)
 
         # Store a list of the non-spatial coordinates for this slice
-        aux_coords = [(coord, []) for coord in
-                      get_non_spatial_coords(slc)]
+        aux_coords = [(coord, []) for coord in get_non_spatial_coords(slc)]
 
         # Find and label cells
         # Call connected object labelling function based on
         # scipy.ndimage.measurements.label
         cell_label_array, _ = connected_object_labelling(
-            slc.data,
-            threshold=threshold,
-            min_size=1,
-            connectivity=connectivity)
+            slc.data, threshold=threshold, min_size=1, connectivity=connectivity
+        )
 
         # Get a list of unique cell labels
         cell_labels = np.unique(cell_label_array)
@@ -397,8 +841,7 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1):
 
             # There should not be any masked data present in cells!
             if np.ma.is_masked(cell_values):
-                raise ValueError("Masked data found in cell {0:d}"
-                                 .format(cell_label))
+                raise ValueError("Masked data found in cell {0:d}".format(cell_label))
 
             # If cell area is less than area_threshold, discard it
             # (by setting its label to the background value)
@@ -409,14 +852,16 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1):
 
             # Estimate cell centre position
             # TODO Is there a better way of doing this? C.O.M?
-            cell_centre = (np.mean(cell_x, dtype=np.float64),
-                           np.mean(cell_y, dtype=np.float64))
+            cell_centre = (
+                np.mean(cell_x, dtype=np.float64),
+                np.mean(cell_y, dtype=np.float64),
+            )
             # Area-weighted mean value in cell
-            cell_mean = (np.sum((cell_grid_areas * cell_values),
-                                dtype=np.float64)
-                         / cell_area)
+            cell_mean = (
+                np.sum((cell_grid_areas * cell_values), dtype=np.float64) / cell_area
+            )
             # Convert cell area from m^2 to km^2...
-            cell_area /= (float(M_IN_KM)**2)
+            cell_area /= float(M_IN_KM) ** 2
             # ...and then cell effective radius in km
             cell_radius = np.sqrt(cell_area / np.pi)
 
@@ -427,7 +872,8 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1):
                 units=cube.units,
                 attributes=cube.attributes,
                 cell_methods=cube.cell_methods,
-                aux_coords_and_dims=aux_coords)
+                aux_coords_and_dims=aux_coords,
+            )
 
             # Set up x, y coordinates describing the grid points in the cell...
             cell_x_coord = iris.coords.AuxCoord(
@@ -437,7 +883,8 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1):
                 units=x_coord.units,
                 bounds=None,
                 attributes=x_coord.attributes,
-                coord_system=x_coord.coord_system)
+                coord_system=x_coord.coord_system,
+            )
             cell_y_coord = iris.coords.AuxCoord(
                 cell_y,
                 standard_name=y_coord.standard_name,
@@ -445,34 +892,33 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1):
                 units=y_coord.units,
                 bounds=None,
                 attributes=y_coord.attributes,
-                coord_system=y_coord.coord_system)
+                coord_system=y_coord.coord_system,
+            )
             # ...and add them to the cell cube
             cell_cube.add_aux_coord(cell_x_coord, 0)
             cell_cube.add_aux_coord(cell_y_coord, 0)
 
             # Set up a coordinate describing the areas of grid cells in
             # the cell object...
-            cell_grid_area_coord = iris.coords.AuxCoord(cell_grid_areas,
-                                                        long_name="grid_areas",
-                                                        units="m2")
-            #...and add it to the cell cube
+            cell_grid_area_coord = iris.coords.AuxCoord(
+                cell_grid_areas, long_name="grid_areas", units="m2"
+            )
+            # ...and add it to the cell cube
             cell_cube.add_aux_coord(cell_grid_area_coord, 0)
 
             # Finally add some attriubtes to the cube that describe some
             # useful information about the cell
-            #cell_cube.attributes["label"] = int(cell_label)
+            # cell_cube.attributes["label"] = int(cell_label)
             cell_cube.attributes["centre"] = cell_centre
             cell_cube.attributes["area_in_km2"] = cell_area
             cell_cube.attributes["effective_radius_in_km"] = cell_radius
             cell_cube.attributes["mean_value"] = cell_mean
-            #cell_cube.attributes["indices"] = cell_indices
+            # cell_cube.attributes["indices"] = cell_indices
             cells.append(cell_cube)
 
     return cells
 
 
-
-
 # ignore
 
 # def cell_statistics(LAND_SEA_SPLIT: bool):
diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml
index d31c1937a..54a5d8f03 100644
--- a/src/CSET/recipes/cell_statistics.yaml
+++ b/src/CSET/recipes/cell_statistics.yaml
@@ -5,21 +5,21 @@ description: |
 
 steps:
 
-  - operator: cell_statistics.caller_thing
+  - operator: read.read_cubes
+    file_paths: $INPUT_PATHS
+    model_names: $MODEL_NAMES
+    constraint:
+      operator: constraints.generate_var_constraint
+      varname: $VARNAME
 
-    # "effective_radius_in_km" or "mean_value"
-    cell_attribute: $CELL_ATTRIBUTE
+  - operator: regrid.regrid_onto_xyspacing
+    xspacing: 0.5
+    yspacing: 0.5
+    method: Linear
 
-    uk_ctrl_um:
-      operator: read.read_cubes
-      file_paths:
-        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp"
-        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb012.pp"
-    uk_expt_lfric:
-      operator: read.read_cubes
-      file_paths:
-        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_000_006.nc"
-        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_006_012.nc"
+  - operator: cell_statistics.caller_thing
+    cell_attribute: $CELL_ATTRIBUTE
+    time_grouping: $TIME_GROUPING
 
   - operator: write.write_cube_to_nc
     overwrite: True

From c299cfbd5eeb7a1cd0c9d0f35d1856bf5703cdb2 Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Mon, 22 Sep 2025 15:27:34 +0100
Subject: [PATCH 03/12] - i think the data creation is now running in cset -
 need to plug into the histogram plotting, possibly with a new histo plot just
 for this data

---
 src/CSET/operators/cell_statistics.py | 231 +++++++++++++++++---------
 1 file changed, 155 insertions(+), 76 deletions(-)

diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
index 6449de083..564527574 100644
--- a/src/CSET/operators/cell_statistics.py
+++ b/src/CSET/operators/cell_statistics.py
@@ -1,9 +1,11 @@
 """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE."""
-
+import collections
 import datetime
 import inspect
 import itertools
+import pickle
 from copy import deepcopy
+from pathlib import Path
 from typing import Callable
 
 import cftime
@@ -448,6 +450,52 @@ def repeat_scalar_coord_along_dim_coord(cubelist, scalar_coord_name, dim_coord_n
     return cubelist
 
 
+def RES_extract_overlapping(cubelist, coord_name):
+    '''
+    Extracts regions from cubes in a :class:`iris.cube.CubeList` such that \
+    the specified coordinate is the same across all cubes.
+
+    Arguments:
+
+    * **cubelist** - an input :class:`iris.cube.CubeList`.
+    * **coord_name** - a string specifying the name of the coordinate \
+                       over which to perform the extraction.
+
+    Returns a :class:`iris.cube.CubeList` where the coordinate corresponding \
+    to coord_name is the same for all cubes.
+    '''
+    # Build a list of all Cell instances for this coordinate by
+    # looping through all cubes in the supplied cubelist
+    all_cells = []
+    for cube in cubelist:
+        for cell in cube.coord(coord_name).cells():
+            all_cells.append(cell)
+
+    # Work out which coordinate Cell instances are common across
+    # all cubes in the cubelist...
+    cell_counts = collections.Counter(all_cells)
+    # unique_cells = cell_counts.keys()
+    unique_cells = list(cell_counts.keys())
+    unique_cell_counts = list(cell_counts.values())
+    num_cubes = len(cubelist)
+    common_cells = [unique_cells[i] for i, count in
+                    enumerate(unique_cell_counts) if count == num_cubes]
+    # ...and use these to subset the cubes in the cubelist
+    constraint = iris.Constraint(
+        coord_values={coord_name: lambda cell: cell in common_cells})
+
+    cubelist = iris.cube.CubeList([cube.extract(constraint)
+                                   for cube in cubelist])
+    return cubelist
+
+
+def ensure_bounds(cubes, coord_name):
+    for cube in cubes:
+        coord = cube.coord(coord_name)
+        if not coord.bounds is not None:
+            coord.guess_bounds()
+
+
 def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
     """
     Create a histogram from each given cube, for every threshold and cell attribute.
@@ -465,9 +513,25 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
         "forecast_period", "hour" or "time"
 
     """
+
+    # DEV/DEBUGGING - REMOVE
+    pkl_filename = Path("/home/users/byron.blay/git/CSET/delme/debug_cubes.pkl")
+    # normal use - save pickle for faster subsequent runs
+    if cubes and not pkl_filename.exists():
+        with open(pkl_filename, 'wb') as pkl_file:
+            pickle.dump(cubes, pkl_file)
+            print("now load that pickle")
+            exit(0)
+    # we'll be calling this instead of the yaml processing for a dev speed up
+    if not cubes and pkl_filename.exists():
+        with open(pkl_filename, 'rb') as pkl_file:
+            cubes = pickle.load(pkl_file)
+
     if not isinstance(cubes, CubeList):
         cubes = CubeList([cubes])
 
+
+
     # For now, thresholds are hard coded.
     thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]
 
@@ -495,91 +559,106 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
     # bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12))
     # bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0)
 
-    # model_histograms = {}
-    for cube in cubes:
-        for threshold in thresholds:
-            # todo: check var_name has been removed by this point, as RES removed it to help with merge
-            hist_cubes = something_like_cell_attribute_histogram(
+    for threshold in thresholds:
+        # todo: check var_name has been removed by this point, as RES removed it to help with merge
+        # hist_cubes = something_like_cell_attribute_histogram(
+        #     cube,
+        #     attribute=cell_attribute,
+        #     bin_edges=get_bin_edges(cell_attribute),
+        #     threshold=threshold) for cube in cubes]
+
+        hist_cubes = CubeList([])
+        for cube in cubes:
+            hist_cubes.append(something_like_cell_attribute_histogram(
                 cube,
                 attribute=cell_attribute,
                 bin_edges=get_bin_edges(cell_attribute),
                 threshold=threshold,
-            )
+            ))
+
+        # todo: res does a deep copy at this point for some reason - seems uneceeary? double check.
+
+        # todo: RES calls this here, but we get a cube, not a cubelist so we can't.
+        # RES used it's own version because Iris' was slow at the time. todo: is it fast now?
+        # hist_cubes = RES_extract_overlapping(hist_cubes, "forecast_period")
+        ensure_bounds(hist_cubes, "forecast_period")
+        hist_cubes = hist_cubes.extract_overlapping("forecast_period")
+
+        # Now we take the cell statistics and aggregate by "time_grouping".
+
+        # We should deep copy the cubes here becuase there is cube wrangling below,
+        # and it's possible that we'll be using the same cubes across iterations in this loop.
+        # (as extract doesn't always create a new cube).
+        hist_cubes = deepcopy(hist_cubes)
+
+        # Res comment for this bit is: preparation of data for averaging and plotting
+        # It removes the other time coords, other than that named by time_grouping.
+        hist_cubes, times = extract_unique(hist_cubes, time_grouping)
+
+        # Sum cell statistic histograms at each time in parallel
+        # input_params = [
+        #     (hist_cubes, time, iris.analysis.SUM, None) for time in times
+        # ]
+        # result_list = [
+        #     aggregate_at_time(input_param) for input_param in input_params
+        # ]
+        result_list = []
+        for time in times:
+            input_param = (hist_cubes, time, iris.analysis.SUM, None)
+            result = aggregate_at_time(input_param)
+            result_list.append(result)
+        cubes_group = iris.cube.CubeList(itertools.chain.from_iterable(result_list))
+        cubes_group = cubes_group.merge()
+
+        # If the number of cases at each time is the same, the
+        # above merge results in a scalar coordinate representing
+        # the number of cases. Replace this scalar coordinate with
+        # an auxillary coordinate that has the same length as the
+        # time coordinate
+        cubes_group = repeat_scalar_coord_along_dim_coord(
+            cubes_group, "num_cases", time_grouping
+        )
 
-            # todo: res does a deep copy at this point for some reason - seems uneceeary? double check.
-
-            # todo: RES calls this here, but we get a cube, not a cubelist so we can't.
-            # RES used it's own version because Iris' was slow at the time. todo: is it fast now?
-            # hist_cubes = cube_utils.extract_overlapping(cubes_group, "forecast_period")
-            hist_cubes = hist_cubes.extract_overlapping("forecast_period")
-
-            # Now we take the cell statistics and aggregate by "time_grouping".
-
-            # We should deep copy the cubes here becuase there is cube wrangling below,
-            # and it's possible that we'll be using the same cubes across iterations in this loop.
-            # (as extract doesn't always create a new cube).
-            hist_cubes = deepcopy(hist_cubes)
-
-            # Res comment for this bit is: preparation of data for averaging and plotting
-            hist_cubes, times = extract_unique(hist_cubes, time_grouping)
-
-            # Sum cell statistic histograms at each time in parallel
-            input_params = [
-                (hist_cubes, time, iris.analysis.SUM, None) for time in times
-            ]
-            result_list = [
-                aggregate_at_time(input_param) for input_param in input_params
-            ]
-            cubes_group = iris.cube.CubeList(itertools.chain.from_iterable(result_list))
-            cubes_group = cubes_group.merge()
-
-            # If the number of cases at each time is the same, the
-            # above merge results in a scalar coordinate representing
-            # the number of cases. Replace this scalar coordinate with
-            # an auxillary coordinate that has the same length as the
-            # time coordinate
-            cubes_group = repeat_scalar_coord_along_dim_coord(
-                cubes_group, "num_cases", time_grouping
+        # At this point, RES extracts every time point into a separate cube list and plots it.
+        for time in times:
+            # Extract histogram at this time
+            time_constraint = iris.Constraint(
+                coord_values={time_grouping: lambda cell: cell.point in time.points}
             )
-
-            # At this point, RES extracts every time point into a separate cube list and plots it.
-            for time in times:
-                # Extract histogram at this time
-                time_constraint = iris.Constraint(
-                    coord_values={time_grouping: lambda cell: cell.point in time.points}
-                )
-                cubes_at_time = cubes_group.extract(time_constraint)
-
-                # todo: RES creates a plot title here.
-                # Perhaps we should add a plot title attribute to each cube here?
-                if time_grouping == "forecast_time":
-                    title = "T+{0:.1f}".format(time.points[0])
-                elif time_grouping == "hour":
-                    title = "{0:.1f}Z".format(time.points[0])
-                elif time_grouping == "time":
-                    time_unit = time.units
-                    datetime = time_unit.num2date(time.points[0])
-                    title = "{0:%Y/%m/%d} {1:%H%M}Z".format(datetime, datetime)
-
-                # todo: RES has some analysis of the number of cases used to construct the histogram at this point.
-
-                # todo: RES plots each cube here.
-                for cube in cubes_at_time:
-                    # todo: Normalise histogram?
-                    # if y_axis == "relative_frequency":
-                    #     cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
-
-                    print(f'adding "{title}" plot for "{cube.name()}"')
-
-            # Sum all histograms. This creates the data for the "all" time point.
-            for cube in cubes_group:
-                cube = cube.collapsed(time_grouping, iris.analysis.SUM)
-
+            cubes_at_time = cubes_group.extract(time_constraint)
+
+            # todo: RES creates a plot title here.
+            # Perhaps we should add a plot title attribute to each cube here?
+            if time_grouping == "forecast_period":
+                title = "T+{0:.1f}".format(time.points[0])
+            elif time_grouping == "hour":
+                title = "{0:.1f}Z".format(time.points[0])
+            elif time_grouping == "time":
+                time_unit = time.units
+                datetime = time_unit.num2date(time.points[0])
+                title = "{0:%Y/%m/%d} {1:%H%M}Z".format(datetime, datetime)
+            else:
+                raise ValueError(f"Unknown time grouping '{time_grouping}'")
+
+            # todo: RES has some analysis of the number of cases used to construct the histogram at this point.
+
+            # todo: RES plots each cube here.
+            for cube in cubes_at_time:
                 # todo: Normalise histogram?
                 # if y_axis == "relative_frequency":
                 #     cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
 
-                print(f'adding "all" plot for collapsed "{cube.name()}"')
+                print(f'adding "{title}" plot for "{cube.name()}"')
+
+        # Sum all histograms. This creates the data for the "all" time point.
+        for cube in cubes_group:
+            cube = cube.collapsed(time_grouping, iris.analysis.SUM)
+
+            # todo: Normalise histogram?
+            # if y_axis == "relative_frequency":
+            #     cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
+
+            print(f'adding "all" plot for collapsed "{cube.name()}"')
 
     # return cubes to plot. todo: in what exact arrangement?
     return None

From 5b53d8fdf72506b7db5d2100d638385f2e8cc416 Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Mon, 22 Sep 2025 15:28:21 +0100
Subject: [PATCH 04/12] -squash this

---
 src/CSET/operators/cell_statistics.py | 39 ++++++++++++++-------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
index 564527574..0bf849e1f 100644
--- a/src/CSET/operators/cell_statistics.py
+++ b/src/CSET/operators/cell_statistics.py
@@ -1,4 +1,5 @@
 """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE."""
+
 import collections
 import datetime
 import inspect
@@ -451,7 +452,7 @@ def repeat_scalar_coord_along_dim_coord(cubelist, scalar_coord_name, dim_coord_n
 
 
 def RES_extract_overlapping(cubelist, coord_name):
-    '''
+    """
     Extracts regions from cubes in a :class:`iris.cube.CubeList` such that \
     the specified coordinate is the same across all cubes.
 
@@ -463,7 +464,7 @@ def RES_extract_overlapping(cubelist, coord_name):
 
     Returns a :class:`iris.cube.CubeList` where the coordinate corresponding \
     to coord_name is the same for all cubes.
-    '''
+    """
     # Build a list of all Cell instances for this coordinate by
     # looping through all cubes in the supplied cubelist
     all_cells = []
@@ -478,14 +479,17 @@ def RES_extract_overlapping(cubelist, coord_name):
     unique_cells = list(cell_counts.keys())
     unique_cell_counts = list(cell_counts.values())
     num_cubes = len(cubelist)
-    common_cells = [unique_cells[i] for i, count in
-                    enumerate(unique_cell_counts) if count == num_cubes]
+    common_cells = [
+        unique_cells[i]
+        for i, count in enumerate(unique_cell_counts)
+        if count == num_cubes
+    ]
     # ...and use these to subset the cubes in the cubelist
     constraint = iris.Constraint(
-        coord_values={coord_name: lambda cell: cell in common_cells})
+        coord_values={coord_name: lambda cell: cell in common_cells}
+    )
 
-    cubelist = iris.cube.CubeList([cube.extract(constraint)
-                                   for cube in cubelist])
+    cubelist = iris.cube.CubeList([cube.extract(constraint) for cube in cubelist])
     return cubelist
 
 
@@ -513,25 +517,22 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
         "forecast_period", "hour" or "time"
 
     """
-
     # DEV/DEBUGGING - REMOVE
     pkl_filename = Path("/home/users/byron.blay/git/CSET/delme/debug_cubes.pkl")
     # normal use - save pickle for faster subsequent runs
     if cubes and not pkl_filename.exists():
-        with open(pkl_filename, 'wb') as pkl_file:
+        with open(pkl_filename, "wb") as pkl_file:
             pickle.dump(cubes, pkl_file)
             print("now load that pickle")
             exit(0)
     # we'll be calling this instead of the yaml processing for a dev speed up
     if not cubes and pkl_filename.exists():
-        with open(pkl_filename, 'rb') as pkl_file:
+        with open(pkl_filename, "rb") as pkl_file:
             cubes = pickle.load(pkl_file)
 
     if not isinstance(cubes, CubeList):
         cubes = CubeList([cubes])
 
-
-
     # For now, thresholds are hard coded.
     thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]
 
@@ -569,12 +570,14 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
 
         hist_cubes = CubeList([])
         for cube in cubes:
-            hist_cubes.append(something_like_cell_attribute_histogram(
-                cube,
-                attribute=cell_attribute,
-                bin_edges=get_bin_edges(cell_attribute),
-                threshold=threshold,
-            ))
+            hist_cubes.append(
+                something_like_cell_attribute_histogram(
+                    cube,
+                    attribute=cell_attribute,
+                    bin_edges=get_bin_edges(cell_attribute),
+                    threshold=threshold,
+                )
+            )
 
         # todo: res does a deep copy at this point for some reason - seems uneceeary? double check.
 

From 4a6d816bfbd8197746bb8e45e2aa5d7240d2d2c0 Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Tue, 23 Sep 2025 10:44:19 +0100
Subject: [PATCH 05/12] the data is now bundled into a data structure which is
 passed to a new plot function, which probably needs to call the existing
 histogram plot function more than once - we'll see.

---
 src/CSET/operators/cell_statistics.py | 86 +++++++++++++--------------
 src/CSET/operators/plot.py            | 27 +++++++++
 src/CSET/recipes/cell_statistics.yaml |  8 ++-
 3 files changed, 75 insertions(+), 46 deletions(-)

diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
index 0bf849e1f..33c72c398 100644
--- a/src/CSET/operators/cell_statistics.py
+++ b/src/CSET/operators/cell_statistics.py
@@ -4,9 +4,7 @@
 import datetime
 import inspect
 import itertools
-import pickle
 from copy import deepcopy
-from pathlib import Path
 from typing import Callable
 
 import cftime
@@ -517,18 +515,18 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
         "forecast_period", "hour" or "time"
 
     """
-    # DEV/DEBUGGING - REMOVE
-    pkl_filename = Path("/home/users/byron.blay/git/CSET/delme/debug_cubes.pkl")
-    # normal use - save pickle for faster subsequent runs
-    if cubes and not pkl_filename.exists():
-        with open(pkl_filename, "wb") as pkl_file:
-            pickle.dump(cubes, pkl_file)
-            print("now load that pickle")
-            exit(0)
-    # we'll be calling this instead of the yaml processing for a dev speed up
-    if not cubes and pkl_filename.exists():
-        with open(pkl_filename, "rb") as pkl_file:
-            cubes = pickle.load(pkl_file)
+    # # DEV/DEBUGGING - REMOVE
+    # pkl_filename = Path("/home/users/byron.blay/git/CSET/delme/debug_cubes.pkl")
+    # # normal use - save pickle for faster subsequent runs
+    # if cubes and not pkl_filename.exists():
+    #     with open(pkl_filename, "wb") as pkl_file:
+    #         pickle.dump(cubes, pkl_file)
+    #         print("now load that pickle")
+    #         exit(0)
+    # # we'll be calling this instead of the yaml processing for a dev speed up
+    # if not cubes and pkl_filename.exists():
+    #     with open(pkl_filename, "rb") as pkl_file:
+    #         cubes = pickle.load(pkl_file)
 
     if not isinstance(cubes, CubeList):
         cubes = CubeList([cubes])
@@ -560,7 +558,11 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
     # bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12))
     # bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0)
 
+    result = {}
+
     for threshold in thresholds:
+        result[f"threshold {threshold}"] = threshold_result = {}
+
         # todo: check var_name has been removed by this point, as RES removed it to help with merge
         # hist_cubes = something_like_cell_attribute_histogram(
         #     cube,
@@ -598,51 +600,48 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
         # It removes the other time coords, other than that named by time_grouping.
         hist_cubes, times = extract_unique(hist_cubes, time_grouping)
 
-        # Sum cell statistic histograms at each time in parallel
-        # input_params = [
-        #     (hist_cubes, time, iris.analysis.SUM, None) for time in times
-        # ]
-        # result_list = [
-        #     aggregate_at_time(input_param) for input_param in input_params
-        # ]
-        result_list = []
+        # Sum cell statistic histograms at each time. todo: parallelise
+        summed_cubes = []
         for time in times:
             input_param = (hist_cubes, time, iris.analysis.SUM, None)
-            result = aggregate_at_time(input_param)
-            result_list.append(result)
-        cubes_group = iris.cube.CubeList(itertools.chain.from_iterable(result_list))
-        cubes_group = cubes_group.merge()
+            summed_cube = aggregate_at_time(input_param)
+            summed_cubes.append(summed_cube)
+        summed_cubes = iris.cube.CubeList(itertools.chain.from_iterable(summed_cubes))
+        summed_cubes = summed_cubes.merge()
 
         # If the number of cases at each time is the same, the
         # above merge results in a scalar coordinate representing
         # the number of cases. Replace this scalar coordinate with
         # an auxillary coordinate that has the same length as the
         # time coordinate
-        cubes_group = repeat_scalar_coord_along_dim_coord(
-            cubes_group, "num_cases", time_grouping
+        summed_cubes = repeat_scalar_coord_along_dim_coord(
+            summed_cubes, "num_cases", time_grouping
         )
 
         # At this point, RES extracts every time point into a separate cube list and plots it.
         for time in times:
-            # Extract histogram at this time
-            time_constraint = iris.Constraint(
-                coord_values={time_grouping: lambda cell: cell.point in time.points}
-            )
-            cubes_at_time = cubes_group.extract(time_constraint)
+            assert len(time.points) == 1
+            time_point = time.points[0]
 
-            # todo: RES creates a plot title here.
-            # Perhaps we should add a plot title attribute to each cube here?
             if time_grouping == "forecast_period":
-                title = "T+{0:.1f}".format(time.points[0])
+                time_title = "T+{0:.1f}".format(time_point)
             elif time_grouping == "hour":
-                title = "{0:.1f}Z".format(time.points[0])
+                time_title = "{0:.1f}Z".format(time_point)
             elif time_grouping == "time":
                 time_unit = time.units
-                datetime = time_unit.num2date(time.points[0])
-                title = "{0:%Y/%m/%d} {1:%H%M}Z".format(datetime, datetime)
+                datetime = time_unit.num2date(time_point)
+                time_title = "{0:%Y/%m/%d} {1:%H%M}Z".format(datetime, datetime)
             else:
                 raise ValueError(f"Unknown time grouping '{time_grouping}'")
 
+            threshold_result[time_title] = {}
+
+            # Extract histogram at this time.
+            time_constraint = iris.Constraint(
+                coord_values={time_grouping: lambda cell: cell.point in time.points}
+            )
+            cubes_at_time = summed_cubes.extract(time_constraint)
+
             # todo: RES has some analysis of the number of cases used to construct the histogram at this point.
 
             # todo: RES plots each cube here.
@@ -651,20 +650,21 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
                 # if y_axis == "relative_frequency":
                 #     cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
 
-                print(f'adding "{title}" plot for "{cube.name()}"')
+                threshold_result[time_title][cube.attributes["model_name"]] = cube
 
         # Sum all histograms. This creates the data for the "all" time point.
-        for cube in cubes_group:
+        threshold_result["all"] = {}
+        for cube in summed_cubes:
             cube = cube.collapsed(time_grouping, iris.analysis.SUM)
 
             # todo: Normalise histogram?
             # if y_axis == "relative_frequency":
             #     cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
 
-            print(f'adding "all" plot for collapsed "{cube.name()}"')
+            threshold_result["all"][cube.attributes["model_name"]] = cube
 
     # return cubes to plot. todo: in what exact arrangement?
-    return None
+    return result
 
 
 def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0):
diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py
index 1eef7ff00..03e9e22a2 100644
--- a/src/CSET/operators/plot.py
+++ b/src/CSET/operators/plot.py
@@ -2208,3 +2208,30 @@ def plot_histogram_series(
     _make_plot_html_page(complete_plot_index)
 
     return cubes
+
+
+def plot_cell_stats_histograms(data: dict, cell_attribute: str, time_grouping: str):
+    """
+
+    Parameters
+    ----------
+    data:
+        A dict mapping thresholds to histogram data for each time point:
+            data[<threshold>][<time_point>][<model_name>] -> Cube
+
+        E.g, when time_grouping is 'forecast_period', we see a key like:
+            data['threshold 0.5']['T+0.5']['uk_ctrl_um']
+
+        The cube is one dimensional, with a dim coord of the given cell_attribute.
+
+    cell_attribute:
+        'effective_radius_in_km' or 'mean_value'
+
+    time_grouping:
+        'forecast_period', 'hour' or 'time'
+
+    Returns
+    -------
+
+    """
+    pass
diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml
index 54a5d8f03..e9f656cca 100644
--- a/src/CSET/recipes/cell_statistics.yaml
+++ b/src/CSET/recipes/cell_statistics.yaml
@@ -21,7 +21,9 @@ steps:
     cell_attribute: $CELL_ATTRIBUTE
     time_grouping: $TIME_GROUPING
 
-  - operator: write.write_cube_to_nc
-    overwrite: True
+#  - operator: write.write_cube_to_nc
+#    overwrite: True
 
-  - operator: plot.plot_histogram_series
+  - operator: plot.plot_cell_stats_histograms
+    cell_attribute: $CELL_ATTRIBUTE
+    time_grouping: $TIME_GROUPING

From d0ae5abe73479f2e5a90b042eedf6b20f3368b5b Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Wed, 24 Sep 2025 10:53:26 +0100
Subject: [PATCH 06/12] - some histograms are being created - one var not
 working - lots of skipped plots due to no data or nan in the min/max - needs
 a bespoke plot and html to bring it all together sensibly

---
 delme/run_cell_statistics.py          | 49 +++++++++++++++
 src/CSET/operators/cell_statistics.py | 48 +++++++++------
 src/CSET/operators/plot.py            | 86 +++++++++++++++++++++++++--
 src/CSET/recipes/cell_statistics.yaml |  1 +
 4 files changed, 161 insertions(+), 23 deletions(-)
 create mode 100644 delme/run_cell_statistics.py

diff --git a/delme/run_cell_statistics.py b/delme/run_cell_statistics.py
new file mode 100644
index 000000000..e380caaf6
--- /dev/null
+++ b/delme/run_cell_statistics.py
@@ -0,0 +1,49 @@
+from pathlib import Path
+
+from CSET._common import parse_recipe
+from CSET.operators import execute_recipe
+
+cell_attributes =['effective_radius_in_km', 'mean_value']
+
+recipe_fpath = Path(__file__).parent.parent / "src" / "CSET" / "recipes" / "cell_statistics.yaml"
+
+# model_data = [
+#     ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp'),
+#     ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb012.pp'),
+#     ('uk_expt_lfric', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_000_006.nc'),
+#     ('uk_expt_lfric', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_006_012.nc'),
+# ]
+
+model_data = [
+    ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiag*.pp'),
+    ('uk_expt_lfric', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric*.nc'),
+]
+
+
+vars = [
+    'surface_microphysical_rainfall_amount',  # m01s04i201
+    'surface_microphysical_rainfall_rate',  # m01s04i203 todo: seems to have missing data, needs investigation
+]
+
+time_groupings = ['forecast_period', 'hour']
+
+
+# # DEBUG
+# import CSET.operators.cell_statistics as cell_statistics
+# cell_statistics.caller_thing(cubes=None, cell_attribute='effective_radius_in_km', time_grouping='forecast_period')
+# exit(0)
+
+
+for cell_attribute in cell_attributes:
+    for var in vars:
+        for time_grouping in time_groupings:
+            recipe_variables = {
+                'INPUT_PATHS': [i[1] for i in model_data],
+                'MODEL_NAMES': [i[0] for i in model_data],
+                'VARNAME': var,
+                'CELL_ATTRIBUTE': cell_attribute,
+                'TIME_GROUPING': time_grouping,
+            }
+
+            recipe = parse_recipe(recipe_fpath, recipe_variables)
+            execute_recipe(recipe, Path(__file__).parent)
diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
index 33c72c398..5ee3cf474 100644
--- a/src/CSET/operators/cell_statistics.py
+++ b/src/CSET/operators/cell_statistics.py
@@ -515,18 +515,15 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
         "forecast_period", "hour" or "time"
 
     """
-    # # DEV/DEBUGGING - REMOVE
-    # pkl_filename = Path("/home/users/byron.blay/git/CSET/delme/debug_cubes.pkl")
-    # # normal use - save pickle for faster subsequent runs
-    # if cubes and not pkl_filename.exists():
-    #     with open(pkl_filename, "wb") as pkl_file:
-    #         pickle.dump(cubes, pkl_file)
-    #         print("now load that pickle")
-    #         exit(0)
-    # # we'll be calling this instead of the yaml processing for a dev speed up
-    # if not cubes and pkl_filename.exists():
-    #     with open(pkl_filename, "rb") as pkl_file:
-    #         cubes = pickle.load(pkl_file)
+
+    # todo: paramterise y_axis?
+    #   set to "frequency" for cell statistic histograms that are \
+    #   plain number counts in bins, or "relative_frequency" for \
+    #   histograms showing the overall proportion of data values in \
+    #   each bin (normalised to 100%). The default is \
+    #   "relative_frequency".
+    y_axis = "relative_frequency"
+
 
     if not isinstance(cubes, CubeList):
         cubes = CubeList([cubes])
@@ -646,9 +643,9 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
 
             # todo: RES plots each cube here.
             for cube in cubes_at_time:
-                # todo: Normalise histogram?
-                # if y_axis == "relative_frequency":
-                #     cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
+                # Normalise histogram?
+                if y_axis == "relative_frequency":
+                    cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
 
                 threshold_result[time_title][cube.attributes["model_name"]] = cube
 
@@ -657,14 +654,27 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
         for cube in summed_cubes:
             cube = cube.collapsed(time_grouping, iris.analysis.SUM)
 
-            # todo: Normalise histogram?
-            # if y_axis == "relative_frequency":
-            #     cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
+            # Normalise histogram?
+            if y_axis == "relative_frequency":
+                cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
 
             threshold_result["all"][cube.attributes["model_name"]] = cube
 
+
+    # For simpler code, the results were formed as:
+    #   data[<threshold>][<time_point>][<model_name>] -> Cube
+    # But the plot needs the model to be above the time points, so rearrange as:
+    #   data[<threshold>][<model_name>][<time_point>] -> Cube
+    reformed_result = {}
+    for threshold, threshold_result in result.items():
+        reformed_result[threshold] = {}
+        for time_title, model_cubes in threshold_result.items():
+            for model_name, cube in model_cubes.items():
+                reformed_result[threshold].setdefault(model_name, {})
+                reformed_result[threshold][model_name][time_title] = cube
+
     # return cubes to plot. todo: in what exact arrangement?
-    return result
+    return reformed_result
 
 
 def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0):
diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py
index 03e9e22a2..8201e1a89 100644
--- a/src/CSET/operators/plot.py
+++ b/src/CSET/operators/plot.py
@@ -23,6 +23,7 @@
 import math
 import os
 import sys
+from pathlib import Path
 from typing import Literal
 
 import cartopy.crs as ccrs
@@ -2210,17 +2211,26 @@ def plot_histogram_series(
     return cubes
 
 
-def plot_cell_stats_histograms(data: dict, cell_attribute: str, time_grouping: str):
+def check_for_nan(cubes):
+    # The plot function was crashing when cube.data had nan in the min/max.
+    for cube in cubes:
+        # if np.nan in [cube.data.min, cube.data.max]:
+        if np.isnan(cube.data.min()) or np.isnan(cube.data.max()):
+            return True
+    return False
+
+
+def plot_cell_stats_histograms(data: dict, varname: str, cell_attribute: str, time_grouping: str):
     """
 
     Parameters
     ----------
     data:
         A dict mapping thresholds to histogram data for each time point:
-            data[<threshold>][<time_point>][<model_name>] -> Cube
+            data[<threshold>][<model_name>][<time_point>] -> Cube
 
         E.g, when time_grouping is 'forecast_period', we see a key like:
-            data['threshold 0.5']['T+0.5']['uk_ctrl_um']
+            data['threshold 0.5']['uk_ctrl_um']['T+0.5']
 
         The cube is one dimensional, with a dim coord of the given cell_attribute.
 
@@ -2234,4 +2244,72 @@ def plot_cell_stats_histograms(data: dict, cell_attribute: str, time_grouping: s
     -------
 
     """
-    pass
+
+    # at each threshold, we're going to combine model's each time points into a single cube
+    sequence_coord = time_grouping
+
+    for threshold, models in data.items():
+
+        models_merged = iris.cube.CubeList()  # for each model: data at every time, which we will merge for a time sequence plot
+        models_all = iris.cube.CubeList()  # for each model: aggregation of all times, created by the operator
+
+        for model_name, times in models.items():
+            models_all.append(times['all'])
+            del times['all']
+
+            # Put the time points together into a single cube.
+            # Except for the time_grouping coord, all other time coords will have been removed.
+            # Todo: why bother splitting them up in the first place?
+            #       was that just for the RES plotting, can we simplify the operator?
+            cubes = iris.cube.CubeList(times.values())
+            cubes = cubes.merge()
+            assert len(cubes) == 1
+            models_merged.extend(cubes)
+
+        if not models_merged:
+            logging.warning(f'no series data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot')
+            continue
+
+        if not models_all:
+            logging.warning(f'no aggregated data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot')
+            continue
+
+        root_folder = Path('/data/scratch/byron.blay/cset/cell_stats/out')
+        root_folder = root_folder / f'{varname}/{cell_attribute}/{time_grouping}/{threshold}'
+
+        orig_folder = os.getcwd()
+
+        # The plot function was crashing when cube.data had nan in the min/max.
+        if check_for_nan(models_merged):
+            logging.warning(
+                f'Series cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot.')
+        else:
+            folder = root_folder / 'series'
+            folder.mkdir(parents=True, exist_ok=True)
+            os.chdir(folder)
+            filename = folder / 'image.png'
+            plot_histogram_series(
+                cubes=models_merged,
+                filename=str(filename),
+                sequence_coordinate=sequence_coord,
+                stamp_coordinate=None,  # is this useful?
+                single_plot=False,
+            )
+
+        if check_for_nan(models_all):
+            logging.warning(
+                f'Combnined cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot.')
+        else:
+            folder = root_folder / 'all'
+            folder.mkdir(parents=True, exist_ok=True)
+            os.chdir(folder)
+            filename = folder / 'image.png'
+            plot_histogram_series(
+                cubes=models_all,
+                filename=str(filename),
+                sequence_coordinate=sequence_coord,  # todo: we don't need a time series for this plot!
+                stamp_coordinate=None,  # is this useful?
+                single_plot=False,
+            )
+
+        os.chdir(orig_folder)
diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml
index e9f656cca..d24dbc58e 100644
--- a/src/CSET/recipes/cell_statistics.yaml
+++ b/src/CSET/recipes/cell_statistics.yaml
@@ -25,5 +25,6 @@ steps:
 #    overwrite: True
 
   - operator: plot.plot_cell_stats_histograms
+    varname: $VARNAME
     cell_attribute: $CELL_ATTRIBUTE
     time_grouping: $TIME_GROUPING

From 9ffb0daac55ecfafc689ebc0c64e06cf6d16ec90 Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Thu, 25 Sep 2025 10:37:39 +0100
Subject: [PATCH 07/12] fixed surface_microphysical_rainfall_rate by filtering
 cell - needs polish something's messed around with code formatting

---
 delme/run_cell_statistics.py          |  32 ++++---
 src/CSET/operators/cell_statistics.py | 117 +++++++++++++-------------
 src/CSET/operators/plot.py            |  42 +++++----
 3 files changed, 105 insertions(+), 86 deletions(-)

diff --git a/delme/run_cell_statistics.py b/delme/run_cell_statistics.py
index e380caaf6..6f564d2c2 100644
--- a/delme/run_cell_statistics.py
+++ b/delme/run_cell_statistics.py
@@ -3,9 +3,11 @@
 from CSET._common import parse_recipe
 from CSET.operators import execute_recipe
 
-cell_attributes =['effective_radius_in_km', 'mean_value']
+cell_attributes = ["effective_radius_in_km", "mean_value"]
 
-recipe_fpath = Path(__file__).parent.parent / "src" / "CSET" / "recipes" / "cell_statistics.yaml"
+recipe_fpath = (
+    Path(__file__).parent.parent / "src" / "CSET" / "recipes" / "cell_statistics.yaml"
+)
 
 # model_data = [
 #     ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp'),
@@ -15,17 +17,23 @@
 # ]
 
 model_data = [
-    ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiag*.pp'),
-    ('uk_expt_lfric', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric*.nc'),
+    (
+        "uk_ctrl_um",
+        "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiag*.pp",
+    ),
+    (
+        "uk_expt_lfric",
+        "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric*.nc",
+    ),
 ]
 
 
 vars = [
-    'surface_microphysical_rainfall_amount',  # m01s04i201
-    'surface_microphysical_rainfall_rate',  # m01s04i203 todo: seems to have missing data, needs investigation
+    "surface_microphysical_rainfall_amount",  # m01s04i201
+    "surface_microphysical_rainfall_rate",  # m01s04i203 todo: seems to have missing data, needs investigation
 ]
 
-time_groupings = ['forecast_period', 'hour']
+time_groupings = ["forecast_period", "hour"]
 
 
 # # DEBUG
@@ -38,11 +46,11 @@
     for var in vars:
         for time_grouping in time_groupings:
             recipe_variables = {
-                'INPUT_PATHS': [i[1] for i in model_data],
-                'MODEL_NAMES': [i[0] for i in model_data],
-                'VARNAME': var,
-                'CELL_ATTRIBUTE': cell_attribute,
-                'TIME_GROUPING': time_grouping,
+                "INPUT_PATHS": [i[1] for i in model_data],
+                "MODEL_NAMES": [i[0] for i in model_data],
+                "VARNAME": var,
+                "CELL_ATTRIBUTE": cell_attribute,
+                "TIME_GROUPING": time_grouping,
             }
 
             recipe = parse_recipe(recipe_fpath, recipe_variables)
diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
index 5ee3cf474..e77c18890 100644
--- a/src/CSET/operators/cell_statistics.py
+++ b/src/CSET/operators/cell_statistics.py
@@ -448,47 +448,47 @@ def repeat_scalar_coord_along_dim_coord(cubelist, scalar_coord_name, dim_coord_n
 
     return cubelist
 
-
-def RES_extract_overlapping(cubelist, coord_name):
-    """
-    Extracts regions from cubes in a :class:`iris.cube.CubeList` such that \
-    the specified coordinate is the same across all cubes.
-
-    Arguments:
-
-    * **cubelist** - an input :class:`iris.cube.CubeList`.
-    * **coord_name** - a string specifying the name of the coordinate \
-                       over which to perform the extraction.
-
-    Returns a :class:`iris.cube.CubeList` where the coordinate corresponding \
-    to coord_name is the same for all cubes.
-    """
-    # Build a list of all Cell instances for this coordinate by
-    # looping through all cubes in the supplied cubelist
-    all_cells = []
-    for cube in cubelist:
-        for cell in cube.coord(coord_name).cells():
-            all_cells.append(cell)
-
-    # Work out which coordinate Cell instances are common across
-    # all cubes in the cubelist...
-    cell_counts = collections.Counter(all_cells)
-    # unique_cells = cell_counts.keys()
-    unique_cells = list(cell_counts.keys())
-    unique_cell_counts = list(cell_counts.values())
-    num_cubes = len(cubelist)
-    common_cells = [
-        unique_cells[i]
-        for i, count in enumerate(unique_cell_counts)
-        if count == num_cubes
-    ]
-    # ...and use these to subset the cubes in the cubelist
-    constraint = iris.Constraint(
-        coord_values={coord_name: lambda cell: cell in common_cells}
-    )
-
-    cubelist = iris.cube.CubeList([cube.extract(constraint) for cube in cubelist])
-    return cubelist
+# todo: no longer needed? we can use the iris version now (double check this is ok)
+# def RES_extract_overlapping(cubelist, coord_name):
+#     """
+#     Extracts regions from cubes in a :class:`iris.cube.CubeList` such that \
+#     the specified coordinate is the same across all cubes.
+#
+#     Arguments:
+#
+#     * **cubelist** - an input :class:`iris.cube.CubeList`.
+#     * **coord_name** - a string specifying the name of the coordinate \
+#                        over which to perform the extraction.
+#
+#     Returns a :class:`iris.cube.CubeList` where the coordinate corresponding \
+#     to coord_name is the same for all cubes.
+#     """
+#     # Build a list of all Cell instances for this coordinate by
+#     # looping through all cubes in the supplied cubelist
+#     all_cells = []
+#     for cube in cubelist:
+#         for cell in cube.coord(coord_name).cells():
+#             all_cells.append(cell)
+#
+#     # Work out which coordinate Cell instances are common across
+#     # all cubes in the cubelist...
+#     cell_counts = collections.Counter(all_cells)
+#     # unique_cells = cell_counts.keys()
+#     unique_cells = list(cell_counts.keys())
+#     unique_cell_counts = list(cell_counts.values())
+#     num_cubes = len(cubelist)
+#     common_cells = [
+#         unique_cells[i]
+#         for i, count in enumerate(unique_cell_counts)
+#         if count == num_cubes
+#     ]
+#     # ...and use these to subset the cubes in the cubelist
+#     constraint = iris.Constraint(
+#         coord_values={coord_name: lambda cell: cell in common_cells}
+#     )
+#
+#     cubelist = iris.cube.CubeList([cube.extract(constraint) for cube in cubelist])
+#     return cubelist
 
 
 def ensure_bounds(cubes, coord_name):
@@ -515,7 +515,6 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
         "forecast_period", "hour" or "time"
 
     """
-
     # todo: paramterise y_axis?
     #   set to "frequency" for cell statistic histograms that are \
     #   plain number counts in bins, or "relative_frequency" for \
@@ -524,10 +523,19 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
     #   "relative_frequency".
     y_axis = "relative_frequency"
 
-
     if not isinstance(cubes, CubeList):
         cubes = CubeList([cubes])
 
+    # todo: put this cell method filtering into the yaml
+    # we saw lfric data with "point" or "mean".
+    def check_cell_methods(cube):
+        """discard "means", we want "points" or empty cell methods"""
+        for cm in cube.cell_methods:
+            if cm.method != "point":
+                return False
+        return True
+    cubes = CubeList([c for c in cubes if check_cell_methods(c)])
+
     # For now, thresholds are hard coded.
     thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]
 
@@ -569,21 +577,13 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
 
         hist_cubes = CubeList([])
         for cube in cubes:
-            hist_cubes.append(
-                something_like_cell_attribute_histogram(
-                    cube,
-                    attribute=cell_attribute,
-                    bin_edges=get_bin_edges(cell_attribute),
-                    threshold=threshold,
-                )
-            )
+            hist_cube = something_like_cell_attribute_histogram(
+                cube, attribute=cell_attribute, bin_edges=get_bin_edges(cell_attribute), threshold=threshold)
+            hist_cubes.append(hist_cube)
 
-        # todo: res does a deep copy at this point for some reason - seems uneceeary? double check.
-
-        # todo: RES calls this here, but we get a cube, not a cubelist so we can't.
         # RES used it's own version because Iris' was slow at the time. todo: is it fast now?
         # hist_cubes = RES_extract_overlapping(hist_cubes, "forecast_period")
-        ensure_bounds(hist_cubes, "forecast_period")
+        ensure_bounds(hist_cubes, "forecast_period")  # iris fails if only one has bounds
         hist_cubes = hist_cubes.extract_overlapping("forecast_period")
 
         # Now we take the cell statistics and aggregate by "time_grouping".
@@ -645,7 +645,9 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
             for cube in cubes_at_time:
                 # Normalise histogram?
                 if y_axis == "relative_frequency":
-                    cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
+                    cube.data = (100.0 * cube.data) / np.sum(
+                        cube.data, dtype=np.float64
+                    )
 
                 threshold_result[time_title][cube.attributes["model_name"]] = cube
 
@@ -656,11 +658,10 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
 
             # Normalise histogram?
             if y_axis == "relative_frequency":
-                cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64))
+                cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)
 
             threshold_result["all"][cube.attributes["model_name"]] = cube
 
-
     # For simpler code, the results were formed as:
     #   data[<threshold>][<time_point>][<model_name>] -> Cube
     # But the plot needs the model to be above the time points, so rearrange as:
diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py
index 8201e1a89..4b06a8e11 100644
--- a/src/CSET/operators/plot.py
+++ b/src/CSET/operators/plot.py
@@ -2220,7 +2220,9 @@ def check_for_nan(cubes):
     return False
 
 
-def plot_cell_stats_histograms(data: dict, varname: str, cell_attribute: str, time_grouping: str):
+def plot_cell_stats_histograms(
+    data: dict, varname: str, cell_attribute: str, time_grouping: str
+):
     """
 
     Parameters
@@ -2244,18 +2246,18 @@ def plot_cell_stats_histograms(data: dict, varname: str, cell_attribute: str, ti
     -------
 
     """
-
     # at each threshold, we're going to combine model's each time points into a single cube
     sequence_coord = time_grouping
 
     for threshold, models in data.items():
-
         models_merged = iris.cube.CubeList()  # for each model: data at every time, which we will merge for a time sequence plot
-        models_all = iris.cube.CubeList()  # for each model: aggregation of all times, created by the operator
+        models_all = (
+            iris.cube.CubeList()
+        )  # for each model: aggregation of all times, created by the operator
 
         for model_name, times in models.items():
-            models_all.append(times['all'])
-            del times['all']
+            models_all.append(times["all"])
+            del times["all"]
 
             # Put the time points together into a single cube.
             # Except for the time_grouping coord, all other time coords will have been removed.
@@ -2267,27 +2269,34 @@ def plot_cell_stats_histograms(data: dict, varname: str, cell_attribute: str, ti
             models_merged.extend(cubes)
 
         if not models_merged:
-            logging.warning(f'no series data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot')
+            logging.warning(
+                f"no series data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot"
+            )
             continue
 
         if not models_all:
-            logging.warning(f'no aggregated data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot')
+            logging.warning(
+                f"no aggregated data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot"
+            )
             continue
 
-        root_folder = Path('/data/scratch/byron.blay/cset/cell_stats/out')
-        root_folder = root_folder / f'{varname}/{cell_attribute}/{time_grouping}/{threshold}'
+        root_folder = Path("/data/scratch/byron.blay/cset/cell_stats/out")
+        root_folder = (
+            root_folder / f"{varname}/{cell_attribute}/{time_grouping}/{threshold}"
+        )
 
         orig_folder = os.getcwd()
 
         # The plot function was crashing when cube.data had nan in the min/max.
         if check_for_nan(models_merged):
             logging.warning(
-                f'Series cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot.')
+                f"Series cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot."
+            )
         else:
-            folder = root_folder / 'series'
+            folder = root_folder / "series"
             folder.mkdir(parents=True, exist_ok=True)
             os.chdir(folder)
-            filename = folder / 'image.png'
+            filename = folder / "image.png"
             plot_histogram_series(
                 cubes=models_merged,
                 filename=str(filename),
@@ -2298,12 +2307,13 @@ def plot_cell_stats_histograms(data: dict, varname: str, cell_attribute: str, ti
 
         if check_for_nan(models_all):
             logging.warning(
-                f'Combnined cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot.')
+                f"Combnined cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot."
+            )
         else:
-            folder = root_folder / 'all'
+            folder = root_folder / "all"
             folder.mkdir(parents=True, exist_ok=True)
             os.chdir(folder)
-            filename = folder / 'image.png'
+            filename = folder / "image.png"
             plot_histogram_series(
                 cubes=models_all,
                 filename=str(filename),

From edd7efb3e4b74b146e0a1f2488a9e242b046e5b5 Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Thu, 25 Sep 2025 11:28:43 +0100
Subject: [PATCH 08/12] i think log y axis is already taken care of in the
 plotting

---
 src/CSET/operators/cell_statistics.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
index e77c18890..b3e77d442 100644
--- a/src/CSET/operators/cell_statistics.py
+++ b/src/CSET/operators/cell_statistics.py
@@ -643,11 +643,9 @@ def check_cell_methods(cube):
 
             # todo: RES plots each cube here.
             for cube in cubes_at_time:
-                # Normalise histogram?
-                if y_axis == "relative_frequency":
-                    cube.data = (100.0 * cube.data) / np.sum(
-                        cube.data, dtype=np.float64
-                    )
+                # todo: Normalise histogram? I think this is done in the plotting, actually, unlike RES.
+                # if y_axis == "relative_frequency":
+                #     cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)
 
                 threshold_result[time_title][cube.attributes["model_name"]] = cube
 
@@ -656,9 +654,9 @@ def check_cell_methods(cube):
         for cube in summed_cubes:
             cube = cube.collapsed(time_grouping, iris.analysis.SUM)
 
-            # Normalise histogram?
-            if y_axis == "relative_frequency":
-                cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)
+            # todo: Normalise histogram? I think this is done in the plotting, actually, unlike RES.
+            # if y_axis == "relative_frequency":
+            #     cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)
 
             threshold_result["all"][cube.attributes["model_name"]] = cube
 

From 448d6f1a0fe6899a2d08203513ffd574746a3009 Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Thu, 25 Sep 2025 15:52:47 +0100
Subject: [PATCH 09/12] wip

---
 delme/run_cell_statistics.py          |  9 +++++++--
 src/CSET/operators/cell_statistics.py | 14 +++++++++++++-
 src/CSET/operators/plot.py            | 10 ++++++++++
 src/CSET/recipes/cell_statistics.yaml |  1 +
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/delme/run_cell_statistics.py b/delme/run_cell_statistics.py
index 6f564d2c2..1e63e2f4a 100644
--- a/delme/run_cell_statistics.py
+++ b/delme/run_cell_statistics.py
@@ -42,9 +42,14 @@
 # exit(0)
 
 
-for cell_attribute in cell_attributes:
-    for var in vars:
+# todo: it's inefficient to keep reloading the data for each var/cell_attribute/time_grouping
+#  it would seem to be better to send the list of vars to the operator and plot functions, just once.
+for var in vars:
+    for cell_attribute in cell_attributes:
         for time_grouping in time_groupings:
+
+            print(f"\nrunning recipe for {var} {cell_attribute} {time_grouping}\n" )
+
             recipe_variables = {
                 "INPUT_PATHS": [i[1] for i in model_data],
                 "MODEL_NAMES": [i[0] for i in model_data],
diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
index b3e77d442..e3a069504 100644
--- a/src/CSET/operators/cell_statistics.py
+++ b/src/CSET/operators/cell_statistics.py
@@ -4,6 +4,7 @@
 import datetime
 import inspect
 import itertools
+import logging
 from copy import deepcopy
 from typing import Callable
 
@@ -521,11 +522,16 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
     #   histograms showing the overall proportion of data values in \
     #   each bin (normalised to 100%). The default is \
     #   "relative_frequency".
-    y_axis = "relative_frequency"
+    # y_axis = "relative_frequency"
+
+    if cubes in (None, [], CubeList([])):
+        logging.warning(f'no cubes received for {cell_attribute} {time_grouping}')
+        return None
 
     if not isinstance(cubes, CubeList):
         cubes = CubeList([cubes])
 
+    # todo: is this correct? it caused more empty cube lists...
     # todo: put this cell method filtering into the yaml
     # we saw lfric data with "point" or "mean".
     def check_cell_methods(cube):
@@ -536,6 +542,10 @@ def check_cell_methods(cube):
         return True
     cubes = CubeList([c for c in cubes if check_cell_methods(c)])
 
+    if len(cubes) == 0:
+        logging.warning(f'no cubes for {cell_attribute} {time_grouping} after cell method filtering')
+        return None
+
     # For now, thresholds are hard coded.
     thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]
 
@@ -660,6 +670,8 @@ def check_cell_methods(cube):
 
             threshold_result["all"][cube.attributes["model_name"]] = cube
 
+    # todo: perhaps we should use metadata to organisethe results, and let the plotter work out what to plot.
+    #   that would let us stick with the cubelist convention...
     # For simpler code, the results were formed as:
     #   data[<threshold>][<time_point>][<model_name>] -> Cube
     # But the plot needs the model to be above the time points, so rearrange as:
diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py
index 4b06a8e11..7fccba314 100644
--- a/src/CSET/operators/plot.py
+++ b/src/CSET/operators/plot.py
@@ -2231,6 +2231,10 @@ def plot_cell_stats_histograms(
         A dict mapping thresholds to histogram data for each time point:
             data[<threshold>][<model_name>][<time_point>] -> Cube
 
+            NOTE: IT SHOULD PROBABLY BE A CUBE/CUBELIST
+                but that would seem to require considerable inefficiency/repetition in the cells stats
+                so we need to think about this a bit more
+
         E.g, when time_grouping is 'forecast_period', we see a key like:
             data['threshold 0.5']['uk_ctrl_um']['T+0.5']
 
@@ -2246,6 +2250,9 @@ def plot_cell_stats_histograms(
     -------
 
     """
+    if data is None:
+        return None
+
     # at each threshold, we're going to combine model's each time points into a single cube
     sequence_coord = time_grouping
 
@@ -2323,3 +2330,6 @@ def plot_cell_stats_histograms(
             )
 
         os.chdir(orig_folder)
+
+    # todo: by convention we should return the cubes, but it's not cubes
+    #   the cells stats produces a lot of stuff so it's currently passed as a dict
diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml
index d24dbc58e..d0e75b04c 100644
--- a/src/CSET/recipes/cell_statistics.yaml
+++ b/src/CSET/recipes/cell_statistics.yaml
@@ -5,6 +5,7 @@ description: |
 
 steps:
 
+  # todo: ideally we'd load just once and have a list of variable names to extract
   - operator: read.read_cubes
     file_paths: $INPUT_PATHS
     model_names: $MODEL_NAMES

From 374ae1be73802eb1a48da72e7c638ab4fb97d17e Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Fri, 26 Sep 2025 20:12:12 +0100
Subject: [PATCH 10/12] First functioning draft of a new cell stats plots page
 with controls for both threshold and time point. We need to  - move the index
 into the subfolder so it doesn't get overwritten  - ? move the cell attribute
 and time grouping from the recipe runner into the code so it's all on the one
 page together

---
 delme/run_cell_statistics.py                  |  22 +-
 .../operators/_cell_stats_plots_template.html | 165 ++++++++++++++
 src/CSET/operators/cell_statistics.py         |  49 +++--
 src/CSET/operators/plot.py                    | 201 +++++++++++-------
 src/CSET/operators/read.py                    |   9 +
 src/CSET/operators/write.py                   |  12 ++
 src/CSET/recipes/cell_statistics.yaml         |   7 +-
 7 files changed, 357 insertions(+), 108 deletions(-)
 create mode 100644 src/CSET/operators/_cell_stats_plots_template.html

diff --git a/delme/run_cell_statistics.py b/delme/run_cell_statistics.py
index 1e63e2f4a..f93e20c4b 100644
--- a/delme/run_cell_statistics.py
+++ b/delme/run_cell_statistics.py
@@ -3,7 +3,6 @@
 from CSET._common import parse_recipe
 from CSET.operators import execute_recipe
 
-cell_attributes = ["effective_radius_in_km", "mean_value"]
 
 recipe_fpath = (
     Path(__file__).parent.parent / "src" / "CSET" / "recipes" / "cell_statistics.yaml"
@@ -27,13 +26,20 @@
     ),
 ]
 
+# # todo: restore this
+# cell_attributes = ["effective_radius_in_km", "mean_value"]
+#
+# vars = [
+#     "surface_microphysical_rainfall_amount",  # m01s04i201
+#     "surface_microphysical_rainfall_rate",  # m01s04i203 todo: seems to have missing data, needs investigation
+# ]
+#
+# time_groupings = ["forecast_period", "hour"]
 
-vars = [
-    "surface_microphysical_rainfall_amount",  # m01s04i201
-    "surface_microphysical_rainfall_rate",  # m01s04i203 todo: seems to have missing data, needs investigation
-]
-
-time_groupings = ["forecast_period", "hour"]
+# faster dev
+cell_attributes = ["effective_radius_in_km"]
+vars = ["surface_microphysical_rainfall_rate"]
+time_groupings = ["forecast_period"]
 
 
 # # DEBUG
@@ -48,7 +54,7 @@
     for cell_attribute in cell_attributes:
         for time_grouping in time_groupings:
 
-            print(f"\nrunning recipe for {var} {cell_attribute} {time_grouping}\n" )
+            print(f"\nrunning recipe for {var}, {cell_attribute}, {time_grouping}\n" )
 
             recipe_variables = {
                 "INPUT_PATHS": [i[1] for i in model_data],
diff --git a/src/CSET/operators/_cell_stats_plots_template.html b/src/CSET/operators/_cell_stats_plots_template.html
new file mode 100644
index 000000000..0de7bad91
--- /dev/null
+++ b/src/CSET/operators/_cell_stats_plots_template.html
@@ -0,0 +1,165 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{{title}}</title>
+  
+    <style>
+    </style>
+  
+    <script type="module">
+
+        function set_threshold_options(threshold_select, threshold_keys) {
+            if (threshold_keys.length <= 0) {
+                return;
+            }
+            // const threshold_select = document.getElementById("threshold_select");
+
+            threshold_keys.forEach(key => {
+                const option = document.createElement("option");
+                option.value = key;
+                option.text = key;
+                threshold_select.appendChild(option);
+            });
+
+            threshold_select.selectedIndex = 0;
+            // threshold_select.value = threshold_keys[0];
+        }
+
+        function set_time_options(time_select, time_keys) {
+            if (time_keys.length <= 0) {
+                return;
+            }
+            // const time_select = document.getElementById("time_select");
+
+            // what value is currently selected? (it'll be "" if there's no options yet)
+            let keep_selected_value = time_select.value;
+
+            // remove existing options
+            time_select.innerHTML = "";
+
+            // add given options
+            time_keys.forEach(key => {
+                const option = document.createElement("option");
+                option.value = key;
+                option.text = key;
+                time_select.appendChild(option);
+            });
+
+            // select previous or first item
+            if (keep_selected_value !== "") {
+                time_select.value = keep_selected_value;
+            } else {
+                time_select.selectedIndex = 0;
+            }
+        }
+        
+        function display_sequence_controls(plot_urls) {
+            // plot_urls[threshold][time_title] = filename
+            
+            // threshold options
+            // const threshold_keys = Object.keys(plot_urls);
+            // const threshold_select = document.getElementById("threshold_select");
+            // if (threshold_keys.length <= 0) {
+            //     return;
+            // }
+            // threshold_keys.forEach(key => {
+            //     const option = document.createElement("option");
+            //     option.value = key;
+            //     option.text = key;
+            //     threshold_select.appendChild(option);
+            // });
+            // threshold_select.value = threshold_keys[0];
+            const threshold_select = document.getElementById("threshold_select");
+            const threshold_keys = Object.keys(plot_urls);
+            set_threshold_options(threshold_select, threshold_keys);
+            // threshold_select.value = threshold_keys[0];
+            // threshold_select.selectedIndex = 0;
+
+
+            // time options
+            // const time_select = document.getElementById("time_select");
+            // const time_keys = Object.keys(plot_urls[threshold_keys[0]]);
+            // if (time_keys.length <= 0) {
+            //     return;
+            // }
+            // time_keys.forEach(key => {
+            //     const option = document.createElement("option");
+            //     option.value = key;
+            //     option.text = key;
+            //     time_select.appendChild(option);
+            // });
+            // time_select.value = time_keys[0];
+            const time_select = document.getElementById("time_select");
+            const time_keys = Object.keys(plot_urls[threshold_keys[0]]);
+            set_time_options(time_select, time_keys);
+            // time_select.value = time_keys[0];
+
+
+            // set the initial image
+            const plot_img = document.getElementById("plot");
+            plot_img.src = plot_urls[threshold_keys[0]][time_keys[0]];
+
+
+            // selection events
+            threshold_select.addEventListener("change", () => {
+                const selected_threshold = threshold_select.value;
+                const time_keys = Object.keys(plot_urls[selected_threshold]);
+                set_time_options(time_select, time_keys);
+
+                const selected_time = time_select.value;
+                plot_img.src = plot_urls[selected_threshold][selected_time];
+            });
+
+            time_select.addEventListener("change", () => {
+                const selected_threshold = threshold_select.value;
+                const selected_time = time_select.value;
+                plot_img.src = plot_urls[selected_threshold][selected_time];
+            });
+
+
+        }
+
+        const plot_urls = {{plot_urls}}  // plot_urls[threshold][time_title] = filename
+        display_sequence_controls(plot_urls);
+
+        // preload images
+        // plot_urls.forEach(threshold_key => {
+        //     plot_urls[threshold_key].forEach(time_key => {
+        //         let img = new Image();
+        //         img.src = plot_urls[threshold_key][time_key];
+        //     });
+        // })
+        Object.keys(plot_urls).forEach(threshold_key => {
+            Object.keys(plot_urls[threshold_key]).forEach(time_key => {
+                let img = new Image();
+                img.src = plot_urls[threshold_key][time_key];
+            });
+        });
+
+
+    </script>
+  
+</head>
+
+<body>
+    <main>
+        
+        <fieldset id="controls">
+            
+            <label for="threshold_select">Threshold:</label>
+            <select id="threshold_select"></select>
+
+            <label for="time_select">Time:</label>
+            <select id="time_select"></select>
+
+        </fieldset>
+
+        <img id="plot" src="" alt="plot">
+
+    </main>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
index e3a069504..97afa4b17 100644
--- a/src/CSET/operators/cell_statistics.py
+++ b/src/CSET/operators/cell_statistics.py
@@ -516,6 +516,11 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
         "forecast_period", "hour" or "time"
 
     """
+    # todo: inorder to get everything on one page,
+    #   i think we might ahve to calculate all cell attributes in here,
+    #   and all time groupings, so that we can pass one big lump to the page plotter.
+
+
     # todo: paramterise y_axis?
     #   set to "frequency" for cell statistic histograms that are \
     #   plain number counts in bins, or "relative_frequency" for \
@@ -543,7 +548,7 @@ def check_cell_methods(cube):
     cubes = CubeList([c for c in cubes if check_cell_methods(c)])
 
     if len(cubes) == 0:
-        logging.warning(f'no cubes for {cell_attribute} {time_grouping} after cell method filtering')
+        logging.warning(f'no cubes for {cell_attribute}, {time_grouping} after cell method filtering')
         return None
 
     # For now, thresholds are hard coded.
@@ -576,7 +581,7 @@ def check_cell_methods(cube):
     result = {}
 
     for threshold in thresholds:
-        result[f"threshold {threshold}"] = threshold_result = {}
+        result[f"threshold {threshold}"] = time_data = {}
 
         # todo: check var_name has been removed by this point, as RES removed it to help with merge
         # hist_cubes = something_like_cell_attribute_histogram(
@@ -585,6 +590,7 @@ def check_cell_methods(cube):
         #     bin_edges=get_bin_edges(cell_attribute),
         #     threshold=threshold) for cube in cubes]
 
+        # todo: can't we just let mpl make the histogram at plot time?
         hist_cubes = CubeList([])
         for cube in cubes:
             hist_cube = something_like_cell_attribute_histogram(
@@ -641,7 +647,7 @@ def check_cell_methods(cube):
             else:
                 raise ValueError(f"Unknown time grouping '{time_grouping}'")
 
-            threshold_result[time_title] = {}
+            time_data[time_title] = {}
 
             # Extract histogram at this time.
             time_constraint = iris.Constraint(
@@ -657,10 +663,10 @@ def check_cell_methods(cube):
                 # if y_axis == "relative_frequency":
                 #     cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)
 
-                threshold_result[time_title][cube.attributes["model_name"]] = cube
+                time_data[time_title][cube.attributes["model_name"]] = cube
 
         # Sum all histograms. This creates the data for the "all" time point.
-        threshold_result["all"] = {}
+        time_data["all"] = {}
         for cube in summed_cubes:
             cube = cube.collapsed(time_grouping, iris.analysis.SUM)
 
@@ -668,24 +674,25 @@ def check_cell_methods(cube):
             # if y_axis == "relative_frequency":
             #     cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)
 
-            threshold_result["all"][cube.attributes["model_name"]] = cube
-
-    # todo: perhaps we should use metadata to organisethe results, and let the plotter work out what to plot.
-    #   that would let us stick with the cubelist convention...
-    # For simpler code, the results were formed as:
-    #   data[<threshold>][<time_point>][<model_name>] -> Cube
-    # But the plot needs the model to be above the time points, so rearrange as:
-    #   data[<threshold>][<model_name>][<time_point>] -> Cube
-    reformed_result = {}
-    for threshold, threshold_result in result.items():
-        reformed_result[threshold] = {}
-        for time_title, model_cubes in threshold_result.items():
-            for model_name, cube in model_cubes.items():
-                reformed_result[threshold].setdefault(model_name, {})
-                reformed_result[threshold][model_name][time_title] = cube
+            time_data["all"][cube.attributes["model_name"]] = cube
+
+    # don't bother rearranging thie dict now that we've got our own plotter
+    # # todo: perhaps we should use metadata to organisethe results, and let the plotter work out what to plot.
+    # #   that would let us stick with the cubelist convention...
+    # # For simpler code, the results were formed as:
+    # #   data[<threshold>][<time_point>][<model_name>] -> Cube
+    # # But the plot needs the model to be above the time points, so rearrange as:
+    # #   data[<threshold>][<model_name>][<time_point>] -> Cube
+    # reformed_result = {}
+    # for threshold, threshold_result in result.items():
+    #     reformed_result[threshold] = {}
+    #     for time_title, model_cubes in threshold_result.items():
+    #         for model_name, cube in model_cubes.items():
+    #             reformed_result[threshold].setdefault(model_name, {})
+    #             reformed_result[threshold][model_name][time_title] = cube
 
     # return cubes to plot. todo: in what exact arrangement?
-    return reformed_result
+    return result
 
 
 def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0):
diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py
index 7fccba314..4a7401e7e 100644
--- a/src/CSET/operators/plot.py
+++ b/src/CSET/operators/plot.py
@@ -2220,6 +2220,17 @@ def check_for_nan(cubes):
     return False
 
 
+def make_cell_stats_page(html_fpath, title, plot_urls):
+    template_file = Path(__file__).parent / "_cell_stats_plots_template.html"
+    variables = {
+        'title': '',
+        'plot_urls': plot_urls,
+    }
+    html = render_file(str(template_file), **variables)
+    with open(html_fpath, "wt", encoding="UTF-8") as fp:
+        fp.write(html)
+
+
 def plot_cell_stats_histograms(
     data: dict, varname: str, cell_attribute: str, time_grouping: str
 ):
@@ -2253,83 +2264,119 @@ def plot_cell_stats_histograms(
     if data is None:
         return None
 
-    # at each threshold, we're going to combine model's each time points into a single cube
-    sequence_coord = time_grouping
-
-    for threshold, models in data.items():
-        models_merged = iris.cube.CubeList()  # for each model: data at every time, which we will merge for a time sequence plot
-        models_all = (
-            iris.cube.CubeList()
-        )  # for each model: aggregation of all times, created by the operator
-
-        for model_name, times in models.items():
-            models_all.append(times["all"])
-            del times["all"]
-
-            # Put the time points together into a single cube.
-            # Except for the time_grouping coord, all other time coords will have been removed.
-            # Todo: why bother splitting them up in the first place?
-            #       was that just for the RES plotting, can we simplify the operator?
-            cubes = iris.cube.CubeList(times.values())
-            cubes = cubes.merge()
-            assert len(cubes) == 1
-            models_merged.extend(cubes)
-
-        if not models_merged:
-            logging.warning(
-                f"no series data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot"
-            )
-            continue
-
-        if not models_all:
-            logging.warning(
-                f"no aggregated data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot"
-            )
-            continue
-
-        root_folder = Path("/data/scratch/byron.blay/cset/cell_stats/out")
-        root_folder = (
-            root_folder / f"{varname}/{cell_attribute}/{time_grouping}/{threshold}"
-        )
-
-        orig_folder = os.getcwd()
-
-        # The plot function was crashing when cube.data had nan in the min/max.
-        if check_for_nan(models_merged):
-            logging.warning(
-                f"Series cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot."
-            )
-        else:
-            folder = root_folder / "series"
-            folder.mkdir(parents=True, exist_ok=True)
-            os.chdir(folder)
-            filename = folder / "image.png"
-            plot_histogram_series(
-                cubes=models_merged,
-                filename=str(filename),
-                sequence_coordinate=sequence_coord,
-                stamp_coordinate=None,  # is this useful?
-                single_plot=False,
-            )
-
-        if check_for_nan(models_all):
-            logging.warning(
-                f"Combnined cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot."
-            )
-        else:
-            folder = root_folder / "all"
-            folder.mkdir(parents=True, exist_ok=True)
-            os.chdir(folder)
-            filename = folder / "image.png"
-            plot_histogram_series(
-                cubes=models_all,
-                filename=str(filename),
-                sequence_coordinate=sequence_coord,  # todo: we don't need a time series for this plot!
-                stamp_coordinate=None,  # is this useful?
-                single_plot=False,
-            )
+    # # at each threshold, we're going to combine each model's time points into a single cube
+    # # todo: can't the operator do that?
+    # sequence_coord = time_grouping
+    #
+    #
+    # for threshold, models in data.items():
+    #
+    #     # for each model: data at every time, which we will merge for a time sequence plot
+    #     models_merged = iris.cube.CubeList()
+    #
+    #     # for each model: aggregation of all times, created by the operator
+    #     models_all = (iris.cube.CubeList())
+    #
+    #     for model_name, times in models.items():
+    #         models_all.append(times["all"])
+    #         del times["all"]
+    #
+    #         # Put the time points together into a single cube.
+    #         # Except for the time_grouping coord, all other time coords will have been removed.
+    #         # Todo: why bother splitting them up in the first place?
+    #         #       was that just for the RES plotting, can we simplify the operator?
+    #         cubes = iris.cube.CubeList(times.values())
+    #         cubes = cubes.merge()
+    #         assert len(cubes) == 1
+    #         models_merged.extend(cubes)
+    #
+    #     if not models_merged:
+    #         logging.warning(
+    #             f"no series data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot"
+    #         )
+    #         continue
+    #
+    #     if not models_all:
+    #         logging.warning(
+    #             f"no aggregated data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot"
+    #         )
+    #         continue
+    #
+    #     root_folder = Path("/data/scratch/byron.blay/cset/cell_stats/out")
+    #     root_folder = (
+    #         root_folder / f"{varname}/{cell_attribute}/{time_grouping}/{threshold}"
+    #     )
+    #
+    #     orig_folder = os.getcwd()
+    #
+    #     # The plot function was crashing when cube.data had nan in the min/max.
+    #     if check_for_nan(models_merged):
+    #         logging.warning(
+    #             f"Series cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot."
+    #         )
+    #     else:
+    #         folder = root_folder / "series"
+    #         folder.mkdir(parents=True, exist_ok=True)
+    #         os.chdir(folder)
+    #         filename = folder / "image.png"
+    #         plot_histogram_series(
+    #             cubes=models_merged,
+    #             filename=str(filename),
+    #             sequence_coordinate=sequence_coord,
+    #             stamp_coordinate=None,  # is this useful?
+    #             single_plot=False,
+    #         )
+    #
+    #     if check_for_nan(models_all):
+    #         logging.warning(
+    #             f"Combnined cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot."
+    #         )
+    #     else:
+    #         folder = root_folder / "all"
+    #         folder.mkdir(parents=True, exist_ok=True)
+    #         os.chdir(folder)
+    #         filename = folder / "image.png"
+    #         plot_histogram_series(
+    #             cubes=models_all,
+    #             filename=str(filename),
+    #             sequence_coordinate=sequence_coord,  # todo: we don't need a time series for this plot!
+    #             stamp_coordinate=None,  # is this useful?
+    #             single_plot=False,
+    #         )
+    #
+    #     os.chdir(orig_folder)
+
+    plot_urls = {}
+
+    # todo: parameterise this
+    root_folder = Path("/data/scratch/byron.blay/cset/cell_stats/out2")
+
+    for threshold, time_data in data.items():
+        plot_urls[threshold] = {}
+
+        # plot every time point, including the 'all' time point
+        for time_title, model_data in time_data.items():
+            filename = root_folder / varname / cell_attribute / threshold / time_grouping / f'{time_title}.png'
+
+            # # make a new plot for this combination of threshold & time
+            # plt.figure(figsize=(10, 10))
+            # plt.title(f'{varname}, {cell_attribute}, threshold {threshold}, at {time_grouping} {time_title}')
+            # for model_name, cube in model_data.items():
+            #     plt.plot(cube.coord(cell_attribute).points, cube.data)
+            #
+            # filename.parent.mkdir(parents=True, exist_ok=True)
+            # plt.savefig(filename)
+            # plt.close()
+
+            plot_urls[threshold][time_title] = str(filename)
+
+    make_cell_stats_page(
+        root_folder / 'index.html',
+        title=f'{varname}, {cell_attribute}, {time_grouping}',
+        plot_urls=json.dumps(plot_urls),
+    )
 
-        os.chdir(orig_folder)
 
     # todo: by convention we should return the cubes, but it's not cubes
-    #   the cells stats produces a lot of stuff so it's currently passed as a dict
+    #   the cells stats produces a lot of stuff and it's currently passed as a dict
+    return None
diff --git a/src/CSET/operators/read.py b/src/CSET/operators/read.py
index 862396d68..cad4e8e22 100644
--- a/src/CSET/operators/read.py
+++ b/src/CSET/operators/read.py
@@ -996,3 +996,12 @@ def _lfric_forecast_period_standard_name_callback(cube: iris.cube.Cube):
             coord.standard_name = "forecast_period"
     except iris.exceptions.CoordinateNotFoundError:
         pass
+
+
+def read_pickle(filename):
+    """Helper to read the cell stats output, until it returns cubes."""
+    import pickle
+
+    with open(filename, "rb") as f:
+        data = pickle.load(f)
+    return data
diff --git a/src/CSET/operators/write.py b/src/CSET/operators/write.py
index e67afd2c0..e334812d5 100644
--- a/src/CSET/operators/write.py
+++ b/src/CSET/operators/write.py
@@ -66,3 +66,15 @@ def write_cube_to_nc(
     # Save the file as nc compliant (iris should handle this)
     iris.save(cube, filename, zlib=True)
     return cube
+
+
+def write_pickle(data, filename):
+    """Helper to save the cell stats output, until it returns cubes."""
+    import pickle
+    if not data:
+        return
+
+    Path(filename).parent.mkdir(exist_ok=True, parents=True)
+    with open(filename, "wb") as f:
+        pickle.dump(data, f)
+    pass
diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml
index d0e75b04c..edd28a6e0 100644
--- a/src/CSET/recipes/cell_statistics.yaml
+++ b/src/CSET/recipes/cell_statistics.yaml
@@ -22,8 +22,11 @@ steps:
     cell_attribute: $CELL_ATTRIBUTE
     time_grouping: $TIME_GROUPING
 
-#  - operator: write.write_cube_to_nc
-#    overwrite: True
+#  these two are useful when working on the plotting
+#  - operator: write.write_pickle
+#    filename: /data/scratch/byron.blay/cset/cell_stats/out2/$VARNAME__$CELL_ATTRIBUTE__$TIME_GROUPING.pkl
+#  - operator: read.read_pickle
+#    filename: /data/scratch/byron.blay/cset/cell_stats/out2/$VARNAME__$CELL_ATTRIBUTE__$TIME_GROUPING.pkl
 
   - operator: plot.plot_cell_stats_histograms
     varname: $VARNAME

From 33a450d5e1e4eda85f12f63e38c367353131de4b Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Mon, 29 Sep 2025 20:55:54 +0100
Subject: [PATCH 11/12] more controls on the page

---
 delme/run_cell_statistics.py                  |  41 ++---
 .../operators/_cell_stats_plots_template.html | 140 +++++++--------
 src/CSET/operators/cell_statistics.py         |  36 ++--
 src/CSET/operators/plot.py                    | 165 +++++-------------
 src/CSET/operators/write.py                   |   5 +-
 src/CSET/recipes/cell_statistics.yaml         |  18 +-
 6 files changed, 165 insertions(+), 240 deletions(-)

diff --git a/delme/run_cell_statistics.py b/delme/run_cell_statistics.py
index f93e20c4b..89b368a71 100644
--- a/delme/run_cell_statistics.py
+++ b/delme/run_cell_statistics.py
@@ -26,20 +26,17 @@
     ),
 ]
 
-# # todo: restore this
+vars = [
+    "surface_microphysical_rainfall_rate",  # m01s04i203
+    "surface_microphysical_rainfall_amount",  # m01s04i201
+]
 # cell_attributes = ["effective_radius_in_km", "mean_value"]
-#
-# vars = [
-#     "surface_microphysical_rainfall_amount",  # m01s04i201
-#     "surface_microphysical_rainfall_rate",  # m01s04i203 todo: seems to have missing data, needs investigation
-# ]
-#
 # time_groupings = ["forecast_period", "hour"]
 
-# faster dev
-cell_attributes = ["effective_radius_in_km"]
-vars = ["surface_microphysical_rainfall_rate"]
-time_groupings = ["forecast_period"]
+# # faster dev
+# cell_attributes = ["effective_radius_in_km"]
+# vars = ["surface_microphysical_rainfall_rate"]
+# time_groupings = ["forecast_period"]
 
 
 # # DEBUG
@@ -51,18 +48,14 @@
 # todo: it's inefficient to keep reloading the data for each var/cell_attribute/time_grouping
 #  it would seem to be better to send the list of vars to the operator and plot functions, just once.
 for var in vars:
-    for cell_attribute in cell_attributes:
-        for time_grouping in time_groupings:
-
-            print(f"\nrunning recipe for {var}, {cell_attribute}, {time_grouping}\n" )
+    print(f"\nrunning recipe for {var}\n" )
 
-            recipe_variables = {
-                "INPUT_PATHS": [i[1] for i in model_data],
-                "MODEL_NAMES": [i[0] for i in model_data],
-                "VARNAME": var,
-                "CELL_ATTRIBUTE": cell_attribute,
-                "TIME_GROUPING": time_grouping,
-            }
+    recipe_variables = {
+        "INPUT_PATHS": [i[1] for i in model_data],
+        "MODEL_NAMES": [i[0] for i in model_data],
+        "VARNAME": var,
+        "OUTPUT_FOLDER": "/data/scratch/byron.blay/cset/cell_stats/out3",
+    }
 
-            recipe = parse_recipe(recipe_fpath, recipe_variables)
-            execute_recipe(recipe, Path(__file__).parent)
+    recipe = parse_recipe(recipe_fpath, recipe_variables)
+    execute_recipe(recipe, Path(__file__).parent)
diff --git a/src/CSET/operators/_cell_stats_plots_template.html b/src/CSET/operators/_cell_stats_plots_template.html
index 0de7bad91..d975bfe88 100644
--- a/src/CSET/operators/_cell_stats_plots_template.html
+++ b/src/CSET/operators/_cell_stats_plots_template.html
@@ -11,112 +11,96 @@
   
     <script type="module">
 
-        function set_threshold_options(threshold_select, threshold_keys) {
-            if (threshold_keys.length <= 0) {
+        function set_select_options(element, keys) {
+            if (keys.length <= 0) {
                 return;
             }
-            // const threshold_select = document.getElementById("threshold_select");
-
-            threshold_keys.forEach(key => {
-                const option = document.createElement("option");
-                option.value = key;
-                option.text = key;
-                threshold_select.appendChild(option);
-            });
-
-            threshold_select.selectedIndex = 0;
-            // threshold_select.value = threshold_keys[0];
-        }
-
-        function set_time_options(time_select, time_keys) {
-            if (time_keys.length <= 0) {
-                return;
-            }
-            // const time_select = document.getElementById("time_select");
-
-            // what value is currently selected? (it'll be "" if there's no options yet)
-            let keep_selected_value = time_select.value;
 
             // remove existing options
-            time_select.innerHTML = "";
+            let keep_selected = element.value;
+            element.innerHTML = "";
 
             // add given options
-            time_keys.forEach(key => {
+            keys.forEach(key => {
                 const option = document.createElement("option");
                 option.value = key;
                 option.text = key;
-                time_select.appendChild(option);
+                element.appendChild(option);
             });
 
-            // select previous or first item
-            if (keep_selected_value !== "") {
-                time_select.value = keep_selected_value;
+            // select same or first item
+            if (keep_selected !== "" && keys.includes(keep_selected)) {
+                element.value = keep_selected;
             } else {
-                time_select.selectedIndex = 0;
+                element.selectedIndex = 0;
             }
         }
-        
+
         function display_sequence_controls(plot_urls) {
-            // plot_urls[threshold][time_title] = filename
+            // plot_urls[cell_attribute][time_grouping][threshold][time_point] = filename
             
+            // cell_attribute options
+            const cell_attribute_select = document.getElementById("cell_attribute_select");
+            const cell_attribute_keys = Object.keys(plot_urls);
+            set_select_options(cell_attribute_select, cell_attribute_keys);
+
+            // time_grouping options
+            const time_grouping_select = document.getElementById("time_grouping_select");
+            const time_grouping_keys = Object.keys(plot_urls[cell_attribute_keys[0]]);
+            set_select_options(time_grouping_select, time_grouping_keys);
+
             // threshold options
-            // const threshold_keys = Object.keys(plot_urls);
-            // const threshold_select = document.getElementById("threshold_select");
-            // if (threshold_keys.length <= 0) {
-            //     return;
-            // }
-            // threshold_keys.forEach(key => {
-            //     const option = document.createElement("option");
-            //     option.value = key;
-            //     option.text = key;
-            //     threshold_select.appendChild(option);
-            // });
-            // threshold_select.value = threshold_keys[0];
             const threshold_select = document.getElementById("threshold_select");
-            const threshold_keys = Object.keys(plot_urls);
-            set_threshold_options(threshold_select, threshold_keys);
-            // threshold_select.value = threshold_keys[0];
-            // threshold_select.selectedIndex = 0;
-
+            const threshold_keys = Object.keys(plot_urls[cell_attribute_keys[0]][time_grouping_keys[0]]);
+            set_select_options(threshold_select, threshold_keys);
 
             // time options
-            // const time_select = document.getElementById("time_select");
-            // const time_keys = Object.keys(plot_urls[threshold_keys[0]]);
-            // if (time_keys.length <= 0) {
-            //     return;
-            // }
-            // time_keys.forEach(key => {
-            //     const option = document.createElement("option");
-            //     option.value = key;
-            //     option.text = key;
-            //     time_select.appendChild(option);
-            // });
-            // time_select.value = time_keys[0];
             const time_select = document.getElementById("time_select");
-            const time_keys = Object.keys(plot_urls[threshold_keys[0]]);
-            set_time_options(time_select, time_keys);
-            // time_select.value = time_keys[0];
+            const time_keys = Object.keys(plot_urls[cell_attribute_keys[0]][time_grouping_keys[0]][threshold_keys[0]]);
+            set_select_options(time_select, time_keys);
 
 
             // set the initial image
             const plot_img = document.getElementById("plot");
-            plot_img.src = plot_urls[threshold_keys[0]][time_keys[0]];
+            plot_img.src = plot_urls[cell_attribute_keys[0]][time_grouping_keys[0]][threshold_keys[0]][time_keys[0]];
 
 
             // selection events
+            cell_attribute_select.addEventListener("change", () => {
+
+                // update the time_group options for the selected cell_attribute
+                // Note: possible overkill because they will probably all be the same
+                const time_grouping_keys = Object.keys(plot_urls[cell_attribute_select.value]);
+                set_select_options(time_grouping_select, time_grouping_keys);
+
+                const threshold_keys = Object.keys(plot_urls[cell_attribute_select.value][time_grouping_select.value]);
+                set_select_options(threshold_select, threshold_keys);
+
+                const time_keys = Object.keys(plot_urls[cell_attribute_select.value][time_grouping_select.value][threshold_select.value]);
+                set_select_options(time_select, time_keys);
+
+                plot_img.src = plot_urls[cell_attribute_select.value][time_grouping_select.value][threshold_select.value][time_select.value];
+            });
+
+            time_grouping_select.addEventListener("change", () => {
+                const threshold_keys = Object.keys(plot_urls[cell_attribute_select.value][time_grouping_select.value]);
+                set_select_options(threshold_select, threshold_keys);
+
+                const time_keys = Object.keys(plot_urls[cell_attribute_select.value][time_grouping_select.value][threshold_select.value]);
+                set_select_options(time_select, time_keys);
+
+                plot_img.src = plot_urls[cell_attribute_select.value][time_grouping_select.value][threshold_select.value][time_select.value];
+            });
+
             threshold_select.addEventListener("change", () => {
-                const selected_threshold = threshold_select.value;
-                const time_keys = Object.keys(plot_urls[selected_threshold]);
-                set_time_options(time_select, time_keys);
+                const time_keys = Object.keys(plot_urls[cell_attribute_select.value][time_grouping_select.value][threshold_select.value]);
+                set_select_options(time_select, time_keys);
 
-                const selected_time = time_select.value;
-                plot_img.src = plot_urls[selected_threshold][selected_time];
+                plot_img.src = plot_urls[cell_attribute_select.value][time_grouping_select.value][threshold_select.value][time_select.value];
             });
 
             time_select.addEventListener("change", () => {
-                const selected_threshold = threshold_select.value;
-                const selected_time = time_select.value;
-                plot_img.src = plot_urls[selected_threshold][selected_time];
+                plot_img.src = plot_urls[cell_attribute_select.value][time_grouping_select.value][threshold_select.value][time_select.value];
             });
 
 
@@ -148,11 +132,17 @@
     <main>
         
         <fieldset id="controls">
-            
+
+            <label for="cell_attribute_select">Cell Attribute:</label>
+            <select id="cell_attribute_select"></select>
+
+            <label for="time_grouping_select">Time grouping:</label>
+            <select id="time_grouping_select"></select>
+
             <label for="threshold_select">Threshold:</label>
             <select id="threshold_select"></select>
 
-            <label for="time_select">Time:</label>
+            <label for="time_select">Time Point:</label>
             <select id="time_select"></select>
 
         </fieldset>
diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
index 97afa4b17..69205f581 100644
--- a/src/CSET/operators/cell_statistics.py
+++ b/src/CSET/operators/cell_statistics.py
@@ -499,7 +499,7 @@ def ensure_bounds(cubes, coord_name):
             coord.guess_bounds()
 
 
-def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
+def caller_thing(cubes: CubeList):
     """
     Create a histogram from each given cube, for every threshold and cell attribute.
 
@@ -515,10 +515,15 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
     time_grouping: str
         "forecast_period", "hour" or "time"
 
+    Returns a nested dictionary:
+        result[cell_attribute][time_grouping][threshold][time_point][model_name] = cube
+
     """
-    # todo: inorder to get everything on one page,
-    #   i think we might ahve to calculate all cell attributes in here,
-    #   and all time groupings, so that we can pass one big lump to the page plotter.
+    # We loop through these, producing a result per model for each combo.
+    # todo: we might not want to produce data for all cell attributes or time groupings, so we could paramterise.
+    cell_attributes = ["effective_radius_in_km", "mean_value"]
+    time_groupings = ["forecast_period", "hour"]  # todo: WE NEED "time" here too! should slot straight in?
+    thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]  # todo: parametrise this?
 
 
     # todo: paramterise y_axis?
@@ -530,7 +535,7 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
     # y_axis = "relative_frequency"
 
     if cubes in (None, [], CubeList([])):
-        logging.warning(f'no cubes received for {cell_attribute} {time_grouping}')
+        logging.warning(f'no cubes received')
         return None
 
     if not isinstance(cubes, CubeList):
@@ -542,18 +547,15 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str):
     def check_cell_methods(cube):
         """discard "means", we want "points" or empty cell methods"""
         for cm in cube.cell_methods:
-            if cm.method != "point":
+            if cm.method == "mean":
                 return False
         return True
     cubes = CubeList([c for c in cubes if check_cell_methods(c)])
 
     if len(cubes) == 0:
-        logging.warning(f'no cubes for {cell_attribute}, {time_grouping} after cell method filtering')
+        logging.warning(f'no cubes after cell method filtering')
         return None
 
-    # For now, thresholds are hard coded.
-    thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]
-
     # Get a list of the common forecast periods and hours
     # todo: For forecast time, this must contain "all", plus all the T+0.5 .. T+71.5
     # todo: For hourly, this must contain "all", plus all the 0.5Z .. 23.5Z
@@ -578,10 +580,20 @@ def check_cell_methods(cube):
     # bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12))
     # bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0)
 
+    # this will be a nested dict:
+    #  result[cell_attribute][time_grouping][threshold][time_point][model_name] = cube
     result = {}
 
-    for threshold in thresholds:
-        result[f"threshold {threshold}"] = time_data = {}
+
+    combos = itertools.product(cell_attributes, time_groupings, thresholds)
+    for cell_attribute, time_grouping, threshold in combos:
+        print(f'cell_attribute: {cell_attribute}, time_grouping: {time_grouping}, threshold: {threshold}')
+
+        # create the top three levels of nested dictionaries
+        result.setdefault(cell_attribute, {})
+        result[cell_attribute].setdefault(time_grouping, {})
+        result[cell_attribute][time_grouping].setdefault(str(threshold), {})
+        time_data = result[cell_attribute][time_grouping][str(threshold)]  # helper var
 
         # todo: check var_name has been removed by this point, as RES removed it to help with merge
         # hist_cubes = something_like_cell_attribute_histogram(
diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py
index 4a7401e7e..a848e1850 100644
--- a/src/CSET/operators/plot.py
+++ b/src/CSET/operators/plot.py
@@ -2231,32 +2231,23 @@ def make_cell_stats_page(html_fpath, title, plot_urls):
         fp.write(html)
 
 
-def plot_cell_stats_histograms(
-    data: dict, varname: str, cell_attribute: str, time_grouping: str
-):
+def plot_cell_stats_histograms(data: dict, varname: str, output_folder: str):
     """
 
     Parameters
     ----------
     data:
         A dict mapping thresholds to histogram data for each time point:
-            data[<threshold>][<model_name>][<time_point>] -> Cube
+            data[cell_attribute][time_grouping][threshold][time_point][model_name] = cube
 
             NOTE: IT SHOULD PROBABLY BE A CUBE/CUBELIST
                 but that would seem to require considerable inefficiency/repetition in the cells stats
                 so we need to think about this a bit more
 
-        E.g, when time_grouping is 'forecast_period', we see a key like:
-            data['threshold 0.5']['uk_ctrl_um']['T+0.5']
+        E.g, data['effective_radius_in_km']['forecast_period']['0.5']['T+0.5']['uk_ctrl_um']
 
         The cube is one dimensional, with a dim coord of the given cell_attribute.
 
-    cell_attribute:
-        'effective_radius_in_km' or 'mean_value'
-
-    time_grouping:
-        'forecast_period', 'hour' or 'time'
-
     Returns
     -------
 
@@ -2264,118 +2255,52 @@ def plot_cell_stats_histograms(
     if data is None:
         return None
 
-    # # at each threshold, we're going to combine each model's time points into a single cube
-    # # todo: can't the operator do that?
-    # sequence_coord = time_grouping
-    #
-    #
-    # for threshold, models in data.items():
-    #
-    #     # for each model: data at every time, which we will merge for a time sequence plot
-    #     models_merged = iris.cube.CubeList()
-    #
-    #     # for each model: aggregation of all times, created by the operator
-    #     models_all = (iris.cube.CubeList())
-    #
-    #     for model_name, times in models.items():
-    #         models_all.append(times["all"])
-    #         del times["all"]
-    #
-    #         # Put the time points together into a single cube.
-    #         # Except for the time_grouping coord, all other time coords will have been removed.
-    #         # Todo: why bother splitting them up in the first place?
-    #         #       was that just for the RES plotting, can we simplify the operator?
-    #         cubes = iris.cube.CubeList(times.values())
-    #         cubes = cubes.merge()
-    #         assert len(cubes) == 1
-    #         models_merged.extend(cubes)
-    #
-    #     if not models_merged:
-    #         logging.warning(
-    #             f"no series data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot"
-    #         )
-    #         continue
-    #
-    #     if not models_all:
-    #         logging.warning(
-    #             f"no aggregated data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot"
-    #         )
-    #         continue
-    #
-    #     root_folder = Path("/data/scratch/byron.blay/cset/cell_stats/out")
-    #     root_folder = (
-    #         root_folder / f"{varname}/{cell_attribute}/{time_grouping}/{threshold}"
-    #     )
-    #
-    #     orig_folder = os.getcwd()
-    #
-    #     # The plot function was crashing when cube.data had nan in the min/max.
-    #     if check_for_nan(models_merged):
-    #         logging.warning(
-    #             f"Series cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot."
-    #         )
-    #     else:
-    #         folder = root_folder / "series"
-    #         folder.mkdir(parents=True, exist_ok=True)
-    #         os.chdir(folder)
-    #         filename = folder / "image.png"
-    #         plot_histogram_series(
-    #             cubes=models_merged,
-    #             filename=str(filename),
-    #             sequence_coordinate=sequence_coord,
-    #             stamp_coordinate=None,  # is this useful?
-    #             single_plot=False,
-    #         )
-    #
-    #     if check_for_nan(models_all):
-    #         logging.warning(
-    #             f"Combnined cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot."
-    #         )
-    #     else:
-    #         folder = root_folder / "all"
-    #         folder.mkdir(parents=True, exist_ok=True)
-    #         os.chdir(folder)
-    #         filename = folder / "image.png"
-    #         plot_histogram_series(
-    #             cubes=models_all,
-    #             filename=str(filename),
-    #             sequence_coordinate=sequence_coord,  # todo: we don't need a time series for this plot!
-    #             stamp_coordinate=None,  # is this useful?
-    #             single_plot=False,
-    #         )
-    #
-    #     os.chdir(orig_folder)
-
     plot_urls = {}
 
     # todo: parameterise this
-    root_folder = Path("/data/scratch/byron.blay/cset/cell_stats/out2")
-
-    for threshold, time_data in data.items():
-        plot_urls[threshold] = {}
-
-        # plot every time point, including the 'all' time point
-        for time_title, model_data in time_data.items():
-            filename = root_folder / varname / cell_attribute / threshold / time_grouping / f'{time_title}.png'
-
-            # # make a new plot for this combination of threshold & time
-            # plt.figure(figsize=(10, 10))
-            # plt.title(f'{varname}, {cell_attribute}, threshold {threshold}, at {time_grouping} {time_title}')
-            # for model_name, cube in model_data.items():
-            #     plt.plot(cube.coord(cell_attribute).points, cube.data)
-            #
-            # filename.parent.mkdir(parents=True, exist_ok=True)
-            # plt.savefig(filename)
-            # plt.close()
-
-            plot_urls[threshold][time_title] = str(filename)
-
-    make_cell_stats_page(
-        root_folder / 'index.html',
-        title=f'{varname}, {cell_attribute}, {time_grouping}',
-        plot_urls=json.dumps(plot_urls),
-    )
+    root_folder = Path(output_folder) / varname
 
+    # for threshold, time_data in data.items():
+    #     plot_urls[threshold] = {}
+    #
+    #     # plot every time point, including the 'all' time point
+    #     for time_title, model_data in time_data.items():
+    #         filename = root_folder / varname / cell_attribute / threshold / time_grouping / f'{time_title}.png'
+    #
+    #         # # make a new plot for this combination of threshold & time
+    #         # plt.figure(figsize=(10, 10))
+    #         # plt.title(f'{varname}, {cell_attribute}, threshold {threshold}, at {time_grouping} {time_title}')
+    #         # for model_name, cube in model_data.items():
+    #         #     plt.plot(cube.coord(cell_attribute).points, cube.data)
+    #         #
+    #         # filename.parent.mkdir(parents=True, exist_ok=True)
+    #         # plt.savefig(filename)
+    #         # plt.close()
+    #
+    #         plot_urls[threshold][time_title] = str(filename)
+
+    for cell_attribute, time_groupings in data.items():
+        plot_urls[cell_attribute] = {}
+        for time_grouping, thresholds in time_groupings.items():
+            plot_urls[cell_attribute][time_grouping] = {}
+            for threshold, time_points in thresholds.items():
+                plot_urls[cell_attribute][time_grouping][threshold] = {}
+                (root_folder / varname / cell_attribute / time_grouping / threshold).mkdir(parents=True, exist_ok=True)
+                for time_point, models in time_points.items():
+                    filename = root_folder / varname / cell_attribute / time_grouping / threshold / f'{time_point}.png'
+                    plot_urls[cell_attribute][time_grouping][threshold][time_point] = str(filename)
+
+                    # make a new plot for this combination of cell_attribute, time_grouping, threshold & time_point
+                    plt.figure(figsize=(10, 10))
+                    plt.title(f'{varname}, {cell_attribute}, {time_grouping}, threshold {threshold}, at {time_point}')
+                    for model_name, cube in models.items():
+                        plt.plot(cube.coord(cell_attribute).points, cube.data)
+
+                    filename.parent.mkdir(parents=True, exist_ok=True)
+                    plt.savefig(filename)
+                    plt.close()
+
+    make_cell_stats_page(root_folder / 'index.html', title=f'{varname}', plot_urls=json.dumps(plot_urls))
 
     # todo: by convention we should return the cubes, but it's not cubes
     #   the cells stats produces a lot of stuff and it's currently passed as a dict
diff --git a/src/CSET/operators/write.py b/src/CSET/operators/write.py
index e334812d5..9a9cdc00c 100644
--- a/src/CSET/operators/write.py
+++ b/src/CSET/operators/write.py
@@ -72,9 +72,10 @@ def write_pickle(data, filename):
     """Helper to save the cell stats output, until it returns cubes."""
     import pickle
     if not data:
-        return
+        return None
 
     Path(filename).parent.mkdir(exist_ok=True, parents=True)
     with open(filename, "wb") as f:
         pickle.dump(data, f)
-    pass
+
+    return data
diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml
index edd28a6e0..508df8e9d 100644
--- a/src/CSET/recipes/cell_statistics.yaml
+++ b/src/CSET/recipes/cell_statistics.yaml
@@ -13,22 +13,26 @@ steps:
       operator: constraints.generate_var_constraint
       varname: $VARNAME
 
+  # todo: check what the grid resolution should be
   - operator: regrid.regrid_onto_xyspacing
     xspacing: 0.5
     yspacing: 0.5
     method: Linear
 
+#  these two are useful when working on the cell stats
+  - operator: write.write_pickle
+    filename: $OUTPUT_FOLDER/$VARNAME.pkl
+#  - operator: read.read_pickle
+#    filename: $OUTPUT_FOLDER/$VARNAME.pkl
+
   - operator: cell_statistics.caller_thing
-    cell_attribute: $CELL_ATTRIBUTE
-    time_grouping: $TIME_GROUPING
 
 #  these two are useful when working on the plotting
-#  - operator: write.write_pickle
-#    filename: /data/scratch/byron.blay/cset/cell_stats/out2/$VARNAME__$CELL_ATTRIBUTE__$TIME_GROUPING.pkl
+  - operator: write.write_pickle
+    filename: $OUTPUT_FOLDER/$VARNAME_stats.pkl
 #  - operator: read.read_pickle
-#    filename: /data/scratch/byron.blay/cset/cell_stats/out2/$VARNAME__$CELL_ATTRIBUTE__$TIME_GROUPING.pkl
+#    filename: $OUTPUT_FOLDER/$VARNAME_stats.pkl
 
   - operator: plot.plot_cell_stats_histograms
     varname: $VARNAME
-    cell_attribute: $CELL_ATTRIBUTE
-    time_grouping: $TIME_GROUPING
+    output_folder: $OUTPUT_FOLDER

From 2c1c2f344e6101a6665eba6f855d16f29019e6b0 Mon Sep 17 00:00:00 2001
From: bblay <byronblay@googlemail.com>
Date: Tue, 30 Sep 2025 08:26:40 +0100
Subject: [PATCH 12/12] tidy

---
 delme/run_cell_statistics.py                  |  23 +-
 .../operators/_cell_stats_plots_template.html |  31 ++-
 src/CSET/operators/cell_statistics.py         | 226 ++----------------
 src/CSET/operators/plot.py                    |  19 --
 src/CSET/recipes/cell_statistics.yaml         |   4 +-
 5 files changed, 40 insertions(+), 263 deletions(-)

diff --git a/delme/run_cell_statistics.py b/delme/run_cell_statistics.py
index 89b368a71..6e052b14f 100644
--- a/delme/run_cell_statistics.py
+++ b/delme/run_cell_statistics.py
@@ -8,13 +8,6 @@
     Path(__file__).parent.parent / "src" / "CSET" / "recipes" / "cell_statistics.yaml"
 )
 
-# model_data = [
-#     ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp'),
-#     ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb012.pp'),
-#     ('uk_expt_lfric', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_000_006.nc'),
-#     ('uk_expt_lfric', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_006_012.nc'),
-# ]
-
 model_data = [
     (
         "uk_ctrl_um",
@@ -30,20 +23,6 @@
     "surface_microphysical_rainfall_rate",  # m01s04i203
     "surface_microphysical_rainfall_amount",  # m01s04i201
 ]
-# cell_attributes = ["effective_radius_in_km", "mean_value"]
-# time_groupings = ["forecast_period", "hour"]
-
-# # faster dev
-# cell_attributes = ["effective_radius_in_km"]
-# vars = ["surface_microphysical_rainfall_rate"]
-# time_groupings = ["forecast_period"]
-
-
-# # DEBUG
-# import CSET.operators.cell_statistics as cell_statistics
-# cell_statistics.caller_thing(cubes=None, cell_attribute='effective_radius_in_km', time_grouping='forecast_period')
-# exit(0)
-
 
 # todo: it's inefficient to keep reloading the data for each var/cell_attribute/time_grouping
 #  it would seem to be better to send the list of vars to the operator and plot functions, just once.
@@ -54,7 +33,7 @@
         "INPUT_PATHS": [i[1] for i in model_data],
         "MODEL_NAMES": [i[0] for i in model_data],
         "VARNAME": var,
-        "OUTPUT_FOLDER": "/data/scratch/byron.blay/cset/cell_stats/out3",
+        "OUTPUT_FOLDER": "/data/scratch/byron.blay/cset/cell_stats/out4",
     }
 
     recipe = parse_recipe(recipe_fpath, recipe_variables)
diff --git a/src/CSET/operators/_cell_stats_plots_template.html b/src/CSET/operators/_cell_stats_plots_template.html
index d975bfe88..380ff0dfb 100644
--- a/src/CSET/operators/_cell_stats_plots_template.html
+++ b/src/CSET/operators/_cell_stats_plots_template.html
@@ -7,6 +7,29 @@
     <title>{{title}}</title>
   
     <style>
+        body {
+            display: flex;
+            flex-wrap: wrap;
+            justify-content: space-around;
+            margin: 0;
+            font-family: sans-serif;
+        }
+
+        input:invalid {
+            box-shadow: 0 0 2px 2px red;
+        }
+
+        main {
+            flex: 30em;
+            height: 100vh;
+        }
+
+        main>img {
+            width: 100%;
+            height: calc(100% - 3em);
+            object-fit: contain;
+        }
+
     </style>
   
     <script type="module">
@@ -103,19 +126,11 @@
                 plot_img.src = plot_urls[cell_attribute_select.value][time_grouping_select.value][threshold_select.value][time_select.value];
             });
 
-
         }
 
         const plot_urls = {{plot_urls}}  // plot_urls[threshold][time_title] = filename
         display_sequence_controls(plot_urls);
 
-        // preload images
-        // plot_urls.forEach(threshold_key => {
-        //     plot_urls[threshold_key].forEach(time_key => {
-        //         let img = new Image();
-        //         img.src = plot_urls[threshold_key][time_key];
-        //     });
-        // })
         Object.keys(plot_urls).forEach(threshold_key => {
             Object.keys(plot_urls[threshold_key]).forEach(time_key => {
                 let img = new Image();
diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py
index 69205f581..da610b5c1 100644
--- a/src/CSET/operators/cell_statistics.py
+++ b/src/CSET/operators/cell_statistics.py
@@ -35,14 +35,6 @@
 # PLOT_PARAM="$CYLC_TASK_PARAM_cellplots"
 
 
-# is this going to be the cli usage?
-# export $INPUT_DIR=$SCRATCH/RES_RETRIEVAL_CACHE/test_quick
-# export OUTPUT_DIR=$SCRATCH/temp/cset
-# export RECIPE=cell_statistics.yaml
-# cset -v bake --input-dir=$INPUT_DIR --output-dir=$OUTPUT_DIR --recipe=$RECIPE
-#   --LAND_SEA_SPLIT=False
-
-
 def var_filter(cubes, long_name=None, stash=None):
     result = []
     for cube in cubes:
@@ -54,36 +46,6 @@ def var_filter(cubes, long_name=None, stash=None):
     return iris.cube.CubeList(result)
 
 
-def get_cell_stat_vars(cubes):
-    vars = {}
-
-    # Large-scale rainfall rate
-    vars["rainfall rate"] = var_filter(
-        cubes, long_name="surface_microphysical_rainfall_rate", stash="m01s04i203"
-    )
-
-    # 1-hourly mean precipitation rate
-    vars["rainfall amount"] = var_filter(
-        cubes, long_name="surface_microphysical_rainfall_amount", stash="m01s04i201"
-    )
-    # vars["1-hourly mean precipitation rate"].extend(var_filter(cubes, stash="m01s04i202"))  # snow
-    # todo: RES has a switch for the following
-    # vars["1-hourly mean precipitation rate"].extend(var_filter(cubes, stash="m01s05i201"))
-    # vars["1-hourly mean precipitation rate"].extend(var_filter(cubes, stash="m01s05i202"))
-
-    # ensure at least one cube found for each variable
-    for k, v in vars.items():
-        assert len(v) > 0, f"No cubes found for variable '{k}'"
-
-    # # strictly one cube per variable?
-    # assert len(vars["Large-scale rainfall rate"]) == 1
-    # assert len(vars["1-hourly mean precipitation rate"]) == 1
-    # vars["Large-scale rainfall rate"] = vars["Large-scale rainfall rate"][0]
-    # vars["1-hourly mean precipitation rate"] = vars["1-hourly mean precipitation rate"][0]
-
-    return vars
-
-
 def get_bin_edges(cell_attribute):
     # todo: we probably want to let the user specify the bins
     if cell_attribute == "effective_radius_in_km":
@@ -449,48 +411,6 @@ def repeat_scalar_coord_along_dim_coord(cubelist, scalar_coord_name, dim_coord_n
 
     return cubelist
 
-# todo: no longer needed? we can use the iris version now (double check this is ok)
-# def RES_extract_overlapping(cubelist, coord_name):
-#     """
-#     Extracts regions from cubes in a :class:`iris.cube.CubeList` such that \
-#     the specified coordinate is the same across all cubes.
-#
-#     Arguments:
-#
-#     * **cubelist** - an input :class:`iris.cube.CubeList`.
-#     * **coord_name** - a string specifying the name of the coordinate \
-#                        over which to perform the extraction.
-#
-#     Returns a :class:`iris.cube.CubeList` where the coordinate corresponding \
-#     to coord_name is the same for all cubes.
-#     """
-#     # Build a list of all Cell instances for this coordinate by
-#     # looping through all cubes in the supplied cubelist
-#     all_cells = []
-#     for cube in cubelist:
-#         for cell in cube.coord(coord_name).cells():
-#             all_cells.append(cell)
-#
-#     # Work out which coordinate Cell instances are common across
-#     # all cubes in the cubelist...
-#     cell_counts = collections.Counter(all_cells)
-#     # unique_cells = cell_counts.keys()
-#     unique_cells = list(cell_counts.keys())
-#     unique_cell_counts = list(cell_counts.values())
-#     num_cubes = len(cubelist)
-#     common_cells = [
-#         unique_cells[i]
-#         for i, count in enumerate(unique_cell_counts)
-#         if count == num_cubes
-#     ]
-#     # ...and use these to subset the cubes in the cubelist
-#     constraint = iris.Constraint(
-#         coord_values={coord_name: lambda cell: cell in common_cells}
-#     )
-#
-#     cubelist = iris.cube.CubeList([cube.extract(constraint) for cube in cubelist])
-#     return cubelist
-
 
 def ensure_bounds(cubes, coord_name):
     for cube in cubes:
@@ -522,17 +442,16 @@ def caller_thing(cubes: CubeList):
     # We loop through these, producing a result per model for each combo.
     # todo: we might not want to produce data for all cell attributes or time groupings, so we could paramterise.
     cell_attributes = ["effective_radius_in_km", "mean_value"]
-    time_groupings = ["forecast_period", "hour"]  # todo: WE NEED "time" here too! should slot straight in?
+    time_groupings = ["forecast_period", "hour", "time"]
     thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]  # todo: parametrise this?
 
-
     # todo: paramterise y_axis?
     #   set to "frequency" for cell statistic histograms that are \
     #   plain number counts in bins, or "relative_frequency" for \
     #   histograms showing the overall proportion of data values in \
     #   each bin (normalised to 100%). The default is \
     #   "relative_frequency".
-    # y_axis = "relative_frequency"
+    y_axis = "relative_frequency"
 
     if cubes in (None, [], CubeList([])):
         logging.warning(f'no cubes received')
@@ -556,9 +475,10 @@ def check_cell_methods(cube):
         logging.warning(f'no cubes after cell method filtering')
         return None
 
-    # Get a list of the common forecast periods and hours
-    # todo: For forecast time, this must contain "all", plus all the T+0.5 .. T+71.5
-    # todo: For hourly, this must contain "all", plus all the 0.5Z .. 23.5Z
+    # Get a list of the common forecast periods and hours (i.e time groupings).
+    # For forecast time, this must contain "all", plus all the T+0.5 .. T+71.5
+    # For hourly, this must contain "all", plus all the 0.5Z .. 23.5Z
+    # todo: What about time?
     forecast_periods = set(cubes[0].coord("forecast_period").points)
     hours = set(cubes[0].coord("time").points)  # todo: check it's actually hourly?
     for cube in cubes[1:]:
@@ -571,20 +491,10 @@ def check_cell_methods(cube):
 
     # what about obs? GPM & UK_RADAR_2KM? ignore for now.
 
-    # different bins for the two cell attributes
-    # bin_edges = {}
-    #
-    # bin_edges['effective_radius_in_km'] = 10**(np.arange(0.0, 3.12, 0.12))
-    # bin_edges['effective_radius_in_km'] = np.insert(bin_edges['effective_radius_in_km'], 0, 0)
-    #
-    # bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12))
-    # bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0)
-
     # this will be a nested dict:
     #  result[cell_attribute][time_grouping][threshold][time_point][model_name] = cube
     result = {}
 
-
     combos = itertools.product(cell_attributes, time_groupings, thresholds)
     for cell_attribute, time_grouping, threshold in combos:
         print(f'cell_attribute: {cell_attribute}, time_grouping: {time_grouping}, threshold: {threshold}')
@@ -596,13 +506,8 @@ def check_cell_methods(cube):
         time_data = result[cell_attribute][time_grouping][str(threshold)]  # helper var
 
         # todo: check var_name has been removed by this point, as RES removed it to help with merge
-        # hist_cubes = something_like_cell_attribute_histogram(
-        #     cube,
-        #     attribute=cell_attribute,
-        #     bin_edges=get_bin_edges(cell_attribute),
-        #     threshold=threshold) for cube in cubes]
 
-        # todo: can't we just let mpl make the histogram at plot time?
+        # todo: can't we just let mpl make the histograms at plot time?
         hist_cubes = CubeList([])
         for cube in cubes:
             hist_cube = something_like_cell_attribute_histogram(
@@ -625,7 +530,7 @@ def check_cell_methods(cube):
         # It removes the other time coords, other than that named by time_grouping.
         hist_cubes, times = extract_unique(hist_cubes, time_grouping)
 
-        # Sum cell statistic histograms at each time. todo: parallelise
+        # Sum cell statistic histograms at each time. todo: parallelise?
         summed_cubes = []
         for time in times:
             input_param = (hist_cubes, time, iris.analysis.SUM, None)
@@ -671,9 +576,9 @@ def check_cell_methods(cube):
 
             # todo: RES plots each cube here.
             for cube in cubes_at_time:
-                # todo: Normalise histogram? I think this is done in the plotting, actually, unlike RES.
-                # if y_axis == "relative_frequency":
-                #     cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)
+                # todo: Normalise histogram? Again, should this be in the plotting?
+                if y_axis == "relative_frequency":
+                    cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)
 
                 time_data[time_title][cube.attributes["model_name"]] = cube
 
@@ -682,27 +587,12 @@ def check_cell_methods(cube):
         for cube in summed_cubes:
             cube = cube.collapsed(time_grouping, iris.analysis.SUM)
 
-            # todo: Normalise histogram? I think this is done in the plotting, actually, unlike RES.
-            # if y_axis == "relative_frequency":
-            #     cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)
+            # todo: Normalise histogram?
+            if y_axis == "relative_frequency":
+                cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)
 
             time_data["all"][cube.attributes["model_name"]] = cube
 
-    # don't bother rearranging thie dict now that we've got our own plotter
-    # # todo: perhaps we should use metadata to organisethe results, and let the plotter work out what to plot.
-    # #   that would let us stick with the cubelist convention...
-    # # For simpler code, the results were formed as:
-    # #   data[<threshold>][<time_point>][<model_name>] -> Cube
-    # # But the plot needs the model to be above the time points, so rearrange as:
-    # #   data[<threshold>][<model_name>][<time_point>] -> Cube
-    # reformed_result = {}
-    # for threshold, threshold_result in result.items():
-    #     reformed_result[threshold] = {}
-    #     for time_title, model_cubes in threshold_result.items():
-    #         for model_name, cube in model_cubes.items():
-    #             reformed_result[threshold].setdefault(model_name, {})
-    #             reformed_result[threshold][model_name][time_title] = cube
-
     # return cubes to plot. todo: in what exact arrangement?
     return result
 
@@ -1039,91 +929,3 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1):
             cells.append(cell_cube)
 
     return cells
-
-
-# ignore
-
-# def cell_statistics(LAND_SEA_SPLIT: bool):
-
-
-# cell_attribute:
-#   effective_radius_in_km
-#   mean_value
-
-
-# RAIN_RATE = diagnostics.Field(
-#     long_name="Large-scale rainfall rate",
-#     short_name="rain_rate",
-#     stash=[diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 4, 203))],
-#     units="mm h-1",
-#     unit_conversion_factor=3600.0,
-#     thresholds=[0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0] if USE_LFRIC_CODE else None,
-#     regrid_method="two_stage"
-#     )
-
-
-# PRECIP_1HR_MEAN = diagnostics.Field(
-#     long_name="1-hourly mean precipitation rate",
-#     short_name="1hr_mean_precip",
-#     stash=PRECIP_1HR_MEAN_STASH,
-#     stash_operator=diagnostics.STASH_OPERATORS["ADD"],
-#     mean_period=1,
-#     use_running_accumulations=not USE_LFRIC_CODE,
-#     units="mm h-1",
-#     observations=["GPM", "UK_radar_2km", "Darwin_radar_rain_2.5km"],
-#     thresholds=[0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0],
-#     regrid_method="two_stage"
-#     )
-
-# PRECIP_1HR_MEAN_STASH = [
-#         diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 4, 201)),
-#         diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 4, 202))]
-# if not USE_LFRIC_CODE:
-#     PRECIP_1HR_MEAN_STASH.extend([
-#         diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 5, 201), is_optional=True),
-#         diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 5, 202), is_optional=True),
-#     ])
-
-
-# choose the thresholds according to the variable
-# todo: should these be [optionally] parameterised?
-
-
-# make a cube list of cell_attribute_histogram() for each cube
-
-
-# return for plotting
-# return some_cube
-
-
-"""
-
-category: Diagnostics
-title: cell statistics plot
-description: |
-  TBD.
-
-steps:
-
-  - operator: cell_statistics.caller_thing
-
-    # "effective_radius_in_km" or "mean_value"
-    cell_attribute: $CELL_ATTRIBUTE
-
-    uk_ctrl_um:
-      operator: read.read_cubes
-      file_paths:
-        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp"
-        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb012.pp"
-    uk_expt_lfric:
-      operator: read.read_cubes
-      file_paths:
-        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_000_006.nc"
-        - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_006_012.nc"
-
-  - operator: write.write_cube_to_nc
-    overwrite: True
-
-  - operator: plot.plot_histogram_series
-
-"""
diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py
index a848e1850..1ad482b84 100644
--- a/src/CSET/operators/plot.py
+++ b/src/CSET/operators/plot.py
@@ -2260,25 +2260,6 @@ def plot_cell_stats_histograms(data: dict, varname: str, output_folder: str):
     # todo: parameterise this
     root_folder = Path(output_folder) / varname
 
-    # for threshold, time_data in data.items():
-    #     plot_urls[threshold] = {}
-    #
-    #     # plot every time point, including the 'all' time point
-    #     for time_title, model_data in time_data.items():
-    #         filename = root_folder / varname / cell_attribute / threshold / time_grouping / f'{time_title}.png'
-    #
-    #         # # make a new plot for this combination of threshold & time
-    #         # plt.figure(figsize=(10, 10))
-    #         # plt.title(f'{varname}, {cell_attribute}, threshold {threshold}, at {time_grouping} {time_title}')
-    #         # for model_name, cube in model_data.items():
-    #         #     plt.plot(cube.coord(cell_attribute).points, cube.data)
-    #         #
-    #         # filename.parent.mkdir(parents=True, exist_ok=True)
-    #         # plt.savefig(filename)
-    #         # plt.close()
-    #
-    #         plot_urls[threshold][time_title] = str(filename)
-
     for cell_attribute, time_groupings in data.items():
         plot_urls[cell_attribute] = {}
         for time_grouping, thresholds in time_groupings.items():
diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml
index 508df8e9d..8c927a788 100644
--- a/src/CSET/recipes/cell_statistics.yaml
+++ b/src/CSET/recipes/cell_statistics.yaml
@@ -19,7 +19,7 @@ steps:
     yspacing: 0.5
     method: Linear
 
-#  these two are useful when working on the cell stats
+#  dev speedup
   - operator: write.write_pickle
     filename: $OUTPUT_FOLDER/$VARNAME.pkl
 #  - operator: read.read_pickle
@@ -27,7 +27,7 @@ steps:
 
   - operator: cell_statistics.caller_thing
 
-#  these two are useful when working on the plotting
+#  dev speedup
   - operator: write.write_pickle
     filename: $OUTPUT_FOLDER/$VARNAME_stats.pkl
 #  - operator: read.read_pickle