From 1f4ff3906ad698047222720fbb7426882f5463df Mon Sep 17 00:00:00 2001 From: bblay Date: Thu, 18 Sep 2025 08:36:28 +0100 Subject: [PATCH 01/12] well the model data is getting through but it's not using the yaml properly and now i need to, for regridding --- src/CSET/operators/__init__.py | 2 + src/CSET/operators/cell_statistics.py | 561 ++++++++++++++++++++++++++ src/CSET/recipes/cell_statistics.yaml | 27 ++ 3 files changed, 590 insertions(+) create mode 100644 src/CSET/operators/cell_statistics.py create mode 100644 src/CSET/recipes/cell_statistics.yaml diff --git a/src/CSET/operators/__init__.py b/src/CSET/operators/__init__.py index 8f68de590..0dc5ae949 100644 --- a/src/CSET/operators/__init__.py +++ b/src/CSET/operators/__init__.py @@ -28,6 +28,7 @@ from CSET.operators import ( ageofair, aggregate, + cell_statistics, collapse, constraints, convection, @@ -46,6 +47,7 @@ __all__ = [ "ageofair", "aggregate", + "cell_statistics", "collapse", "constraints", "convection", diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py new file mode 100644 index 000000000..965317832 --- /dev/null +++ b/src/CSET/operators/cell_statistics.py @@ -0,0 +1,561 @@ +"""OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE.""" + +import iris.analysis +import iris.coords +import iris.cube +import iris.util +import numpy as np +from scipy import ndimage + + +# some things from the res job +# +# DATES="20200801T0600Z 20200801T1800Z 20200802T0600Z" +# LAND_SEA_SPLIT="False" +# NUM_PROC="8" +# INPUT_DIR="/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_quick" +# PLOT_DIR="/data/users/byron.blay/RES/test_full/diags" +# RECIPE="generic_basic_qq_plot.yaml" +# SHORT_NAMES="ral2 ral3" +# SITE="metoffice" +# Y_AXIS="frequency" +# PLOT_PARAM="$CYLC_TASK_PARAM_cellplots" + + +# is this going to be the cli usage? +# export $INPUT_DIR=$SCRATCH/RES_RETRIEVAL_CACHE/test_quick +# export OUTPUT_DIR=$SCRATCH/temp/cset +# export RECIPE=cell_statistics.yaml +# cset -v bake --input-dir=$INPUT_DIR --output-dir=$OUTPUT_DIR --recipe=$RECIPE +# --LAND_SEA_SPLIT=False + + +def var_filter(cubes, long_name=None, stash=None): + """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE.""" + result = [] + for cube in cubes: + if long_name and cube.long_name == long_name: + result.append(cube) + elif stash and "STASH" in cube.attributes and cube.attributes["STASH"] == stash: + result.append(cube) + + return iris.cube.CubeList(result) + + +def get_cell_stat_vars(cubes): + """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE.""" + vars = {} + + # Large-scale rainfall rate + vars["rainfall rate"] = var_filter( + cubes, long_name="surface_microphysical_rainfall_rate", stash="m01s04i203" + ) + + # 1-hourly mean precipitation rate + vars["rainfall amount"] = var_filter( + cubes, long_name="surface_microphysical_rainfall_amount", stash="m01s04i201" + ) + # vars["1-hourly mean precipitation rate"].extend(var_filter(cubes, stash="m01s04i202")) # snow + # todo: RES has a switch for the following + # vars["1-hourly mean precipitation rate"].extend(var_filter(cubes, stash="m01s05i201")) + # vars["1-hourly mean precipitation rate"].extend(var_filter(cubes, stash="m01s05i202")) + + # ensure at least one cube found for each variable + for k, v in vars.items(): + assert len(v) > 0, f"No cubes found for variable '{k}'" + + # # strictly one cube per variable? + # assert len(vars["Large-scale rainfall rate"]) == 1 + # assert len(vars["1-hourly mean precipitation rate"]) == 1 + # vars["Large-scale rainfall rate"] = vars["Large-scale rainfall rate"][0] + # vars["1-hourly mean precipitation rate"] = vars["1-hourly mean precipitation rate"][0] + + return vars + + +def caller_thing(cell_attribute, **kwargs): + """ + Create a histogram from the given cube, for every threshold and cell attribute. + + The cube will be "1-hourly mean precipitation rate" or "Large-scale rainfall rate". + + + Also for: + cell attribute + effective_radius_in_km, mean_value + + """ + + # todo: remove var_name? impedes merge + + # For each model, extract the two variables we're interested in. + model_data = {} + for model, cubes in kwargs.items(): + model_data[model] = get_cell_stat_vars(cubes) + + # For now, thresholds and cell attributes are hard coded. + thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0] + + # todo: what about different models? how are they loaded and labelled? + + # # For now, cell attributes are hard coded. + # cell_attributes = ["effective_radius_in_km", "mean_value"] + # + # # For now, hard-code to these two, which is what we see in the res output. + # # todo: there is also "time" in the res code + # time_groupings = ["forecast_period", "hour"] + + # We need a list of times for each time grouping + # For forecast time, this must contain "all", plus all the T+0.5 .. T+71.5 + # For hourly, this must contain "all", plus all the 0.5Z .. 23.5Z + # todo: we should presumably get this from the cube. + # assert False + + # todo: we need a land-sea split input? + # for now, hard-code as false (this is what the res output had it set to) + # land_sea_split = False + + # what about obs? GPM & UK_RADAR_2KM? ignore for now. + + # different bins for the two cell attributes + bin_edges = {} + + bin_edges['effective_radius_in_km'] = 10**(np.arange(0.0, 3.12, 0.12)) + bin_edges['effective_radius_in_km'] = np.insert(bin_edges['effective_radius_in_km'], 0, 0) + + bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12)) + bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0) + + + + model_histograms = {} + for model, data in model_data.keys(): + for threshold in thresholds: + something_like_cell_attribute_histogram( + cube, + attribute=cell_attribute, + bin_edges=bin_edges[cell_attribute], + threshold=threshold, + ) + + + # return a cube for each model + return model_histograms + + +def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0): + hist_cube = iris.cube.CubeList() + coords = get_non_spatial_coords(cube) + + # Express histogram bins as an Iris coordinate + bin_centres = 0.5 * (bin_edges[1:] + bin_edges[:-1]) + bins_as_coord = iris.coords.DimCoord( + bin_centres, long_name=attribute, units=cube.units, + coord_system=None, bounds=np.column_stack((bin_edges[0:-1], bin_edges[1:]))) + + hist_cubes = iris.cube.CubeList() + + for slc in cube.slices_over(coords): + # Identify connected cells in this spatial slice + cells = find_cells(slc, threshold=threshold) + + if cells: + # Extract values of the desired cell attribute + cell_attributes = [cell.attributes[attribute] for cell in cells] + + # Construct a histogram of the desired cell attribute + hist, _ = np.histogram(cell_attributes) + else: + # Assign zeros to all bins + hist = np.zeros(bin_centres.size).astype(np.int64) + + # Get a list of the non-spatial coordinates for this slice + aux_coords = [(coord, []) for coord in get_non_spatial_coords(slc)] + + # Construct a cube to hold the cell statistic histogram for this slice + hist_slc = iris.cube.Cube( + hist, + long_name=("{0:s} cell {1:s} histogram".format(slc.name(), attribute)), + units="no_unit", + attributes=slc.attributes, + cell_methods=slc.cell_methods, + dim_coords_and_dims=[(bins_as_coord, 0)], + aux_coords_and_dims=aux_coords, + ) + + hist_cubes.append(hist_slc) + + hist_cube = hist_cubes.merge_cube() + return hist_cube + + +def get_spatial_coords(cube): + '''Returns the x, y coordinates of an input :class:`iris.cube.Cube`.''' + # Usual names for spatial coordinates + X_COORD_NAMES = ["longitude", "grid_longitude", + "projection_x_coordinate", "x"] + Y_COORD_NAMES = ["latitude", "grid_latitude", + "projection_y_coordinate", "y"] + + # Get a list of coordinate names for the cube + coord_names = [coord.name() for coord in cube.coords()] + + # Check which x-coordinate we have, if any + x_coords = [coord for coord in coord_names if coord in X_COORD_NAMES] + if len(x_coords) != 1: + raise ValueError("Could not identify a unique x-coordinate in cube") + x_coord = cube.coord(x_coords[0]) + + # Check which y-coordinate we have, if any + y_coords = [coord for coord in coord_names if coord in Y_COORD_NAMES] + if len(y_coords) != 1: + raise ValueError("Could not identify a unique y-coordinate in cube") + y_coord = cube.coord(y_coords[0]) + + return [x_coord, y_coord] + + +def get_non_spatial_coords(cube): + """ + Returns a list of the non-spatial coordinates of an input \ + :class:`iris.cube.Cube`. + """ + # Get a list of the cube coordinates + coords = cube.coords() + # Get the spatial coordinates of the cube + x_coord, y_coord = get_spatial_coords(cube) + # Remove the spatial coordinates from the list of coordinates + coords.remove(x_coord) + coords.remove(y_coord) + return coords + + +def guess_bounds(cube): + ''' + Takes an input :class:`iris.cube.Cube`, guesses bounds on the x, y \ + coordinates, then returns the cube. Such bounds are often required by \ + regridding algorithms. + ''' + # Loop over spatial coordinates + for axis in ["x", "y"]: + coord = cube.coord(axis=axis) + # Check that this is not a variable resolution grid + # TODO: Does bounds really not work with variable resolution? AVD + # seem to think it won't... + try: + _ = iris.util.regular_step(coord) + except: + raise ValueError("Cannot guess bounds for a variable " + "resolution grid") + # Guess bounds if there aren't any + if coord.bounds is None: + coord.guess_bounds() + return cube + + +def connected_object_labelling(data, threshold=0.0, min_size=1, connectivity=1): + ''' + Finds connected objects in an input array and assigns them unique labels. + + Arguments: + + * **data** - a :class:`numpy.ndarray` array in which to label objects. + + Keyword arguments: + + * **threshold** - if supplied, only regions where the input data exceeds \ + the threshold will be considered when searching for \ + connected objects. + * **min_size** - minimum size in grids points for connected objects. Must \ + be an integer >= 1. + * **connectivity** - given a particular grid point, all grid points up to \ + a squared distance of connectivity away are considered \ + neighbours. Connectivity may range from 1 (only direct \ + neighbours are considered) to :attr:`data.ndim`. + + Returns: + + * **label_array** - an integer array where each unique object in the input \ + array has a unique label in the returned array. + * **num_objects** - the number of objects found. + ''' + # Apply the supplied threshold to the data to generate a binary array + binary_data = data.copy() + if np.ma.is_masked(binary_data): + # If the data is masked, replace any masked values with (threshold - 1) + # thus guaranteeing they will be below the threshold and thus set to 0 + # below + binary_data = np.ma.filled(binary_data, fill_value=(threshold - 1)) + # Set values above and below the threshold to 1 and 0, respectively + indices_above = (binary_data > threshold) + indices_below = (binary_data <= threshold) + binary_data[indices_above] = 1 + binary_data[indices_below] = 0 + + # Construct a structuring element that defines how the neighbours of + # a grid point are assigned + structure_element = ndimage.morphology.generate_binary_structure( + data.ndim, connectivity) + + # Label distinct (connected) objects in the binary array + label_array, num_objects = ndimage.measurements.label( + binary_data, + structure=structure_element) + + # Throw away any objects smaller than min_size + if min_size < 1: + raise ValueError('"min_size" must be 1 or greater') + elif min_size > 1: + labels = np.unique(label_array) + # Discard the background (which will be labelled as 0) + labels = labels[(labels > 0)] + # Loop over distinct objects + for label in labels: + # Find the indices of the grid points comprising this object + indices = np.where(label_array == label) + # If this object is smaller than min_size, set it as background + if indices[0].size < min_size: + label_array[indices] = 0 + num_objects -= 1 + + return label_array, num_objects + + +def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1): + """ + Finds connected objects (i.e. cells) in spatial slices of a given \ + :class:`iris.cube.Cube`. + + Arguments: + + * **cube** - an input :class:`iris.cube.Cube` object. + + Keyword arguments: + + * **threshold** - if supplied, only regions where the input data exceeds \ + the threshold will be considered when identifying cells. + * **area_threshold** - minimum area in km^2 that cells must have. + * **connectivity** - given a particular grid point, all grid points up to a \ + squared distance of connectivity away are considered \ + neighbours. Connectivity may range from 1 (only \ + direct neighbours are considered) to \ + :attr:`cube.data.ndim`. + + Returns: + + * **cells** - a :class:`iris.cube.CubeList` of \ + :class:`iris.cube.Cube` objects, each one corresponding to \ + an identified cell. + """ + + # Convert input area threshold from km^2 to m^2 + M_IN_KM = 1000 + area_threshold = (float(M_IN_KM)**2) * area_threshold + + # Get x, y coordinates of input cube + x_coord, y_coord = get_spatial_coords(cube) + x, y = iris.analysis.cartography.get_xy_grids(cube) + + # Guess x, y coordinate bounds + cube = guess_bounds(cube) + + # Loop over 2D spatial slices of the input cube and find cells in each + # slice + grid_areas = None + cells = iris.cube.CubeList() + coords = get_non_spatial_coords(cube) + for slc in cube.slices_over(coords): + if grid_areas is None: + # Area of grid cells, in m^2 + grid_areas = iris.analysis.cartography.area_weights(slc) + + # Store a list of the non-spatial coordinates for this slice + aux_coords = [(coord, []) for coord in + get_non_spatial_coords(slc)] + + # Find and label cells + # Call connected object labelling function based on + # scipy.ndimage.measurements.label + cell_label_array, _ = connected_object_labelling( + slc.data, + threshold=threshold, + min_size=1, + connectivity=connectivity) + + # Get a list of unique cell labels + cell_labels = np.unique(cell_label_array) + # Discard background (which has a label of 0) + cell_labels = cell_labels[(cell_labels > 0)] + # Loop over cell and store their properties + for cell_label in cell_labels: + # Find the indices of the grid points comprising this cell + cell_indices = np.where(cell_label_array == cell_label) + cell_x = x[cell_indices] + cell_y = y[cell_indices] + cell_values = slc.data[cell_indices] + cell_grid_areas = grid_areas[cell_indices] + + # There should not be any masked data present in cells! + if np.ma.is_masked(cell_values): + raise ValueError("Masked data found in cell {0:d}" + .format(cell_label)) + + # If cell area is less than area_threshold, discard it + # (by setting its label to the background value) + cell_area = np.sum(cell_grid_areas, dtype=np.float64) + if cell_area < area_threshold: + cell_label_array[cell_indices] = 0 + continue + + # Estimate cell centre position + # TODO Is there a better way of doing this? C.O.M? + cell_centre = (np.mean(cell_x, dtype=np.float64), + np.mean(cell_y, dtype=np.float64)) + # Area-weighted mean value in cell + cell_mean = (np.sum((cell_grid_areas * cell_values), + dtype=np.float64) + / cell_area) + # Convert cell area from m^2 to km^2... + cell_area /= (float(M_IN_KM)**2) + # ...and then cell effective radius in km + cell_radius = np.sqrt(cell_area / np.pi) + + # Create an Iris cube to store this cell + cell_cube = iris.cube.Cube( + cell_values, + long_name="{:s} cell".format(cube.name()), + units=cube.units, + attributes=cube.attributes, + cell_methods=cube.cell_methods, + aux_coords_and_dims=aux_coords) + + # Set up x, y coordinates describing the grid points in the cell... + cell_x_coord = iris.coords.AuxCoord( + cell_x, + standard_name=x_coord.standard_name, + long_name=x_coord.long_name, + units=x_coord.units, + bounds=None, + attributes=x_coord.attributes, + coord_system=x_coord.coord_system) + cell_y_coord = iris.coords.AuxCoord( + cell_y, + standard_name=y_coord.standard_name, + long_name=y_coord.long_name, + units=y_coord.units, + bounds=None, + attributes=y_coord.attributes, + coord_system=y_coord.coord_system) + # ...and add them to the cell cube + cell_cube.add_aux_coord(cell_x_coord, 0) + cell_cube.add_aux_coord(cell_y_coord, 0) + + # Set up a coordinate describing the areas of grid cells in + # the cell object... + cell_grid_area_coord = iris.coords.AuxCoord(cell_grid_areas, + long_name="grid_areas", + units="m2") + #...and add it to the cell cube + cell_cube.add_aux_coord(cell_grid_area_coord, 0) + + # Finally add some attriubtes to the cube that describe some + # useful information about the cell + #cell_cube.attributes["label"] = int(cell_label) + cell_cube.attributes["centre"] = cell_centre + cell_cube.attributes["area_in_km2"] = cell_area + cell_cube.attributes["effective_radius_in_km"] = cell_radius + cell_cube.attributes["mean_value"] = cell_mean + #cell_cube.attributes["indices"] = cell_indices + cells.append(cell_cube) + + return cells + + + + +# ignore + +# def cell_statistics(LAND_SEA_SPLIT: bool): + + +# cell_attribute: +# effective_radius_in_km +# mean_value + + +# RAIN_RATE = diagnostics.Field( +# long_name="Large-scale rainfall rate", +# short_name="rain_rate", +# stash=[diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 4, 203))], +# units="mm h-1", +# unit_conversion_factor=3600.0, +# thresholds=[0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0] if USE_LFRIC_CODE else None, +# regrid_method="two_stage" +# ) + + +# PRECIP_1HR_MEAN = diagnostics.Field( +# long_name="1-hourly mean precipitation rate", +# short_name="1hr_mean_precip", +# stash=PRECIP_1HR_MEAN_STASH, +# stash_operator=diagnostics.STASH_OPERATORS["ADD"], +# mean_period=1, +# use_running_accumulations=not USE_LFRIC_CODE, +# units="mm h-1", +# observations=["GPM", "UK_radar_2km", "Darwin_radar_rain_2.5km"], +# thresholds=[0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0], +# regrid_method="two_stage" +# ) + +# PRECIP_1HR_MEAN_STASH = [ +# diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 4, 201)), +# diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 4, 202))] +# if not USE_LFRIC_CODE: +# PRECIP_1HR_MEAN_STASH.extend([ +# diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 5, 201), is_optional=True), +# diagnostics.STASHRequest(iris.fileformats.pp.STASH(1, 5, 202), is_optional=True), +# ]) + + +# choose the thresholds according to the variable +# todo: should these be [optionally] parameterised? + + +# make a cube list of cell_attribute_histogram() for each cube + + +# return for plotting +# return some_cube + + +""" + +category: Diagnostics +title: cell statistics plot +description: | + TBD. + +steps: + + - operator: cell_statistics.caller_thing + + # "effective_radius_in_km" or "mean_value" + cell_attribute: $CELL_ATTRIBUTE + + uk_ctrl_um: + operator: read.read_cubes + file_paths: + - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp" + - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb012.pp" + uk_expt_lfric: + operator: read.read_cubes + file_paths: + - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_000_006.nc" + - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_006_012.nc" + + - operator: write.write_cube_to_nc + overwrite: True + + - operator: plot.plot_histogram_series + +""" diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml new file mode 100644 index 000000000..d31c1937a --- /dev/null +++ b/src/CSET/recipes/cell_statistics.yaml @@ -0,0 +1,27 @@ +category: Diagnostics +title: cell statistics plot +description: | + TBD. + +steps: + + - operator: cell_statistics.caller_thing + + # "effective_radius_in_km" or "mean_value" + cell_attribute: $CELL_ATTRIBUTE + + uk_ctrl_um: + operator: read.read_cubes + file_paths: + - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp" + - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb012.pp" + uk_expt_lfric: + operator: read.read_cubes + file_paths: + - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_000_006.nc" + - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_006_012.nc" + + - operator: write.write_cube_to_nc + overwrite: True + + - operator: plot.plot_histogram_series From 53b547ef55c1ba6197479bd341c38ea369287e48 Mon Sep 17 00:00:00 2001 From: bblay Date: Mon, 22 Sep 2025 10:59:06 +0100 Subject: [PATCH 02/12] - add regrid to yaml - brought more code in from res - think i need to remove the outer cubes loop --- src/CSET/operators/cell_statistics.py | 640 ++++++++++++++++++++++---- src/CSET/recipes/cell_statistics.yaml | 26 +- 2 files changed, 556 insertions(+), 110 deletions(-) diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py index 965317832..6449de083 100644 --- a/src/CSET/operators/cell_statistics.py +++ b/src/CSET/operators/cell_statistics.py @@ -1,12 +1,23 @@ """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE.""" +import datetime +import inspect +import itertools +from copy import deepcopy +from typing import Callable + +import cftime import iris.analysis import iris.coords import iris.cube import iris.util import numpy as np +from iris.cube import CubeList from scipy import ndimage +M_IN_KM = 1000 +HOUR_IN_SECONDS = 3600 + # some things from the res job # @@ -31,7 +42,6 @@ def var_filter(cubes, long_name=None, stash=None): - """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE.""" result = [] for cube in cubes: if long_name and cube.long_name == long_name: @@ -43,7 +53,6 @@ def var_filter(cubes, long_name=None, stash=None): def get_cell_stat_vars(cubes): - """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE.""" vars = {} # Large-scale rainfall rate @@ -73,87 +82,523 @@ def get_cell_stat_vars(cubes): return vars -def caller_thing(cell_attribute, **kwargs): +def get_bin_edges(cell_attribute): + # todo: we probably want to let the user specify the bins + if cell_attribute == "effective_radius_in_km": + bin_edges = 10 ** (np.arange(0.0, 3.12, 0.12)) + bin_edges = np.insert(bin_edges, 0, 0) + elif cell_attribute == "mean_value": + bin_edges = 10 ** (np.arange(-1, 2.7, 0.12)) + bin_edges = np.insert(bin_edges, 0, 0) + else: + raise ValueError(f"Unknown cell attribute '{cell_attribute}'") + + return bin_edges + + +def identify_unique_times(cubelist, time_coord_name): """ - Create a histogram from the given cube, for every threshold and cell attribute. + Given a :class:`iris.cube.CubeList`, this finds the set of unique times \ + which occur across all cubes in the cubelist. - The cube will be "1-hourly mean precipitation rate" or "Large-scale rainfall rate". + Arguments: + * **cubelist** - a :class:`iris.cube.CubeList` of :class:`iris.cube.Cube` \ + objects. + * **time_coord_name** - the name of the time coordinate to select, \ + typically "time", "forecast_period" or "hour". - Also for: - cell attribute - effective_radius_in_km, mean_value + Returns + ------- + * **time_coord** - an :class:`iris.coords.Coord` instance containing the \ + unique times that occur across the cubes in the \ + input cubelist. + """ + times = [] + time_unit = None + # Loop over cubes + for cube in cubelist: + # Extract the desired time coordinate from the cube + time_coord = cube.coord(time_coord_name) + + # Get the units for the specifed time coordinate + if time_unit is None: + time_unit = time_coord.units + + # Store the time coordinate points + times.extend(time_coord.points) + + # Construct a list of unique times... + times = sorted(list(set(times))) + # ...and store them in a new time coordinate + time_coord = iris.coords.DimCoord(times, units=time_unit) + time_coord.rename(time_coord_name) + + return time_coord + + +def add_categorised_coord( + cube: iris.cube.Cube, + name: str, + from_coord: iris.coords.DimCoord | iris.coords.AuxCoord | str, + category_function: Callable, + units: str = "1", +) -> None: + """Add a new coordinate to a cube, by categorising an existing one. + + Make a new :class:`iris.coords.AuxCoord` from mapped values, and add + it to the cube. + + Parameters + ---------- + cube : + The cube containing 'from_coord'. The new coord will be added into it. + name : + Name of the created coordinate. + from_coord : + Coordinate in 'cube', or the name of one. + category_function : + Function(coordinate, value), returning a category value for a coordinate + point-value. If ``value`` has a type hint :obj:`cftime.datetime`, the + coordinate points are translated to :obj:`cftime.datetime` s before + calling ``category_function``. + units : + Units of the category value, typically 'no_unit' or '1'. + """ + # Interpret coord, if given as a name + coord = cube.coord(from_coord) if isinstance(from_coord, str) else from_coord + + if len(cube.coords(name)) > 0: + msg = 'A coordinate "%s" already exists in the cube.' % name + raise ValueError(msg) + + # Translate the coordinate points to cftime datetimes if requested. + value_param = list(inspect.signature(category_function).parameters.values())[1] + if issubclass(value_param.annotation, cftime.datetime): + points = coord.units.num2date(coord.points, only_use_cftime_datetimes=True) + else: + points = coord.points + + # Construct new coordinate by mapping values, using numpy.vectorize to + # support multi-dimensional coords. + # Test whether the result contains strings. If it does we must manually + # force the dtype because of a numpy bug (see numpy #3270 on GitHub). + result = category_function(coord, points.ravel()[0]) + if isinstance(result, str): + str_vectorised_fn = np.vectorize(category_function, otypes=[object]) + + def vectorised_fn(*args): + # Use a common type for string arrays (N.B. limited to 64 chars). + return str_vectorised_fn(*args).astype("|U64") + + else: + vectorised_fn = np.vectorize(category_function) + new_coord = iris.coords.AuxCoord( + vectorised_fn(coord, points), + units=units, + attributes=coord.attributes.copy(), + ) + new_coord.rename(name) + # Add into the cube + cube.add_aux_coord(new_coord, cube.coord_dims(coord)) + + +def hour_from_time(coord, point): + """ + Category function to calculate the hour given a time, for use in \ + :func:`iris.coord_categorisation.add_categorised_coord`. """ + time = coord.units.num2date(point) + day_start = datetime.datetime(time.year, time.month, time.day) + seconds_since_day_start = (time - day_start).total_seconds() + hours_since_day_start = seconds_since_day_start / float(HOUR_IN_SECONDS) + return hours_since_day_start - # todo: remove var_name? impedes merge - # For each model, extract the two variables we're interested in. - model_data = {} - for model, cubes in kwargs.items(): - model_data[model] = get_cell_stat_vars(cubes) +def remove_duplicates(cubelist): + """ + Removes any duplicate :class:`iris.cube.Cube` objects from an \ + :class:`iris.cube.CubeList`. + """ + # Nothing to do if the cubelist is empty + if not cubelist: + return cubelist + # Build up a list of indices of the cubes to remove because they are + # duplicated + indices_to_remove = [] + for i in range(len(cubelist) - 1): + cube_i = cubelist[i] + for j in range(i + 1, len(cubelist)): + cube_j = cubelist[j] + if cube_i == cube_j: + if j not in indices_to_remove: + indices_to_remove.append(j) + # Only keep unique cubes + cubelist = iris.cube.CubeList( + [cube for index, cube in enumerate(cubelist) if index not in indices_to_remove] + ) + return cubelist - # For now, thresholds and cell attributes are hard coded. - thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0] - # todo: what about different models? how are they loaded and labelled? +def extract_unique(cubes, group): + # Preparation of data for averaging and plotting + if group == "time": + # Identify a unique set of times + times = identify_unique_times(cubes, group) - # # For now, cell attributes are hard coded. - # cell_attributes = ["effective_radius_in_km", "mean_value"] - # - # # For now, hard-code to these two, which is what we see in the res output. - # # todo: there is also "time" in the res code - # time_groupings = ["forecast_period", "hour"] + # Now extract data at these times + time_constraint = iris.Constraint( + coord_values={group: lambda cell: cell.point in times.points} + ) + cubes = cubes.extract(time_constraint) + + # Remove other time coordinates to allow a cube merge + # later + for cube in cubes: + cube.remove_coord("forecast_reference_time") + cube.remove_coord("forecast_period") + elif group == "forecast_period": + # Identify a unique set of lead times + times = identify_unique_times(cubes, group) + + # Remove other time coordinates to allow a cube + # merge later + for cube in cubes: + cube.remove_coord("forecast_reference_time") + cube.remove_coord("time") + elif group == "hour": + # Categorise the time coordinate of each cube into + # hours + for cube in cubes: + add_categorised_coord( + cube, "hour", cube.coord("time"), hour_from_time, units="hour" + ) - # We need a list of times for each time grouping - # For forecast time, this must contain "all", plus all the T+0.5 .. T+71.5 - # For hourly, this must contain "all", plus all the 0.5Z .. 23.5Z - # todo: we should presumably get this from the cube. - # assert False + # Identify a unique set of times of day + times = identify_unique_times(cubes, group) - # todo: we need a land-sea split input? - # for now, hard-code as false (this is what the res output had it set to) - # land_sea_split = False + # Now extract data at these times + time_constraint = iris.Constraint( + coord_values={group: lambda cell: cell.point in times.points} + ) + cubes = cubes.extract(time_constraint) - # what about obs? GPM & UK_RADAR_2KM? ignore for now. + # Remove other time coordinates to allow a cube + # merge later + for cube in cubes: + cube.remove_coord("forecast_reference_time") + cube.remove_coord("time") + cube.remove_coord("forecast_period") - # different bins for the two cell attributes - bin_edges = {} + # Remove any duplicate cubes to allow a successful merge + # Note this is typcially because we have the same set of + # observations associated with more than one model + cubes = remove_duplicates(cubes) - bin_edges['effective_radius_in_km'] = 10**(np.arange(0.0, 3.12, 0.12)) - bin_edges['effective_radius_in_km'] = np.insert(bin_edges['effective_radius_in_km'], 0, 0) + return cubes, times - bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12)) - bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0) +def remove_cell_method(cube, cell_method): + """ + Removes the supplied :class:`iris.coords.CellMethod` from an input + :class:`iris.cube.Cube`, then returns the cube. + """ + cell_methods = [cm for cm in cube.cell_methods if cm != cell_method] + cube.cell_methods = () + for cm in cell_methods: + cube.add_cell_method(cm) + return cube + + +def aggregate_at_time(input_params): + """ + Extracts data valid at a given time from each cube in a list of cubes, \ + then performs an aggregation operation (e.g. mean) across this data. + + Arguments (passed in as a tuple to allow parallelisation): + + * **input_params** - a four-element tuple consisting of: + + * **cubes** - a :class:`iris.cube.CubeList` holding the \ + :class:`iris.cube.Cube` objects to process. + * **time_coord** - the time at which the aggregation should be performed, \ + supplied as an :class:`iris.coords.Coord` object. + * **aggregator** - the aggregator to use, which can be any from \ + :mod:`iris.analysis`. + * **percentile** - the value of the percentile rank at which to extract \ + values, if the chosen aggregator is \ + :class:`iris.analysis.PERCENTILE`. For other \ + aggregators this not used. + + Returns + ------- + * **aggregated_cubes** - an :class:`iris.cube.CubeList` of \ + :class:`iris.cube.Cube` objects holding the \ + aggregated data. + """ + # Unpack input parameters tuple + cubes = input_params[0] + time_coord = input_params[1] + aggregator = input_params[2] + # TODO: Can we improve the handling of this with keyword arguments? + percentile = input_params[3] + + # Check the supplied time coordinate to make sure it corresponds to a + # single time only + if len(time_coord.points) != 1: + raise ValueError("Time coordinate should specify a single time only") + + # Remove any duplicate cubes in the input cubelist otherwise this + # will break the aggregation + cubes = remove_duplicates(cubes) + + # Name of the supplied time coordinate + time_coord_name = time_coord.name() + + # Extract cubes matching the time specified by the supplied time coordinate. + # Even though the source coord might have floats for its points, + # the cell here will have cftime objects, such as DatetimeGregorian, + # so we can't just compare against the time coord's points. + time_constraint = iris.Constraint( + coord_values={time_coord_name: lambda cell: cell.point in time_coord.cells()} + ) + cubes_at_time = cubes.extract(time_constraint) + + # Add a temporary "number" coordinate to uniquely label the different + # data points at this time. + # An example of when there can be multiple data points at the time of + # interest is if the time coordinate represents the hour of day. + number = 0 + numbered_cubes = iris.cube.CubeList() + for cube in cubes_at_time: + for slc in cube.slices_over(time_coord_name): + number_coord = iris.coords.AuxCoord(number, long_name="number") + slc.add_aux_coord(number_coord) + numbered_cubes.append(slc) + number += 1 + cubes_at_time = numbered_cubes + + # Merge + cubes_at_time = cubes_at_time.merge() + + # For each cube in the cubelist, aggregate over all cases at this time + # using the supplied aggregator + aggregated_cubes = iris.cube.CubeList() + for cube in cubes_at_time: + # If there was only a single data point at this time, then "number" + # will be a scalar coordinate. If so, make it a dimension coordinate + # to allow collapsing below + if not cube.coord_dims("number"): + cube = iris.util.new_axis(cube, scalar_coord="number") + + # Store the total number of data points found at this time + num_cases = cube.coord("number").points.size + num_cases_coord = iris.coords.AuxCoord(num_cases, long_name="num_cases") + cube.add_aux_coord(num_cases_coord) + + # Do aggregation across the temporary "number" coordinate + if isinstance(aggregator, type(iris.analysis.PERCENTILE)): + cube = cube.collapsed("number", aggregator, percent=percentile) + else: + cube = cube.collapsed("number", aggregator) + + # Now remove the "number" coordinate... + cube.remove_coord("number") + # ...and associated cell method + cell_method = iris.coords.CellMethod(aggregator.name(), coords="number") + cube = remove_cell_method(cube, cell_method) + + aggregated_cubes.append(cube) + + return aggregated_cubes + + +def repeat_scalar_coord_along_dim_coord(cubelist, scalar_coord_name, dim_coord_name): + """ + For each :class:`iris.cube.Cube` in a given :class:`iris.cube.CubeList`, \ + this extends (by repetition) a specified scalar coordinate along the \ + dimension corresponding to a specified dimension coordinate. + """ + for cube in cubelist: + scalar_coord = cube.coord(scalar_coord_name) + # Check the coordinate referenced by scalar_coord_name is indeed + # a scalar coordinate. Otherwise nothing to do. + if scalar_coord.points.size == 1: + # Get the data value held by the scalar coordinate... + scalar_coord_val = scalar_coord.points[0] + # ...then remove it from the cube + cube.remove_coord(scalar_coord) + # Extract the dimension coordinate matching dim_coord_name + dim_coord = cube.coord(dim_coord_name) + # Get the dimension spanned by this dimension coordinate... + dim = cube.coord_dims(dim_coord_name)[0] + # ...and its length + dim_size = dim_coord.points.size + # Construct an auxillary coordinate by replicating the data + # value from the scalar coordinate to match the size of the + # specified dimension coordinate + scalar_coord = iris.coords.AuxCoord( + np.repeat(scalar_coord_val, dim_size), long_name=scalar_coord_name + ) + + # Add the new auxillary coordinate to the cube + cube.add_aux_coord(scalar_coord, dim) + + return cubelist + + +def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): + """ + Create a histogram from each given cube, for every threshold and cell attribute. + + The cube will be "1-hourly mean precipitation rate" or "Large-scale rainfall rate". + Arguments + --------- + cubes: iris.cube.CubeList + Cube(s) to filter. Will be "1-hourly mean precipitation rate" or "Large-scale rainfall rate". + Assumed to be one cube per model, with the first being the control. + cell_attribute: str + "effective_radius_in_km" or "mean_value" + time_grouping: str + "forecast_period", "hour" or "time" - model_histograms = {} - for model, data in model_data.keys(): + """ + if not isinstance(cubes, CubeList): + cubes = CubeList([cubes]) + + # For now, thresholds are hard coded. + thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0] + + # Get a list of the common forecast periods and hours + # todo: For forecast time, this must contain "all", plus all the T+0.5 .. T+71.5 + # todo: For hourly, this must contain "all", plus all the 0.5Z .. 23.5Z + forecast_periods = set(cubes[0].coord("forecast_period").points) + hours = set(cubes[0].coord("time").points) # todo: check it's actually hourly? + for cube in cubes[1:]: + forecast_periods.intersection_update(set(cube.coord("forecast_period").points)) + hours.intersection_update(set(cube.coord("time").points)) + + # todo: we need a land-sea split input? + # for now, hard-code as false (this is what the res output had it set to) + land_sea_split = False + + # what about obs? GPM & UK_RADAR_2KM? ignore for now. + + # different bins for the two cell attributes + # bin_edges = {} + # + # bin_edges['effective_radius_in_km'] = 10**(np.arange(0.0, 3.12, 0.12)) + # bin_edges['effective_radius_in_km'] = np.insert(bin_edges['effective_radius_in_km'], 0, 0) + # + # bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12)) + # bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0) + + # model_histograms = {} + for cube in cubes: for threshold in thresholds: - something_like_cell_attribute_histogram( + # todo: check var_name has been removed by this point, as RES removed it to help with merge + hist_cubes = something_like_cell_attribute_histogram( cube, attribute=cell_attribute, - bin_edges=bin_edges[cell_attribute], + bin_edges=get_bin_edges(cell_attribute), threshold=threshold, ) + # todo: res does a deep copy at this point for some reason - seems uneceeary? double check. + + # todo: RES calls this here, but we get a cube, not a cubelist so we can't. + # RES used it's own version because Iris' was slow at the time. todo: is it fast now? + # hist_cubes = cube_utils.extract_overlapping(cubes_group, "forecast_period") + hist_cubes = hist_cubes.extract_overlapping("forecast_period") + + # Now we take the cell statistics and aggregate by "time_grouping". + + # We should deep copy the cubes here becuase there is cube wrangling below, + # and it's possible that we'll be using the same cubes across iterations in this loop. + # (as extract doesn't always create a new cube). + hist_cubes = deepcopy(hist_cubes) + + # Res comment for this bit is: preparation of data for averaging and plotting + hist_cubes, times = extract_unique(hist_cubes, time_grouping) + + # Sum cell statistic histograms at each time in parallel + input_params = [ + (hist_cubes, time, iris.analysis.SUM, None) for time in times + ] + result_list = [ + aggregate_at_time(input_param) for input_param in input_params + ] + cubes_group = iris.cube.CubeList(itertools.chain.from_iterable(result_list)) + cubes_group = cubes_group.merge() + + # If the number of cases at each time is the same, the + # above merge results in a scalar coordinate representing + # the number of cases. Replace this scalar coordinate with + # an auxillary coordinate that has the same length as the + # time coordinate + cubes_group = repeat_scalar_coord_along_dim_coord( + cubes_group, "num_cases", time_grouping + ) - # return a cube for each model - return model_histograms + # At this point, RES extracts every time point into a separate cube list and plots it. + for time in times: + # Extract histogram at this time + time_constraint = iris.Constraint( + coord_values={time_grouping: lambda cell: cell.point in time.points} + ) + cubes_at_time = cubes_group.extract(time_constraint) + # todo: RES creates a plot title here. + # Perhaps we should add a plot title attribute to each cube here? + if time_grouping == "forecast_time": + title = "T+{0:.1f}".format(time.points[0]) + elif time_grouping == "hour": + title = "{0:.1f}Z".format(time.points[0]) + elif time_grouping == "time": + time_unit = time.units + datetime = time_unit.num2date(time.points[0]) + title = "{0:%Y/%m/%d} {1:%H%M}Z".format(datetime, datetime) -def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0): - hist_cube = iris.cube.CubeList() - coords = get_non_spatial_coords(cube) + # todo: RES has some analysis of the number of cases used to construct the histogram at this point. + + # todo: RES plots each cube here. + for cube in cubes_at_time: + # todo: Normalise histogram? + # if y_axis == "relative_frequency": + # cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) + + print(f'adding "{title}" plot for "{cube.name()}"') + + # Sum all histograms. This creates the data for the "all" time point. + for cube in cubes_group: + cube = cube.collapsed(time_grouping, iris.analysis.SUM) + + # todo: Normalise histogram? + # if y_axis == "relative_frequency": + # cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) + + print(f'adding "all" plot for collapsed "{cube.name()}"') + + # return cubes to plot. todo: in what exact arrangement? + return None + +def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0): # Express histogram bins as an Iris coordinate bin_centres = 0.5 * (bin_edges[1:] + bin_edges[:-1]) bins_as_coord = iris.coords.DimCoord( - bin_centres, long_name=attribute, units=cube.units, - coord_system=None, bounds=np.column_stack((bin_edges[0:-1], bin_edges[1:]))) + bin_centres, + long_name=attribute, + units=cube.units, + coord_system=None, + bounds=np.column_stack((bin_edges[0:-1], bin_edges[1:])), + ) + data_min, data_max = None, None hist_cubes = iris.cube.CubeList() + coords = get_non_spatial_coords(cube) for slc in cube.slices_over(coords): # Identify connected cells in this spatial slice @@ -163,8 +608,14 @@ def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshol # Extract values of the desired cell attribute cell_attributes = [cell.attributes[attribute] for cell in cells] + # Store the minimum/maximum values of the cell attribute + if data_min is None or np.min(cell_attributes) < data_min: + data_min = np.min(cell_attributes) + if data_max is None or np.max(cell_attributes) > data_max: + data_max = np.max(cell_attributes) + # Construct a histogram of the desired cell attribute - hist, _ = np.histogram(cell_attributes) + hist, _ = np.histogram(cell_attributes, bin_edges) else: # Assign zeros to all bins hist = np.zeros(bin_centres.size).astype(np.int64) @@ -190,12 +641,10 @@ def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshol def get_spatial_coords(cube): - '''Returns the x, y coordinates of an input :class:`iris.cube.Cube`.''' + """Returns the x, y coordinates of an input :class:`iris.cube.Cube`.""" # Usual names for spatial coordinates - X_COORD_NAMES = ["longitude", "grid_longitude", - "projection_x_coordinate", "x"] - Y_COORD_NAMES = ["latitude", "grid_latitude", - "projection_y_coordinate", "y"] + X_COORD_NAMES = ["longitude", "grid_longitude", "projection_x_coordinate", "x"] + Y_COORD_NAMES = ["latitude", "grid_latitude", "projection_y_coordinate", "y"] # Get a list of coordinate names for the cube coord_names = [coord.name() for coord in cube.coords()] @@ -231,11 +680,11 @@ def get_non_spatial_coords(cube): def guess_bounds(cube): - ''' + """ Takes an input :class:`iris.cube.Cube`, guesses bounds on the x, y \ coordinates, then returns the cube. Such bounds are often required by \ regridding algorithms. - ''' + """ # Loop over spatial coordinates for axis in ["x", "y"]: coord = cube.coord(axis=axis) @@ -245,8 +694,7 @@ def guess_bounds(cube): try: _ = iris.util.regular_step(coord) except: - raise ValueError("Cannot guess bounds for a variable " - "resolution grid") + raise ValueError("Cannot guess bounds for a variable resolution grid") # Guess bounds if there aren't any if coord.bounds is None: coord.guess_bounds() @@ -254,7 +702,7 @@ def guess_bounds(cube): def connected_object_labelling(data, threshold=0.0, min_size=1, connectivity=1): - ''' + """ Finds connected objects in an input array and assigns them unique labels. Arguments: @@ -273,12 +721,12 @@ def connected_object_labelling(data, threshold=0.0, min_size=1, connectivity=1): neighbours. Connectivity may range from 1 (only direct \ neighbours are considered) to :attr:`data.ndim`. - Returns: - + Returns + ------- * **label_array** - an integer array where each unique object in the input \ array has a unique label in the returned array. * **num_objects** - the number of objects found. - ''' + """ # Apply the supplied threshold to the data to generate a binary array binary_data = data.copy() if np.ma.is_masked(binary_data): @@ -287,20 +735,21 @@ def connected_object_labelling(data, threshold=0.0, min_size=1, connectivity=1): # below binary_data = np.ma.filled(binary_data, fill_value=(threshold - 1)) # Set values above and below the threshold to 1 and 0, respectively - indices_above = (binary_data > threshold) - indices_below = (binary_data <= threshold) + indices_above = binary_data > threshold + indices_below = binary_data <= threshold binary_data[indices_above] = 1 binary_data[indices_below] = 0 # Construct a structuring element that defines how the neighbours of # a grid point are assigned structure_element = ndimage.morphology.generate_binary_structure( - data.ndim, connectivity) + data.ndim, connectivity + ) # Label distinct (connected) objects in the binary array label_array, num_objects = ndimage.measurements.label( - binary_data, - structure=structure_element) + binary_data, structure=structure_element + ) # Throw away any objects smaller than min_size if min_size < 1: @@ -341,16 +790,14 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1): direct neighbours are considered) to \ :attr:`cube.data.ndim`. - Returns: - + Returns + ------- * **cells** - a :class:`iris.cube.CubeList` of \ :class:`iris.cube.Cube` objects, each one corresponding to \ an identified cell. """ - # Convert input area threshold from km^2 to m^2 - M_IN_KM = 1000 - area_threshold = (float(M_IN_KM)**2) * area_threshold + area_threshold = (float(M_IN_KM) ** 2) * area_threshold # Get x, y coordinates of input cube x_coord, y_coord = get_spatial_coords(cube) @@ -370,17 +817,14 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1): grid_areas = iris.analysis.cartography.area_weights(slc) # Store a list of the non-spatial coordinates for this slice - aux_coords = [(coord, []) for coord in - get_non_spatial_coords(slc)] + aux_coords = [(coord, []) for coord in get_non_spatial_coords(slc)] # Find and label cells # Call connected object labelling function based on # scipy.ndimage.measurements.label cell_label_array, _ = connected_object_labelling( - slc.data, - threshold=threshold, - min_size=1, - connectivity=connectivity) + slc.data, threshold=threshold, min_size=1, connectivity=connectivity + ) # Get a list of unique cell labels cell_labels = np.unique(cell_label_array) @@ -397,8 +841,7 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1): # There should not be any masked data present in cells! if np.ma.is_masked(cell_values): - raise ValueError("Masked data found in cell {0:d}" - .format(cell_label)) + raise ValueError("Masked data found in cell {0:d}".format(cell_label)) # If cell area is less than area_threshold, discard it # (by setting its label to the background value) @@ -409,14 +852,16 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1): # Estimate cell centre position # TODO Is there a better way of doing this? C.O.M? - cell_centre = (np.mean(cell_x, dtype=np.float64), - np.mean(cell_y, dtype=np.float64)) + cell_centre = ( + np.mean(cell_x, dtype=np.float64), + np.mean(cell_y, dtype=np.float64), + ) # Area-weighted mean value in cell - cell_mean = (np.sum((cell_grid_areas * cell_values), - dtype=np.float64) - / cell_area) + cell_mean = ( + np.sum((cell_grid_areas * cell_values), dtype=np.float64) / cell_area + ) # Convert cell area from m^2 to km^2... - cell_area /= (float(M_IN_KM)**2) + cell_area /= float(M_IN_KM) ** 2 # ...and then cell effective radius in km cell_radius = np.sqrt(cell_area / np.pi) @@ -427,7 +872,8 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1): units=cube.units, attributes=cube.attributes, cell_methods=cube.cell_methods, - aux_coords_and_dims=aux_coords) + aux_coords_and_dims=aux_coords, + ) # Set up x, y coordinates describing the grid points in the cell... cell_x_coord = iris.coords.AuxCoord( @@ -437,7 +883,8 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1): units=x_coord.units, bounds=None, attributes=x_coord.attributes, - coord_system=x_coord.coord_system) + coord_system=x_coord.coord_system, + ) cell_y_coord = iris.coords.AuxCoord( cell_y, standard_name=y_coord.standard_name, @@ -445,34 +892,33 @@ def find_cells(cube, threshold=0.0, area_threshold=0.0, connectivity=1): units=y_coord.units, bounds=None, attributes=y_coord.attributes, - coord_system=y_coord.coord_system) + coord_system=y_coord.coord_system, + ) # ...and add them to the cell cube cell_cube.add_aux_coord(cell_x_coord, 0) cell_cube.add_aux_coord(cell_y_coord, 0) # Set up a coordinate describing the areas of grid cells in # the cell object... - cell_grid_area_coord = iris.coords.AuxCoord(cell_grid_areas, - long_name="grid_areas", - units="m2") - #...and add it to the cell cube + cell_grid_area_coord = iris.coords.AuxCoord( + cell_grid_areas, long_name="grid_areas", units="m2" + ) + # ...and add it to the cell cube cell_cube.add_aux_coord(cell_grid_area_coord, 0) # Finally add some attriubtes to the cube that describe some # useful information about the cell - #cell_cube.attributes["label"] = int(cell_label) + # cell_cube.attributes["label"] = int(cell_label) cell_cube.attributes["centre"] = cell_centre cell_cube.attributes["area_in_km2"] = cell_area cell_cube.attributes["effective_radius_in_km"] = cell_radius cell_cube.attributes["mean_value"] = cell_mean - #cell_cube.attributes["indices"] = cell_indices + # cell_cube.attributes["indices"] = cell_indices cells.append(cell_cube) return cells - - # ignore # def cell_statistics(LAND_SEA_SPLIT: bool): diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml index d31c1937a..54a5d8f03 100644 --- a/src/CSET/recipes/cell_statistics.yaml +++ b/src/CSET/recipes/cell_statistics.yaml @@ -5,21 +5,21 @@ description: | steps: - - operator: cell_statistics.caller_thing + - operator: read.read_cubes + file_paths: $INPUT_PATHS + model_names: $MODEL_NAMES + constraint: + operator: constraints.generate_var_constraint + varname: $VARNAME - # "effective_radius_in_km" or "mean_value" - cell_attribute: $CELL_ATTRIBUTE + - operator: regrid.regrid_onto_xyspacing + xspacing: 0.5 + yspacing: 0.5 + method: Linear - uk_ctrl_um: - operator: read.read_cubes - file_paths: - - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp" - - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb012.pp" - uk_expt_lfric: - operator: read.read_cubes - file_paths: - - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_000_006.nc" - - "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_006_012.nc" + - operator: cell_statistics.caller_thing + cell_attribute: $CELL_ATTRIBUTE + time_grouping: $TIME_GROUPING - operator: write.write_cube_to_nc overwrite: True From c299cfbd5eeb7a1cd0c9d0f35d1856bf5703cdb2 Mon Sep 17 00:00:00 2001 From: bblay Date: Mon, 22 Sep 2025 15:27:34 +0100 Subject: [PATCH 03/12] - i think the data creation is now running in cset - need to plug into the histogram plotting, possibly with a new histo plot just for this data --- src/CSET/operators/cell_statistics.py | 231 +++++++++++++++++--------- 1 file changed, 155 insertions(+), 76 deletions(-) diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py index 6449de083..564527574 100644 --- a/src/CSET/operators/cell_statistics.py +++ b/src/CSET/operators/cell_statistics.py @@ -1,9 +1,11 @@ """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE.""" - +import collections import datetime import inspect import itertools +import pickle from copy import deepcopy +from pathlib import Path from typing import Callable import cftime @@ -448,6 +450,52 @@ def repeat_scalar_coord_along_dim_coord(cubelist, scalar_coord_name, dim_coord_n return cubelist +def RES_extract_overlapping(cubelist, coord_name): + ''' + Extracts regions from cubes in a :class:`iris.cube.CubeList` such that \ + the specified coordinate is the same across all cubes. + + Arguments: + + * **cubelist** - an input :class:`iris.cube.CubeList`. + * **coord_name** - a string specifying the name of the coordinate \ + over which to perform the extraction. + + Returns a :class:`iris.cube.CubeList` where the coordinate corresponding \ + to coord_name is the same for all cubes. + ''' + # Build a list of all Cell instances for this coordinate by + # looping through all cubes in the supplied cubelist + all_cells = [] + for cube in cubelist: + for cell in cube.coord(coord_name).cells(): + all_cells.append(cell) + + # Work out which coordinate Cell instances are common across + # all cubes in the cubelist... + cell_counts = collections.Counter(all_cells) + # unique_cells = cell_counts.keys() + unique_cells = list(cell_counts.keys()) + unique_cell_counts = list(cell_counts.values()) + num_cubes = len(cubelist) + common_cells = [unique_cells[i] for i, count in + enumerate(unique_cell_counts) if count == num_cubes] + # ...and use these to subset the cubes in the cubelist + constraint = iris.Constraint( + coord_values={coord_name: lambda cell: cell in common_cells}) + + cubelist = iris.cube.CubeList([cube.extract(constraint) + for cube in cubelist]) + return cubelist + + +def ensure_bounds(cubes, coord_name): + for cube in cubes: + coord = cube.coord(coord_name) + if not coord.bounds is not None: + coord.guess_bounds() + + def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): """ Create a histogram from each given cube, for every threshold and cell attribute. @@ -465,9 +513,25 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): "forecast_period", "hour" or "time" """ + + # DEV/DEBUGGING - REMOVE + pkl_filename = Path("/home/users/byron.blay/git/CSET/delme/debug_cubes.pkl") + # normal use - save pickle for faster subsequent runs + if cubes and not pkl_filename.exists(): + with open(pkl_filename, 'wb') as pkl_file: + pickle.dump(cubes, pkl_file) + print("now load that pickle") + exit(0) + # we'll be calling this instead of the yaml processing for a dev speed up + if not cubes and pkl_filename.exists(): + with open(pkl_filename, 'rb') as pkl_file: + cubes = pickle.load(pkl_file) + if not isinstance(cubes, CubeList): cubes = CubeList([cubes]) + + # For now, thresholds are hard coded. thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0] @@ -495,91 +559,106 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): # bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12)) # bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0) - # model_histograms = {} - for cube in cubes: - for threshold in thresholds: - # todo: check var_name has been removed by this point, as RES removed it to help with merge - hist_cubes = something_like_cell_attribute_histogram( + for threshold in thresholds: + # todo: check var_name has been removed by this point, as RES removed it to help with merge + # hist_cubes = something_like_cell_attribute_histogram( + # cube, + # attribute=cell_attribute, + # bin_edges=get_bin_edges(cell_attribute), + # threshold=threshold) for cube in cubes] + + hist_cubes = CubeList([]) + for cube in cubes: + hist_cubes.append(something_like_cell_attribute_histogram( cube, attribute=cell_attribute, bin_edges=get_bin_edges(cell_attribute), threshold=threshold, - ) + )) + + # todo: res does a deep copy at this point for some reason - seems uneceeary? double check. + + # todo: RES calls this here, but we get a cube, not a cubelist so we can't. + # RES used it's own version because Iris' was slow at the time. todo: is it fast now? + # hist_cubes = RES_extract_overlapping(hist_cubes, "forecast_period") + ensure_bounds(hist_cubes, "forecast_period") + hist_cubes = hist_cubes.extract_overlapping("forecast_period") + + # Now we take the cell statistics and aggregate by "time_grouping". + + # We should deep copy the cubes here becuase there is cube wrangling below, + # and it's possible that we'll be using the same cubes across iterations in this loop. + # (as extract doesn't always create a new cube). + hist_cubes = deepcopy(hist_cubes) + + # Res comment for this bit is: preparation of data for averaging and plotting + # It removes the other time coords, other than that named by time_grouping. + hist_cubes, times = extract_unique(hist_cubes, time_grouping) + + # Sum cell statistic histograms at each time in parallel + # input_params = [ + # (hist_cubes, time, iris.analysis.SUM, None) for time in times + # ] + # result_list = [ + # aggregate_at_time(input_param) for input_param in input_params + # ] + result_list = [] + for time in times: + input_param = (hist_cubes, time, iris.analysis.SUM, None) + result = aggregate_at_time(input_param) + result_list.append(result) + cubes_group = iris.cube.CubeList(itertools.chain.from_iterable(result_list)) + cubes_group = cubes_group.merge() + + # If the number of cases at each time is the same, the + # above merge results in a scalar coordinate representing + # the number of cases. Replace this scalar coordinate with + # an auxillary coordinate that has the same length as the + # time coordinate + cubes_group = repeat_scalar_coord_along_dim_coord( + cubes_group, "num_cases", time_grouping + ) - # todo: res does a deep copy at this point for some reason - seems uneceeary? double check. - - # todo: RES calls this here, but we get a cube, not a cubelist so we can't. - # RES used it's own version because Iris' was slow at the time. todo: is it fast now? - # hist_cubes = cube_utils.extract_overlapping(cubes_group, "forecast_period") - hist_cubes = hist_cubes.extract_overlapping("forecast_period") - - # Now we take the cell statistics and aggregate by "time_grouping". - - # We should deep copy the cubes here becuase there is cube wrangling below, - # and it's possible that we'll be using the same cubes across iterations in this loop. - # (as extract doesn't always create a new cube). - hist_cubes = deepcopy(hist_cubes) - - # Res comment for this bit is: preparation of data for averaging and plotting - hist_cubes, times = extract_unique(hist_cubes, time_grouping) - - # Sum cell statistic histograms at each time in parallel - input_params = [ - (hist_cubes, time, iris.analysis.SUM, None) for time in times - ] - result_list = [ - aggregate_at_time(input_param) for input_param in input_params - ] - cubes_group = iris.cube.CubeList(itertools.chain.from_iterable(result_list)) - cubes_group = cubes_group.merge() - - # If the number of cases at each time is the same, the - # above merge results in a scalar coordinate representing - # the number of cases. Replace this scalar coordinate with - # an auxillary coordinate that has the same length as the - # time coordinate - cubes_group = repeat_scalar_coord_along_dim_coord( - cubes_group, "num_cases", time_grouping + # At this point, RES extracts every time point into a separate cube list and plots it. + for time in times: + # Extract histogram at this time + time_constraint = iris.Constraint( + coord_values={time_grouping: lambda cell: cell.point in time.points} ) - - # At this point, RES extracts every time point into a separate cube list and plots it. - for time in times: - # Extract histogram at this time - time_constraint = iris.Constraint( - coord_values={time_grouping: lambda cell: cell.point in time.points} - ) - cubes_at_time = cubes_group.extract(time_constraint) - - # todo: RES creates a plot title here. - # Perhaps we should add a plot title attribute to each cube here? - if time_grouping == "forecast_time": - title = "T+{0:.1f}".format(time.points[0]) - elif time_grouping == "hour": - title = "{0:.1f}Z".format(time.points[0]) - elif time_grouping == "time": - time_unit = time.units - datetime = time_unit.num2date(time.points[0]) - title = "{0:%Y/%m/%d} {1:%H%M}Z".format(datetime, datetime) - - # todo: RES has some analysis of the number of cases used to construct the histogram at this point. - - # todo: RES plots each cube here. - for cube in cubes_at_time: - # todo: Normalise histogram? - # if y_axis == "relative_frequency": - # cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) - - print(f'adding "{title}" plot for "{cube.name()}"') - - # Sum all histograms. This creates the data for the "all" time point. - for cube in cubes_group: - cube = cube.collapsed(time_grouping, iris.analysis.SUM) - + cubes_at_time = cubes_group.extract(time_constraint) + + # todo: RES creates a plot title here. + # Perhaps we should add a plot title attribute to each cube here? + if time_grouping == "forecast_period": + title = "T+{0:.1f}".format(time.points[0]) + elif time_grouping == "hour": + title = "{0:.1f}Z".format(time.points[0]) + elif time_grouping == "time": + time_unit = time.units + datetime = time_unit.num2date(time.points[0]) + title = "{0:%Y/%m/%d} {1:%H%M}Z".format(datetime, datetime) + else: + raise ValueError(f"Unknown time grouping '{time_grouping}'") + + # todo: RES has some analysis of the number of cases used to construct the histogram at this point. + + # todo: RES plots each cube here. + for cube in cubes_at_time: # todo: Normalise histogram? # if y_axis == "relative_frequency": # cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) - print(f'adding "all" plot for collapsed "{cube.name()}"') + print(f'adding "{title}" plot for "{cube.name()}"') + + # Sum all histograms. This creates the data for the "all" time point. + for cube in cubes_group: + cube = cube.collapsed(time_grouping, iris.analysis.SUM) + + # todo: Normalise histogram? + # if y_axis == "relative_frequency": + # cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) + + print(f'adding "all" plot for collapsed "{cube.name()}"') # return cubes to plot. todo: in what exact arrangement? return None From 5b53d8fdf72506b7db5d2100d638385f2e8cc416 Mon Sep 17 00:00:00 2001 From: bblay Date: Mon, 22 Sep 2025 15:28:21 +0100 Subject: [PATCH 04/12] -squash this --- src/CSET/operators/cell_statistics.py | 39 ++++++++++++++------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py index 564527574..0bf849e1f 100644 --- a/src/CSET/operators/cell_statistics.py +++ b/src/CSET/operators/cell_statistics.py @@ -1,4 +1,5 @@ """OMG YOU CAN'T COMMIT WITHOUT DOCSTRINGS, EVEN IN EXPLORATORY CODE.""" + import collections import datetime import inspect @@ -451,7 +452,7 @@ def repeat_scalar_coord_along_dim_coord(cubelist, scalar_coord_name, dim_coord_n def RES_extract_overlapping(cubelist, coord_name): - ''' + """ Extracts regions from cubes in a :class:`iris.cube.CubeList` such that \ the specified coordinate is the same across all cubes. @@ -463,7 +464,7 @@ def RES_extract_overlapping(cubelist, coord_name): Returns a :class:`iris.cube.CubeList` where the coordinate corresponding \ to coord_name is the same for all cubes. - ''' + """ # Build a list of all Cell instances for this coordinate by # looping through all cubes in the supplied cubelist all_cells = [] @@ -478,14 +479,17 @@ def RES_extract_overlapping(cubelist, coord_name): unique_cells = list(cell_counts.keys()) unique_cell_counts = list(cell_counts.values()) num_cubes = len(cubelist) - common_cells = [unique_cells[i] for i, count in - enumerate(unique_cell_counts) if count == num_cubes] + common_cells = [ + unique_cells[i] + for i, count in enumerate(unique_cell_counts) + if count == num_cubes + ] # ...and use these to subset the cubes in the cubelist constraint = iris.Constraint( - coord_values={coord_name: lambda cell: cell in common_cells}) + coord_values={coord_name: lambda cell: cell in common_cells} + ) - cubelist = iris.cube.CubeList([cube.extract(constraint) - for cube in cubelist]) + cubelist = iris.cube.CubeList([cube.extract(constraint) for cube in cubelist]) return cubelist @@ -513,25 +517,22 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): "forecast_period", "hour" or "time" """ - # DEV/DEBUGGING - REMOVE pkl_filename = Path("/home/users/byron.blay/git/CSET/delme/debug_cubes.pkl") # normal use - save pickle for faster subsequent runs if cubes and not pkl_filename.exists(): - with open(pkl_filename, 'wb') as pkl_file: + with open(pkl_filename, "wb") as pkl_file: pickle.dump(cubes, pkl_file) print("now load that pickle") exit(0) # we'll be calling this instead of the yaml processing for a dev speed up if not cubes and pkl_filename.exists(): - with open(pkl_filename, 'rb') as pkl_file: + with open(pkl_filename, "rb") as pkl_file: cubes = pickle.load(pkl_file) if not isinstance(cubes, CubeList): cubes = CubeList([cubes]) - - # For now, thresholds are hard coded. thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0] @@ -569,12 +570,14 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): hist_cubes = CubeList([]) for cube in cubes: - hist_cubes.append(something_like_cell_attribute_histogram( - cube, - attribute=cell_attribute, - bin_edges=get_bin_edges(cell_attribute), - threshold=threshold, - )) + hist_cubes.append( + something_like_cell_attribute_histogram( + cube, + attribute=cell_attribute, + bin_edges=get_bin_edges(cell_attribute), + threshold=threshold, + ) + ) # todo: res does a deep copy at this point for some reason - seems uneceeary? double check. From 4a6d816bfbd8197746bb8e45e2aa5d7240d2d2c0 Mon Sep 17 00:00:00 2001 From: bblay Date: Tue, 23 Sep 2025 10:44:19 +0100 Subject: [PATCH 05/12] the data is now bundled into a data structure which is passed to a new plot function, which probably needs to call the existing histogram plot function more than once - we'll see. --- src/CSET/operators/cell_statistics.py | 86 +++++++++++++-------------- src/CSET/operators/plot.py | 27 +++++++++ src/CSET/recipes/cell_statistics.yaml | 8 ++- 3 files changed, 75 insertions(+), 46 deletions(-) diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py index 0bf849e1f..33c72c398 100644 --- a/src/CSET/operators/cell_statistics.py +++ b/src/CSET/operators/cell_statistics.py @@ -4,9 +4,7 @@ import datetime import inspect import itertools -import pickle from copy import deepcopy -from pathlib import Path from typing import Callable import cftime @@ -517,18 +515,18 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): "forecast_period", "hour" or "time" """ - # DEV/DEBUGGING - REMOVE - pkl_filename = Path("/home/users/byron.blay/git/CSET/delme/debug_cubes.pkl") - # normal use - save pickle for faster subsequent runs - if cubes and not pkl_filename.exists(): - with open(pkl_filename, "wb") as pkl_file: - pickle.dump(cubes, pkl_file) - print("now load that pickle") - exit(0) - # we'll be calling this instead of the yaml processing for a dev speed up - if not cubes and pkl_filename.exists(): - with open(pkl_filename, "rb") as pkl_file: - cubes = pickle.load(pkl_file) + # # DEV/DEBUGGING - REMOVE + # pkl_filename = Path("/home/users/byron.blay/git/CSET/delme/debug_cubes.pkl") + # # normal use - save pickle for faster subsequent runs + # if cubes and not pkl_filename.exists(): + # with open(pkl_filename, "wb") as pkl_file: + # pickle.dump(cubes, pkl_file) + # print("now load that pickle") + # exit(0) + # # we'll be calling this instead of the yaml processing for a dev speed up + # if not cubes and pkl_filename.exists(): + # with open(pkl_filename, "rb") as pkl_file: + # cubes = pickle.load(pkl_file) if not isinstance(cubes, CubeList): cubes = CubeList([cubes]) @@ -560,7 +558,11 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): # bin_edges['mean_value'] = 10**(np.arange(-1, 2.7, 0.12)) # bin_edges['mean_value'] = np.insert(bin_edges['mean_value'], 0, 0) + result = {} + for threshold in thresholds: + result[f"threshold {threshold}"] = threshold_result = {} + # todo: check var_name has been removed by this point, as RES removed it to help with merge # hist_cubes = something_like_cell_attribute_histogram( # cube, @@ -598,51 +600,48 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): # It removes the other time coords, other than that named by time_grouping. hist_cubes, times = extract_unique(hist_cubes, time_grouping) - # Sum cell statistic histograms at each time in parallel - # input_params = [ - # (hist_cubes, time, iris.analysis.SUM, None) for time in times - # ] - # result_list = [ - # aggregate_at_time(input_param) for input_param in input_params - # ] - result_list = [] + # Sum cell statistic histograms at each time. todo: parallelise + summed_cubes = [] for time in times: input_param = (hist_cubes, time, iris.analysis.SUM, None) - result = aggregate_at_time(input_param) - result_list.append(result) - cubes_group = iris.cube.CubeList(itertools.chain.from_iterable(result_list)) - cubes_group = cubes_group.merge() + summed_cube = aggregate_at_time(input_param) + summed_cubes.append(summed_cube) + summed_cubes = iris.cube.CubeList(itertools.chain.from_iterable(summed_cubes)) + summed_cubes = summed_cubes.merge() # If the number of cases at each time is the same, the # above merge results in a scalar coordinate representing # the number of cases. Replace this scalar coordinate with # an auxillary coordinate that has the same length as the # time coordinate - cubes_group = repeat_scalar_coord_along_dim_coord( - cubes_group, "num_cases", time_grouping + summed_cubes = repeat_scalar_coord_along_dim_coord( + summed_cubes, "num_cases", time_grouping ) # At this point, RES extracts every time point into a separate cube list and plots it. for time in times: - # Extract histogram at this time - time_constraint = iris.Constraint( - coord_values={time_grouping: lambda cell: cell.point in time.points} - ) - cubes_at_time = cubes_group.extract(time_constraint) + assert len(time.points) == 1 + time_point = time.points[0] - # todo: RES creates a plot title here. - # Perhaps we should add a plot title attribute to each cube here? if time_grouping == "forecast_period": - title = "T+{0:.1f}".format(time.points[0]) + time_title = "T+{0:.1f}".format(time_point) elif time_grouping == "hour": - title = "{0:.1f}Z".format(time.points[0]) + time_title = "{0:.1f}Z".format(time_point) elif time_grouping == "time": time_unit = time.units - datetime = time_unit.num2date(time.points[0]) - title = "{0:%Y/%m/%d} {1:%H%M}Z".format(datetime, datetime) + datetime = time_unit.num2date(time_point) + time_title = "{0:%Y/%m/%d} {1:%H%M}Z".format(datetime, datetime) else: raise ValueError(f"Unknown time grouping '{time_grouping}'") + threshold_result[time_title] = {} + + # Extract histogram at this time. + time_constraint = iris.Constraint( + coord_values={time_grouping: lambda cell: cell.point in time.points} + ) + cubes_at_time = summed_cubes.extract(time_constraint) + # todo: RES has some analysis of the number of cases used to construct the histogram at this point. # todo: RES plots each cube here. @@ -651,20 +650,21 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): # if y_axis == "relative_frequency": # cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) - print(f'adding "{title}" plot for "{cube.name()}"') + threshold_result[time_title][cube.attributes["model_name"]] = cube # Sum all histograms. This creates the data for the "all" time point. - for cube in cubes_group: + threshold_result["all"] = {} + for cube in summed_cubes: cube = cube.collapsed(time_grouping, iris.analysis.SUM) # todo: Normalise histogram? # if y_axis == "relative_frequency": # cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) - print(f'adding "all" plot for collapsed "{cube.name()}"') + threshold_result["all"][cube.attributes["model_name"]] = cube # return cubes to plot. todo: in what exact arrangement? - return None + return result def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0): diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py index 1eef7ff00..03e9e22a2 100644 --- a/src/CSET/operators/plot.py +++ b/src/CSET/operators/plot.py @@ -2208,3 +2208,30 @@ def plot_histogram_series( _make_plot_html_page(complete_plot_index) return cubes + + +def plot_cell_stats_histograms(data: dict, cell_attribute: str, time_grouping: str): + """ + + Parameters + ---------- + data: + A dict mapping thresholds to histogram data for each time point: + data[][][] -> Cube + + E.g, when time_grouping is 'forecast_period', we see a key like: + data['threshold 0.5']['T+0.5']['uk_ctrl_um'] + + The cube is one dimensional, with a dim coord of the given cell_attribute. + + cell_attribute: + 'effective_radius_in_km' or 'mean_value' + + time_grouping: + 'forecast_period', 'hour' or 'time' + + Returns + ------- + + """ + pass diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml index 54a5d8f03..e9f656cca 100644 --- a/src/CSET/recipes/cell_statistics.yaml +++ b/src/CSET/recipes/cell_statistics.yaml @@ -21,7 +21,9 @@ steps: cell_attribute: $CELL_ATTRIBUTE time_grouping: $TIME_GROUPING - - operator: write.write_cube_to_nc - overwrite: True +# - operator: write.write_cube_to_nc +# overwrite: True - - operator: plot.plot_histogram_series + - operator: plot.plot_cell_stats_histograms + cell_attribute: $CELL_ATTRIBUTE + time_grouping: $TIME_GROUPING From d0ae5abe73479f2e5a90b042eedf6b20f3368b5b Mon Sep 17 00:00:00 2001 From: bblay Date: Wed, 24 Sep 2025 10:53:26 +0100 Subject: [PATCH 06/12] - some histograms are being created - one var not working - lots of skipped plots due to no data or nan in the min/max - needs a bespoke plot and html to bring it all together sensibly --- delme/run_cell_statistics.py | 49 +++++++++++++++ src/CSET/operators/cell_statistics.py | 48 +++++++++------ src/CSET/operators/plot.py | 86 +++++++++++++++++++++++++-- src/CSET/recipes/cell_statistics.yaml | 1 + 4 files changed, 161 insertions(+), 23 deletions(-) create mode 100644 delme/run_cell_statistics.py diff --git a/delme/run_cell_statistics.py b/delme/run_cell_statistics.py new file mode 100644 index 000000000..e380caaf6 --- /dev/null +++ b/delme/run_cell_statistics.py @@ -0,0 +1,49 @@ +from pathlib import Path + +from CSET._common import parse_recipe +from CSET.operators import execute_recipe + +cell_attributes =['effective_radius_in_km', 'mean_value'] + +recipe_fpath = Path(__file__).parent.parent / "src" / "CSET" / "recipes" / "cell_statistics.yaml" + +# model_data = [ +# ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp'), +# ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb012.pp'), +# ('uk_expt_lfric', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_000_006.nc'), +# ('uk_expt_lfric', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric_slam_timeproc_006_012.nc'), +# ] + +model_data = [ + ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiag*.pp'), + ('uk_expt_lfric', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric*.nc'), +] + + +vars = [ + 'surface_microphysical_rainfall_amount', # m01s04i201 + 'surface_microphysical_rainfall_rate', # m01s04i203 todo: seems to have missing data, needs investigation +] + +time_groupings = ['forecast_period', 'hour'] + + +# # DEBUG +# import CSET.operators.cell_statistics as cell_statistics +# cell_statistics.caller_thing(cubes=None, cell_attribute='effective_radius_in_km', time_grouping='forecast_period') +# exit(0) + + +for cell_attribute in cell_attributes: + for var in vars: + for time_grouping in time_groupings: + recipe_variables = { + 'INPUT_PATHS': [i[1] for i in model_data], + 'MODEL_NAMES': [i[0] for i in model_data], + 'VARNAME': var, + 'CELL_ATTRIBUTE': cell_attribute, + 'TIME_GROUPING': time_grouping, + } + + recipe = parse_recipe(recipe_fpath, recipe_variables) + execute_recipe(recipe, Path(__file__).parent) diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py index 33c72c398..5ee3cf474 100644 --- a/src/CSET/operators/cell_statistics.py +++ b/src/CSET/operators/cell_statistics.py @@ -515,18 +515,15 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): "forecast_period", "hour" or "time" """ - # # DEV/DEBUGGING - REMOVE - # pkl_filename = Path("/home/users/byron.blay/git/CSET/delme/debug_cubes.pkl") - # # normal use - save pickle for faster subsequent runs - # if cubes and not pkl_filename.exists(): - # with open(pkl_filename, "wb") as pkl_file: - # pickle.dump(cubes, pkl_file) - # print("now load that pickle") - # exit(0) - # # we'll be calling this instead of the yaml processing for a dev speed up - # if not cubes and pkl_filename.exists(): - # with open(pkl_filename, "rb") as pkl_file: - # cubes = pickle.load(pkl_file) + + # todo: paramterise y_axis? + # set to "frequency" for cell statistic histograms that are \ + # plain number counts in bins, or "relative_frequency" for \ + # histograms showing the overall proportion of data values in \ + # each bin (normalised to 100%). The default is \ + # "relative_frequency". + y_axis = "relative_frequency" + if not isinstance(cubes, CubeList): cubes = CubeList([cubes]) @@ -646,9 +643,9 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): # todo: RES plots each cube here. for cube in cubes_at_time: - # todo: Normalise histogram? - # if y_axis == "relative_frequency": - # cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) + # Normalise histogram? + if y_axis == "relative_frequency": + cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) threshold_result[time_title][cube.attributes["model_name"]] = cube @@ -657,14 +654,27 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): for cube in summed_cubes: cube = cube.collapsed(time_grouping, iris.analysis.SUM) - # todo: Normalise histogram? - # if y_axis == "relative_frequency": - # cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) + # Normalise histogram? + if y_axis == "relative_frequency": + cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) threshold_result["all"][cube.attributes["model_name"]] = cube + + # For simpler code, the results were formed as: + # data[][][] -> Cube + # But the plot needs the model to be above the time points, so rearrange as: + # data[][][] -> Cube + reformed_result = {} + for threshold, threshold_result in result.items(): + reformed_result[threshold] = {} + for time_title, model_cubes in threshold_result.items(): + for model_name, cube in model_cubes.items(): + reformed_result[threshold].setdefault(model_name, {}) + reformed_result[threshold][model_name][time_title] = cube + # return cubes to plot. todo: in what exact arrangement? - return result + return reformed_result def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0): diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py index 03e9e22a2..8201e1a89 100644 --- a/src/CSET/operators/plot.py +++ b/src/CSET/operators/plot.py @@ -23,6 +23,7 @@ import math import os import sys +from pathlib import Path from typing import Literal import cartopy.crs as ccrs @@ -2210,17 +2211,26 @@ def plot_histogram_series( return cubes -def plot_cell_stats_histograms(data: dict, cell_attribute: str, time_grouping: str): +def check_for_nan(cubes): + # The plot function was crashing when cube.data had nan in the min/max. + for cube in cubes: + # if np.nan in [cube.data.min, cube.data.max]: + if np.isnan(cube.data.min()) or np.isnan(cube.data.max()): + return True + return False + + +def plot_cell_stats_histograms(data: dict, varname: str, cell_attribute: str, time_grouping: str): """ Parameters ---------- data: A dict mapping thresholds to histogram data for each time point: - data[][][] -> Cube + data[][][] -> Cube E.g, when time_grouping is 'forecast_period', we see a key like: - data['threshold 0.5']['T+0.5']['uk_ctrl_um'] + data['threshold 0.5']['uk_ctrl_um']['T+0.5'] The cube is one dimensional, with a dim coord of the given cell_attribute. @@ -2234,4 +2244,72 @@ def plot_cell_stats_histograms(data: dict, cell_attribute: str, time_grouping: s ------- """ - pass + + # at each threshold, we're going to combine model's each time points into a single cube + sequence_coord = time_grouping + + for threshold, models in data.items(): + + models_merged = iris.cube.CubeList() # for each model: data at every time, which we will merge for a time sequence plot + models_all = iris.cube.CubeList() # for each model: aggregation of all times, created by the operator + + for model_name, times in models.items(): + models_all.append(times['all']) + del times['all'] + + # Put the time points together into a single cube. + # Except for the time_grouping coord, all other time coords will have been removed. + # Todo: why bother splitting them up in the first place? + # was that just for the RES plotting, can we simplify the operator? + cubes = iris.cube.CubeList(times.values()) + cubes = cubes.merge() + assert len(cubes) == 1 + models_merged.extend(cubes) + + if not models_merged: + logging.warning(f'no series data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot') + continue + + if not models_all: + logging.warning(f'no aggregated data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot') + continue + + root_folder = Path('/data/scratch/byron.blay/cset/cell_stats/out') + root_folder = root_folder / f'{varname}/{cell_attribute}/{time_grouping}/{threshold}' + + orig_folder = os.getcwd() + + # The plot function was crashing when cube.data had nan in the min/max. + if check_for_nan(models_merged): + logging.warning( + f'Series cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot.') + else: + folder = root_folder / 'series' + folder.mkdir(parents=True, exist_ok=True) + os.chdir(folder) + filename = folder / 'image.png' + plot_histogram_series( + cubes=models_merged, + filename=str(filename), + sequence_coordinate=sequence_coord, + stamp_coordinate=None, # is this useful? + single_plot=False, + ) + + if check_for_nan(models_all): + logging.warning( + f'Combnined cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot.') + else: + folder = root_folder / 'all' + folder.mkdir(parents=True, exist_ok=True) + os.chdir(folder) + filename = folder / 'image.png' + plot_histogram_series( + cubes=models_all, + filename=str(filename), + sequence_coordinate=sequence_coord, # todo: we don't need a time series for this plot! + stamp_coordinate=None, # is this useful? + single_plot=False, + ) + + os.chdir(orig_folder) diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml index e9f656cca..d24dbc58e 100644 --- a/src/CSET/recipes/cell_statistics.yaml +++ b/src/CSET/recipes/cell_statistics.yaml @@ -25,5 +25,6 @@ steps: # overwrite: True - operator: plot.plot_cell_stats_histograms + varname: $VARNAME cell_attribute: $CELL_ATTRIBUTE time_grouping: $TIME_GROUPING From 9ffb0daac55ecfafc689ebc0c64e06cf6d16ec90 Mon Sep 17 00:00:00 2001 From: bblay Date: Thu, 25 Sep 2025 10:37:39 +0100 Subject: [PATCH 07/12] fixed surface_microphysical_rainfall_rate by filtering cell - needs polish something's messed around with code formatting --- delme/run_cell_statistics.py | 32 ++++--- src/CSET/operators/cell_statistics.py | 117 +++++++++++++------------- src/CSET/operators/plot.py | 42 +++++---- 3 files changed, 105 insertions(+), 86 deletions(-) diff --git a/delme/run_cell_statistics.py b/delme/run_cell_statistics.py index e380caaf6..6f564d2c2 100644 --- a/delme/run_cell_statistics.py +++ b/delme/run_cell_statistics.py @@ -3,9 +3,11 @@ from CSET._common import parse_recipe from CSET.operators import execute_recipe -cell_attributes =['effective_radius_in_km', 'mean_value'] +cell_attributes = ["effective_radius_in_km", "mean_value"] -recipe_fpath = Path(__file__).parent.parent / "src" / "CSET" / "recipes" / "cell_statistics.yaml" +recipe_fpath = ( + Path(__file__).parent.parent / "src" / "CSET" / "recipes" / "cell_statistics.yaml" +) # model_data = [ # ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiagb000.pp'), @@ -15,17 +17,23 @@ # ] model_data = [ - ('uk_ctrl_um', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiag*.pp'), - ('uk_expt_lfric', '/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric*.nc'), + ( + "uk_ctrl_um", + "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_ctrl_um/20240121T0000Z_UK_672x672_1km_RAL3P3_pdiag*.pp", + ), + ( + "uk_expt_lfric", + "/data/scratch/byron.blay/RES_RETRIEVAL_CACHE/test_lfric/20240121T0000Z/uk_expt_lfric/20240121T0000Z_UK_672x672_1km_RAL3P3_lfric*.nc", + ), ] vars = [ - 'surface_microphysical_rainfall_amount', # m01s04i201 - 'surface_microphysical_rainfall_rate', # m01s04i203 todo: seems to have missing data, needs investigation + "surface_microphysical_rainfall_amount", # m01s04i201 + "surface_microphysical_rainfall_rate", # m01s04i203 todo: seems to have missing data, needs investigation ] -time_groupings = ['forecast_period', 'hour'] +time_groupings = ["forecast_period", "hour"] # # DEBUG @@ -38,11 +46,11 @@ for var in vars: for time_grouping in time_groupings: recipe_variables = { - 'INPUT_PATHS': [i[1] for i in model_data], - 'MODEL_NAMES': [i[0] for i in model_data], - 'VARNAME': var, - 'CELL_ATTRIBUTE': cell_attribute, - 'TIME_GROUPING': time_grouping, + "INPUT_PATHS": [i[1] for i in model_data], + "MODEL_NAMES": [i[0] for i in model_data], + "VARNAME": var, + "CELL_ATTRIBUTE": cell_attribute, + "TIME_GROUPING": time_grouping, } recipe = parse_recipe(recipe_fpath, recipe_variables) diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py index 5ee3cf474..e77c18890 100644 --- a/src/CSET/operators/cell_statistics.py +++ b/src/CSET/operators/cell_statistics.py @@ -448,47 +448,47 @@ def repeat_scalar_coord_along_dim_coord(cubelist, scalar_coord_name, dim_coord_n return cubelist - -def RES_extract_overlapping(cubelist, coord_name): - """ - Extracts regions from cubes in a :class:`iris.cube.CubeList` such that \ - the specified coordinate is the same across all cubes. - - Arguments: - - * **cubelist** - an input :class:`iris.cube.CubeList`. - * **coord_name** - a string specifying the name of the coordinate \ - over which to perform the extraction. - - Returns a :class:`iris.cube.CubeList` where the coordinate corresponding \ - to coord_name is the same for all cubes. - """ - # Build a list of all Cell instances for this coordinate by - # looping through all cubes in the supplied cubelist - all_cells = [] - for cube in cubelist: - for cell in cube.coord(coord_name).cells(): - all_cells.append(cell) - - # Work out which coordinate Cell instances are common across - # all cubes in the cubelist... - cell_counts = collections.Counter(all_cells) - # unique_cells = cell_counts.keys() - unique_cells = list(cell_counts.keys()) - unique_cell_counts = list(cell_counts.values()) - num_cubes = len(cubelist) - common_cells = [ - unique_cells[i] - for i, count in enumerate(unique_cell_counts) - if count == num_cubes - ] - # ...and use these to subset the cubes in the cubelist - constraint = iris.Constraint( - coord_values={coord_name: lambda cell: cell in common_cells} - ) - - cubelist = iris.cube.CubeList([cube.extract(constraint) for cube in cubelist]) - return cubelist +# todo: no longer needed? we can use the iris version now (double check this is ok) +# def RES_extract_overlapping(cubelist, coord_name): +# """ +# Extracts regions from cubes in a :class:`iris.cube.CubeList` such that \ +# the specified coordinate is the same across all cubes. +# +# Arguments: +# +# * **cubelist** - an input :class:`iris.cube.CubeList`. +# * **coord_name** - a string specifying the name of the coordinate \ +# over which to perform the extraction. +# +# Returns a :class:`iris.cube.CubeList` where the coordinate corresponding \ +# to coord_name is the same for all cubes. +# """ +# # Build a list of all Cell instances for this coordinate by +# # looping through all cubes in the supplied cubelist +# all_cells = [] +# for cube in cubelist: +# for cell in cube.coord(coord_name).cells(): +# all_cells.append(cell) +# +# # Work out which coordinate Cell instances are common across +# # all cubes in the cubelist... +# cell_counts = collections.Counter(all_cells) +# # unique_cells = cell_counts.keys() +# unique_cells = list(cell_counts.keys()) +# unique_cell_counts = list(cell_counts.values()) +# num_cubes = len(cubelist) +# common_cells = [ +# unique_cells[i] +# for i, count in enumerate(unique_cell_counts) +# if count == num_cubes +# ] +# # ...and use these to subset the cubes in the cubelist +# constraint = iris.Constraint( +# coord_values={coord_name: lambda cell: cell in common_cells} +# ) +# +# cubelist = iris.cube.CubeList([cube.extract(constraint) for cube in cubelist]) +# return cubelist def ensure_bounds(cubes, coord_name): @@ -515,7 +515,6 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): "forecast_period", "hour" or "time" """ - # todo: paramterise y_axis? # set to "frequency" for cell statistic histograms that are \ # plain number counts in bins, or "relative_frequency" for \ @@ -524,10 +523,19 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): # "relative_frequency". y_axis = "relative_frequency" - if not isinstance(cubes, CubeList): cubes = CubeList([cubes]) + # todo: put this cell method filtering into the yaml + # we saw lfric data with "point" or "mean". + def check_cell_methods(cube): + """discard "means", we want "points" or empty cell methods""" + for cm in cube.cell_methods: + if cm.method != "point": + return False + return True + cubes = CubeList([c for c in cubes if check_cell_methods(c)]) + # For now, thresholds are hard coded. thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0] @@ -569,21 +577,13 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): hist_cubes = CubeList([]) for cube in cubes: - hist_cubes.append( - something_like_cell_attribute_histogram( - cube, - attribute=cell_attribute, - bin_edges=get_bin_edges(cell_attribute), - threshold=threshold, - ) - ) + hist_cube = something_like_cell_attribute_histogram( + cube, attribute=cell_attribute, bin_edges=get_bin_edges(cell_attribute), threshold=threshold) + hist_cubes.append(hist_cube) - # todo: res does a deep copy at this point for some reason - seems uneceeary? double check. - - # todo: RES calls this here, but we get a cube, not a cubelist so we can't. # RES used it's own version because Iris' was slow at the time. todo: is it fast now? # hist_cubes = RES_extract_overlapping(hist_cubes, "forecast_period") - ensure_bounds(hist_cubes, "forecast_period") + ensure_bounds(hist_cubes, "forecast_period") # iris fails if only one has bounds hist_cubes = hist_cubes.extract_overlapping("forecast_period") # Now we take the cell statistics and aggregate by "time_grouping". @@ -645,7 +645,9 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): for cube in cubes_at_time: # Normalise histogram? if y_axis == "relative_frequency": - cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) + cube.data = (100.0 * cube.data) / np.sum( + cube.data, dtype=np.float64 + ) threshold_result[time_title][cube.attributes["model_name"]] = cube @@ -656,11 +658,10 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): # Normalise histogram? if y_axis == "relative_frequency": - cube.data = ((100.0 * cube.data) / np.sum(cube.data, dtype=np.float64)) + cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64) threshold_result["all"][cube.attributes["model_name"]] = cube - # For simpler code, the results were formed as: # data[][][] -> Cube # But the plot needs the model to be above the time points, so rearrange as: diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py index 8201e1a89..4b06a8e11 100644 --- a/src/CSET/operators/plot.py +++ b/src/CSET/operators/plot.py @@ -2220,7 +2220,9 @@ def check_for_nan(cubes): return False -def plot_cell_stats_histograms(data: dict, varname: str, cell_attribute: str, time_grouping: str): +def plot_cell_stats_histograms( + data: dict, varname: str, cell_attribute: str, time_grouping: str +): """ Parameters @@ -2244,18 +2246,18 @@ def plot_cell_stats_histograms(data: dict, varname: str, cell_attribute: str, ti ------- """ - # at each threshold, we're going to combine model's each time points into a single cube sequence_coord = time_grouping for threshold, models in data.items(): - models_merged = iris.cube.CubeList() # for each model: data at every time, which we will merge for a time sequence plot - models_all = iris.cube.CubeList() # for each model: aggregation of all times, created by the operator + models_all = ( + iris.cube.CubeList() + ) # for each model: aggregation of all times, created by the operator for model_name, times in models.items(): - models_all.append(times['all']) - del times['all'] + models_all.append(times["all"]) + del times["all"] # Put the time points together into a single cube. # Except for the time_grouping coord, all other time coords will have been removed. @@ -2267,27 +2269,34 @@ def plot_cell_stats_histograms(data: dict, varname: str, cell_attribute: str, ti models_merged.extend(cubes) if not models_merged: - logging.warning(f'no series data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot') + logging.warning( + f"no series data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot" + ) continue if not models_all: - logging.warning(f'no aggregated data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot') + logging.warning( + f"no aggregated data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot" + ) continue - root_folder = Path('/data/scratch/byron.blay/cset/cell_stats/out') - root_folder = root_folder / f'{varname}/{cell_attribute}/{time_grouping}/{threshold}' + root_folder = Path("/data/scratch/byron.blay/cset/cell_stats/out") + root_folder = ( + root_folder / f"{varname}/{cell_attribute}/{time_grouping}/{threshold}" + ) orig_folder = os.getcwd() # The plot function was crashing when cube.data had nan in the min/max. if check_for_nan(models_merged): logging.warning( - f'Series cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot.') + f"Series cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot." + ) else: - folder = root_folder / 'series' + folder = root_folder / "series" folder.mkdir(parents=True, exist_ok=True) os.chdir(folder) - filename = folder / 'image.png' + filename = folder / "image.png" plot_histogram_series( cubes=models_merged, filename=str(filename), @@ -2298,12 +2307,13 @@ def plot_cell_stats_histograms(data: dict, varname: str, cell_attribute: str, ti if check_for_nan(models_all): logging.warning( - f'Combnined cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot.') + f"Combnined cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot." + ) else: - folder = root_folder / 'all' + folder = root_folder / "all" folder.mkdir(parents=True, exist_ok=True) os.chdir(folder) - filename = folder / 'image.png' + filename = folder / "image.png" plot_histogram_series( cubes=models_all, filename=str(filename), From edd7efb3e4b74b146e0a1f2488a9e242b046e5b5 Mon Sep 17 00:00:00 2001 From: bblay Date: Thu, 25 Sep 2025 11:28:43 +0100 Subject: [PATCH 08/12] i think log y axis is already taken care of in the plotting --- src/CSET/operators/cell_statistics.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py index e77c18890..b3e77d442 100644 --- a/src/CSET/operators/cell_statistics.py +++ b/src/CSET/operators/cell_statistics.py @@ -643,11 +643,9 @@ def check_cell_methods(cube): # todo: RES plots each cube here. for cube in cubes_at_time: - # Normalise histogram? - if y_axis == "relative_frequency": - cube.data = (100.0 * cube.data) / np.sum( - cube.data, dtype=np.float64 - ) + # todo: Normalise histogram? I think this is done in the plotting, actually, unlike RES. + # if y_axis == "relative_frequency": + # cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64) threshold_result[time_title][cube.attributes["model_name"]] = cube @@ -656,9 +654,9 @@ def check_cell_methods(cube): for cube in summed_cubes: cube = cube.collapsed(time_grouping, iris.analysis.SUM) - # Normalise histogram? - if y_axis == "relative_frequency": - cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64) + # todo: Normalise histogram? I think this is done in the plotting, actually, unlike RES. + # if y_axis == "relative_frequency": + # cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64) threshold_result["all"][cube.attributes["model_name"]] = cube From 448d6f1a0fe6899a2d08203513ffd574746a3009 Mon Sep 17 00:00:00 2001 From: bblay Date: Thu, 25 Sep 2025 15:52:47 +0100 Subject: [PATCH 09/12] wip --- delme/run_cell_statistics.py | 9 +++++++-- src/CSET/operators/cell_statistics.py | 14 +++++++++++++- src/CSET/operators/plot.py | 10 ++++++++++ src/CSET/recipes/cell_statistics.yaml | 1 + 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/delme/run_cell_statistics.py b/delme/run_cell_statistics.py index 6f564d2c2..1e63e2f4a 100644 --- a/delme/run_cell_statistics.py +++ b/delme/run_cell_statistics.py @@ -42,9 +42,14 @@ # exit(0) -for cell_attribute in cell_attributes: - for var in vars: +# todo: it's inefficient to keep reloading the data for each var/cell_attribute/time_grouping +# it would seem to be better to send the list of vars to the operator and plot functions, just once. +for var in vars: + for cell_attribute in cell_attributes: for time_grouping in time_groupings: + + print(f"\nrunning recipe for {var} {cell_attribute} {time_grouping}\n" ) + recipe_variables = { "INPUT_PATHS": [i[1] for i in model_data], "MODEL_NAMES": [i[0] for i in model_data], diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py index b3e77d442..e3a069504 100644 --- a/src/CSET/operators/cell_statistics.py +++ b/src/CSET/operators/cell_statistics.py @@ -4,6 +4,7 @@ import datetime import inspect import itertools +import logging from copy import deepcopy from typing import Callable @@ -521,11 +522,16 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): # histograms showing the overall proportion of data values in \ # each bin (normalised to 100%). The default is \ # "relative_frequency". - y_axis = "relative_frequency" + # y_axis = "relative_frequency" + + if cubes in (None, [], CubeList([])): + logging.warning(f'no cubes received for {cell_attribute} {time_grouping}') + return None if not isinstance(cubes, CubeList): cubes = CubeList([cubes]) + # todo: is this correct? it caused more empty cube lists... # todo: put this cell method filtering into the yaml # we saw lfric data with "point" or "mean". def check_cell_methods(cube): @@ -536,6 +542,10 @@ def check_cell_methods(cube): return True cubes = CubeList([c for c in cubes if check_cell_methods(c)]) + if len(cubes) == 0: + logging.warning(f'no cubes for {cell_attribute} {time_grouping} after cell method filtering') + return None + # For now, thresholds are hard coded. thresholds = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0] @@ -660,6 +670,8 @@ def check_cell_methods(cube): threshold_result["all"][cube.attributes["model_name"]] = cube + # todo: perhaps we should use metadata to organisethe results, and let the plotter work out what to plot. + # that would let us stick with the cubelist convention... # For simpler code, the results were formed as: # data[][][] -> Cube # But the plot needs the model to be above the time points, so rearrange as: diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py index 4b06a8e11..7fccba314 100644 --- a/src/CSET/operators/plot.py +++ b/src/CSET/operators/plot.py @@ -2231,6 +2231,10 @@ def plot_cell_stats_histograms( A dict mapping thresholds to histogram data for each time point: data[][][] -> Cube + NOTE: IT SHOULD PROBABLY BE A CUBE/CUBELIST + but that would seem to require considerable inefficiency/repetition in the cells stats + so we need to think about this a bit more + E.g, when time_grouping is 'forecast_period', we see a key like: data['threshold 0.5']['uk_ctrl_um']['T+0.5'] @@ -2246,6 +2250,9 @@ def plot_cell_stats_histograms( ------- """ + if data is None: + return None + # at each threshold, we're going to combine model's each time points into a single cube sequence_coord = time_grouping @@ -2323,3 +2330,6 @@ def plot_cell_stats_histograms( ) os.chdir(orig_folder) + + # todo: by convention we should return the cubes, but it's not cubes + # the cells stats produces a lot of stuff so it's currently passed as a dict diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml index d24dbc58e..d0e75b04c 100644 --- a/src/CSET/recipes/cell_statistics.yaml +++ b/src/CSET/recipes/cell_statistics.yaml @@ -5,6 +5,7 @@ description: | steps: + # todo: ideally we'd load just once and have a list of variable names to extract - operator: read.read_cubes file_paths: $INPUT_PATHS model_names: $MODEL_NAMES From 374ae1be73802eb1a48da72e7c638ab4fb97d17e Mon Sep 17 00:00:00 2001 From: bblay Date: Fri, 26 Sep 2025 20:12:12 +0100 Subject: [PATCH 10/12] First functioning draft of a new cell stats plots page with controls for both threshold and time point. We need to - move the index into the subfolder so it doesn't get overwritten - ? move the cell attribute and time grouping from the recipe runner into the code so it's all on the one page together --- delme/run_cell_statistics.py | 22 +- .../operators/_cell_stats_plots_template.html | 165 ++++++++++++++ src/CSET/operators/cell_statistics.py | 49 +++-- src/CSET/operators/plot.py | 201 +++++++++++------- src/CSET/operators/read.py | 9 + src/CSET/operators/write.py | 12 ++ src/CSET/recipes/cell_statistics.yaml | 7 +- 7 files changed, 357 insertions(+), 108 deletions(-) create mode 100644 src/CSET/operators/_cell_stats_plots_template.html diff --git a/delme/run_cell_statistics.py b/delme/run_cell_statistics.py index 1e63e2f4a..f93e20c4b 100644 --- a/delme/run_cell_statistics.py +++ b/delme/run_cell_statistics.py @@ -3,7 +3,6 @@ from CSET._common import parse_recipe from CSET.operators import execute_recipe -cell_attributes = ["effective_radius_in_km", "mean_value"] recipe_fpath = ( Path(__file__).parent.parent / "src" / "CSET" / "recipes" / "cell_statistics.yaml" @@ -27,13 +26,20 @@ ), ] +# # todo: restore this +# cell_attributes = ["effective_radius_in_km", "mean_value"] +# +# vars = [ +# "surface_microphysical_rainfall_amount", # m01s04i201 +# "surface_microphysical_rainfall_rate", # m01s04i203 todo: seems to have missing data, needs investigation +# ] +# +# time_groupings = ["forecast_period", "hour"] -vars = [ - "surface_microphysical_rainfall_amount", # m01s04i201 - "surface_microphysical_rainfall_rate", # m01s04i203 todo: seems to have missing data, needs investigation -] - -time_groupings = ["forecast_period", "hour"] +# faster dev +cell_attributes = ["effective_radius_in_km"] +vars = ["surface_microphysical_rainfall_rate"] +time_groupings = ["forecast_period"] # # DEBUG @@ -48,7 +54,7 @@ for cell_attribute in cell_attributes: for time_grouping in time_groupings: - print(f"\nrunning recipe for {var} {cell_attribute} {time_grouping}\n" ) + print(f"\nrunning recipe for {var}, {cell_attribute}, {time_grouping}\n" ) recipe_variables = { "INPUT_PATHS": [i[1] for i in model_data], diff --git a/src/CSET/operators/_cell_stats_plots_template.html b/src/CSET/operators/_cell_stats_plots_template.html new file mode 100644 index 000000000..0de7bad91 --- /dev/null +++ b/src/CSET/operators/_cell_stats_plots_template.html @@ -0,0 +1,165 @@ + + + + + + + {{title}} + + + + + + + + +
+ +
+ + + + + + + +
+ + plot + +
+ + + \ No newline at end of file diff --git a/src/CSET/operators/cell_statistics.py b/src/CSET/operators/cell_statistics.py index e3a069504..97afa4b17 100644 --- a/src/CSET/operators/cell_statistics.py +++ b/src/CSET/operators/cell_statistics.py @@ -516,6 +516,11 @@ def caller_thing(cubes: CubeList, cell_attribute: str, time_grouping: str): "forecast_period", "hour" or "time" """ + # todo: inorder to get everything on one page, + # i think we might ahve to calculate all cell attributes in here, + # and all time groupings, so that we can pass one big lump to the page plotter. + + # todo: paramterise y_axis? # set to "frequency" for cell statistic histograms that are \ # plain number counts in bins, or "relative_frequency" for \ @@ -543,7 +548,7 @@ def check_cell_methods(cube): cubes = CubeList([c for c in cubes if check_cell_methods(c)]) if len(cubes) == 0: - logging.warning(f'no cubes for {cell_attribute} {time_grouping} after cell method filtering') + logging.warning(f'no cubes for {cell_attribute}, {time_grouping} after cell method filtering') return None # For now, thresholds are hard coded. @@ -576,7 +581,7 @@ def check_cell_methods(cube): result = {} for threshold in thresholds: - result[f"threshold {threshold}"] = threshold_result = {} + result[f"threshold {threshold}"] = time_data = {} # todo: check var_name has been removed by this point, as RES removed it to help with merge # hist_cubes = something_like_cell_attribute_histogram( @@ -585,6 +590,7 @@ def check_cell_methods(cube): # bin_edges=get_bin_edges(cell_attribute), # threshold=threshold) for cube in cubes] + # todo: can't we just let mpl make the histogram at plot time? hist_cubes = CubeList([]) for cube in cubes: hist_cube = something_like_cell_attribute_histogram( @@ -641,7 +647,7 @@ def check_cell_methods(cube): else: raise ValueError(f"Unknown time grouping '{time_grouping}'") - threshold_result[time_title] = {} + time_data[time_title] = {} # Extract histogram at this time. time_constraint = iris.Constraint( @@ -657,10 +663,10 @@ def check_cell_methods(cube): # if y_axis == "relative_frequency": # cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64) - threshold_result[time_title][cube.attributes["model_name"]] = cube + time_data[time_title][cube.attributes["model_name"]] = cube # Sum all histograms. This creates the data for the "all" time point. - threshold_result["all"] = {} + time_data["all"] = {} for cube in summed_cubes: cube = cube.collapsed(time_grouping, iris.analysis.SUM) @@ -668,24 +674,25 @@ def check_cell_methods(cube): # if y_axis == "relative_frequency": # cube.data = (100.0 * cube.data) / np.sum(cube.data, dtype=np.float64) - threshold_result["all"][cube.attributes["model_name"]] = cube - - # todo: perhaps we should use metadata to organisethe results, and let the plotter work out what to plot. - # that would let us stick with the cubelist convention... - # For simpler code, the results were formed as: - # data[][][] -> Cube - # But the plot needs the model to be above the time points, so rearrange as: - # data[][][] -> Cube - reformed_result = {} - for threshold, threshold_result in result.items(): - reformed_result[threshold] = {} - for time_title, model_cubes in threshold_result.items(): - for model_name, cube in model_cubes.items(): - reformed_result[threshold].setdefault(model_name, {}) - reformed_result[threshold][model_name][time_title] = cube + time_data["all"][cube.attributes["model_name"]] = cube + + # don't bother rearranging thie dict now that we've got our own plotter + # # todo: perhaps we should use metadata to organisethe results, and let the plotter work out what to plot. + # # that would let us stick with the cubelist convention... + # # For simpler code, the results were formed as: + # # data[][][] -> Cube + # # But the plot needs the model to be above the time points, so rearrange as: + # # data[][][] -> Cube + # reformed_result = {} + # for threshold, threshold_result in result.items(): + # reformed_result[threshold] = {} + # for time_title, model_cubes in threshold_result.items(): + # for model_name, cube in model_cubes.items(): + # reformed_result[threshold].setdefault(model_name, {}) + # reformed_result[threshold][model_name][time_title] = cube # return cubes to plot. todo: in what exact arrangement? - return reformed_result + return result def something_like_cell_attribute_histogram(cube, attribute, bin_edges, threshold=0.0): diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py index 7fccba314..4a7401e7e 100644 --- a/src/CSET/operators/plot.py +++ b/src/CSET/operators/plot.py @@ -2220,6 +2220,17 @@ def check_for_nan(cubes): return False +def make_cell_stats_page(html_fpath, title, plot_urls): + template_file = Path(__file__).parent / "_cell_stats_plots_template.html" + variables = { + 'title': '', + 'plot_urls': plot_urls, + } + html = render_file(str(template_file), **variables) + with open(html_fpath, "wt", encoding="UTF-8") as fp: + fp.write(html) + + def plot_cell_stats_histograms( data: dict, varname: str, cell_attribute: str, time_grouping: str ): @@ -2253,83 +2264,119 @@ def plot_cell_stats_histograms( if data is None: return None - # at each threshold, we're going to combine model's each time points into a single cube - sequence_coord = time_grouping - - for threshold, models in data.items(): - models_merged = iris.cube.CubeList() # for each model: data at every time, which we will merge for a time sequence plot - models_all = ( - iris.cube.CubeList() - ) # for each model: aggregation of all times, created by the operator - - for model_name, times in models.items(): - models_all.append(times["all"]) - del times["all"] - - # Put the time points together into a single cube. - # Except for the time_grouping coord, all other time coords will have been removed. - # Todo: why bother splitting them up in the first place? - # was that just for the RES plotting, can we simplify the operator? - cubes = iris.cube.CubeList(times.values()) - cubes = cubes.merge() - assert len(cubes) == 1 - models_merged.extend(cubes) - - if not models_merged: - logging.warning( - f"no series data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot" - ) - continue - - if not models_all: - logging.warning( - f"no aggregated data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot" - ) - continue - - root_folder = Path("/data/scratch/byron.blay/cset/cell_stats/out") - root_folder = ( - root_folder / f"{varname}/{cell_attribute}/{time_grouping}/{threshold}" - ) - - orig_folder = os.getcwd() - - # The plot function was crashing when cube.data had nan in the min/max. - if check_for_nan(models_merged): - logging.warning( - f"Series cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot." - ) - else: - folder = root_folder / "series" - folder.mkdir(parents=True, exist_ok=True) - os.chdir(folder) - filename = folder / "image.png" - plot_histogram_series( - cubes=models_merged, - filename=str(filename), - sequence_coordinate=sequence_coord, - stamp_coordinate=None, # is this useful? - single_plot=False, - ) - - if check_for_nan(models_all): - logging.warning( - f"Combnined cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot." - ) - else: - folder = root_folder / "all" - folder.mkdir(parents=True, exist_ok=True) - os.chdir(folder) - filename = folder / "image.png" - plot_histogram_series( - cubes=models_all, - filename=str(filename), - sequence_coordinate=sequence_coord, # todo: we don't need a time series for this plot! - stamp_coordinate=None, # is this useful? - single_plot=False, - ) + # # at each threshold, we're going to combine each model's time points into a single cube + # # todo: can't the operator do that? + # sequence_coord = time_grouping + # + # + # for threshold, models in data.items(): + # + # # for each model: data at every time, which we will merge for a time sequence plot + # models_merged = iris.cube.CubeList() + # + # # for each model: aggregation of all times, created by the operator + # models_all = (iris.cube.CubeList()) + # + # for model_name, times in models.items(): + # models_all.append(times["all"]) + # del times["all"] + # + # # Put the time points together into a single cube. + # # Except for the time_grouping coord, all other time coords will have been removed. + # # Todo: why bother splitting them up in the first place? + # # was that just for the RES plotting, can we simplify the operator? + # cubes = iris.cube.CubeList(times.values()) + # cubes = cubes.merge() + # assert len(cubes) == 1 + # models_merged.extend(cubes) + # + # if not models_merged: + # logging.warning( + # f"no series data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot" + # ) + # continue + # + # if not models_all: + # logging.warning( + # f"no aggregated data found for {varname} {cell_attribute} {time_grouping} {threshold}, skipping plot" + # ) + # continue + # + # root_folder = Path("/data/scratch/byron.blay/cset/cell_stats/out") + # root_folder = ( + # root_folder / f"{varname}/{cell_attribute}/{time_grouping}/{threshold}" + # ) + # + # orig_folder = os.getcwd() + # + # # The plot function was crashing when cube.data had nan in the min/max. + # if check_for_nan(models_merged): + # logging.warning( + # f"Series cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot." + # ) + # else: + # folder = root_folder / "series" + # folder.mkdir(parents=True, exist_ok=True) + # os.chdir(folder) + # filename = folder / "image.png" + # plot_histogram_series( + # cubes=models_merged, + # filename=str(filename), + # sequence_coordinate=sequence_coord, + # stamp_coordinate=None, # is this useful? + # single_plot=False, + # ) + # + # if check_for_nan(models_all): + # logging.warning( + # f"Combnined cube for {varname}, {cell_attribute}, {time_grouping}, {threshold} has nan. Skipping plot." + # ) + # else: + # folder = root_folder / "all" + # folder.mkdir(parents=True, exist_ok=True) + # os.chdir(folder) + # filename = folder / "image.png" + # plot_histogram_series( + # cubes=models_all, + # filename=str(filename), + # sequence_coordinate=sequence_coord, # todo: we don't need a time series for this plot! + # stamp_coordinate=None, # is this useful? + # single_plot=False, + # ) + # + # os.chdir(orig_folder) + + plot_urls = {} + + # todo: parameterise this + root_folder = Path("/data/scratch/byron.blay/cset/cell_stats/out2") + + for threshold, time_data in data.items(): + plot_urls[threshold] = {} + + # plot every time point, including the 'all' time point + for time_title, model_data in time_data.items(): + filename = root_folder / varname / cell_attribute / threshold / time_grouping / f'{time_title}.png' + + # # make a new plot for this combination of threshold & time + # plt.figure(figsize=(10, 10)) + # plt.title(f'{varname}, {cell_attribute}, threshold {threshold}, at {time_grouping} {time_title}') + # for model_name, cube in model_data.items(): + # plt.plot(cube.coord(cell_attribute).points, cube.data) + # + # filename.parent.mkdir(parents=True, exist_ok=True) + # plt.savefig(filename) + # plt.close() + + plot_urls[threshold][time_title] = str(filename) + + make_cell_stats_page( + root_folder / 'index.html', + title=f'{varname}, {cell_attribute}, {time_grouping}', + plot_urls=json.dumps(plot_urls), + ) - os.chdir(orig_folder) # todo: by convention we should return the cubes, but it's not cubes - # the cells stats produces a lot of stuff so it's currently passed as a dict + # the cells stats produces a lot of stuff and it's currently passed as a dict + return None diff --git a/src/CSET/operators/read.py b/src/CSET/operators/read.py index 862396d68..cad4e8e22 100644 --- a/src/CSET/operators/read.py +++ b/src/CSET/operators/read.py @@ -996,3 +996,12 @@ def _lfric_forecast_period_standard_name_callback(cube: iris.cube.Cube): coord.standard_name = "forecast_period" except iris.exceptions.CoordinateNotFoundError: pass + + +def read_pickle(filename): + """Helper to read the cell stats output, until it returns cubes.""" + import pickle + + with open(filename, "rb") as f: + data = pickle.load(f) + return data diff --git a/src/CSET/operators/write.py b/src/CSET/operators/write.py index e67afd2c0..e334812d5 100644 --- a/src/CSET/operators/write.py +++ b/src/CSET/operators/write.py @@ -66,3 +66,15 @@ def write_cube_to_nc( # Save the file as nc compliant (iris should handle this) iris.save(cube, filename, zlib=True) return cube + + +def write_pickle(data, filename): + """Helper to save the cell stats output, until it returns cubes.""" + import pickle + if not data: + return + + Path(filename).parent.mkdir(exist_ok=True, parents=True) + with open(filename, "wb") as f: + pickle.dump(data, f) + pass diff --git a/src/CSET/recipes/cell_statistics.yaml b/src/CSET/recipes/cell_statistics.yaml index d0e75b04c..edd28a6e0 100644 --- a/src/CSET/recipes/cell_statistics.yaml +++ b/src/CSET/recipes/cell_statistics.yaml @@ -22,8 +22,11 @@ steps: cell_attribute: $CELL_ATTRIBUTE time_grouping: $TIME_GROUPING -# - operator: write.write_cube_to_nc -# overwrite: True +# these two are useful when working on the plotting +# - operator: write.write_pickle +# filename: /data/scratch/byron.blay/cset/cell_stats/out2/$VARNAME__$CELL_ATTRIBUTE__$TIME_GROUPING.pkl +# - operator: read.read_pickle +# filename: /data/scratch/byron.blay/cset/cell_stats/out2/$VARNAME__$CELL_ATTRIBUTE__$TIME_GROUPING.pkl - operator: plot.plot_cell_stats_histograms varname: $VARNAME From 33a450d5e1e4eda85f12f63e38c367353131de4b Mon Sep 17 00:00:00 2001 From: bblay Date: Mon, 29 Sep 2025 20:55:54 +0100 Subject: [PATCH 11/12] more controls on the page --- delme/run_cell_statistics.py | 41 ++--- .../operators/_cell_stats_plots_template.html | 140 +++++++-------- src/CSET/operators/cell_statistics.py | 36 ++-- src/CSET/operators/plot.py | 165 +++++------------- src/CSET/operators/write.py | 5 +- src/CSET/recipes/cell_statistics.yaml | 18 +- 6 files changed, 165 insertions(+), 240 deletions(-) diff --git a/delme/run_cell_statistics.py b/delme/run_cell_statistics.py index f93e20c4b..89b368a71 100644 --- a/delme/run_cell_statistics.py +++ b/delme/run_cell_statistics.py @@ -26,20 +26,17 @@ ), ] -# # todo: restore this +vars = [ + "surface_microphysical_rainfall_rate", # m01s04i203 + "surface_microphysical_rainfall_amount", # m01s04i201 +] # cell_attributes = ["effective_radius_in_km", "mean_value"] -# -# vars = [ -# "surface_microphysical_rainfall_amount", # m01s04i201 -# "surface_microphysical_rainfall_rate", # m01s04i203 todo: seems to have missing data, needs investigation -# ] -# # time_groupings = ["forecast_period", "hour"] -# faster dev -cell_attributes = ["effective_radius_in_km"] -vars = ["surface_microphysical_rainfall_rate"] -time_groupings = ["forecast_period"] +# # faster dev +# cell_attributes = ["effective_radius_in_km"] +# vars = ["surface_microphysical_rainfall_rate"] +# time_groupings = ["forecast_period"] # # DEBUG @@ -51,18 +48,14 @@ # todo: it's inefficient to keep reloading the data for each var/cell_attribute/time_grouping # it would seem to be better to send the list of vars to the operator and plot functions, just once. for var in vars: - for cell_attribute in cell_attributes: - for time_grouping in time_groupings: - - print(f"\nrunning recipe for {var}, {cell_attribute}, {time_grouping}\n" ) + print(f"\nrunning recipe for {var}\n" ) - recipe_variables = { - "INPUT_PATHS": [i[1] for i in model_data], - "MODEL_NAMES": [i[0] for i in model_data], - "VARNAME": var, - "CELL_ATTRIBUTE": cell_attribute, - "TIME_GROUPING": time_grouping, - } + recipe_variables = { + "INPUT_PATHS": [i[1] for i in model_data], + "MODEL_NAMES": [i[0] for i in model_data], + "VARNAME": var, + "OUTPUT_FOLDER": "/data/scratch/byron.blay/cset/cell_stats/out3", + } - recipe = parse_recipe(recipe_fpath, recipe_variables) - execute_recipe(recipe, Path(__file__).parent) + recipe = parse_recipe(recipe_fpath, recipe_variables) + execute_recipe(recipe, Path(__file__).parent) diff --git a/src/CSET/operators/_cell_stats_plots_template.html b/src/CSET/operators/_cell_stats_plots_template.html index 0de7bad91..d975bfe88 100644 --- a/src/CSET/operators/_cell_stats_plots_template.html +++ b/src/CSET/operators/_cell_stats_plots_template.html @@ -11,112 +11,96 @@