From d7609b8092b07bb6e3c80b570fa578cefa03dde4 Mon Sep 17 00:00:00 2001 From: Peter Dudfield Date: Fri, 15 Aug 2025 09:24:23 +0100 Subject: [PATCH 1/3] update readme, add scale factor, add 15 min satellite data at different location --- README.md | 12 ++++++++++++ src/cloudcasting_app/data.py | 30 +++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 43b0418..8891ea7 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,18 @@ The repo associated with training the models run here is https://github.com/open The model checkpoints are hosted at: https://huggingface.co/openclimatefix/cloudcasting_uk +## Environment Variables + +The following environment variables are used in the app: + +- `SATELLITE_ZARR_PATH`: The path to the satellite data in Zarr format. +- `OUTPUT_PREDICTION_DIRECTORY`: The directory where results are saved. + +### Optional Environment Variables + +- `SATELLITE_SCALE_FACTOR`: The scale factor for the satellite data. Defaults to 1023. +- `SATELLITE_15_ZARR_PATH`: The path to the 15 minute satellite data in Zarr format. If +this is not set then the `SATELLITE_ZARR_PATH` is used by `.zarr` is repalced with `_15.zarr` ## Installation diff --git a/src/cloudcasting_app/data.py b/src/cloudcasting_app/data.py index 7f6a4c6..45f25cd 100644 --- a/src/cloudcasting_app/data.py +++ b/src/cloudcasting_app/data.py @@ -1,6 +1,7 @@ import logging import shutil import os +import yaml import fsspec import numpy as np @@ -83,7 +84,18 @@ def prepare_satellite_data(t0: pd.Timestamp): ds = ds.sel(variable=channel_order) # Scale the satellite data from 0-1 - ds = ds / 1023 + scale_factor = int(os.environ.get("SATELLITE_SCALE_FACTOR", 1023)) + logger.info( + f"Scaling satellite data by {scale_factor} to be between 0 and 1" + ) + ds = ds / scale_factor + + # make sure area attrs are yaml string + if "area" in ds.data.attrs and isinstance(ds.data.attrs["area"], dict): + logger.warning("Converting area attribute to YAML string, " + "we should do this in the satellite consumer.") + ds.data.attrs["area"] = yaml.dump(ds.data.attrs["area"]) + # Resave ds = ds.compute() @@ -107,8 +119,10 @@ def download_all_sat_data() -> bool: # Set variable to track whether the satellite download is successful sat_available = False + # get paths + sat_5_dl_path, sat_15_dl_path = get_satellite_source_paths() + # download 5 minute satellite data - sat_5_dl_path = os.environ["SATELLITE_ZARR_PATH"] fs, _ = fsspec.core.url_to_fs(sat_5_dl_path) if fs.exists(sat_5_dl_path): sat_available = True @@ -121,7 +135,6 @@ def download_all_sat_data() -> bool: logger.info("No 5-minute data available") # Also download 15-minute satellite if it exists - sat_15_dl_path = sat_5_dl_path.replace(".zarr", "_15.zarr") if fs.exists(sat_15_dl_path): sat_available = True logger.info("Downloading 15-minute satellite data") @@ -226,3 +239,14 @@ def get_input_data(ds: xr.Dataset, t0: pd.Timestamp): X = np.nan_to_num(X, nan=-1) return torch.Tensor(X) + + +def get_satellite_source_paths() -> (str | None, str | None): + """ Get the paths to the satellite data from environment variables""" + sat_source_path_5 = os.getenv("SATELLITE_ZARR_PATH", None) + sat_source_path_15 = os.getenv("SATELLITE_15_ZARR_PATH", None) + if sat_source_path_15 is None and sat_source_path_5 is not None: + sat_source_path_15 = sat_source_path_5.replace(".zarr", "_15.zarr") + logger.info( + f"Satellite source paths: 5-minute: {sat_source_path_5}, 15-minute: {sat_source_path_15}") + return sat_source_path_5, sat_source_path_15 From 3530801e9905f18d642d6fd08dfaec9f28bcd2e4 Mon Sep 17 00:00:00 2001 From: Peter Dudfield Date: Fri, 15 Aug 2025 11:29:30 +0100 Subject: [PATCH 2/3] move location of attrs to yaml --- src/cloudcasting_app/data.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cloudcasting_app/data.py b/src/cloudcasting_app/data.py index 45f25cd..0512826 100644 --- a/src/cloudcasting_app/data.py +++ b/src/cloudcasting_app/data.py @@ -80,6 +80,12 @@ def prepare_satellite_data(t0: pd.Timestamp): # Crop the input area to expected ds = crop_input_area(ds) + # make sure area attrs are yaml string + if "area" in ds.data.attrs and isinstance(ds.data.attrs["area"], dict): + logger.warning("Converting area attribute to YAML string, " + "we should do this in the satellite consumer.") + ds.data.attrs["area"] = yaml.dump(ds.data.attrs["area"]) + # Reorder channels ds = ds.sel(variable=channel_order) @@ -90,12 +96,6 @@ def prepare_satellite_data(t0: pd.Timestamp): ) ds = ds / scale_factor - # make sure area attrs are yaml string - if "area" in ds.data.attrs and isinstance(ds.data.attrs["area"], dict): - logger.warning("Converting area attribute to YAML string, " - "we should do this in the satellite consumer.") - ds.data.attrs["area"] = yaml.dump(ds.data.attrs["area"]) - # Resave ds = ds.compute() From 27a1ce40ce0cdba7be5f39eed220fad5001a175d Mon Sep 17 00:00:00 2001 From: Peter Dudfield Date: Fri, 15 Aug 2025 11:53:35 +0100 Subject: [PATCH 3/3] update --- src/cloudcasting_app/data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cloudcasting_app/data.py b/src/cloudcasting_app/data.py index 0512826..3a6f092 100644 --- a/src/cloudcasting_app/data.py +++ b/src/cloudcasting_app/data.py @@ -77,15 +77,15 @@ def prepare_satellite_data(t0: pd.Timestamp): # Load data the data for more preprocessing ds = xr.open_zarr(sat_path) - # Crop the input area to expected - ds = crop_input_area(ds) - # make sure area attrs are yaml string if "area" in ds.data.attrs and isinstance(ds.data.attrs["area"], dict): logger.warning("Converting area attribute to YAML string, " "we should do this in the satellite consumer.") ds.data.attrs["area"] = yaml.dump(ds.data.attrs["area"]) + # Crop the input area to expected + ds = crop_input_area(ds) + # Reorder channels ds = ds.sel(variable=channel_order)