From 3ceaf06ed645004b9c10219c1ff77e76348e05bf Mon Sep 17 00:00:00 2001 From: Aaron Spring Date: Mon, 23 Feb 2026 11:07:47 +0100 Subject: [PATCH 1/6] Add Global Carbon Budget to climate catalog - Add custom excel_url driver for reading Excel files from URLs - Uses simplecache for disk caching - Add GCB 2025 and GCB 2021 from ICOS-CP data portal Note: intake v2 has native Excel support via PandasExcel reader, but the YAML format is complex and not compatible with existing catalogs. The custom driver provides a simpler solution for YAML-based catalogs. --- catalogs/climate.yaml | 37 +++++++++++++++ remote_climate_data/__init__.py | 6 ++- remote_climate_data/excel_source.py | 73 +++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 remote_climate_data/excel_source.py diff --git a/catalogs/climate.yaml b/catalogs/climate.yaml index bdf5a0e..9fc4ea5 100644 --- a/catalogs/climate.yaml +++ b/catalogs/climate.yaml @@ -2,7 +2,44 @@ plugins: source: - module: intake_xarray + - module: remote_climate_data sources: + Global_Carbon_Budget_2025: + description: Global Carbon Budget 2025 + metadata: + url: https://globalcarbonbudget.org/ + doi: + - https://doi.org/10.5194/essd-2025-659 + - https://doi.org/10.18160/gcp-2025 + plots: + over_time: + kind: line + driver: excel_url + args: + urlpath: simplecache::https://data.icos-cp.eu/licence_accept?ids=%5B%22UtUDiUg-PuYWkAiHoUNn83e0%22%5D + sheet_name: Global Carbon Budget + header: 21 + index_col: Year + skipfooter: 0 + + Global_Carbon_Budget_2021: + description: Global Carbon Budget 2021 + metadata: + url: https://www.globalcarbonproject.org/carbonbudget/ + doi: + - https://doi.org/10.5194/essd-2021-386 + - https://doi.org/10.18160/gcp-2021 + plots: + over_time: + kind: line + driver: excel_url + args: + urlpath: simplecache::https://data.icos-cp.eu/licence_accept?ids=%5B%220ST81nXCND5VfAQdOCSJDveT%22%5D + sheet_name: Global Carbon Budget + header: 20 + index_col: Year + skipfooter: 0 + NOAA_correlation: description: climate indices from psl.noaa.gov/data/correlation metadata: diff --git a/remote_climate_data/__init__.py b/remote_climate_data/__init__.py index 892fafb..3d22125 100644 --- a/remote_climate_data/__init__.py +++ b/remote_climate_data/__init__.py @@ -1,8 +1,10 @@ import intake +from remote_climate_data import excel_source # noqa: F401 + def cmor_cat(cat="../master.yml", keyword="cmor", depth=3): if isinstance(cat, str): cat = intake.open_catalog(cat) - assert isinstance(cat, intake.catalog.local.YAMLFileCatalog) # type: ignore[attr-defined] - return intake.Catalog.from_dict(cat.search(keyword, depth=depth).walk()) # type: ignore[call-arg, union-attr] + assert isinstance(cat, intake.catalog.local.YAMLFileCatalog) + return intake.Catalog.from_dict(cat.search(keyword, depth=depth).walk()) diff --git a/remote_climate_data/excel_source.py b/remote_climate_data/excel_source.py new file mode 100644 index 0000000..e86fbc2 --- /dev/null +++ b/remote_climate_data/excel_source.py @@ -0,0 +1,73 @@ +import io + +import intake +import intake.source + + +class ExcelSource(intake.source.base.DataSource): + """Intake source for reading Excel files from URLs. + + Supports simplecache:: prefix for caching. + """ + + container = "dataframe" + name = "excel_url" + version = "0.0.1" + + def __init__( + self, + urlpath, + sheet_name="Global Carbon Budget", + header=21, + index_col="Year", + skipfooter=4, + metadata=None, + ): + """Initialize Excel source. + + Args: + urlpath: URL or simplecache path to Excel file + sheet_name: Name of sheet to read + header: Row number to use as header (0-indexed) + index_col: Column to use as index + skipfooter: Number of rows to skip at end + metadata: Additional metadata + """ + super().__init__(metadata=metadata) + self.urlpath = urlpath + self.sheet_name = sheet_name + self.header = header + self.index_col = index_col + self.skipfooter = skipfooter + + def _get_schema(self): + return intake.source.base.Schema( + datatypes={"_": "python"}, + shape=(None, None), + npartitions=1, + metadata=self.metadata, + ) + + def _load(self): + import fsspec + import pandas as pd + + fs, _token, paths = fsspec.get_fs_token_paths(self.urlpath) + path = paths[0] + with fs.open(path, "rb") as f: + return pd.read_excel( + io.BytesIO(f.read()), + sheet_name=self.sheet_name, + header=self.header, + index_col=self.index_col, + skipfooter=self.skipfooter, + ) + + def read(self): + return self._load() + + def _close(self): + pass + + +intake.source.register_driver("excel_url", ExcelSource) From d6184952135024811697e09d3477d3b87cf440c3 Mon Sep 17 00:00:00 2001 From: Aaron Spring Date: Mon, 23 Feb 2026 11:38:47 +0100 Subject: [PATCH 2/6] Remove plots metadata, add GCB usage example to README --- README.md | 15 ++++++++++++++- catalogs/climate.yaml | 6 ------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 6438c52..b1e1114 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,8 @@ atmosphere: climate: - NOAA_correlation - NOAA_correlation_xr + - Global_Carbon_Budget_2025 + - Global_Carbon_Budget_2021 shapefiles: - Countries @@ -94,6 +96,17 @@ cat = intake.open_catalog('https://raw.githubusercontent.com/aaronspring/remote_ cat.atmosphere.HadCRUT5.to_dask() ``` +### Plot Global Carbon Budget +```python +import hvplot.pandas +import intake +cat = intake.open_catalog('master.yaml') +gcb = cat.climate().Global_Carbon_Budget_2025.read() +gcb.hvplot(y=['fossil emissions excluding carbonation', 'land-use change emissions', + 'atmospheric growth', 'ocean sink', 'land sink'], + title='Global Carbon Budget 2025') +``` + To explore the whole catalog, you can try: ```python cat.walk() @@ -115,7 +128,7 @@ Make data access for climate data easy: - [`intake_xarray`](https://intake-xarray.readthedocs.io/en/latest/) for: - `nc` using [`netcdf4`](https://github.com/Unidata/netcdf4-python) [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/atmosphere.yaml#L64)] - `tif` using [`rioxarray`](https://github.com/corteva/rioxarray) [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/humans.yaml#L42)] -- [`intake_excel`](https://github.com/edjdavid/intake-excel) for Excel `xls` and `xlsx` [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/climate.yaml#L35)] +- [`intake_excel`](https://github.com/edjdavid/intake-excel) for Excel `xls` and `xlsx` [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/climate.yaml#L35)] (see [`excel_source.py`](remote_climate_data/excel_source.py) for custom driver if package unavailable) - [`intake_geopandas`](https://github.com/intake/intake_geopandas) for shapefiles `shp` [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/shapefiles.yaml#L11)], GeoJSON `geo.json` [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/shapefiles.yaml#L57)], GeoParquet `parquet`, `PostGIS` databases, `Spatialite` databases - [`regionmask`](https://regionmask.readthedocs.io/) for aggregating over geoshapes diff --git a/catalogs/climate.yaml b/catalogs/climate.yaml index 9fc4ea5..aee1538 100644 --- a/catalogs/climate.yaml +++ b/catalogs/climate.yaml @@ -11,9 +11,6 @@ sources: doi: - https://doi.org/10.5194/essd-2025-659 - https://doi.org/10.18160/gcp-2025 - plots: - over_time: - kind: line driver: excel_url args: urlpath: simplecache::https://data.icos-cp.eu/licence_accept?ids=%5B%22UtUDiUg-PuYWkAiHoUNn83e0%22%5D @@ -29,9 +26,6 @@ sources: doi: - https://doi.org/10.5194/essd-2021-386 - https://doi.org/10.18160/gcp-2021 - plots: - over_time: - kind: line driver: excel_url args: urlpath: simplecache::https://data.icos-cp.eu/licence_accept?ids=%5B%220ST81nXCND5VfAQdOCSJDveT%22%5D From 9d7c51740e2c05b7c9a9330e90093dbb96fd798b Mon Sep 17 00:00:00 2001 From: Aaron Spring <12237157+aaronspring@users.noreply.github.com> Date: Mon, 23 Feb 2026 11:40:42 +0100 Subject: [PATCH 3/6] Update README.md --- README.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index b1e1114..5185e77 100644 --- a/README.md +++ b/README.md @@ -96,18 +96,15 @@ cat = intake.open_catalog('https://raw.githubusercontent.com/aaronspring/remote_ cat.atmosphere.HadCRUT5.to_dask() ``` -### Plot Global Carbon Budget ```python import hvplot.pandas -import intake -cat = intake.open_catalog('master.yaml') gcb = cat.climate().Global_Carbon_Budget_2025.read() gcb.hvplot(y=['fossil emissions excluding carbonation', 'land-use change emissions', 'atmospheric growth', 'ocean sink', 'land sink'], - title='Global Carbon Budget 2025') + title='Global Carbon Budget 2025') ``` -To explore the whole catalog, you can try: +Explore the whole catalog: ```python cat.walk() ``` From ee85526cb836d428696eb55cdc3fe8970739802b Mon Sep 17 00:00:00 2001 From: Aaron Spring Date: Mon, 23 Feb 2026 11:42:03 +0100 Subject: [PATCH 4/6] Add repr output to usage examples --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 5185e77..f410c1a 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,11 @@ import intake cat = intake.open_catalog('https://raw.githubusercontent.com/aaronspring/remote_climate_data/master/master.yaml') cat.atmosphere.HadCRUT5.to_dask() ``` +``` + Size: 42MB +Dimensions: (time: 2028, latitude: 36, longitude: 72, bnds: 2) +... +``` ```python import hvplot.pandas @@ -103,6 +108,15 @@ gcb.hvplot(y=['fossil emissions excluding carbonation', 'land-use change emissio 'atmospheric growth', 'ocean sink', 'land sink'], title='Global Carbon Budget 2025') ``` +``` + fossil emissions excluding carbonation ... budget imbalance +Year +1959 2.416788 ... 1.168380 +... +2024 10.534546 ... -1.691863 + +[66 rows x 7 columns] +``` Explore the whole catalog: ```python From a16f99c2a8f13726fa9f273550aacbf1a427154b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Feb 2026 10:44:02 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f410c1a..98ce1dd 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ gcb.hvplot(y=['fossil emissions excluding carbonation', 'land-use change emissio ``` ``` fossil emissions excluding carbonation ... budget imbalance -Year +Year 1959 2.416788 ... 1.168380 ... 2024 10.534546 ... -1.691863 From b215179c2957a4319ea09d4a5d983a8ebb297f19 Mon Sep 17 00:00:00 2001 From: Aaron Spring Date: Mon, 23 Feb 2026 11:54:44 +0100 Subject: [PATCH 6/6] Add repr output to usage examples --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 98ce1dd..baa90c1 100644 --- a/README.md +++ b/README.md @@ -107,14 +107,12 @@ gcb = cat.climate().Global_Carbon_Budget_2025.read() gcb.hvplot(y=['fossil emissions excluding carbonation', 'land-use change emissions', 'atmospheric growth', 'ocean sink', 'land sink'], title='Global Carbon Budget 2025') -``` -``` +gcb fossil emissions excluding carbonation ... budget imbalance Year 1959 2.416788 ... 1.168380 ... 2024 10.534546 ... -1.691863 - [66 rows x 7 columns] ```