diff --git a/README.md b/README.md index 6438c52..baa90c1 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,8 @@ atmosphere: climate: - NOAA_correlation - NOAA_correlation_xr + - Global_Carbon_Budget_2025 + - Global_Carbon_Budget_2021 shapefiles: - Countries @@ -93,8 +95,28 @@ import intake cat = intake.open_catalog('https://raw.githubusercontent.com/aaronspring/remote_climate_data/master/master.yaml') cat.atmosphere.HadCRUT5.to_dask() ``` +``` + Size: 42MB +Dimensions: (time: 2028, latitude: 36, longitude: 72, bnds: 2) +... +``` + +```python +import hvplot.pandas +gcb = cat.climate().Global_Carbon_Budget_2025.read() +gcb.hvplot(y=['fossil emissions excluding carbonation', 'land-use change emissions', + 'atmospheric growth', 'ocean sink', 'land sink'], + title='Global Carbon Budget 2025') +gcb + fossil emissions excluding carbonation ... budget imbalance +Year +1959 2.416788 ... 1.168380 +... +2024 10.534546 ... -1.691863 +[66 rows x 7 columns] +``` -To explore the whole catalog, you can try: +Explore the whole catalog: ```python cat.walk() ``` @@ -115,7 +137,7 @@ Make data access for climate data easy: - [`intake_xarray`](https://intake-xarray.readthedocs.io/en/latest/) for: - `nc` using [`netcdf4`](https://github.com/Unidata/netcdf4-python) [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/atmosphere.yaml#L64)] - `tif` using [`rioxarray`](https://github.com/corteva/rioxarray) [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/humans.yaml#L42)] -- [`intake_excel`](https://github.com/edjdavid/intake-excel) for Excel `xls` and `xlsx` [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/climate.yaml#L35)] +- [`intake_excel`](https://github.com/edjdavid/intake-excel) for Excel `xls` and `xlsx` [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/climate.yaml#L35)] (see [`excel_source.py`](remote_climate_data/excel_source.py) for custom driver if package unavailable) - [`intake_geopandas`](https://github.com/intake/intake_geopandas) for shapefiles `shp` [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/shapefiles.yaml#L11)], GeoJSON `geo.json` [[example](https://github.com/aaronspring/remote_climate_data/blob/1209c5ebf5877b09b4403ea60da6d97b374b7b5c/catalogs/shapefiles.yaml#L57)], GeoParquet `parquet`, `PostGIS` databases, `Spatialite` databases - [`regionmask`](https://regionmask.readthedocs.io/) for aggregating over geoshapes diff --git a/catalogs/climate.yaml b/catalogs/climate.yaml index bdf5a0e..aee1538 100644 --- a/catalogs/climate.yaml +++ b/catalogs/climate.yaml @@ -2,7 +2,38 @@ plugins: source: - module: intake_xarray + - module: remote_climate_data sources: + Global_Carbon_Budget_2025: + description: Global Carbon Budget 2025 + metadata: + url: https://globalcarbonbudget.org/ + doi: + - https://doi.org/10.5194/essd-2025-659 + - https://doi.org/10.18160/gcp-2025 + driver: excel_url + args: + urlpath: simplecache::https://data.icos-cp.eu/licence_accept?ids=%5B%22UtUDiUg-PuYWkAiHoUNn83e0%22%5D + sheet_name: Global Carbon Budget + header: 21 + index_col: Year + skipfooter: 0 + + Global_Carbon_Budget_2021: + description: Global Carbon Budget 2021 + metadata: + url: https://www.globalcarbonproject.org/carbonbudget/ + doi: + - https://doi.org/10.5194/essd-2021-386 + - https://doi.org/10.18160/gcp-2021 + driver: excel_url + args: + urlpath: simplecache::https://data.icos-cp.eu/licence_accept?ids=%5B%220ST81nXCND5VfAQdOCSJDveT%22%5D + sheet_name: Global Carbon Budget + header: 20 + index_col: Year + skipfooter: 0 + NOAA_correlation: description: climate indices from psl.noaa.gov/data/correlation metadata: diff --git a/remote_climate_data/__init__.py b/remote_climate_data/__init__.py index 892fafb..3d22125 100644 --- a/remote_climate_data/__init__.py +++ b/remote_climate_data/__init__.py @@ -1,8 +1,10 @@ import intake +from remote_climate_data import excel_source # noqa: F401 + def cmor_cat(cat="../master.yml", keyword="cmor", depth=3): if isinstance(cat, str): cat = intake.open_catalog(cat) - assert isinstance(cat, intake.catalog.local.YAMLFileCatalog) # type: ignore[attr-defined] - return intake.Catalog.from_dict(cat.search(keyword, depth=depth).walk()) # type: ignore[call-arg, union-attr] + assert isinstance(cat, intake.catalog.local.YAMLFileCatalog) + return intake.Catalog.from_dict(cat.search(keyword, depth=depth).walk()) diff --git a/remote_climate_data/excel_source.py b/remote_climate_data/excel_source.py new file mode 100644 index 0000000..e86fbc2 --- /dev/null +++ b/remote_climate_data/excel_source.py @@ -0,0 +1,73 @@ +import io + +import intake +import intake.source + + +class ExcelSource(intake.source.base.DataSource): + """Intake source for reading Excel files from URLs. + + Supports simplecache:: prefix for caching. + """ + + container = "dataframe" + name = "excel_url" + version = "0.0.1" + + def __init__( + self, + urlpath, + sheet_name="Global Carbon Budget", + header=21, + index_col="Year", + skipfooter=4, + metadata=None, + ): + """Initialize Excel source. + + Args: + urlpath: URL or simplecache path to Excel file + sheet_name: Name of sheet to read + header: Row number to use as header (0-indexed) + index_col: Column to use as index + skipfooter: Number of rows to skip at end + metadata: Additional metadata + """ + super().__init__(metadata=metadata) + self.urlpath = urlpath + self.sheet_name = sheet_name + self.header = header + self.index_col = index_col + self.skipfooter = skipfooter + + def _get_schema(self): + return intake.source.base.Schema( + datatypes={"_": "python"}, + shape=(None, None), + npartitions=1, + metadata=self.metadata, + ) + + def _load(self): + import fsspec + import pandas as pd + + fs, _token, paths = fsspec.get_fs_token_paths(self.urlpath) + path = paths[0] + with fs.open(path, "rb") as f: + return pd.read_excel( + io.BytesIO(f.read()), + sheet_name=self.sheet_name, + header=self.header, + index_col=self.index_col, + skipfooter=self.skipfooter, + ) + + def read(self): + return self._load() + + def _close(self): + pass + + +intake.source.register_driver("excel_url", ExcelSource)