From a81bc61ce5591b657976b19ebb6ebaed19bf28be Mon Sep 17 00:00:00 2001 From: Alex Vermeulen Date: Sun, 21 Jun 2026 02:29:39 +0200 Subject: [PATCH] Index GRIB1 messages in scan_grib (edition-aware _split_file) _split_file read the total-length field from the GRIB2 indicator section only (64-bit length, low word at bytes 12-15), so GRIB1 messages (24-bit length at bytes 4-6, plus ECMWF's scaled-length extension) got a wrong length, desynced the scan, and were dropped. The decode path (GRIBCodec -> eccodes) already reads both editions, so only the splitter needed to be edition-aware. Also adds an EOF guard: trailing bytes after the last message could spin the seek(-4) "marker may straddle the boundary" branch forever (latent; it only surfaced once GRIB1 messages were parsed correctly). Refs #358 Co-Authored-By: Claude Opus 4.8 (1M context) --- kerchunk/grib2.py | 19 +++++++++++++++---- tests/test_grib.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/kerchunk/grib2.py b/kerchunk/grib2.py index 8346809d..ae2f5650 100644 --- a/kerchunk/grib2.py +++ b/kerchunk/grib2.py @@ -66,12 +66,23 @@ def _split_file(f: io.FileIO, skip=0): while f.tell() < size: logger.debug(f"extract part {part + 1}") head = f.read(1024) - if b"GRIB" not in head: - f.seek(-4, 1) + ind = head.find(b"GRIB") + if ind == -1: + if len(head) < 1024: # final partial read -> no more messages + break + f.seek(-4, 1) # 'GRIB' may straddle the 1024-byte boundary continue - ind = head.index(b"GRIB") start = f.tell() - len(head) + ind - part_size = int.from_bytes(head[ind + 12 : ind + 16], "big") + edition = head[ind + 7] + if edition == 1: + # GRIB1: 24-bit total length at bytes 4-6. ECMWF "large message" extension: + # if the top bit is set, the real length is (len & 0x7fffff) * 120 bytes. + part_size = int.from_bytes(head[ind + 4 : ind + 7], "big") + if part_size & 0x800000: + part_size = (part_size & 0x7FFFFF) * 120 + else: + # GRIB2: 64-bit total length at bytes 8-15. + part_size = int.from_bytes(head[ind + 8 : ind + 16], "big") f.seek(start) yield start, part_size, f.read(part_size) part += 1 diff --git a/tests/test_grib.py b/tests/test_grib.py index 2c5387fd..0b051628 100644 --- a/tests/test_grib.py +++ b/tests/test_grib.py @@ -451,3 +451,37 @@ def test_extract_methods_grib_parameter(zarr_tree_and_datatree_instance): # checking if level in each series data assert all(list(map(lambda data: "level" in data.keys(), grib_metadata))) + + +def test_scan_grib1(tmp_path): + # scan_grib must index GRIB1, not just GRIB2 (the decode path is edition-agnostic) + p = os.path.join(str(tmp_path), "sample.grib1") + gid = eccodes.codes_grib_new_from_samples("regular_ll_sfc_grib1") + assert eccodes.codes_get(gid, "editionNumber") == 1 + with open(p, "wb") as f: + eccodes.codes_write(gid, f) + eccodes.codes_release(gid) + + refs = scan_grib(p) + assert len(refs) >= 1 + ds = xr.open_dataset( + fsspec.filesystem("reference", fo=refs[0]).get_mapper(""), + engine="zarr", + backend_kwargs={"consolidated": False}, + ) + assert len(ds.data_vars) >= 1 + var = next(iter(ds.data_vars)) + assert np.isfinite(np.asarray(ds[var].values)).any() + + +def test_scan_grib1_and_grib2(tmp_path): + # a file holding both editions yields both messages + p = os.path.join(str(tmp_path), "mixed.grib") + g1 = eccodes.codes_grib_new_from_samples("regular_ll_sfc_grib1") + g2 = eccodes.codes_grib_new_from_samples("regular_ll_sfc_grib2") + with open(p, "wb") as f: + eccodes.codes_write(g1, f) + eccodes.codes_write(g2, f) + eccodes.codes_release(g1) + eccodes.codes_release(g2) + assert len(scan_grib(p)) >= 2