From a81bc61ce5591b657976b19ebb6ebaed19bf28be Mon Sep 17 00:00:00 2001
From: Alex Vermeulen <alex.vermeulen@icos-ri.eu>
Date: Sun, 21 Jun 2026 02:29:39 +0200
Subject: [PATCH] Index GRIB1 messages in scan_grib (edition-aware _split_file)

_split_file read the total-length field from the GRIB2 indicator section only
(64-bit length, low word at bytes 12-15), so GRIB1 messages (24-bit length at
bytes 4-6, plus ECMWF's scaled-length extension) got a wrong length, desynced
the scan, and were dropped. The decode path (GRIBCodec -> eccodes) already
reads both editions, so only the splitter needed to be edition-aware.

Also adds an EOF guard: trailing bytes after the last message could spin the
seek(-4) "marker may straddle the boundary" branch forever (latent; it only
surfaced once GRIB1 messages were parsed correctly).

Refs #358

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 kerchunk/grib2.py  | 19 +++++++++++++++----
 tests/test_grib.py | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/kerchunk/grib2.py b/kerchunk/grib2.py
index 8346809d..ae2f5650 100644
--- a/kerchunk/grib2.py
+++ b/kerchunk/grib2.py
@@ -66,12 +66,23 @@ def _split_file(f: io.FileIO, skip=0):
     while f.tell() < size:
         logger.debug(f"extract part {part + 1}")
         head = f.read(1024)
-        if b"GRIB" not in head:
-            f.seek(-4, 1)
+        ind = head.find(b"GRIB")
+        if ind == -1:
+            if len(head) < 1024:        # final partial read -> no more messages
+                break
+            f.seek(-4, 1)               # 'GRIB' may straddle the 1024-byte boundary
             continue
-        ind = head.index(b"GRIB")
         start = f.tell() - len(head) + ind
-        part_size = int.from_bytes(head[ind + 12 : ind + 16], "big")
+        edition = head[ind + 7]
+        if edition == 1:
+            # GRIB1: 24-bit total length at bytes 4-6. ECMWF "large message" extension:
+            # if the top bit is set, the real length is (len & 0x7fffff) * 120 bytes.
+            part_size = int.from_bytes(head[ind + 4 : ind + 7], "big")
+            if part_size & 0x800000:
+                part_size = (part_size & 0x7FFFFF) * 120
+        else:
+            # GRIB2: 64-bit total length at bytes 8-15.
+            part_size = int.from_bytes(head[ind + 8 : ind + 16], "big")
         f.seek(start)
         yield start, part_size, f.read(part_size)
         part += 1
diff --git a/tests/test_grib.py b/tests/test_grib.py
index 2c5387fd..0b051628 100644
--- a/tests/test_grib.py
+++ b/tests/test_grib.py
@@ -451,3 +451,37 @@ def test_extract_methods_grib_parameter(zarr_tree_and_datatree_instance):
 
     # checking if level in each series data
     assert all(list(map(lambda data: "level" in data.keys(), grib_metadata)))
+
+
+def test_scan_grib1(tmp_path):
+    # scan_grib must index GRIB1, not just GRIB2 (the decode path is edition-agnostic)
+    p = os.path.join(str(tmp_path), "sample.grib1")
+    gid = eccodes.codes_grib_new_from_samples("regular_ll_sfc_grib1")
+    assert eccodes.codes_get(gid, "editionNumber") == 1
+    with open(p, "wb") as f:
+        eccodes.codes_write(gid, f)
+    eccodes.codes_release(gid)
+
+    refs = scan_grib(p)
+    assert len(refs) >= 1
+    ds = xr.open_dataset(
+        fsspec.filesystem("reference", fo=refs[0]).get_mapper(""),
+        engine="zarr",
+        backend_kwargs={"consolidated": False},
+    )
+    assert len(ds.data_vars) >= 1
+    var = next(iter(ds.data_vars))
+    assert np.isfinite(np.asarray(ds[var].values)).any()
+
+
+def test_scan_grib1_and_grib2(tmp_path):
+    # a file holding both editions yields both messages
+    p = os.path.join(str(tmp_path), "mixed.grib")
+    g1 = eccodes.codes_grib_new_from_samples("regular_ll_sfc_grib1")
+    g2 = eccodes.codes_grib_new_from_samples("regular_ll_sfc_grib2")
+    with open(p, "wb") as f:
+        eccodes.codes_write(g1, f)
+        eccodes.codes_write(g2, f)
+    eccodes.codes_release(g1)
+    eccodes.codes_release(g2)
+    assert len(scan_grib(p)) >= 2