From 1c5861ea97d567f27f3b4965cd97c409802735e1 Mon Sep 17 00:00:00 2001 From: Heberto Mayorquin Date: Tue, 3 Mar 2026 00:15:01 -0600 Subject: [PATCH 1/5] Better collection detection --- CHANGELOG.md | 6 ++ src/hdmf/backends/hdf5/h5tools.py | 10 +-- src/hdmf/build/objectmapper.py | 7 +- src/hdmf/common/table.py | 12 ++-- src/hdmf/container.py | 4 +- src/hdmf/data_utils.py | 6 +- src/hdmf/utils.py | 35 ++++++++-- src/hdmf/validate/validator.py | 9 ++- tests/unit/test_io_hdf5_h5tools.py | 32 +++++++++ tests/unit/utils_test/test_utils.py | 104 +++++++++++++++++++++++++++- 10 files changed, 200 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e05245609..393268c66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # HDMF Changelog +## HDMF 5.0.1 (Upcoming) + +### Enhancements +- Replaced `hasattr(data, "__len__")` checks with `ndim`-based collection detection via new `_is_collection` and `_get_length` helpers in `hdmf.utils`. This fixes compatibility with array libraries that follow the Python array API standard (e.g., zarr v3), which provide `ndim` and `shape` but not `__len__`. @h-mayorquin [#1414](https://github.com/hdmf-dev/hdmf/pull/1414) + + ## HDMF 5.0.0 (March 2, 2026) ### Changed diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index c69ba4210..2a539c06b 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -20,7 +20,7 @@ from ...data_utils import AbstractDataChunkIterator from ...spec import RefSpec, DtypeSpec, NamespaceCatalog from ...utils import (docval, getargs, popargs, get_data_shape, get_docval, StrDataset, is_zarr_array, - get_basic_array_info, generate_array_html_repr) + get_basic_array_info, generate_array_html_repr, _is_collection, _get_length) from ..utils import NamespaceToBuilderHelper, WriteStatusTracker ROOT_NAME = 'root' @@ -805,10 +805,12 @@ def get_type(cls, data): return H5_BINARY elif isinstance(data, Container): return H5_REF - elif not hasattr(data, '__len__'): + elif not _is_collection(data): + if isinstance(data, np.ndarray) and data.ndim == 0: + return data.dtype.type return type(data) else: - if len(data) == 0: + if _get_length(data) == 0: if hasattr(data, 'dtype'): return data.dtype else: @@ -1238,7 +1240,7 @@ def _filler(): dset = self.__setup_chunked_dset__(parent, name, data, options) self.__dci_queue.append(dataset=dset, data=data) # Write a regular in memory array (e.g., numpy array, list etc.) - elif hasattr(data, '__len__'): + elif _is_collection(data): dset = self.__list_fill__(parent, name, data, options) # Write a regular scalar dataset else: diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index 389c9e834..affca571a 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -18,6 +18,7 @@ from ..container import AbstractContainer, Data from ..term_set import TermSetWrapper from ..data_utils import DataIO, AbstractDataChunkIterator +from ..utils import _is_collection, _get_length from ..query import ReferenceResolver from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec from ..spec.spec import BaseStorageSpec @@ -1024,19 +1025,19 @@ def __is_reftype(self, data): return False tmp = data - while hasattr(tmp, '__len__') and not isinstance(tmp, (AbstractContainer, str, bytes)): + while _is_collection(tmp) and not isinstance(tmp, AbstractContainer): tmptmp = None for t in tmp: # In case of a numeric array stop the iteration at the first element to avoid long-running loop if isinstance(t, (int, float, complex, bool)): break - if hasattr(t, '__len__') and len(t) > 0 and not isinstance(t, (AbstractContainer, str, bytes)): + if _is_collection(t) and _get_length(t) > 0 and not isinstance(t, AbstractContainer): tmptmp = tmp[0] break if tmptmp is not None: break else: - if len(tmp) == 0: + if _get_length(tmp) == 0: tmp = None else: tmp = tmp[0] diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index 9feeb04e9..e13b5704e 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -15,7 +15,7 @@ from . import register_class, EXP_NAMESPACE from ..container import Container, Data from ..data_utils import DataIO, AbstractDataChunkIterator -from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged +from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged, _is_collection, _get_length from ..term_set import TermSetWrapper @@ -294,7 +294,7 @@ def _validate_new_data(self, data): (hasattr(data, "data") and isinstance(data.data, AbstractDataChunkIterator))): if not np.issubdtype(data.dtype, np.integer): raise ValueError("ElementIdentifiers must contain integers") - elif hasattr(data, "__len__") and len(data): + elif _is_collection(data) and _get_length(data): self._validate_new_data_element(data[0]) def _validate_new_data_element(self, arg): @@ -1464,8 +1464,8 @@ def from_dataframe(cls, **kwargs): else: columns.append({'name': col_name, 'description': column_descriptions.get(col_name, 'no description')}) - if hasattr(df[col_name].iloc[0], '__len__') and not isinstance(df[col_name].iloc[0], str): - lengths = [len(x) for x in df[col_name]] + if _is_collection(df[col_name].iloc[0]): + lengths = [_get_length(x) for x in df[col_name]] if not lengths[1:] == lengths[:-1]: columns[-1].update(index=True) @@ -1780,8 +1780,8 @@ def _validate_new_data_element(self, arg): def _uint_precision(elements): """ Calculate the uint precision needed to encode a set of elements """ n_elements = elements - if hasattr(elements, '__len__'): - n_elements = len(elements) + if _is_collection(elements): + n_elements = _get_length(elements) return np.dtype('uint%d' % (8 * max(1, int((2 ** np.ceil((np.ceil(np.log2(n_elements)) - 8) / 8)))))).type diff --git a/src/hdmf/container.py b/src/hdmf/container.py index 762555112..2d1e1a172 100644 --- a/src/hdmf/container.py +++ b/src/hdmf/container.py @@ -11,7 +11,7 @@ from .data_utils import DataIO, append_data, extend_data, AbstractDataChunkIterator from .utils import (docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict, - get_basic_array_info, generate_array_html_repr) + get_basic_array_info, generate_array_html_repr, _is_collection) from .term_set import TermSet, TermSetWrapper @@ -626,7 +626,7 @@ def __repr__(self): template += "\nFields:\n" for k in sorted(self.fields): # sorted to enable tests v = self.fields[k] - if hasattr(v, '__len__'): + if _is_collection(v): if isinstance(v, (np.ndarray, list, tuple)) or v: template += " {}: {}\n".format(k, self.__smart_str(v, 1)) else: diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py index 0f245eda7..bcba9b6e3 100644 --- a/src/hdmf/data_utils.py +++ b/src/hdmf/data_utils.py @@ -14,7 +14,7 @@ import h5py import numpy as np -from .utils import docval, getargs, popargs, docval_macro, get_data_shape +from .utils import docval, getargs, popargs, docval_macro, get_data_shape, _is_collection, _get_length def append_data(data, arg): from hdmf.backends.hdf5.h5_utils import HDMFDataset @@ -738,9 +738,9 @@ def maxshape(self): # Size of self.__next_chunk.data along self.iter_axis is not accurate for maxshape because it is just a # chunk. So try to set maxshape along the dimension self.iter_axis based on the shape of self.data if # possible. Otherwise, use None to represent an unlimited size - if hasattr(self.data, '__len__') and self.iter_axis == 0: + if _is_collection(self.data) and self.iter_axis == 0: # special case of 1-D array - self.__maxshape[0] = len(self.data) + self.__maxshape[0] = _get_length(self.data) else: self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis] except AttributeError: # from self.data.shape diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py index 578da41f2..d6fd45795 100644 --- a/src/hdmf/utils.py +++ b/src/hdmf/utils.py @@ -829,9 +829,10 @@ def get_data_shape(data, strict_no_data_load=False): def __get_shape_helper(local_data): shape = list() - if hasattr(local_data, '__len__'): - shape.append(len(local_data)) - if len(local_data): + if _is_collection(local_data): + length = _get_length(local_data) + shape.append(length) + if length: el = next(iter(local_data)) # If local_data is a list/tuple of Data, do not iterate into the objects if not isinstance(el, (str, bytes, Data)): @@ -849,12 +850,38 @@ def __get_shape_helper(local_data): return data.maxshape if isinstance(data, dict): return None - if hasattr(data, '__len__') and not isinstance(data, (str, bytes)): + if _is_collection(data): if not strict_no_data_load or isinstance(data, (list, tuple, set)): return __get_shape_helper(data) return None +def _is_collection(data): + """Check if data is a collection (array-like with elements) vs a scalar. + + Checks ndim first because the Python array API standard requires conforming + arrays to have ndim and shape but does not require __len__. This handles + array libraries like zarr v3 that follow the standard. Falls back to + __len__ for plain Python containers (list, tuple). Strings and bytes + are treated as scalars. + """ + if isinstance(data, (str, bytes)): + return False + if hasattr(data, "ndim"): + return data.ndim > 0 + return hasattr(data, "__len__") + + +def _get_length(data): + """Get the length of the first dimension of a collection. + + Uses shape[0] for array-like objects and len() for plain containers. + """ + if hasattr(data, "shape") and data.shape is not None: + return data.shape[0] + return len(data) + + def pystr(s): """ Convert a string of characters to Python str object diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index c774a8bf1..d48d193c1 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -13,6 +13,7 @@ from ..spec import Spec, AttributeSpec, GroupSpec, DatasetSpec, RefSpec, LinkSpec from ..spec import SpecNamespace from ..spec.spec import BaseStorageSpec, DtypeHelper +from ..utils import _is_collection, _get_length from ..utils import docval, getargs, pystr, get_data_shape from ..query import ReferenceResolver @@ -198,7 +199,11 @@ def get_type(data, builder_dtype=None): # Numpy bool data elif isinstance(data, np.bool_): return 'bool', None - if not hasattr(data, '__len__'): + # Numpy 0-d structured array with compound dtype (ndim=0 but has named fields) + if isinstance(data, np.ndarray) and data.ndim == 0 and len(data.dtype) > 1: + if builder_dtype and isinstance(builder_dtype, list): + return _get_type_compound_dtype(data, builder_dtype) + if not _is_collection(data): if type(data) is float: # Python float is 64-bit return 'float64', None if type(data) is int: # Python int is 64-bit (or larger) @@ -214,7 +219,7 @@ def get_type(data, builder_dtype=None): return _get_type_from_dtype_attr(data, builder_dtype) # If all else has failed, try to determine the datatype from the first element of the array - if len(data) > 0: + if _get_length(data) > 0: return get_type(data[0], builder_dtype) raise EmptyArrayError() diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index c7e6ea531..21392e8e0 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -138,6 +138,21 @@ def test_write_dataset_scalar(self): self.assertTupleEqual(dset.shape, ()) self.assertEqual(dset[()], a) + def test_write_dataset_0d_ndarray(self): + """Regression: writing a 0-d ndarray as a scalar dataset should work. + + A user computing a scalar attribute (e.g. a sampling rate) with an + array-API-conforming library gets a 0-d array back. Converting to + numpy via np.asarray() produces a 0-d ndarray, which they then pass + to hdmf to store as an HDF5 dataset. Previously this crashed because + the __len__ heuristic misidentified 0-d ndarrays as collections. + """ + sampling_rate = np.asarray(30000.0) + self.io.write_dataset(self.f, DatasetBuilder('test_dataset', sampling_rate, attributes={})) + dset = self.f['test_dataset'] + self.assertTupleEqual(dset.shape, ()) + self.assertEqual(dset[()], 30000.0) + def test_write_dataset_string(self): a = 'test string' self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a, attributes={})) @@ -3913,6 +3928,23 @@ def test_setup_empty_dset_create_exception(self): with self.assertRaisesRegex(Exception, "Could not create dataset foo in /"): HDF5IO.__setup_empty_dset__(self.f, 'foo', {'shape': (3, 3), 'dtype': 'float'}) + def test_get_type_0d_ndarray(self): + """Regression: get_type should handle 0-d ndarrays. + + The Python array API standard requires reductions (mean, sum, etc.) + to return 0-d arrays, not scalars. When results from array-API- + conforming libraries (zarr v3, cupy, etc.) are converted to numpy + via np.asarray(), the result is a 0-d ndarray. A 0-d ndarray has + __len__ defined but len() raises TypeError, which previously + crashed get_type. We use np.asarray(scalar) to produce a 0-d + ndarray without requiring any array library as a test dependency. + """ + zero_d = np.asarray(3.14) + self.assertIsInstance(zero_d, np.ndarray) + self.assertEqual(zero_d.ndim, 0) + result = HDF5IO.get_type(zero_d) + self.assertIs(result, np.float64) + class H5DataIOTests(TestCase): diff --git a/tests/unit/utils_test/test_utils.py b/tests/unit/utils_test/test_utils.py index 98115d3d3..635c94ebc 100644 --- a/tests/unit/utils_test/test_utils.py +++ b/tests/unit/utils_test/test_utils.py @@ -5,7 +5,109 @@ from hdmf.container import Data from hdmf.data_utils import DataChunkIterator, DataIO from hdmf.testing import TestCase -from hdmf.utils import get_data_shape, to_uint_array, is_newer_version +from hdmf.utils import get_data_shape, to_uint_array, is_newer_version, _is_collection, _get_length + + +class TestIsCollection(TestCase): + """Tests for _is_collection helper that detects collections vs scalars.""" + + def test_numpy_1d_array(self): + self.assertTrue(_is_collection(np.array([1, 2, 3]))) + + def test_numpy_empty_array(self): + self.assertTrue(_is_collection(np.array([]))) + + def test_numpy_2d_array(self): + self.assertTrue(_is_collection(np.array([[1, 2], [3, 4]]))) + + def test_numpy_0d_array(self): + self.assertFalse(_is_collection(np.array(5))) + + def test_numpy_scalar(self): + self.assertFalse(_is_collection(np.float64(3.14))) + + def test_list(self): + self.assertTrue(_is_collection([1, 2, 3])) + + def test_empty_list(self): + self.assertTrue(_is_collection([])) + + def test_tuple(self): + self.assertTrue(_is_collection((1, 2, 3))) + + def test_set(self): + self.assertTrue(_is_collection({1, 2, 3})) + + def test_int(self): + self.assertFalse(_is_collection(42)) + + def test_float(self): + self.assertFalse(_is_collection(3.14)) + + def test_string(self): + self.assertFalse(_is_collection("hello")) + + def test_bytes(self): + self.assertFalse(_is_collection(b"hello")) + + def test_none(self): + self.assertFalse(_is_collection(None)) + + def test_bool(self): + self.assertFalse(_is_collection(True)) + + def test_ndim_without_len(self): + """Simulate zarr v3 array: has ndim and shape but no __len__.""" + class FakeArray: + ndim = 2 + shape = (10, 5) + self.assertTrue(_is_collection(FakeArray())) + + def test_ndim_zero_without_len(self): + """Simulate 0-d array-like: has ndim=0 but no __len__.""" + class FakeScalar: + ndim = 0 + shape = () + self.assertFalse(_is_collection(FakeScalar())) + + def test_numpy_0d_array_not_collection(self): + """Regression: numpy 0-d ndarrays have __len__ but len() raises TypeError. + + zarr v3 scalar indexing (e.g. zarr_array[0]) returns 0-d ndarrays. + The old hasattr(data, '__len__') check would incorrectly treat these + as collections, then crash on len(data). + """ + zero_d = np.array(5.0) + self.assertTrue(hasattr(zero_d, '__len__')) # confirms the old check would pass + with self.assertRaises(TypeError): + len(zero_d) # confirms len() crashes + self.assertFalse(_is_collection(zero_d)) # our helper handles it correctly + + + +class TestGetLength(TestCase): + """Tests for _get_length helper that gets first-dimension length.""" + + def test_list(self): + self.assertEqual(_get_length([1, 2, 3]), 3) + + def test_empty_list(self): + self.assertEqual(_get_length([]), 0) + + def test_tuple(self): + self.assertEqual(_get_length((1, 2)), 2) + + def test_numpy_array(self): + self.assertEqual(_get_length(np.array([1, 2, 3, 4])), 4) + + def test_numpy_2d_array(self): + self.assertEqual(_get_length(np.array([[1, 2], [3, 4], [5, 6]])), 3) + + def test_shape_without_len(self): + """Simulate zarr v3 array: has shape but no __len__.""" + class FakeArray: + shape = (10, 5) + self.assertEqual(_get_length(FakeArray()), 10) class TestGetDataShape(TestCase): From 4e4c6119314c54cef712c70000477fcf41210a66 Mon Sep 17 00:00:00 2001 From: Heberto Mayorquin Date: Tue, 3 Mar 2026 00:42:53 -0600 Subject: [PATCH 2/5] fix ruff --- src/hdmf/common/table.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index e13b5704e..e170c8b6a 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -15,7 +15,8 @@ from . import register_class, EXP_NAMESPACE from ..container import Container, Data from ..data_utils import DataIO, AbstractDataChunkIterator -from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged, _is_collection, _get_length +from ..utils import (docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged, + _is_collection, _get_length) from ..term_set import TermSetWrapper From e2d9ed18122168971d903b1c3f512fc947c1009b Mon Sep 17 00:00:00 2001 From: Heberto Mayorquin Date: Tue, 3 Mar 2026 03:31:12 -0600 Subject: [PATCH 3/5] Rename _get_length to _first_dim_size The function returns shape[0], not len(). The new name reflects the actual semantics: the size of the first dimension, which is what the array API standard exposes via .shape rather than __len__. Co-Authored-By: Claude Opus 4.6 --- hdmf_issue_get_length_semantics.md | 50 +++++++++++++++ hdmf_issue_is_collection.md | 8 +++ hdmf_issue_resolve_scalar.md | 7 +++ missing_tests_for_array_api_compat.md | 91 +++++++++++++++++++++++++++ src/hdmf/backends/hdf5/h5tools.py | 4 +- src/hdmf/build/objectmapper.py | 6 +- src/hdmf/common/table.py | 8 +-- src/hdmf/data_utils.py | 4 +- src/hdmf/utils.py | 9 +-- src/hdmf/validate/validator.py | 4 +- tests/unit/utils_test/test_utils.py | 16 ++--- 11 files changed, 182 insertions(+), 25 deletions(-) create mode 100644 hdmf_issue_get_length_semantics.md create mode 100644 hdmf_issue_is_collection.md create mode 100644 hdmf_issue_resolve_scalar.md create mode 100644 missing_tests_for_array_api_compat.md diff --git a/hdmf_issue_get_length_semantics.md b/hdmf_issue_get_length_semantics.md new file mode 100644 index 000000000..f878c97ec --- /dev/null +++ b/hdmf_issue_get_length_semantics.md @@ -0,0 +1,50 @@ +# Rename `_get_length` to reflect actual semantics: first-dimension size, not `len()` + +The `_get_length` utility was introduced in PR #1414 to replace direct `len()` calls that break with array-API-conforming libraries (e.g., zarr v3, which does not implement `__len__` on arrays). The function works correctly, but its name implies `len()` semantics when what hdmf actually needs in every call site is the size of the first dimension. + +## Current implementation + +```python +def _get_length(data): + if hasattr(data, "shape") and data.shape is not None: + return data.shape[0] + return len(data) +``` + +## What every call site actually does + +Every use of `_get_length` in the codebase falls into one of three patterns, all of which are asking "how many rows / elements along the first axis?": + +**Bounds checking / slice resolution** +- `table.py`: `arg.indices(_get_length(self.data))`, `arg >= _get_length(self.data)` +- `table.py`: `DynamicTableRegion.shape` returns `(_get_length(self.data), ...)` + +**Empty check** +- `table.py`: `_get_length(data) > 0` +- `validator.py`: `_get_length(data) > 0` +- `h5tools.py`: `_get_length(data) == 0` +- `objectmapper.py`: `_get_length(t) > 0`, `_get_length(tmp) == 0` + +**Sizing / iteration** +- `container.py`: `Data.__len__` returns `_get_length(self.__data)` +- `h5tools.py`: `_get_length(data) > dset.shape[0]`, `new_shape[0] = _get_length(data)` +- `objectmapper.py`: `data_shape = (_get_length(data),)` +- `data_utils.py`: `self.__maxshape[0] = _get_length(self.data)` + +None of these are asking for "the length of this object" in the Python `__len__` sense. They are all asking for `shape[0]` -- the number of elements along the first axis. + +## Why the name matters + +1. The array API standard deliberately excludes `__len__` from array objects. This was not an oversight: for multi-dimensional arrays, `len()` is ambiguous (does it mean total elements? first axis? something else?). The standard uses `.shape` and `.size` instead. + +2. Naming this `_get_length` suggests that hdmf is working around the missing `__len__` by reimplementing it. A name like `_num_rows` or `_first_dim_size` would make clear that we are using the right semantic: first-axis size via `.shape[0]`, with a `len()` fallback for plain containers that don't have `.shape`. + +3. Future contributors will see `_get_length` and think "this is just `len()` with extra steps" rather than understanding the intent. + +## Suggested rename + +`_get_length` -> `_num_rows` or `_first_dim_size` + +The fallback to `len()` for plain containers (lists, tuples, dicts) should stay since these don't have `.shape`, and for 1-d containers `len()` does give the first-axis size. But the name should reflect what we are actually measuring. + +This is a low-priority naming issue, not a bug. All existing call sites work correctly with the current implementation. diff --git a/hdmf_issue_is_collection.md b/hdmf_issue_is_collection.md new file mode 100644 index 000000000..ad2016a7f --- /dev/null +++ b/hdmf_issue_is_collection.md @@ -0,0 +1,8 @@ +# Replace `hasattr(data, "__len__")` with `ndim`-based collection detection + +Motivated by hdmf-dev/hdmf-zarr#325 (zarr v2 to v3 migration). + +hdmf uses `hasattr(data, "__len__")` across hte library ([`utils.py`](https://github.com/hdmf-dev/hdmf/blob/a27f6061/src/hdmf/utils.py#L852), [`h5tools.py`](https://github.com/hdmf-dev/hdmf/blob/a27f6061/src/hdmf/backends/hdf5/h5tools.py#L808), [`validator.py`](https://github.com/hdmf-dev/hdmf/blob/a27f6061/src/hdmf/validate/validator.py#L201)) to decide whether data is a collection or a scalar. This has worked so far because numpy implements `__len__` and other array libraries (h5py, zarr v2) followed that convention. But `__len__` is not part of any array specification (see the [array API standard object definition](https://data-apis.org/array-api/draft/API_specification/array_object.html#attributes) for the required attributes and methods), so we cannot rely on future array libraries continuing to provide it. The check is also implicit about what it is actually trying to detect. The zarr v2 to v3 migration (hdmf-dev/hdmf-zarr#325) exposed this, as zarr v3 arrays no longer implement `__len__`. The `__len__` heuristic also has a bug in the current release: 0-d ndarrays have `__len__` defined but `len()` raises `TypeError`, so passing one to `HDF5IO.get_type` or `write_dataset` crashes. This already happens when converting results from array-API-conforming libraries (which return 0-d arrays from reductions like `mean` and `sum`) to numpy via `np.asarray()`. A regression test for this is included. + +This PR centralizes the "is this a collection?" logic into a single helper (`_is_collection`) that is both more general and more explicit: check `ndim > 0` first (which the array API standard guarantees for any conforming array), then fall back to `__len__` for plain Python containers. A companion `_get_length` helper safely gets first-dimension length via `shape[0]` or `len()`. This replaces all scattered checks with one clear, documented policy that covers both legacy types and array API standard conforming libraries, and shoudl eliminate the need for monkey-patching in hdmf-zarr. +T \ No newline at end of file diff --git a/hdmf_issue_resolve_scalar.md b/hdmf_issue_resolve_scalar.md new file mode 100644 index 000000000..c3d0c77d4 --- /dev/null +++ b/hdmf_issue_resolve_scalar.md @@ -0,0 +1,7 @@ +# Handle 0-d ndarrays in scalar isinstance checks + +Chained after #1414 (`_is_collection`). Motivated by hdmf-dev/hdmf-zarr#325 (zarr v2 to v3 migration). + +hdmf's `get_type` functions infer element dtype by recursively indexing with `data[0]` until reaching a scalar, then calling `type()` on it. With numpy and zarr v2, `data[0]` on a 1-d float array returns a numpy scalar (e.g., `numpy.float64`), which passes `isinstance(val, float)` and has no `__len__`, so all downstream checks work. With zarr v3 (following the Python array API standard), `data[0]` returns a 0-d ndarray instead. A 0-d ndarray fails `isinstance(val, (int, float, str, bool))` and `type()` returns `numpy.ndarray` rather than the element dtype. PR #1414 fixed the crash path (`__len__` heuristic), but `isinstance` checks in other parts of the codebase still silently take the wrong branch when they encounter a 0-d ndarray. + +This PR adds a `_unwrap_scalar` helper in `hdmf.utils` that converts 0-d ndarrays to numpy scalars via `.item()`, and applies it at the remaining `isinstance` checks that compare against Python scalar types. Together with #1414, this eliminates the need for the `__getitem__` monkey-patch in hdmf-zarr PR #325. diff --git a/missing_tests_for_array_api_compat.md b/missing_tests_for_array_api_compat.md new file mode 100644 index 000000000..00e575037 --- /dev/null +++ b/missing_tests_for_array_api_compat.md @@ -0,0 +1,91 @@ +# Missing tests for array-API compatibility in hdmf + +PR #1414 (collection detection) and PR #1415 (0-d ndarray handling) introduced `_is_collection`, `_get_length`, and `_unwrap_scalar` to replace duck-typing patterns that break with zarr v3. However, when tested against hdmf-zarr's full test suite, four additional call sites failed that were not covered by the original PRs. + +## What failed and where + +All four failures came from hdmf-zarr tests that exercise read-back and export paths, where `data` is a `zarr.Array`: + +| File | Line | Call | Error | +|------|------|------|-------| +| `container.py` | 980 | `len(self.__data)` in `Data.__len__` | `TypeError: object of type 'Array' has no len()` | +| `h5tools.py` | 1403, 1405 | `len(data)` when sizing datasets during export | same | +| `objectmapper.py` | 977 | `len(data)` for compound dtype shape | same | +| `objectmapper.py` | 1069 | `list(row)` iterating compound dataset rows | `TypeError: iteration over a 0-d array` | + +These were fixed in commits `7ee7ed64` (table.py), and `ec5ece4e` (container.py, h5tools.py, objectmapper.py) on the `remove_duck_typing_for_type` branch. + +## Why these were missed + +The original PRs found call sites by searching for `hasattr(data, "__len__")` patterns. But these four sites used `len()` directly without a `hasattr` guard, so they were invisible to that search. They also don't fail in hdmf's own test suite because hdmf tests never pass zarr Arrays as data -- they use lists, numpy arrays, and h5py datasets, all of which have `__len__` and return scalars from indexing. + +## Tests that should be added + +hdmf currently has no unit tests that exercise its core APIs with array-like objects that lack `__len__` or that return 0-d ndarrays from scalar indexing. The following tests would catch regressions and cover the patterns we fixed: + +### 1. `Data.__len__` with a `__len__`-less array-like + +Test that `Data(name, data)` where `data` has `shape` but no `__len__` still returns the correct length via `shape[0]`. + +```python +class ArrayWithoutLen: + """Minimal array-like that has shape and ndim but no __len__, like zarr v3.""" + def __init__(self, shape, dtype): + self._data = np.zeros(shape, dtype=dtype) + self.shape = shape + self.ndim = len(shape) + self.dtype = dtype + + def __getitem__(self, idx): + return self._data[idx] + +def test_data_len_without_dunder_len(): + arr = ArrayWithoutLen((5,), "f8") + d = Data(name="test", data=arr) + assert len(d) == 5 +``` + +### 2. `_is_collection` and `_get_length` with array-like without `__len__` + +```python +def test_is_collection_array_without_len(): + arr = ArrayWithoutLen((3,), "i4") + assert _is_collection(arr) is True + assert _get_length(arr) == 3 + +def test_is_collection_scalar_array(): + arr = ArrayWithoutLen((), "f8") + assert _is_collection(arr) is False +``` + +### 3. `_unwrap_scalar` in `ElementIdentifiers._validate_new_data_element` + +Test that `ElementIdentifiers` accepts data whose `__getitem__` returns 0-d ndarrays: + +```python +class ArrayReturning0dNdarray(ArrayWithoutLen): + """Array-like where __getitem__ returns 0-d ndarray (like zarr v3).""" + def __getitem__(self, idx): + val = self._data[idx] + if np.ndim(val) == 0 and not isinstance(val, np.ndarray): + return np.array(val) # wrap scalar as 0-d ndarray + return val + +def test_element_identifiers_with_0d_ndarray_elements(): + arr = ArrayReturning0dNdarray((3,), "i8") + arr._data[:] = [10, 20, 30] + # should not raise "ElementIdentifiers must contain integers" + eids = ElementIdentifiers(name="ids", data=arr) +``` + +### 4. `ObjectMapper` compound dtype handling with 0-d rows + +Test that building a compound dataset where each row is a 0-d ndarray (structured void) works. This is harder to unit-test in isolation because it goes through the build pipeline, but a minimal test could mock `container.data` as an array whose iteration yields 0-d structured ndarrays. + +### 5. `HDF5IO.__write_dataset` with `__len__`-less data + +Test that writing a dataset from a `__len__`-less array-like correctly determines data shape via `shape[0]`. + +## Broader pattern + +The root issue is that hdmf implicitly assumes all array-like data supports `len()` and returns scalars from `__getitem__`. The Python array API standard does not require either. A shared mock class (like `ArrayWithoutLen` above) that mimics array-API-only behavior could be reused across the test suite to systematically verify all data paths. diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 2a539c06b..7d14625fd 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -20,7 +20,7 @@ from ...data_utils import AbstractDataChunkIterator from ...spec import RefSpec, DtypeSpec, NamespaceCatalog from ...utils import (docval, getargs, popargs, get_data_shape, get_docval, StrDataset, is_zarr_array, - get_basic_array_info, generate_array_html_repr, _is_collection, _get_length) + get_basic_array_info, generate_array_html_repr, _is_collection, _first_dim_size) from ..utils import NamespaceToBuilderHelper, WriteStatusTracker ROOT_NAME = 'root' @@ -810,7 +810,7 @@ def get_type(cls, data): return data.dtype.type return type(data) else: - if _get_length(data) == 0: + if _first_dim_size(data) == 0: if hasattr(data, 'dtype'): return data.dtype else: diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index affca571a..699d1c0c9 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -18,7 +18,7 @@ from ..container import AbstractContainer, Data from ..term_set import TermSetWrapper from ..data_utils import DataIO, AbstractDataChunkIterator -from ..utils import _is_collection, _get_length +from ..utils import _is_collection, _first_dim_size from ..query import ReferenceResolver from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec from ..spec.spec import BaseStorageSpec @@ -1031,13 +1031,13 @@ def __is_reftype(self, data): # In case of a numeric array stop the iteration at the first element to avoid long-running loop if isinstance(t, (int, float, complex, bool)): break - if _is_collection(t) and _get_length(t) > 0 and not isinstance(t, AbstractContainer): + if _is_collection(t) and _first_dim_size(t) > 0 and not isinstance(t, AbstractContainer): tmptmp = tmp[0] break if tmptmp is not None: break else: - if _get_length(tmp) == 0: + if _first_dim_size(tmp) == 0: tmp = None else: tmp = tmp[0] diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index e170c8b6a..40c01519b 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -16,7 +16,7 @@ from ..container import Container, Data from ..data_utils import DataIO, AbstractDataChunkIterator from ..utils import (docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged, - _is_collection, _get_length) + _is_collection, _first_dim_size) from ..term_set import TermSetWrapper @@ -295,7 +295,7 @@ def _validate_new_data(self, data): (hasattr(data, "data") and isinstance(data.data, AbstractDataChunkIterator))): if not np.issubdtype(data.dtype, np.integer): raise ValueError("ElementIdentifiers must contain integers") - elif _is_collection(data) and _get_length(data): + elif _is_collection(data) and _first_dim_size(data): self._validate_new_data_element(data[0]) def _validate_new_data_element(self, arg): @@ -1466,7 +1466,7 @@ def from_dataframe(cls, **kwargs): columns.append({'name': col_name, 'description': column_descriptions.get(col_name, 'no description')}) if _is_collection(df[col_name].iloc[0]): - lengths = [_get_length(x) for x in df[col_name]] + lengths = [_first_dim_size(x) for x in df[col_name]] if not lengths[1:] == lengths[:-1]: columns[-1].update(index=True) @@ -1782,7 +1782,7 @@ def _uint_precision(elements): """ Calculate the uint precision needed to encode a set of elements """ n_elements = elements if _is_collection(elements): - n_elements = _get_length(elements) + n_elements = _first_dim_size(elements) return np.dtype('uint%d' % (8 * max(1, int((2 ** np.ceil((np.ceil(np.log2(n_elements)) - 8) / 8)))))).type diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py index bcba9b6e3..7e706a88d 100644 --- a/src/hdmf/data_utils.py +++ b/src/hdmf/data_utils.py @@ -14,7 +14,7 @@ import h5py import numpy as np -from .utils import docval, getargs, popargs, docval_macro, get_data_shape, _is_collection, _get_length +from .utils import docval, getargs, popargs, docval_macro, get_data_shape, _is_collection, _first_dim_size def append_data(data, arg): from hdmf.backends.hdf5.h5_utils import HDMFDataset @@ -740,7 +740,7 @@ def maxshape(self): # possible. Otherwise, use None to represent an unlimited size if _is_collection(self.data) and self.iter_axis == 0: # special case of 1-D array - self.__maxshape[0] = _get_length(self.data) + self.__maxshape[0] = _first_dim_size(self.data) else: self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis] except AttributeError: # from self.data.shape diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py index d6fd45795..daa7ca423 100644 --- a/src/hdmf/utils.py +++ b/src/hdmf/utils.py @@ -830,7 +830,7 @@ def get_data_shape(data, strict_no_data_load=False): def __get_shape_helper(local_data): shape = list() if _is_collection(local_data): - length = _get_length(local_data) + length = _first_dim_size(local_data) shape.append(length) if length: el = next(iter(local_data)) @@ -872,10 +872,11 @@ def _is_collection(data): return hasattr(data, "__len__") -def _get_length(data): - """Get the length of the first dimension of a collection. +def _first_dim_size(data): + """Get the size of the first dimension of a collection. - Uses shape[0] for array-like objects and len() for plain containers. + Returns shape[0] for array-like objects (including those without __len__, + such as zarr v3 arrays) and len() for plain containers. """ if hasattr(data, "shape") and data.shape is not None: return data.shape[0] diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index d48d193c1..da3670cc9 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -13,7 +13,7 @@ from ..spec import Spec, AttributeSpec, GroupSpec, DatasetSpec, RefSpec, LinkSpec from ..spec import SpecNamespace from ..spec.spec import BaseStorageSpec, DtypeHelper -from ..utils import _is_collection, _get_length +from ..utils import _is_collection, _first_dim_size from ..utils import docval, getargs, pystr, get_data_shape from ..query import ReferenceResolver @@ -219,7 +219,7 @@ def get_type(data, builder_dtype=None): return _get_type_from_dtype_attr(data, builder_dtype) # If all else has failed, try to determine the datatype from the first element of the array - if _get_length(data) > 0: + if _first_dim_size(data) > 0: return get_type(data[0], builder_dtype) raise EmptyArrayError() diff --git a/tests/unit/utils_test/test_utils.py b/tests/unit/utils_test/test_utils.py index 635c94ebc..93147cc6e 100644 --- a/tests/unit/utils_test/test_utils.py +++ b/tests/unit/utils_test/test_utils.py @@ -5,7 +5,7 @@ from hdmf.container import Data from hdmf.data_utils import DataChunkIterator, DataIO from hdmf.testing import TestCase -from hdmf.utils import get_data_shape, to_uint_array, is_newer_version, _is_collection, _get_length +from hdmf.utils import get_data_shape, to_uint_array, is_newer_version, _is_collection, _first_dim_size class TestIsCollection(TestCase): @@ -86,28 +86,28 @@ def test_numpy_0d_array_not_collection(self): class TestGetLength(TestCase): - """Tests for _get_length helper that gets first-dimension length.""" + """Tests for _first_dim_size helper that gets first-dimension length.""" def test_list(self): - self.assertEqual(_get_length([1, 2, 3]), 3) + self.assertEqual(_first_dim_size([1, 2, 3]), 3) def test_empty_list(self): - self.assertEqual(_get_length([]), 0) + self.assertEqual(_first_dim_size([]), 0) def test_tuple(self): - self.assertEqual(_get_length((1, 2)), 2) + self.assertEqual(_first_dim_size((1, 2)), 2) def test_numpy_array(self): - self.assertEqual(_get_length(np.array([1, 2, 3, 4])), 4) + self.assertEqual(_first_dim_size(np.array([1, 2, 3, 4])), 4) def test_numpy_2d_array(self): - self.assertEqual(_get_length(np.array([[1, 2], [3, 4], [5, 6]])), 3) + self.assertEqual(_first_dim_size(np.array([[1, 2], [3, 4], [5, 6]])), 3) def test_shape_without_len(self): """Simulate zarr v3 array: has shape but no __len__.""" class FakeArray: shape = (10, 5) - self.assertEqual(_get_length(FakeArray()), 10) + self.assertEqual(_first_dim_size(FakeArray()), 10) class TestGetDataShape(TestCase): From d915d63cfef3b3ce741afc0fcbe27388d0056d08 Mon Sep 17 00:00:00 2001 From: Heberto Mayorquin Date: Tue, 3 Mar 2026 03:31:21 -0600 Subject: [PATCH 4/5] Remove draft markdown files accidentally committed --- hdmf_issue_get_length_semantics.md | 50 --------------- hdmf_issue_is_collection.md | 8 --- hdmf_issue_resolve_scalar.md | 7 --- missing_tests_for_array_api_compat.md | 91 --------------------------- 4 files changed, 156 deletions(-) delete mode 100644 hdmf_issue_get_length_semantics.md delete mode 100644 hdmf_issue_is_collection.md delete mode 100644 hdmf_issue_resolve_scalar.md delete mode 100644 missing_tests_for_array_api_compat.md diff --git a/hdmf_issue_get_length_semantics.md b/hdmf_issue_get_length_semantics.md deleted file mode 100644 index f878c97ec..000000000 --- a/hdmf_issue_get_length_semantics.md +++ /dev/null @@ -1,50 +0,0 @@ -# Rename `_get_length` to reflect actual semantics: first-dimension size, not `len()` - -The `_get_length` utility was introduced in PR #1414 to replace direct `len()` calls that break with array-API-conforming libraries (e.g., zarr v3, which does not implement `__len__` on arrays). The function works correctly, but its name implies `len()` semantics when what hdmf actually needs in every call site is the size of the first dimension. - -## Current implementation - -```python -def _get_length(data): - if hasattr(data, "shape") and data.shape is not None: - return data.shape[0] - return len(data) -``` - -## What every call site actually does - -Every use of `_get_length` in the codebase falls into one of three patterns, all of which are asking "how many rows / elements along the first axis?": - -**Bounds checking / slice resolution** -- `table.py`: `arg.indices(_get_length(self.data))`, `arg >= _get_length(self.data)` -- `table.py`: `DynamicTableRegion.shape` returns `(_get_length(self.data), ...)` - -**Empty check** -- `table.py`: `_get_length(data) > 0` -- `validator.py`: `_get_length(data) > 0` -- `h5tools.py`: `_get_length(data) == 0` -- `objectmapper.py`: `_get_length(t) > 0`, `_get_length(tmp) == 0` - -**Sizing / iteration** -- `container.py`: `Data.__len__` returns `_get_length(self.__data)` -- `h5tools.py`: `_get_length(data) > dset.shape[0]`, `new_shape[0] = _get_length(data)` -- `objectmapper.py`: `data_shape = (_get_length(data),)` -- `data_utils.py`: `self.__maxshape[0] = _get_length(self.data)` - -None of these are asking for "the length of this object" in the Python `__len__` sense. They are all asking for `shape[0]` -- the number of elements along the first axis. - -## Why the name matters - -1. The array API standard deliberately excludes `__len__` from array objects. This was not an oversight: for multi-dimensional arrays, `len()` is ambiguous (does it mean total elements? first axis? something else?). The standard uses `.shape` and `.size` instead. - -2. Naming this `_get_length` suggests that hdmf is working around the missing `__len__` by reimplementing it. A name like `_num_rows` or `_first_dim_size` would make clear that we are using the right semantic: first-axis size via `.shape[0]`, with a `len()` fallback for plain containers that don't have `.shape`. - -3. Future contributors will see `_get_length` and think "this is just `len()` with extra steps" rather than understanding the intent. - -## Suggested rename - -`_get_length` -> `_num_rows` or `_first_dim_size` - -The fallback to `len()` for plain containers (lists, tuples, dicts) should stay since these don't have `.shape`, and for 1-d containers `len()` does give the first-axis size. But the name should reflect what we are actually measuring. - -This is a low-priority naming issue, not a bug. All existing call sites work correctly with the current implementation. diff --git a/hdmf_issue_is_collection.md b/hdmf_issue_is_collection.md deleted file mode 100644 index ad2016a7f..000000000 --- a/hdmf_issue_is_collection.md +++ /dev/null @@ -1,8 +0,0 @@ -# Replace `hasattr(data, "__len__")` with `ndim`-based collection detection - -Motivated by hdmf-dev/hdmf-zarr#325 (zarr v2 to v3 migration). - -hdmf uses `hasattr(data, "__len__")` across hte library ([`utils.py`](https://github.com/hdmf-dev/hdmf/blob/a27f6061/src/hdmf/utils.py#L852), [`h5tools.py`](https://github.com/hdmf-dev/hdmf/blob/a27f6061/src/hdmf/backends/hdf5/h5tools.py#L808), [`validator.py`](https://github.com/hdmf-dev/hdmf/blob/a27f6061/src/hdmf/validate/validator.py#L201)) to decide whether data is a collection or a scalar. This has worked so far because numpy implements `__len__` and other array libraries (h5py, zarr v2) followed that convention. But `__len__` is not part of any array specification (see the [array API standard object definition](https://data-apis.org/array-api/draft/API_specification/array_object.html#attributes) for the required attributes and methods), so we cannot rely on future array libraries continuing to provide it. The check is also implicit about what it is actually trying to detect. The zarr v2 to v3 migration (hdmf-dev/hdmf-zarr#325) exposed this, as zarr v3 arrays no longer implement `__len__`. The `__len__` heuristic also has a bug in the current release: 0-d ndarrays have `__len__` defined but `len()` raises `TypeError`, so passing one to `HDF5IO.get_type` or `write_dataset` crashes. This already happens when converting results from array-API-conforming libraries (which return 0-d arrays from reductions like `mean` and `sum`) to numpy via `np.asarray()`. A regression test for this is included. - -This PR centralizes the "is this a collection?" logic into a single helper (`_is_collection`) that is both more general and more explicit: check `ndim > 0` first (which the array API standard guarantees for any conforming array), then fall back to `__len__` for plain Python containers. A companion `_get_length` helper safely gets first-dimension length via `shape[0]` or `len()`. This replaces all scattered checks with one clear, documented policy that covers both legacy types and array API standard conforming libraries, and shoudl eliminate the need for monkey-patching in hdmf-zarr. -T \ No newline at end of file diff --git a/hdmf_issue_resolve_scalar.md b/hdmf_issue_resolve_scalar.md deleted file mode 100644 index c3d0c77d4..000000000 --- a/hdmf_issue_resolve_scalar.md +++ /dev/null @@ -1,7 +0,0 @@ -# Handle 0-d ndarrays in scalar isinstance checks - -Chained after #1414 (`_is_collection`). Motivated by hdmf-dev/hdmf-zarr#325 (zarr v2 to v3 migration). - -hdmf's `get_type` functions infer element dtype by recursively indexing with `data[0]` until reaching a scalar, then calling `type()` on it. With numpy and zarr v2, `data[0]` on a 1-d float array returns a numpy scalar (e.g., `numpy.float64`), which passes `isinstance(val, float)` and has no `__len__`, so all downstream checks work. With zarr v3 (following the Python array API standard), `data[0]` returns a 0-d ndarray instead. A 0-d ndarray fails `isinstance(val, (int, float, str, bool))` and `type()` returns `numpy.ndarray` rather than the element dtype. PR #1414 fixed the crash path (`__len__` heuristic), but `isinstance` checks in other parts of the codebase still silently take the wrong branch when they encounter a 0-d ndarray. - -This PR adds a `_unwrap_scalar` helper in `hdmf.utils` that converts 0-d ndarrays to numpy scalars via `.item()`, and applies it at the remaining `isinstance` checks that compare against Python scalar types. Together with #1414, this eliminates the need for the `__getitem__` monkey-patch in hdmf-zarr PR #325. diff --git a/missing_tests_for_array_api_compat.md b/missing_tests_for_array_api_compat.md deleted file mode 100644 index 00e575037..000000000 --- a/missing_tests_for_array_api_compat.md +++ /dev/null @@ -1,91 +0,0 @@ -# Missing tests for array-API compatibility in hdmf - -PR #1414 (collection detection) and PR #1415 (0-d ndarray handling) introduced `_is_collection`, `_get_length`, and `_unwrap_scalar` to replace duck-typing patterns that break with zarr v3. However, when tested against hdmf-zarr's full test suite, four additional call sites failed that were not covered by the original PRs. - -## What failed and where - -All four failures came from hdmf-zarr tests that exercise read-back and export paths, where `data` is a `zarr.Array`: - -| File | Line | Call | Error | -|------|------|------|-------| -| `container.py` | 980 | `len(self.__data)` in `Data.__len__` | `TypeError: object of type 'Array' has no len()` | -| `h5tools.py` | 1403, 1405 | `len(data)` when sizing datasets during export | same | -| `objectmapper.py` | 977 | `len(data)` for compound dtype shape | same | -| `objectmapper.py` | 1069 | `list(row)` iterating compound dataset rows | `TypeError: iteration over a 0-d array` | - -These were fixed in commits `7ee7ed64` (table.py), and `ec5ece4e` (container.py, h5tools.py, objectmapper.py) on the `remove_duck_typing_for_type` branch. - -## Why these were missed - -The original PRs found call sites by searching for `hasattr(data, "__len__")` patterns. But these four sites used `len()` directly without a `hasattr` guard, so they were invisible to that search. They also don't fail in hdmf's own test suite because hdmf tests never pass zarr Arrays as data -- they use lists, numpy arrays, and h5py datasets, all of which have `__len__` and return scalars from indexing. - -## Tests that should be added - -hdmf currently has no unit tests that exercise its core APIs with array-like objects that lack `__len__` or that return 0-d ndarrays from scalar indexing. The following tests would catch regressions and cover the patterns we fixed: - -### 1. `Data.__len__` with a `__len__`-less array-like - -Test that `Data(name, data)` where `data` has `shape` but no `__len__` still returns the correct length via `shape[0]`. - -```python -class ArrayWithoutLen: - """Minimal array-like that has shape and ndim but no __len__, like zarr v3.""" - def __init__(self, shape, dtype): - self._data = np.zeros(shape, dtype=dtype) - self.shape = shape - self.ndim = len(shape) - self.dtype = dtype - - def __getitem__(self, idx): - return self._data[idx] - -def test_data_len_without_dunder_len(): - arr = ArrayWithoutLen((5,), "f8") - d = Data(name="test", data=arr) - assert len(d) == 5 -``` - -### 2. `_is_collection` and `_get_length` with array-like without `__len__` - -```python -def test_is_collection_array_without_len(): - arr = ArrayWithoutLen((3,), "i4") - assert _is_collection(arr) is True - assert _get_length(arr) == 3 - -def test_is_collection_scalar_array(): - arr = ArrayWithoutLen((), "f8") - assert _is_collection(arr) is False -``` - -### 3. `_unwrap_scalar` in `ElementIdentifiers._validate_new_data_element` - -Test that `ElementIdentifiers` accepts data whose `__getitem__` returns 0-d ndarrays: - -```python -class ArrayReturning0dNdarray(ArrayWithoutLen): - """Array-like where __getitem__ returns 0-d ndarray (like zarr v3).""" - def __getitem__(self, idx): - val = self._data[idx] - if np.ndim(val) == 0 and not isinstance(val, np.ndarray): - return np.array(val) # wrap scalar as 0-d ndarray - return val - -def test_element_identifiers_with_0d_ndarray_elements(): - arr = ArrayReturning0dNdarray((3,), "i8") - arr._data[:] = [10, 20, 30] - # should not raise "ElementIdentifiers must contain integers" - eids = ElementIdentifiers(name="ids", data=arr) -``` - -### 4. `ObjectMapper` compound dtype handling with 0-d rows - -Test that building a compound dataset where each row is a 0-d ndarray (structured void) works. This is harder to unit-test in isolation because it goes through the build pipeline, but a minimal test could mock `container.data` as an array whose iteration yields 0-d structured ndarrays. - -### 5. `HDF5IO.__write_dataset` with `__len__`-less data - -Test that writing a dataset from a `__len__`-less array-like correctly determines data shape via `shape[0]`. - -## Broader pattern - -The root issue is that hdmf implicitly assumes all array-like data supports `len()` and returns scalars from `__getitem__`. The Python array API standard does not require either. A shared mock class (like `ArrayWithoutLen` above) that mimics array-API-only behavior could be reused across the test suite to systematically verify all data paths. From 3bf5c8f3c571672e344f67bcee1b6fba04ebbfe6 Mon Sep 17 00:00:00 2001 From: Heberto Mayorquin Date: Tue, 3 Mar 2026 03:41:01 -0600 Subject: [PATCH 5/5] Revert _first_dim_size rename back to _get_length After discussion, _get_length is the clearest name: it maps to what Python programmers understand as "how many items if I iterate this", which is what every call site needs. The implementation detail of using shape[0] vs len() belongs in the docstring, not the name. Co-Authored-By: Claude Opus 4.6 --- src/hdmf/backends/hdf5/h5tools.py | 4 ++-- src/hdmf/build/objectmapper.py | 6 +++--- src/hdmf/common/table.py | 8 ++++---- src/hdmf/data_utils.py | 4 ++-- src/hdmf/utils.py | 9 ++++----- src/hdmf/validate/validator.py | 4 ++-- tests/unit/utils_test/test_utils.py | 16 ++++++++-------- 7 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 7d14625fd..2a539c06b 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -20,7 +20,7 @@ from ...data_utils import AbstractDataChunkIterator from ...spec import RefSpec, DtypeSpec, NamespaceCatalog from ...utils import (docval, getargs, popargs, get_data_shape, get_docval, StrDataset, is_zarr_array, - get_basic_array_info, generate_array_html_repr, _is_collection, _first_dim_size) + get_basic_array_info, generate_array_html_repr, _is_collection, _get_length) from ..utils import NamespaceToBuilderHelper, WriteStatusTracker ROOT_NAME = 'root' @@ -810,7 +810,7 @@ def get_type(cls, data): return data.dtype.type return type(data) else: - if _first_dim_size(data) == 0: + if _get_length(data) == 0: if hasattr(data, 'dtype'): return data.dtype else: diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index 699d1c0c9..affca571a 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -18,7 +18,7 @@ from ..container import AbstractContainer, Data from ..term_set import TermSetWrapper from ..data_utils import DataIO, AbstractDataChunkIterator -from ..utils import _is_collection, _first_dim_size +from ..utils import _is_collection, _get_length from ..query import ReferenceResolver from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec from ..spec.spec import BaseStorageSpec @@ -1031,13 +1031,13 @@ def __is_reftype(self, data): # In case of a numeric array stop the iteration at the first element to avoid long-running loop if isinstance(t, (int, float, complex, bool)): break - if _is_collection(t) and _first_dim_size(t) > 0 and not isinstance(t, AbstractContainer): + if _is_collection(t) and _get_length(t) > 0 and not isinstance(t, AbstractContainer): tmptmp = tmp[0] break if tmptmp is not None: break else: - if _first_dim_size(tmp) == 0: + if _get_length(tmp) == 0: tmp = None else: tmp = tmp[0] diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index 40c01519b..e170c8b6a 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -16,7 +16,7 @@ from ..container import Container, Data from ..data_utils import DataIO, AbstractDataChunkIterator from ..utils import (docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged, - _is_collection, _first_dim_size) + _is_collection, _get_length) from ..term_set import TermSetWrapper @@ -295,7 +295,7 @@ def _validate_new_data(self, data): (hasattr(data, "data") and isinstance(data.data, AbstractDataChunkIterator))): if not np.issubdtype(data.dtype, np.integer): raise ValueError("ElementIdentifiers must contain integers") - elif _is_collection(data) and _first_dim_size(data): + elif _is_collection(data) and _get_length(data): self._validate_new_data_element(data[0]) def _validate_new_data_element(self, arg): @@ -1466,7 +1466,7 @@ def from_dataframe(cls, **kwargs): columns.append({'name': col_name, 'description': column_descriptions.get(col_name, 'no description')}) if _is_collection(df[col_name].iloc[0]): - lengths = [_first_dim_size(x) for x in df[col_name]] + lengths = [_get_length(x) for x in df[col_name]] if not lengths[1:] == lengths[:-1]: columns[-1].update(index=True) @@ -1782,7 +1782,7 @@ def _uint_precision(elements): """ Calculate the uint precision needed to encode a set of elements """ n_elements = elements if _is_collection(elements): - n_elements = _first_dim_size(elements) + n_elements = _get_length(elements) return np.dtype('uint%d' % (8 * max(1, int((2 ** np.ceil((np.ceil(np.log2(n_elements)) - 8) / 8)))))).type diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py index 7e706a88d..bcba9b6e3 100644 --- a/src/hdmf/data_utils.py +++ b/src/hdmf/data_utils.py @@ -14,7 +14,7 @@ import h5py import numpy as np -from .utils import docval, getargs, popargs, docval_macro, get_data_shape, _is_collection, _first_dim_size +from .utils import docval, getargs, popargs, docval_macro, get_data_shape, _is_collection, _get_length def append_data(data, arg): from hdmf.backends.hdf5.h5_utils import HDMFDataset @@ -740,7 +740,7 @@ def maxshape(self): # possible. Otherwise, use None to represent an unlimited size if _is_collection(self.data) and self.iter_axis == 0: # special case of 1-D array - self.__maxshape[0] = _first_dim_size(self.data) + self.__maxshape[0] = _get_length(self.data) else: self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis] except AttributeError: # from self.data.shape diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py index daa7ca423..d6fd45795 100644 --- a/src/hdmf/utils.py +++ b/src/hdmf/utils.py @@ -830,7 +830,7 @@ def get_data_shape(data, strict_no_data_load=False): def __get_shape_helper(local_data): shape = list() if _is_collection(local_data): - length = _first_dim_size(local_data) + length = _get_length(local_data) shape.append(length) if length: el = next(iter(local_data)) @@ -872,11 +872,10 @@ def _is_collection(data): return hasattr(data, "__len__") -def _first_dim_size(data): - """Get the size of the first dimension of a collection. +def _get_length(data): + """Get the length of the first dimension of a collection. - Returns shape[0] for array-like objects (including those without __len__, - such as zarr v3 arrays) and len() for plain containers. + Uses shape[0] for array-like objects and len() for plain containers. """ if hasattr(data, "shape") and data.shape is not None: return data.shape[0] diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index da3670cc9..d48d193c1 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -13,7 +13,7 @@ from ..spec import Spec, AttributeSpec, GroupSpec, DatasetSpec, RefSpec, LinkSpec from ..spec import SpecNamespace from ..spec.spec import BaseStorageSpec, DtypeHelper -from ..utils import _is_collection, _first_dim_size +from ..utils import _is_collection, _get_length from ..utils import docval, getargs, pystr, get_data_shape from ..query import ReferenceResolver @@ -219,7 +219,7 @@ def get_type(data, builder_dtype=None): return _get_type_from_dtype_attr(data, builder_dtype) # If all else has failed, try to determine the datatype from the first element of the array - if _first_dim_size(data) > 0: + if _get_length(data) > 0: return get_type(data[0], builder_dtype) raise EmptyArrayError() diff --git a/tests/unit/utils_test/test_utils.py b/tests/unit/utils_test/test_utils.py index 93147cc6e..635c94ebc 100644 --- a/tests/unit/utils_test/test_utils.py +++ b/tests/unit/utils_test/test_utils.py @@ -5,7 +5,7 @@ from hdmf.container import Data from hdmf.data_utils import DataChunkIterator, DataIO from hdmf.testing import TestCase -from hdmf.utils import get_data_shape, to_uint_array, is_newer_version, _is_collection, _first_dim_size +from hdmf.utils import get_data_shape, to_uint_array, is_newer_version, _is_collection, _get_length class TestIsCollection(TestCase): @@ -86,28 +86,28 @@ def test_numpy_0d_array_not_collection(self): class TestGetLength(TestCase): - """Tests for _first_dim_size helper that gets first-dimension length.""" + """Tests for _get_length helper that gets first-dimension length.""" def test_list(self): - self.assertEqual(_first_dim_size([1, 2, 3]), 3) + self.assertEqual(_get_length([1, 2, 3]), 3) def test_empty_list(self): - self.assertEqual(_first_dim_size([]), 0) + self.assertEqual(_get_length([]), 0) def test_tuple(self): - self.assertEqual(_first_dim_size((1, 2)), 2) + self.assertEqual(_get_length((1, 2)), 2) def test_numpy_array(self): - self.assertEqual(_first_dim_size(np.array([1, 2, 3, 4])), 4) + self.assertEqual(_get_length(np.array([1, 2, 3, 4])), 4) def test_numpy_2d_array(self): - self.assertEqual(_first_dim_size(np.array([[1, 2], [3, 4], [5, 6]])), 3) + self.assertEqual(_get_length(np.array([[1, 2], [3, 4], [5, 6]])), 3) def test_shape_without_len(self): """Simulate zarr v3 array: has shape but no __len__.""" class FakeArray: shape = (10, 5) - self.assertEqual(_first_dim_size(FakeArray()), 10) + self.assertEqual(_get_length(FakeArray()), 10) class TestGetDataShape(TestCase):