diff --git a/CHANGELOG.md b/CHANGELOG.md index e05245609..393268c66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # HDMF Changelog +## HDMF 5.0.1 (Upcoming) + +### Enhancements +- Replaced `hasattr(data, "__len__")` checks with `ndim`-based collection detection via new `_is_collection` and `_get_length` helpers in `hdmf.utils`. This fixes compatibility with array libraries that follow the Python array API standard (e.g., zarr v3), which provide `ndim` and `shape` but not `__len__`. @h-mayorquin [#1414](https://github.com/hdmf-dev/hdmf/pull/1414) + + ## HDMF 5.0.0 (March 2, 2026) ### Changed diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index c69ba4210..2a539c06b 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -20,7 +20,7 @@ from ...data_utils import AbstractDataChunkIterator from ...spec import RefSpec, DtypeSpec, NamespaceCatalog from ...utils import (docval, getargs, popargs, get_data_shape, get_docval, StrDataset, is_zarr_array, - get_basic_array_info, generate_array_html_repr) + get_basic_array_info, generate_array_html_repr, _is_collection, _get_length) from ..utils import NamespaceToBuilderHelper, WriteStatusTracker ROOT_NAME = 'root' @@ -805,10 +805,12 @@ def get_type(cls, data): return H5_BINARY elif isinstance(data, Container): return H5_REF - elif not hasattr(data, '__len__'): + elif not _is_collection(data): + if isinstance(data, np.ndarray) and data.ndim == 0: + return data.dtype.type return type(data) else: - if len(data) == 0: + if _get_length(data) == 0: if hasattr(data, 'dtype'): return data.dtype else: @@ -1238,7 +1240,7 @@ def _filler(): dset = self.__setup_chunked_dset__(parent, name, data, options) self.__dci_queue.append(dataset=dset, data=data) # Write a regular in memory array (e.g., numpy array, list etc.) - elif hasattr(data, '__len__'): + elif _is_collection(data): dset = self.__list_fill__(parent, name, data, options) # Write a regular scalar dataset else: diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index 389c9e834..affca571a 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -18,6 +18,7 @@ from ..container import AbstractContainer, Data from ..term_set import TermSetWrapper from ..data_utils import DataIO, AbstractDataChunkIterator +from ..utils import _is_collection, _get_length from ..query import ReferenceResolver from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec from ..spec.spec import BaseStorageSpec @@ -1024,19 +1025,19 @@ def __is_reftype(self, data): return False tmp = data - while hasattr(tmp, '__len__') and not isinstance(tmp, (AbstractContainer, str, bytes)): + while _is_collection(tmp) and not isinstance(tmp, AbstractContainer): tmptmp = None for t in tmp: # In case of a numeric array stop the iteration at the first element to avoid long-running loop if isinstance(t, (int, float, complex, bool)): break - if hasattr(t, '__len__') and len(t) > 0 and not isinstance(t, (AbstractContainer, str, bytes)): + if _is_collection(t) and _get_length(t) > 0 and not isinstance(t, AbstractContainer): tmptmp = tmp[0] break if tmptmp is not None: break else: - if len(tmp) == 0: + if _get_length(tmp) == 0: tmp = None else: tmp = tmp[0] diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index 9feeb04e9..e170c8b6a 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -15,7 +15,8 @@ from . import register_class, EXP_NAMESPACE from ..container import Container, Data from ..data_utils import DataIO, AbstractDataChunkIterator -from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged +from ..utils import (docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged, + _is_collection, _get_length) from ..term_set import TermSetWrapper @@ -294,7 +295,7 @@ def _validate_new_data(self, data): (hasattr(data, "data") and isinstance(data.data, AbstractDataChunkIterator))): if not np.issubdtype(data.dtype, np.integer): raise ValueError("ElementIdentifiers must contain integers") - elif hasattr(data, "__len__") and len(data): + elif _is_collection(data) and _get_length(data): self._validate_new_data_element(data[0]) def _validate_new_data_element(self, arg): @@ -1464,8 +1465,8 @@ def from_dataframe(cls, **kwargs): else: columns.append({'name': col_name, 'description': column_descriptions.get(col_name, 'no description')}) - if hasattr(df[col_name].iloc[0], '__len__') and not isinstance(df[col_name].iloc[0], str): - lengths = [len(x) for x in df[col_name]] + if _is_collection(df[col_name].iloc[0]): + lengths = [_get_length(x) for x in df[col_name]] if not lengths[1:] == lengths[:-1]: columns[-1].update(index=True) @@ -1780,8 +1781,8 @@ def _validate_new_data_element(self, arg): def _uint_precision(elements): """ Calculate the uint precision needed to encode a set of elements """ n_elements = elements - if hasattr(elements, '__len__'): - n_elements = len(elements) + if _is_collection(elements): + n_elements = _get_length(elements) return np.dtype('uint%d' % (8 * max(1, int((2 ** np.ceil((np.ceil(np.log2(n_elements)) - 8) / 8)))))).type diff --git a/src/hdmf/container.py b/src/hdmf/container.py index 762555112..2d1e1a172 100644 --- a/src/hdmf/container.py +++ b/src/hdmf/container.py @@ -11,7 +11,7 @@ from .data_utils import DataIO, append_data, extend_data, AbstractDataChunkIterator from .utils import (docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict, - get_basic_array_info, generate_array_html_repr) + get_basic_array_info, generate_array_html_repr, _is_collection) from .term_set import TermSet, TermSetWrapper @@ -626,7 +626,7 @@ def __repr__(self): template += "\nFields:\n" for k in sorted(self.fields): # sorted to enable tests v = self.fields[k] - if hasattr(v, '__len__'): + if _is_collection(v): if isinstance(v, (np.ndarray, list, tuple)) or v: template += " {}: {}\n".format(k, self.__smart_str(v, 1)) else: diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py index 0f245eda7..bcba9b6e3 100644 --- a/src/hdmf/data_utils.py +++ b/src/hdmf/data_utils.py @@ -14,7 +14,7 @@ import h5py import numpy as np -from .utils import docval, getargs, popargs, docval_macro, get_data_shape +from .utils import docval, getargs, popargs, docval_macro, get_data_shape, _is_collection, _get_length def append_data(data, arg): from hdmf.backends.hdf5.h5_utils import HDMFDataset @@ -738,9 +738,9 @@ def maxshape(self): # Size of self.__next_chunk.data along self.iter_axis is not accurate for maxshape because it is just a # chunk. So try to set maxshape along the dimension self.iter_axis based on the shape of self.data if # possible. Otherwise, use None to represent an unlimited size - if hasattr(self.data, '__len__') and self.iter_axis == 0: + if _is_collection(self.data) and self.iter_axis == 0: # special case of 1-D array - self.__maxshape[0] = len(self.data) + self.__maxshape[0] = _get_length(self.data) else: self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis] except AttributeError: # from self.data.shape diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py index 578da41f2..d6fd45795 100644 --- a/src/hdmf/utils.py +++ b/src/hdmf/utils.py @@ -829,9 +829,10 @@ def get_data_shape(data, strict_no_data_load=False): def __get_shape_helper(local_data): shape = list() - if hasattr(local_data, '__len__'): - shape.append(len(local_data)) - if len(local_data): + if _is_collection(local_data): + length = _get_length(local_data) + shape.append(length) + if length: el = next(iter(local_data)) # If local_data is a list/tuple of Data, do not iterate into the objects if not isinstance(el, (str, bytes, Data)): @@ -849,12 +850,38 @@ def __get_shape_helper(local_data): return data.maxshape if isinstance(data, dict): return None - if hasattr(data, '__len__') and not isinstance(data, (str, bytes)): + if _is_collection(data): if not strict_no_data_load or isinstance(data, (list, tuple, set)): return __get_shape_helper(data) return None +def _is_collection(data): + """Check if data is a collection (array-like with elements) vs a scalar. + + Checks ndim first because the Python array API standard requires conforming + arrays to have ndim and shape but does not require __len__. This handles + array libraries like zarr v3 that follow the standard. Falls back to + __len__ for plain Python containers (list, tuple). Strings and bytes + are treated as scalars. + """ + if isinstance(data, (str, bytes)): + return False + if hasattr(data, "ndim"): + return data.ndim > 0 + return hasattr(data, "__len__") + + +def _get_length(data): + """Get the length of the first dimension of a collection. + + Uses shape[0] for array-like objects and len() for plain containers. + """ + if hasattr(data, "shape") and data.shape is not None: + return data.shape[0] + return len(data) + + def pystr(s): """ Convert a string of characters to Python str object diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py index c774a8bf1..d48d193c1 100644 --- a/src/hdmf/validate/validator.py +++ b/src/hdmf/validate/validator.py @@ -13,6 +13,7 @@ from ..spec import Spec, AttributeSpec, GroupSpec, DatasetSpec, RefSpec, LinkSpec from ..spec import SpecNamespace from ..spec.spec import BaseStorageSpec, DtypeHelper +from ..utils import _is_collection, _get_length from ..utils import docval, getargs, pystr, get_data_shape from ..query import ReferenceResolver @@ -198,7 +199,11 @@ def get_type(data, builder_dtype=None): # Numpy bool data elif isinstance(data, np.bool_): return 'bool', None - if not hasattr(data, '__len__'): + # Numpy 0-d structured array with compound dtype (ndim=0 but has named fields) + if isinstance(data, np.ndarray) and data.ndim == 0 and len(data.dtype) > 1: + if builder_dtype and isinstance(builder_dtype, list): + return _get_type_compound_dtype(data, builder_dtype) + if not _is_collection(data): if type(data) is float: # Python float is 64-bit return 'float64', None if type(data) is int: # Python int is 64-bit (or larger) @@ -214,7 +219,7 @@ def get_type(data, builder_dtype=None): return _get_type_from_dtype_attr(data, builder_dtype) # If all else has failed, try to determine the datatype from the first element of the array - if len(data) > 0: + if _get_length(data) > 0: return get_type(data[0], builder_dtype) raise EmptyArrayError() diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index c7e6ea531..21392e8e0 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -138,6 +138,21 @@ def test_write_dataset_scalar(self): self.assertTupleEqual(dset.shape, ()) self.assertEqual(dset[()], a) + def test_write_dataset_0d_ndarray(self): + """Regression: writing a 0-d ndarray as a scalar dataset should work. + + A user computing a scalar attribute (e.g. a sampling rate) with an + array-API-conforming library gets a 0-d array back. Converting to + numpy via np.asarray() produces a 0-d ndarray, which they then pass + to hdmf to store as an HDF5 dataset. Previously this crashed because + the __len__ heuristic misidentified 0-d ndarrays as collections. + """ + sampling_rate = np.asarray(30000.0) + self.io.write_dataset(self.f, DatasetBuilder('test_dataset', sampling_rate, attributes={})) + dset = self.f['test_dataset'] + self.assertTupleEqual(dset.shape, ()) + self.assertEqual(dset[()], 30000.0) + def test_write_dataset_string(self): a = 'test string' self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a, attributes={})) @@ -3913,6 +3928,23 @@ def test_setup_empty_dset_create_exception(self): with self.assertRaisesRegex(Exception, "Could not create dataset foo in /"): HDF5IO.__setup_empty_dset__(self.f, 'foo', {'shape': (3, 3), 'dtype': 'float'}) + def test_get_type_0d_ndarray(self): + """Regression: get_type should handle 0-d ndarrays. + + The Python array API standard requires reductions (mean, sum, etc.) + to return 0-d arrays, not scalars. When results from array-API- + conforming libraries (zarr v3, cupy, etc.) are converted to numpy + via np.asarray(), the result is a 0-d ndarray. A 0-d ndarray has + __len__ defined but len() raises TypeError, which previously + crashed get_type. We use np.asarray(scalar) to produce a 0-d + ndarray without requiring any array library as a test dependency. + """ + zero_d = np.asarray(3.14) + self.assertIsInstance(zero_d, np.ndarray) + self.assertEqual(zero_d.ndim, 0) + result = HDF5IO.get_type(zero_d) + self.assertIs(result, np.float64) + class H5DataIOTests(TestCase): diff --git a/tests/unit/utils_test/test_utils.py b/tests/unit/utils_test/test_utils.py index 98115d3d3..635c94ebc 100644 --- a/tests/unit/utils_test/test_utils.py +++ b/tests/unit/utils_test/test_utils.py @@ -5,7 +5,109 @@ from hdmf.container import Data from hdmf.data_utils import DataChunkIterator, DataIO from hdmf.testing import TestCase -from hdmf.utils import get_data_shape, to_uint_array, is_newer_version +from hdmf.utils import get_data_shape, to_uint_array, is_newer_version, _is_collection, _get_length + + +class TestIsCollection(TestCase): + """Tests for _is_collection helper that detects collections vs scalars.""" + + def test_numpy_1d_array(self): + self.assertTrue(_is_collection(np.array([1, 2, 3]))) + + def test_numpy_empty_array(self): + self.assertTrue(_is_collection(np.array([]))) + + def test_numpy_2d_array(self): + self.assertTrue(_is_collection(np.array([[1, 2], [3, 4]]))) + + def test_numpy_0d_array(self): + self.assertFalse(_is_collection(np.array(5))) + + def test_numpy_scalar(self): + self.assertFalse(_is_collection(np.float64(3.14))) + + def test_list(self): + self.assertTrue(_is_collection([1, 2, 3])) + + def test_empty_list(self): + self.assertTrue(_is_collection([])) + + def test_tuple(self): + self.assertTrue(_is_collection((1, 2, 3))) + + def test_set(self): + self.assertTrue(_is_collection({1, 2, 3})) + + def test_int(self): + self.assertFalse(_is_collection(42)) + + def test_float(self): + self.assertFalse(_is_collection(3.14)) + + def test_string(self): + self.assertFalse(_is_collection("hello")) + + def test_bytes(self): + self.assertFalse(_is_collection(b"hello")) + + def test_none(self): + self.assertFalse(_is_collection(None)) + + def test_bool(self): + self.assertFalse(_is_collection(True)) + + def test_ndim_without_len(self): + """Simulate zarr v3 array: has ndim and shape but no __len__.""" + class FakeArray: + ndim = 2 + shape = (10, 5) + self.assertTrue(_is_collection(FakeArray())) + + def test_ndim_zero_without_len(self): + """Simulate 0-d array-like: has ndim=0 but no __len__.""" + class FakeScalar: + ndim = 0 + shape = () + self.assertFalse(_is_collection(FakeScalar())) + + def test_numpy_0d_array_not_collection(self): + """Regression: numpy 0-d ndarrays have __len__ but len() raises TypeError. + + zarr v3 scalar indexing (e.g. zarr_array[0]) returns 0-d ndarrays. + The old hasattr(data, '__len__') check would incorrectly treat these + as collections, then crash on len(data). + """ + zero_d = np.array(5.0) + self.assertTrue(hasattr(zero_d, '__len__')) # confirms the old check would pass + with self.assertRaises(TypeError): + len(zero_d) # confirms len() crashes + self.assertFalse(_is_collection(zero_d)) # our helper handles it correctly + + + +class TestGetLength(TestCase): + """Tests for _get_length helper that gets first-dimension length.""" + + def test_list(self): + self.assertEqual(_get_length([1, 2, 3]), 3) + + def test_empty_list(self): + self.assertEqual(_get_length([]), 0) + + def test_tuple(self): + self.assertEqual(_get_length((1, 2)), 2) + + def test_numpy_array(self): + self.assertEqual(_get_length(np.array([1, 2, 3, 4])), 4) + + def test_numpy_2d_array(self): + self.assertEqual(_get_length(np.array([[1, 2], [3, 4], [5, 6]])), 3) + + def test_shape_without_len(self): + """Simulate zarr v3 array: has shape but no __len__.""" + class FakeArray: + shape = (10, 5) + self.assertEqual(_get_length(FakeArray()), 10) class TestGetDataShape(TestCase):