Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# HDMF Changelog

## HDMF 5.0.1 (Upcoming)

### Enhancements
- Replaced `hasattr(data, "__len__")` checks with `ndim`-based collection detection via new `_is_collection` and `_get_length` helpers in `hdmf.utils`. This fixes compatibility with array libraries that follow the Python array API standard (e.g., zarr v3), which provide `ndim` and `shape` but not `__len__`. @h-mayorquin [#1414](https://github.com/hdmf-dev/hdmf/pull/1414)


## HDMF 5.0.0 (March 2, 2026)

### Changed
Expand Down
10 changes: 6 additions & 4 deletions src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from ...data_utils import AbstractDataChunkIterator
from ...spec import RefSpec, DtypeSpec, NamespaceCatalog
from ...utils import (docval, getargs, popargs, get_data_shape, get_docval, StrDataset, is_zarr_array,
get_basic_array_info, generate_array_html_repr)
get_basic_array_info, generate_array_html_repr, _is_collection, _get_length)
from ..utils import NamespaceToBuilderHelper, WriteStatusTracker

ROOT_NAME = 'root'
Expand Down Expand Up @@ -805,10 +805,12 @@ def get_type(cls, data):
return H5_BINARY
elif isinstance(data, Container):
return H5_REF
elif not hasattr(data, '__len__'):
elif not _is_collection(data):
if isinstance(data, np.ndarray) and data.ndim == 0:
return data.dtype.type
return type(data)
else:
if len(data) == 0:
if _get_length(data) == 0:
if hasattr(data, 'dtype'):
return data.dtype
else:
Expand Down Expand Up @@ -1238,7 +1240,7 @@ def _filler():
dset = self.__setup_chunked_dset__(parent, name, data, options)
self.__dci_queue.append(dataset=dset, data=data)
# Write a regular in memory array (e.g., numpy array, list etc.)
elif hasattr(data, '__len__'):
elif _is_collection(data):
dset = self.__list_fill__(parent, name, data, options)
# Write a regular scalar dataset
else:
Expand Down
7 changes: 4 additions & 3 deletions src/hdmf/build/objectmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from ..container import AbstractContainer, Data
from ..term_set import TermSetWrapper
from ..data_utils import DataIO, AbstractDataChunkIterator
from ..utils import _is_collection, _get_length
from ..query import ReferenceResolver
from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec
from ..spec.spec import BaseStorageSpec
Expand Down Expand Up @@ -1024,19 +1025,19 @@ def __is_reftype(self, data):
return False

tmp = data
while hasattr(tmp, '__len__') and not isinstance(tmp, (AbstractContainer, str, bytes)):
while _is_collection(tmp) and not isinstance(tmp, AbstractContainer):
tmptmp = None
for t in tmp:
# In case of a numeric array stop the iteration at the first element to avoid long-running loop
if isinstance(t, (int, float, complex, bool)):
break
if hasattr(t, '__len__') and len(t) > 0 and not isinstance(t, (AbstractContainer, str, bytes)):
if _is_collection(t) and _get_length(t) > 0 and not isinstance(t, AbstractContainer):
tmptmp = tmp[0]
break
if tmptmp is not None:
break
else:
if len(tmp) == 0:
if _get_length(tmp) == 0:
tmp = None
else:
tmp = tmp[0]
Expand Down
13 changes: 7 additions & 6 deletions src/hdmf/common/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from . import register_class, EXP_NAMESPACE
from ..container import Container, Data
from ..data_utils import DataIO, AbstractDataChunkIterator
from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged
from ..utils import (docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged,
_is_collection, _get_length)
from ..term_set import TermSetWrapper


Expand Down Expand Up @@ -294,7 +295,7 @@ def _validate_new_data(self, data):
(hasattr(data, "data") and isinstance(data.data, AbstractDataChunkIterator))):
if not np.issubdtype(data.dtype, np.integer):
raise ValueError("ElementIdentifiers must contain integers")
elif hasattr(data, "__len__") and len(data):
elif _is_collection(data) and _get_length(data):
self._validate_new_data_element(data[0])

def _validate_new_data_element(self, arg):
Expand Down Expand Up @@ -1464,8 +1465,8 @@ def from_dataframe(cls, **kwargs):
else:
columns.append({'name': col_name,
'description': column_descriptions.get(col_name, 'no description')})
if hasattr(df[col_name].iloc[0], '__len__') and not isinstance(df[col_name].iloc[0], str):
lengths = [len(x) for x in df[col_name]]
if _is_collection(df[col_name].iloc[0]):
lengths = [_get_length(x) for x in df[col_name]]
if not lengths[1:] == lengths[:-1]:
columns[-1].update(index=True)

Expand Down Expand Up @@ -1780,8 +1781,8 @@ def _validate_new_data_element(self, arg):
def _uint_precision(elements):
""" Calculate the uint precision needed to encode a set of elements """
n_elements = elements
if hasattr(elements, '__len__'):
n_elements = len(elements)
if _is_collection(elements):
n_elements = _get_length(elements)
return np.dtype('uint%d' % (8 * max(1, int((2 ** np.ceil((np.ceil(np.log2(n_elements)) - 8) / 8)))))).type


Expand Down
4 changes: 2 additions & 2 deletions src/hdmf/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from .data_utils import DataIO, append_data, extend_data, AbstractDataChunkIterator
from .utils import (docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict,
get_basic_array_info, generate_array_html_repr)
get_basic_array_info, generate_array_html_repr, _is_collection)

from .term_set import TermSet, TermSetWrapper

Expand Down Expand Up @@ -626,7 +626,7 @@ def __repr__(self):
template += "\nFields:\n"
for k in sorted(self.fields): # sorted to enable tests
v = self.fields[k]
if hasattr(v, '__len__'):
if _is_collection(v):
if isinstance(v, (np.ndarray, list, tuple)) or v:
template += " {}: {}\n".format(k, self.__smart_str(v, 1))
else:
Expand Down
6 changes: 3 additions & 3 deletions src/hdmf/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import h5py
import numpy as np

from .utils import docval, getargs, popargs, docval_macro, get_data_shape
from .utils import docval, getargs, popargs, docval_macro, get_data_shape, _is_collection, _get_length

def append_data(data, arg):
from hdmf.backends.hdf5.h5_utils import HDMFDataset
Expand Down Expand Up @@ -738,9 +738,9 @@ def maxshape(self):
# Size of self.__next_chunk.data along self.iter_axis is not accurate for maxshape because it is just a
# chunk. So try to set maxshape along the dimension self.iter_axis based on the shape of self.data if
# possible. Otherwise, use None to represent an unlimited size
if hasattr(self.data, '__len__') and self.iter_axis == 0:
if _is_collection(self.data) and self.iter_axis == 0:
# special case of 1-D array
self.__maxshape[0] = len(self.data)
self.__maxshape[0] = _get_length(self.data)
else:
self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis]
except AttributeError: # from self.data.shape
Expand Down
35 changes: 31 additions & 4 deletions src/hdmf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,9 +829,10 @@ def get_data_shape(data, strict_no_data_load=False):

def __get_shape_helper(local_data):
shape = list()
if hasattr(local_data, '__len__'):
shape.append(len(local_data))
if len(local_data):
if _is_collection(local_data):
length = _get_length(local_data)
shape.append(length)
if length:
el = next(iter(local_data))
# If local_data is a list/tuple of Data, do not iterate into the objects
if not isinstance(el, (str, bytes, Data)):
Expand All @@ -849,12 +850,38 @@ def __get_shape_helper(local_data):
return data.maxshape
if isinstance(data, dict):
return None
if hasattr(data, '__len__') and not isinstance(data, (str, bytes)):
if _is_collection(data):
if not strict_no_data_load or isinstance(data, (list, tuple, set)):
return __get_shape_helper(data)
return None


def _is_collection(data):
"""Check if data is a collection (array-like with elements) vs a scalar.

Checks ndim first because the Python array API standard requires conforming
arrays to have ndim and shape but does not require __len__. This handles
array libraries like zarr v3 that follow the standard. Falls back to
__len__ for plain Python containers (list, tuple). Strings and bytes
are treated as scalars.
"""
if isinstance(data, (str, bytes)):
return False
if hasattr(data, "ndim"):
return data.ndim > 0
return hasattr(data, "__len__")


def _get_length(data):
"""Get the length of the first dimension of a collection.

Uses shape[0] for array-like objects and len() for plain containers.
"""
if hasattr(data, "shape") and data.shape is not None:
return data.shape[0]
return len(data)


def pystr(s):
"""
Convert a string of characters to Python str object
Expand Down
9 changes: 7 additions & 2 deletions src/hdmf/validate/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from ..spec import Spec, AttributeSpec, GroupSpec, DatasetSpec, RefSpec, LinkSpec
from ..spec import SpecNamespace
from ..spec.spec import BaseStorageSpec, DtypeHelper
from ..utils import _is_collection, _get_length
from ..utils import docval, getargs, pystr, get_data_shape
from ..query import ReferenceResolver

Expand Down Expand Up @@ -198,7 +199,11 @@ def get_type(data, builder_dtype=None):
# Numpy bool data
elif isinstance(data, np.bool_):
return 'bool', None
if not hasattr(data, '__len__'):
# Numpy 0-d structured array with compound dtype (ndim=0 but has named fields)
if isinstance(data, np.ndarray) and data.ndim == 0 and len(data.dtype) > 1:
if builder_dtype and isinstance(builder_dtype, list):
return _get_type_compound_dtype(data, builder_dtype)
if not _is_collection(data):
if type(data) is float: # Python float is 64-bit
return 'float64', None
if type(data) is int: # Python int is 64-bit (or larger)
Expand All @@ -214,7 +219,7 @@ def get_type(data, builder_dtype=None):
return _get_type_from_dtype_attr(data, builder_dtype)

# If all else has failed, try to determine the datatype from the first element of the array
if len(data) > 0:
if _get_length(data) > 0:
return get_type(data[0], builder_dtype)
raise EmptyArrayError()

Expand Down
32 changes: 32 additions & 0 deletions tests/unit/test_io_hdf5_h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,21 @@ def test_write_dataset_scalar(self):
self.assertTupleEqual(dset.shape, ())
self.assertEqual(dset[()], a)

def test_write_dataset_0d_ndarray(self):
"""Regression: writing a 0-d ndarray as a scalar dataset should work.

A user computing a scalar attribute (e.g. a sampling rate) with an
array-API-conforming library gets a 0-d array back. Converting to
numpy via np.asarray() produces a 0-d ndarray, which they then pass
to hdmf to store as an HDF5 dataset. Previously this crashed because
the __len__ heuristic misidentified 0-d ndarrays as collections.
"""
sampling_rate = np.asarray(30000.0)
self.io.write_dataset(self.f, DatasetBuilder('test_dataset', sampling_rate, attributes={}))
dset = self.f['test_dataset']
self.assertTupleEqual(dset.shape, ())
self.assertEqual(dset[()], 30000.0)

def test_write_dataset_string(self):
a = 'test string'
self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a, attributes={}))
Expand Down Expand Up @@ -3913,6 +3928,23 @@ def test_setup_empty_dset_create_exception(self):
with self.assertRaisesRegex(Exception, "Could not create dataset foo in /"):
HDF5IO.__setup_empty_dset__(self.f, 'foo', {'shape': (3, 3), 'dtype': 'float'})

def test_get_type_0d_ndarray(self):
"""Regression: get_type should handle 0-d ndarrays.

The Python array API standard requires reductions (mean, sum, etc.)
to return 0-d arrays, not scalars. When results from array-API-
conforming libraries (zarr v3, cupy, etc.) are converted to numpy
via np.asarray(), the result is a 0-d ndarray. A 0-d ndarray has
__len__ defined but len() raises TypeError, which previously
crashed get_type. We use np.asarray(scalar) to produce a 0-d
ndarray without requiring any array library as a test dependency.
"""
zero_d = np.asarray(3.14)
self.assertIsInstance(zero_d, np.ndarray)
self.assertEqual(zero_d.ndim, 0)
result = HDF5IO.get_type(zero_d)
self.assertIs(result, np.float64)


class H5DataIOTests(TestCase):

Expand Down
104 changes: 103 additions & 1 deletion tests/unit/utils_test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,109 @@
from hdmf.container import Data
from hdmf.data_utils import DataChunkIterator, DataIO
from hdmf.testing import TestCase
from hdmf.utils import get_data_shape, to_uint_array, is_newer_version
from hdmf.utils import get_data_shape, to_uint_array, is_newer_version, _is_collection, _get_length


class TestIsCollection(TestCase):
"""Tests for _is_collection helper that detects collections vs scalars."""

def test_numpy_1d_array(self):
self.assertTrue(_is_collection(np.array([1, 2, 3])))

def test_numpy_empty_array(self):
self.assertTrue(_is_collection(np.array([])))

def test_numpy_2d_array(self):
self.assertTrue(_is_collection(np.array([[1, 2], [3, 4]])))

def test_numpy_0d_array(self):
self.assertFalse(_is_collection(np.array(5)))

def test_numpy_scalar(self):
self.assertFalse(_is_collection(np.float64(3.14)))

def test_list(self):
self.assertTrue(_is_collection([1, 2, 3]))

def test_empty_list(self):
self.assertTrue(_is_collection([]))

def test_tuple(self):
self.assertTrue(_is_collection((1, 2, 3)))

def test_set(self):
self.assertTrue(_is_collection({1, 2, 3}))

def test_int(self):
self.assertFalse(_is_collection(42))

def test_float(self):
self.assertFalse(_is_collection(3.14))

def test_string(self):
self.assertFalse(_is_collection("hello"))

def test_bytes(self):
self.assertFalse(_is_collection(b"hello"))

def test_none(self):
self.assertFalse(_is_collection(None))

def test_bool(self):
self.assertFalse(_is_collection(True))

def test_ndim_without_len(self):
"""Simulate zarr v3 array: has ndim and shape but no __len__."""
class FakeArray:
ndim = 2
shape = (10, 5)
self.assertTrue(_is_collection(FakeArray()))

def test_ndim_zero_without_len(self):
"""Simulate 0-d array-like: has ndim=0 but no __len__."""
class FakeScalar:
ndim = 0
shape = ()
self.assertFalse(_is_collection(FakeScalar()))

def test_numpy_0d_array_not_collection(self):
"""Regression: numpy 0-d ndarrays have __len__ but len() raises TypeError.

zarr v3 scalar indexing (e.g. zarr_array[0]) returns 0-d ndarrays.
The old hasattr(data, '__len__') check would incorrectly treat these
as collections, then crash on len(data).
"""
zero_d = np.array(5.0)
self.assertTrue(hasattr(zero_d, '__len__')) # confirms the old check would pass
with self.assertRaises(TypeError):
len(zero_d) # confirms len() crashes
self.assertFalse(_is_collection(zero_d)) # our helper handles it correctly



class TestGetLength(TestCase):
"""Tests for _get_length helper that gets first-dimension length."""

def test_list(self):
self.assertEqual(_get_length([1, 2, 3]), 3)

def test_empty_list(self):
self.assertEqual(_get_length([]), 0)

def test_tuple(self):
self.assertEqual(_get_length((1, 2)), 2)

def test_numpy_array(self):
self.assertEqual(_get_length(np.array([1, 2, 3, 4])), 4)

def test_numpy_2d_array(self):
self.assertEqual(_get_length(np.array([[1, 2], [3, 4], [5, 6]])), 3)

def test_shape_without_len(self):
"""Simulate zarr v3 array: has shape but no __len__."""
class FakeArray:
shape = (10, 5)
self.assertEqual(_get_length(FakeArray()), 10)


class TestGetDataShape(TestCase):
Expand Down
Loading