hdmf-dev · h-mayorquin · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # HDMF Changelog
 
+## HDMF 5.0.1 (Upcoming)
+
+### Enhancements
+- Replaced `hasattr(data, "__len__")` checks with `ndim`-based collection detection via new `_is_collection` and `_get_length` helpers in `hdmf.utils`. This fixes compatibility with array libraries that follow the Python array API standard (e.g., zarr v3), which provide `ndim` and `shape` but not `__len__`. @h-mayorquin [#1414](https://github.com/hdmf-dev/hdmf/pull/1414)
+
+
 ## HDMF 5.0.0 (March 2, 2026)
 
 ### Changed

diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py
@@ -20,7 +20,7 @@
 from ...data_utils import AbstractDataChunkIterator
 from ...spec import RefSpec, DtypeSpec, NamespaceCatalog
 from ...utils import (docval, getargs, popargs, get_data_shape, get_docval, StrDataset, is_zarr_array,
-                      get_basic_array_info, generate_array_html_repr)
+                      get_basic_array_info, generate_array_html_repr, _is_collection, _get_length)
 from ..utils import NamespaceToBuilderHelper, WriteStatusTracker
 
 ROOT_NAME = 'root'
@@ -805,10 +805,12 @@ def get_type(cls, data):
             return H5_BINARY
         elif isinstance(data, Container):
             return H5_REF
-        elif not hasattr(data, '__len__'):
+        elif not _is_collection(data):
+            if isinstance(data, np.ndarray) and data.ndim == 0:
+                return data.dtype.type
             return type(data)
         else:
-            if len(data) == 0:
+            if _get_length(data) == 0:
                 if hasattr(data, 'dtype'):
                     return data.dtype
                 else:
@@ -1238,7 +1240,7 @@ def _filler():
                 dset = self.__setup_chunked_dset__(parent, name, data, options)
                 self.__dci_queue.append(dataset=dset, data=data)
             # Write a regular in memory array (e.g., numpy array, list etc.)
-            elif hasattr(data, '__len__'):
+            elif _is_collection(data):
                 dset = self.__list_fill__(parent, name, data, options)
             # Write a regular scalar dataset
             else:

diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py
@@ -18,6 +18,7 @@
 from ..container import AbstractContainer, Data
 from ..term_set import TermSetWrapper
 from ..data_utils import DataIO, AbstractDataChunkIterator
+from ..utils import _is_collection, _get_length
 from ..query import ReferenceResolver
 from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec
 from ..spec.spec import BaseStorageSpec
@@ -1024,19 +1025,19 @@ def __is_reftype(self, data):
             return False
 
         tmp = data
-        while hasattr(tmp, '__len__') and not isinstance(tmp, (AbstractContainer, str, bytes)):
+        while _is_collection(tmp) and not isinstance(tmp, AbstractContainer):
             tmptmp = None
             for t in tmp:
                 # In case of a numeric array stop the iteration at the first element to avoid long-running loop
                 if isinstance(t, (int, float, complex, bool)):
                     break
-                if hasattr(t, '__len__') and len(t) > 0 and not isinstance(t, (AbstractContainer, str, bytes)):
+                if _is_collection(t) and _get_length(t) > 0 and not isinstance(t, AbstractContainer):
                     tmptmp = tmp[0]
                     break
             if tmptmp is not None:
                 break
             else:
-                if len(tmp) == 0:
+                if _get_length(tmp) == 0:
                     tmp = None
                 else:
                     tmp = tmp[0]

diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py
@@ -15,7 +15,8 @@
 from . import register_class, EXP_NAMESPACE
 from ..container import Container, Data
 from ..data_utils import DataIO, AbstractDataChunkIterator
-from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged
+from ..utils import (docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged,
+                     _is_collection, _get_length)
 from ..term_set import TermSetWrapper
 
 
@@ -294,7 +295,7 @@ def _validate_new_data(self, data):
                 (hasattr(data, "data") and isinstance(data.data, AbstractDataChunkIterator))):
             if not np.issubdtype(data.dtype, np.integer):
                 raise ValueError("ElementIdentifiers must contain integers")
-        elif hasattr(data, "__len__") and len(data):
+        elif _is_collection(data) and _get_length(data):
             self._validate_new_data_element(data[0])
 
     def _validate_new_data_element(self, arg):
@@ -1464,8 +1465,8 @@ def from_dataframe(cls, **kwargs):
             else:
                 columns.append({'name': col_name,
                                 'description': column_descriptions.get(col_name, 'no description')})
-                if hasattr(df[col_name].iloc[0], '__len__') and not isinstance(df[col_name].iloc[0], str):
-                    lengths = [len(x) for x in df[col_name]]
+                if _is_collection(df[col_name].iloc[0]):
+                    lengths = [_get_length(x) for x in df[col_name]]
                     if not lengths[1:] == lengths[:-1]:
                         columns[-1].update(index=True)
 
@@ -1780,8 +1781,8 @@ def _validate_new_data_element(self, arg):
 def _uint_precision(elements):
     """ Calculate the uint precision needed to encode a set of elements """
     n_elements = elements
-    if hasattr(elements, '__len__'):
-        n_elements = len(elements)
+    if _is_collection(elements):
+        n_elements = _get_length(elements)
     return np.dtype('uint%d' % (8 * max(1, int((2 ** np.ceil((np.ceil(np.log2(n_elements)) - 8) / 8)))))).type
 
 

diff --git a/src/hdmf/container.py b/src/hdmf/container.py
@@ -11,7 +11,7 @@
 
 from .data_utils import DataIO, append_data, extend_data, AbstractDataChunkIterator
 from .utils import (docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict,
-                    get_basic_array_info, generate_array_html_repr)
+                    get_basic_array_info, generate_array_html_repr, _is_collection)
 
 from .term_set import TermSet, TermSetWrapper
 
@@ -626,7 +626,7 @@ def __repr__(self):
             template += "\nFields:\n"
         for k in sorted(self.fields):  # sorted to enable tests
             v = self.fields[k]
-            if hasattr(v, '__len__'):
+            if _is_collection(v):
                 if isinstance(v, (np.ndarray, list, tuple)) or v:
                     template += "  {}: {}\n".format(k, self.__smart_str(v, 1))
             else:

diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py
@@ -14,7 +14,7 @@
 import h5py
 import numpy as np
 
-from .utils import docval, getargs, popargs, docval_macro, get_data_shape
+from .utils import docval, getargs, popargs, docval_macro, get_data_shape, _is_collection, _get_length
 
 def append_data(data, arg):
     from hdmf.backends.hdf5.h5_utils import HDMFDataset
@@ -738,9 +738,9 @@ def maxshape(self):
                 # Size of self.__next_chunk.data along self.iter_axis is not accurate for maxshape because it is just a
                 # chunk. So try to set maxshape along the dimension self.iter_axis based on the shape of self.data if
                 # possible. Otherwise, use None to represent an unlimited size
-                if hasattr(self.data, '__len__') and self.iter_axis == 0:
+                if _is_collection(self.data) and self.iter_axis == 0:
                     # special case of 1-D array
-                    self.__maxshape[0] = len(self.data)
+                    self.__maxshape[0] = _get_length(self.data)
                 else:
                     self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis]
             except AttributeError:  # from self.data.shape

diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py
@@ -829,9 +829,10 @@ def get_data_shape(data, strict_no_data_load=False):
 
     def __get_shape_helper(local_data):
         shape = list()
-        if hasattr(local_data, '__len__'):
-            shape.append(len(local_data))
-            if len(local_data):
+        if _is_collection(local_data):
+            length = _get_length(local_data)
+            shape.append(length)
+            if length:
                 el = next(iter(local_data))
                 # If local_data is a list/tuple of Data, do not iterate into the objects
                 if not isinstance(el, (str, bytes, Data)):
@@ -849,12 +850,38 @@ def __get_shape_helper(local_data):
         return data.maxshape
     if isinstance(data, dict):
         return None
-    if hasattr(data, '__len__') and not isinstance(data, (str, bytes)):
+    if _is_collection(data):
         if not strict_no_data_load or isinstance(data, (list, tuple, set)):
             return __get_shape_helper(data)
     return None
 
 
+def _is_collection(data):
+    """Check if data is a collection (array-like with elements) vs a scalar.
+
+    Checks ndim first because the Python array API standard requires conforming
+    arrays to have ndim and shape but does not require __len__. This handles
+    array libraries like zarr v3 that follow the standard. Falls back to
+    __len__ for plain Python containers (list, tuple). Strings and bytes
+    are treated as scalars.
+    """
+    if isinstance(data, (str, bytes)):
+        return False
+    if hasattr(data, "ndim"):
+        return data.ndim > 0
+    return hasattr(data, "__len__")
+
+
+def _get_length(data):
+    """Get the length of the first dimension of a collection.
+
+    Uses shape[0] for array-like objects and len() for plain containers.
+    """
+    if hasattr(data, "shape") and data.shape is not None:
+        return data.shape[0]
+    return len(data)
+
+
 def pystr(s):
     """
     Convert a string of characters to Python str object

diff --git a/src/hdmf/validate/validator.py b/src/hdmf/validate/validator.py
@@ -13,6 +13,7 @@
 from ..spec import Spec, AttributeSpec, GroupSpec, DatasetSpec, RefSpec, LinkSpec
 from ..spec import SpecNamespace
 from ..spec.spec import BaseStorageSpec, DtypeHelper
+from ..utils import _is_collection, _get_length
 from ..utils import docval, getargs, pystr, get_data_shape
 from ..query import ReferenceResolver
 
@@ -198,7 +199,11 @@ def get_type(data, builder_dtype=None):
     # Numpy bool data
     elif isinstance(data, np.bool_):
         return 'bool', None
-    if not hasattr(data, '__len__'):
+    # Numpy 0-d structured array with compound dtype (ndim=0 but has named fields)
+    if isinstance(data, np.ndarray) and data.ndim == 0 and len(data.dtype) > 1:
+        if builder_dtype and isinstance(builder_dtype, list):
+            return _get_type_compound_dtype(data, builder_dtype)
+    if not _is_collection(data):
         if type(data) is float:  # Python float is 64-bit
             return 'float64', None
         if type(data) is int:  # Python int is 64-bit (or larger)
@@ -214,7 +219,7 @@ def get_type(data, builder_dtype=None):
         return _get_type_from_dtype_attr(data, builder_dtype)
 
     # If all else has failed, try to determine the datatype from the first element of the array
-    if len(data) > 0:
+    if _get_length(data) > 0:
         return get_type(data[0], builder_dtype)
     raise EmptyArrayError()
 

diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py
@@ -138,6 +138,21 @@ def test_write_dataset_scalar(self):
         self.assertTupleEqual(dset.shape, ())
         self.assertEqual(dset[()], a)
 
+    def test_write_dataset_0d_ndarray(self):
+        """Regression: writing a 0-d ndarray as a scalar dataset should work.
+
+        A user computing a scalar attribute (e.g. a sampling rate) with an
+        array-API-conforming library gets a 0-d array back. Converting to
+        numpy via np.asarray() produces a 0-d ndarray, which they then pass
+        to hdmf to store as an HDF5 dataset. Previously this crashed because
+        the __len__ heuristic misidentified 0-d ndarrays as collections.
+        """
+        sampling_rate = np.asarray(30000.0)
+        self.io.write_dataset(self.f, DatasetBuilder('test_dataset', sampling_rate, attributes={}))
+        dset = self.f['test_dataset']
+        self.assertTupleEqual(dset.shape, ())
+        self.assertEqual(dset[()], 30000.0)
+
     def test_write_dataset_string(self):
         a = 'test string'
         self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a, attributes={}))
@@ -3913,6 +3928,23 @@ def test_setup_empty_dset_create_exception(self):
         with self.assertRaisesRegex(Exception, "Could not create dataset foo in /"):
             HDF5IO.__setup_empty_dset__(self.f, 'foo', {'shape': (3, 3), 'dtype': 'float'})
 
+    def test_get_type_0d_ndarray(self):
+        """Regression: get_type should handle 0-d ndarrays.
+
+        The Python array API standard requires reductions (mean, sum, etc.)
+        to return 0-d arrays, not scalars. When results from array-API-
+        conforming libraries (zarr v3, cupy, etc.) are converted to numpy
+        via np.asarray(), the result is a 0-d ndarray. A 0-d ndarray has
+        __len__ defined but len() raises TypeError, which previously
+        crashed get_type. We use np.asarray(scalar) to produce a 0-d
+        ndarray without requiring any array library as a test dependency.
+        """
+        zero_d = np.asarray(3.14)
+        self.assertIsInstance(zero_d, np.ndarray)
+        self.assertEqual(zero_d.ndim, 0)
+        result = HDF5IO.get_type(zero_d)
+        self.assertIs(result, np.float64)
+
 
 class H5DataIOTests(TestCase):
 

diff --git a/tests/unit/utils_test/test_utils.py b/tests/unit/utils_test/test_utils.py
@@ -5,7 +5,109 @@
 from hdmf.container import Data
 from hdmf.data_utils import DataChunkIterator, DataIO
 from hdmf.testing import TestCase
-from hdmf.utils import get_data_shape, to_uint_array, is_newer_version
+from hdmf.utils import get_data_shape, to_uint_array, is_newer_version, _is_collection, _get_length
+
+
+class TestIsCollection(TestCase):
+    """Tests for _is_collection helper that detects collections vs scalars."""
+
+    def test_numpy_1d_array(self):
+        self.assertTrue(_is_collection(np.array([1, 2, 3])))
+
+    def test_numpy_empty_array(self):
+        self.assertTrue(_is_collection(np.array([])))
+
+    def test_numpy_2d_array(self):
+        self.assertTrue(_is_collection(np.array([[1, 2], [3, 4]])))
+
+    def test_numpy_0d_array(self):
+        self.assertFalse(_is_collection(np.array(5)))
+
+    def test_numpy_scalar(self):
+        self.assertFalse(_is_collection(np.float64(3.14)))
+
+    def test_list(self):
+        self.assertTrue(_is_collection([1, 2, 3]))
+
+    def test_empty_list(self):
+        self.assertTrue(_is_collection([]))
+
+    def test_tuple(self):
+        self.assertTrue(_is_collection((1, 2, 3)))
+
+    def test_set(self):
+        self.assertTrue(_is_collection({1, 2, 3}))
+
+    def test_int(self):
+        self.assertFalse(_is_collection(42))
+
+    def test_float(self):
+        self.assertFalse(_is_collection(3.14))
+
+    def test_string(self):
+        self.assertFalse(_is_collection("hello"))
+
+    def test_bytes(self):
+        self.assertFalse(_is_collection(b"hello"))
+
+    def test_none(self):
+        self.assertFalse(_is_collection(None))
+
+    def test_bool(self):
+        self.assertFalse(_is_collection(True))
+
+    def test_ndim_without_len(self):
+        """Simulate zarr v3 array: has ndim and shape but no __len__."""
+        class FakeArray:
+            ndim = 2
+            shape = (10, 5)
+        self.assertTrue(_is_collection(FakeArray()))
+
+    def test_ndim_zero_without_len(self):
+        """Simulate 0-d array-like: has ndim=0 but no __len__."""
+        class FakeScalar:
+            ndim = 0
+            shape = ()
+        self.assertFalse(_is_collection(FakeScalar()))
+
+    def test_numpy_0d_array_not_collection(self):
+        """Regression: numpy 0-d ndarrays have __len__ but len() raises TypeError.
+
+        zarr v3 scalar indexing (e.g. zarr_array[0]) returns 0-d ndarrays.
+        The old hasattr(data, '__len__') check would incorrectly treat these
+        as collections, then crash on len(data).
+        """
+        zero_d = np.array(5.0)
+        self.assertTrue(hasattr(zero_d, '__len__'))  # confirms the old check would pass
+        with self.assertRaises(TypeError):
+            len(zero_d)  # confirms len() crashes
+        self.assertFalse(_is_collection(zero_d))  # our helper handles it correctly
+
+
+
+class TestGetLength(TestCase):
+    """Tests for _get_length helper that gets first-dimension length."""
+
+    def test_list(self):
+        self.assertEqual(_get_length([1, 2, 3]), 3)
+
+    def test_empty_list(self):
+        self.assertEqual(_get_length([]), 0)
+
+    def test_tuple(self):
+        self.assertEqual(_get_length((1, 2)), 2)
+
+    def test_numpy_array(self):
+        self.assertEqual(_get_length(np.array([1, 2, 3, 4])), 4)
+
+    def test_numpy_2d_array(self):
+        self.assertEqual(_get_length(np.array([[1, 2], [3, 4], [5, 6]])), 3)
+
+    def test_shape_without_len(self):
+        """Simulate zarr v3 array: has shape but no __len__."""
+        class FakeArray:
+            shape = (10, 5)
+        self.assertEqual(_get_length(FakeArray()), 10)
 
 
 class TestGetDataShape(TestCase):