Skip to content
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ sandbox/
venv/
venvs/
.DS_Store
uv.lock
5 changes: 5 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
- Prefer specific exceptions over generic ones
- For CLI, use click library patterns
- Imports organized: stdlib, third-party, local (alphabetical within groups)
- **Imports must be at the top of the file** — do NOT place imports inside
functions or methods unless there is a concrete reason (circular dependency,
or heavy transitive imports like `pynwb`/`h5py`/`nwbinspector` that would
slow down module load for unrelated code paths). When deferring an import
for weight, add the comment `# Avoid heavy import by importing within function:`.

## Documentation
- Keep docstrings updated when changing function signatures
Expand Down
46 changes: 45 additions & 1 deletion dandi/files/bases.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from dandischema.digests.dandietag import DandiETag
from dandischema.models import BareAsset, CommonModel
from dandischema.models import Dandiset as DandisetMeta
from dandischema.models import get_schema_version
from dandischema.models import StandardsType, get_schema_version, nwb_standard
from packaging.version import Version
from pydantic import ValidationError
from pydantic_core import ErrorDetails
Expand All @@ -41,6 +41,20 @@
Validator,
)

# True when the installed dandischema exposes the per-asset dataStandard field
# and related StandardsType enhancements (version, extensions). All these
# fields ship together starting with dandischema 0.12.2.
# TODO: remove this guard (and all branches that check it) once the minimum
# required dandischema version is >= 0.12.2.
_SCHEMA_BAREASSET_HAS_DATASTANDARD = "dataStandard" in BareAsset.model_fields
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD and Version(
dandischema.__version__
) >= Version("0.12.2"):
raise RuntimeError(
f"dandischema {dandischema.__version__} should have "
f"'dataStandard' field on BareAsset"
)

lgr = dandi.get_logger()

# TODO -- should come from schema. This is just a simplistic example for now
Expand Down Expand Up @@ -504,6 +518,36 @@ def get_metadata(
else:
raise
metadata.path = self.path
if _SCHEMA_BAREASSET_HAS_DATASTANDARD:
kwargs: dict[str, Any] = dict(nwb_standard)
# Avoid heavy import by importing within function:
from dandi.pynwb_utils import get_nwb_extensions, get_nwb_version

if nwb_ver := get_nwb_version(self.filepath):
kwargs["version"] = nwb_ver
try:
nwb_exts = get_nwb_extensions(self.filepath)
except Exception:
lgr.debug(
"Failed to extract NWB extensions from %s",
self.filepath,
exc_info=True,
)
nwb_exts = {}
if nwb_exts:
kwargs["extensions"] = [
StandardsType(name=name, version=ver).model_dump( # type: ignore[call-arg]
mode="json", exclude_none=True
)
for name, ver in sorted(nwb_exts.items())
]
nwb = StandardsType(**kwargs)
# TODO: use metadata.dataStandard directly once min dandischema >= 0.12.2
cur = getattr(metadata, "dataStandard", None)
if cur is None:
setattr(metadata, "dataStandard", [nwb])
elif nwb not in cur:
cur.append(nwb)
return metadata

# TODO: @validate_cache.memoize_path
Expand Down
88 changes: 85 additions & 3 deletions dandi/files/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,32 @@
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import datetime
import json
from pathlib import Path
from threading import Lock
import weakref

from dandischema.models import BareAsset
from dandischema.models import (
BareAsset,
StandardsType,
bids_standard,
ome_ngff_standard,
)

import dandi
from dandi.bids_validator_deno import bids_validate

from .bases import GenericAsset, LocalFileAsset, NWBAsset
from .bases import (
_SCHEMA_BAREASSET_HAS_DATASTANDARD,
GenericAsset,
LocalFileAsset,
NWBAsset,
)
Comment on lines +21 to +26

Check notice

Code scanning / CodeQL

Cyclic import Note

Import of module
dandi.files.bases
begins an import cycle.

Copilot Autofix

AI 4 days ago

In general, the clean way to break an import cycle is to avoid having two modules that directly depend on each other’s runtime definitions. For class hierarchies, this usually means either (1) moving shared abstractions into a third, lower-level module, or (2) converting some imports to type-only imports using typing.TYPE_CHECKING, so the runtime import no longer occurs.

Within dandi/files/bids.py, we can break the cycle without changing behavior by changing how we import from .bases. The key is to avoid importing GenericAsset (a base abstraction that likely needs to stay independent), while still ensuring that any references to it remain valid for static typing. Since Python 3.7+ with from __future__ import annotations supports postponed evaluation of annotations, we can safely import GenericAsset only under TYPE_CHECKING. At runtime, code in bids.py does not need the GenericAsset name unless it is used directly (e.g., for isinstance checks); if it is only used in type hints, moving it under TYPE_CHECKING removes the cycle.

Concretely, in dandi/files/bids.py we will:

  • Keep importing LocalFileAsset and NWBAsset from .bases at runtime.
  • Stop importing GenericAsset at runtime.
  • Add a TYPE_CHECKING guarded import for GenericAsset so type checkers still see it.

This involves:

  • Adding from typing import TYPE_CHECKING to the imports.
  • Modifying the .bases import block so that GenericAsset is no longer in the runtime import.
  • Adding a small if TYPE_CHECKING: block that imports GenericAsset from .bases for typing only.

No functionality changes for executed code, but the runtime import graph changes so that bids.py no longer pulls in GenericAsset during module initialization, thus breaking or at least relaxing the cycle.

Suggested changeset 1
dandi/files/bids.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/dandi/files/bids.py b/dandi/files/bids.py
--- a/dandi/files/bids.py
+++ b/dandi/files/bids.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 from threading import Lock
 import weakref
+from typing import TYPE_CHECKING
 
 from dandischema.models import (
     BareAsset,
@@ -20,11 +21,13 @@
 
 from .bases import (
     _SCHEMA_BAREASSET_HAS_DATASTANDARD,
-    GenericAsset,
     LocalFileAsset,
     NWBAsset,
 )
 
+if TYPE_CHECKING:
+    from .bases import GenericAsset
+
 if _SCHEMA_BAREASSET_HAS_DATASTANDARD:
     from dandischema.models import hed_standard  # type: ignore[attr-defined]
 else:
EOF
@@ -7,6 +7,7 @@
from pathlib import Path
from threading import Lock
import weakref
from typing import TYPE_CHECKING

from dandischema.models import (
BareAsset,
@@ -20,11 +21,13 @@

from .bases import (
_SCHEMA_BAREASSET_HAS_DATASTANDARD,
GenericAsset,
LocalFileAsset,
NWBAsset,
)

if TYPE_CHECKING:
from .bases import GenericAsset

if _SCHEMA_BAREASSET_HAS_DATASTANDARD:
from dandischema.models import hed_standard # type: ignore[attr-defined]
else:
Copilot is powered by AI and may make mistakes. Always verify output.

if _SCHEMA_BAREASSET_HAS_DATASTANDARD:
from dandischema.models import hed_standard # type: ignore[attr-defined]
else:
hed_standard = None # type: ignore[assignment]
from .zarr import ZarrAsset
from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file
from ..metadata.core import add_common_metadata, prepare_metadata
Expand All @@ -23,10 +40,32 @@
ValidationResult,
)

lgr = dandi.get_logger()

BIDS_ASSET_ERRORS = ("BIDS.NON_BIDS_PATH_PLACEHOLDER",)
BIDS_DATASET_ERRORS = ("BIDS.MANDATORY_FILE_MISSING_PLACEHOLDER",)


def _add_standard(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line 542 of bases appears to have similar logic

metadata: BareAsset,
standard_dict: dict,
version: str | None = None,
) -> None:
"""Add a data standard to asset metadata if the field is available."""
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD:
return
kwargs = dict(standard_dict)
if version:
kwargs["version"] = version
standard = StandardsType(**kwargs)
# TODO: use metadata.dataStandard directly once min dandischema >= 0.12.2
cur = getattr(metadata, "dataStandard", None)
if cur is None:
setattr(metadata, "dataStandard", [standard])
elif standard not in cur:
cur.append(standard)


@dataclass
class BIDSDatasetDescriptionAsset(LocalFileAsset):
"""
Expand Down Expand Up @@ -192,7 +231,48 @@
assert self._dataset_errors is not None
return self._dataset_errors.copy()

# get_metadata(): inherit use of default metadata from LocalFileAsset
def get_metadata(
self,
digest: Digest | None = None,
ignore_errors: bool = True,
) -> BareAsset:
metadata = super().get_metadata(digest=digest, ignore_errors=ignore_errors)
try:
with open(self.filepath) as f:
desc = json.load(f)
except (OSError, json.JSONDecodeError) as e:
lgr.warning("Failed to read %s: %s", self.filepath, e)
_add_standard(metadata, bids_standard)
return metadata
_add_standard(metadata, bids_standard, version=desc.get("BIDSVersion"))
if hed_standard and (hed_version := desc.get("HEDVersion")):
# HEDVersion can be a string or list.
# List form: ["8.2.0", "sc:1.0.0"] where first element is base
# HED version and subsequent "prefix:version" entries are library
# schemas recorded as extensions.
if isinstance(hed_version, list):
version = hed_version[0] if hed_version else None
library_entries = hed_version[1:]
else:
version = hed_version
library_entries = []
kwargs: dict = dict(hed_standard)
if version:
kwargs["version"] = version
if library_entries:
extensions = []
for entry in library_entries:
# Format is "prefix:version" (e.g. "sc:1.0.0")
if ":" in str(entry):
lib_name, lib_ver = str(entry).split(":", 1)
else:
lib_name, lib_ver = str(entry), None
ext = StandardsType(name=lib_name, version=lib_ver) # type: ignore[call-arg]
extensions.append(ext.model_dump(mode="json", exclude_none=True))
if extensions:
kwargs["extensions"] = extensions
_add_standard(metadata, kwargs)
return metadata


@dataclass
Expand Down Expand Up @@ -312,6 +392,8 @@
add_common_metadata(metadata, self.filepath, start_time, end_time, digest)
metadata.path = self.path
metadata.encodingFormat = ZARR_MIME_TYPE
if Path(self.path).suffixes == [".ome", ".zarr"]:
_add_standard(metadata, ome_ngff_standard)
return metadata


Expand Down
43 changes: 43 additions & 0 deletions dandi/pynwb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,49 @@ def _sanitize(v: Any) -> str:
return None


# Namespaces bundled with NWB/HDMF core — not extensions
_NWB_CORE_NAMESPACES = frozenset({"core", "hdmf-common", "hdmf-experimental"})


def get_nwb_extensions(filepath: str | Path | Readable) -> dict[str, str]:
"""Return NWB extensions embedded in an HDF5 file.
Reads the ``specifications`` group of an NWB HDF5 file and returns a
mapping of extension namespace names to their latest embedded version,
excluding core NWB/HDMF namespaces.
Parameters
----------
filepath
Path to an NWB ``.nwb`` HDF5 file, or a :class:`Readable`.
Returns
-------
dict[str, str]
``{namespace_name: latest_version}`` for each non-core namespace
found in the file's ``specifications`` group. Empty dict if no
extensions are present or the group does not exist.
"""
extensions: dict[str, str] = {}
with open_readable(filepath) as fp, h5py.File(fp, "r") as h5file:
specs = h5file.get("specifications")
if specs is None:
return extensions
for name in specs:
if name in _NWB_CORE_NAMESPACES:
continue
ns_group = specs[name]
if not isinstance(ns_group, h5py.Group):
continue
try:
sorted_versions = sorted(ns_group, key=Version)
if sorted_versions:
extensions[name] = sorted_versions[-1]
except Exception:
lgr.debug("Failed to parse versions for NWB extension %s", name)
return extensions


def get_neurodata_types_to_modalities_map() -> dict[str, str]:
"""Return a dict to map neurodata types known to pynwb to "modalities"
Expand Down
89 changes: 89 additions & 0 deletions dandi/tests/test_files.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import json
from operator import attrgetter
import os
from pathlib import Path
Expand Down Expand Up @@ -29,6 +30,7 @@
dandi_file,
find_dandi_files,
)
from ..files.bases import _SCHEMA_BAREASSET_HAS_DATASTANDARD

lgr = get_logger()

Expand Down Expand Up @@ -608,3 +610,90 @@ def test_validate_invalid_zarr3(path: str, expected_result_ids: set[str]) -> Non

result_ids = {r.id for r in zf.get_validation_errors()}
assert result_ids == expected_result_ids


@pytest.mark.ai_generated
class TestBIDSDatasetDescriptionDataStandard:
"""Tests for per-asset dataStandard population from dataset_description.json"""

@staticmethod
def _make_bids_dd(tmp_path: Path, content: dict) -> BIDSDatasetDescriptionAsset:
dd_path = tmp_path / "dataset_description.json"
dd_path.write_text(json.dumps(content))
return BIDSDatasetDescriptionAsset(
filepath=dd_path,
path="dataset_description.json",
dandiset_path=tmp_path,
)

@staticmethod
def _standard_names(metadata): # type: ignore[no-untyped-def]
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD:
pytest.skip("dandischema too old, no dataStandard on BareAsset")
return [s.name for s in (metadata.dataStandard or [])]

def test_bids_always_set(self, tmp_path: Path) -> None:
asset = self._make_bids_dd(
tmp_path,
{"Name": "Test", "BIDSVersion": "1.9.0"},
)
names = self._standard_names(asset.get_metadata())
assert "Brain Imaging Data Structure (BIDS)" in names

def test_hed_detected_when_hedversion_present(self, tmp_path: Path) -> None:
asset = self._make_bids_dd(
tmp_path,
{"Name": "Test", "BIDSVersion": "1.9.0", "HEDVersion": "8.2.0"},
)
names = self._standard_names(asset.get_metadata())
assert "Hierarchical Event Descriptors (HED)" in names
assert "Brain Imaging Data Structure (BIDS)" in names

def test_hed_not_detected_when_hedversion_absent(self, tmp_path: Path) -> None:
asset = self._make_bids_dd(
tmp_path,
{"Name": "Test", "BIDSVersion": "1.9.0"},
)
names = self._standard_names(asset.get_metadata())
assert "Hierarchical Event Descriptors (HED)" not in names

def test_hed_detected_with_list_hedversion(self, tmp_path: Path) -> None:
"""HEDVersion can be a list of strings per BIDS spec."""
asset = self._make_bids_dd(
tmp_path,
{
"Name": "Test",
"BIDSVersion": "1.9.0",
"HEDVersion": ["8.2.0", "sc:1.0.0"],
},
)
names = self._standard_names(asset.get_metadata())
assert "Hierarchical Event Descriptors (HED)" in names

def test_hed_library_schemas_as_extensions(self, tmp_path: Path) -> None:
"""HED library schemas in list HEDVersion populate extensions."""
if not _SCHEMA_BAREASSET_HAS_DATASTANDARD:
pytest.skip("dandischema too old, no dataStandard on BareAsset")
asset = self._make_bids_dd(
tmp_path,
{
"Name": "Test",
"BIDSVersion": "1.9.0",
"HEDVersion": ["8.2.0", "sc:1.0.0", "lang:1.1.0"],
},
)
metadata = asset.get_metadata()
hed_standards = [
# TODO: use metadata.dataStandard directly once min dandischema >= 0.12.2
s
for s in (getattr(metadata, "dataStandard", None) or [])
if s.name == "Hierarchical Event Descriptors (HED)"
]
assert len(hed_standards) == 1
hed = hed_standards[0]
assert hed.version == "8.2.0"
assert hed.extensions is not None
ext_names = {e.name for e in hed.extensions}
assert ext_names == {"sc", "lang"}
ext_map = {e.name: e.version for e in hed.extensions}
assert ext_map == {"sc": "1.0.0", "lang": "1.1.0"}
Loading
Loading