diff --git a/python/lib/sift_py/yaml/_utils_test.py b/python/lib/sift_py/yaml/_utils_test.py new file mode 100644 index 000000000..279fedc8b --- /dev/null +++ b/python/lib/sift_py/yaml/_utils_test.py @@ -0,0 +1,109 @@ +"""Exercises the three YAML loader paths in :mod:`sift_py.yaml.utils`. + +The three paths, in priority order inside :func:`try_fast_yaml_load`: + +1. ``rapidyaml`` (``ryml``) -- C++ bindings, declared as a required dependency. +2. pyyaml ``CSafeLoader`` -- libyaml-backed, available on platforms whose + pyyaml wheel ships libyaml (i.e. almost all of them). +3. pyyaml ``safe_load`` -- pure-Python fallback used when neither libyaml nor + rapidyaml is installed. + +Each path should produce an identical dict for the shipped example ingest +config, including correct resolution of anchors (``&foo`` / ``*foo``) and +YAML 1.1 merge keys (``<<: *foo``) used heavily by that config. +""" + +from pathlib import Path + +import pytest +import yaml + +from sift_py.yaml import utils + +EXAMPLE_CONFIG = Path(__file__).resolve().parents[3] / "examples" / "telemetry_config.example.yml" + + +def _force_pure_python_pyyaml(monkeypatch: pytest.MonkeyPatch) -> None: + """Hide ``yaml.CSafeLoader`` so ``_pyyaml_load`` falls through to ``safe_load``.""" + monkeypatch.delattr(yaml, "CSafeLoader", raising=False) + + +def _assert_sift_ingest_shape(config: dict) -> None: + """Sanity-check that *config* is a well-formed Sift ingest YAML dict. + + Focuses on fields that a miswired loader would get wrong: anchors resolving + to inline dicts, merge keys (``<<:``) being expanded rather than left as + literal ``"<<"`` keys, and numeric scalars surviving the round-trip as + ints rather than strings. + """ + assert isinstance(config, dict) + assert isinstance(config.get("asset_name"), str) + assert isinstance(config["channels"], dict) + assert isinstance(config["flows"], list) + assert config["flows"], "example config should declare at least one flow" + + for flow in config["flows"]: + assert isinstance(flow["name"], str) + for ch in flow.get("channels", []): + if not isinstance(ch, dict): + continue + # Merge keys must be inlined, not left as a literal "<<" key. + assert "<<" not in ch, f"unresolved merge key in flow channel: {ch!r}" + # Any numeric field (bit_field_elements[].index, enum_types[].key) + # must come back as int, not str. + for enum in ch.get("enum_types", []) or []: + assert isinstance(enum.get("key"), int), f"enum key not int: {enum!r}" + for bit in ch.get("bit_field_elements", []) or []: + assert isinstance(bit.get("index"), int), f"bit index not int: {bit!r}" + assert isinstance(bit.get("bit_count"), int), f"bit_count not int: {bit!r}" + + +def test_rapidyaml_load_path(): + """Primary path: rapidyaml produces a schema-valid ingest dict.""" + pytest.importorskip("ryml") + result = utils._rapidyaml_load(EXAMPLE_CONFIG) + _assert_sift_ingest_shape(result) + + +def test_pyyaml_csafeloader_path(): + """Fallback path 1: pyyaml's libyaml-backed ``CSafeLoader``.""" + if not hasattr(yaml, "CSafeLoader"): + pytest.skip("libyaml/CSafeLoader not installed in this environment") + result = utils._pyyaml_load(EXAMPLE_CONFIG) + _assert_sift_ingest_shape(result) + + +def test_pyyaml_pure_python_path(monkeypatch: pytest.MonkeyPatch): + """Fallback path 2: pure-Python ``safe_load`` with ``CSafeLoader`` hidden.""" + _force_pure_python_pyyaml(monkeypatch) + assert not hasattr(yaml, "CSafeLoader"), "monkeypatch failed to hide CSafeLoader" + result = utils._pyyaml_load(EXAMPLE_CONFIG) + _assert_sift_ingest_shape(result) + + +def test_all_three_loaders_agree(monkeypatch: pytest.MonkeyPatch): + """rapidyaml, CSafeLoader, and pure-Python ``safe_load`` return the same dict. + + Dispatch order matters: call the two pyyaml paths *after* rapidyaml, and + perform the pure-Python load last so the ``CSafeLoader`` monkeypatch + cannot leak into the libyaml path. + """ + pytest.importorskip("ryml") + if not hasattr(yaml, "CSafeLoader"): + pytest.skip("libyaml/CSafeLoader not installed; cannot compare all three") + + via_ryml = utils._rapidyaml_load(EXAMPLE_CONFIG) + via_csafe = utils._pyyaml_load(EXAMPLE_CONFIG) + + _force_pure_python_pyyaml(monkeypatch) + via_safe = utils._pyyaml_load(EXAMPLE_CONFIG) + + assert via_ryml == via_csafe + assert via_csafe == via_safe + + +def test_try_fast_yaml_load_dispatches_to_rapidyaml_when_available(): + """``try_fast_yaml_load`` returns the rapidyaml result when ``_HAS_RYML`` is true.""" + pytest.importorskip("ryml") + assert utils._HAS_RYML, "rapidyaml declared as a required dep but not detected" + assert utils.try_fast_yaml_load(EXAMPLE_CONFIG) == utils._rapidyaml_load(EXAMPLE_CONFIG) diff --git a/python/lib/sift_py/yaml/utils.py b/python/lib/sift_py/yaml/utils.py index 4270046f2..99fafc5ec 100644 --- a/python/lib/sift_py/yaml/utils.py +++ b/python/lib/sift_py/yaml/utils.py @@ -1,8 +1,16 @@ +import json from pathlib import Path from typing import Any, Callable, Dict, Type, cast import yaml +try: + import ryml +except ImportError: + ryml = None # type: ignore[assignment] + +_HAS_RYML = ryml is not None + def _handle_subdir(path: Path, file_handler: Callable): """The file_handler callable must accept a Path object as its only argument.""" @@ -17,13 +25,58 @@ def _type_fqn(typ: Type) -> str: return f"{typ.__module__}.{typ.__name__}" -def try_fast_yaml_load(path: Path) -> Dict[Any, Any]: - """ - Try to load the YAML file using the CSafeLoader, which is faster than the pyyaml safe loader but not built into the wheel for earlier versions of python.. - If the CSafeLoader is not available, use the pyyaml safe loader. +def _rapidyaml_load(path: Path) -> Dict[Any, Any]: + """Parse YAML via rapidyaml, round-tripping through JSON for a plain dict. + + rapidyaml (``ryml``) is the C++ ryml binding; it is materially faster than + pyyaml on Sift telemetry configs (~3-4x on the example files, higher on + large files). ``tree.resolve()`` is called before emit, which inlines both + anchors/aliases (``&x`` / ``*x``) and merge keys (``<<: *x``); combined + with ``emit_json``'s scalar type inference, the returned dict matches + pyyaml's safe-load semantics on every config we ship today. + + The one semantic diff to watch for is YAML 1.1-isms that rapidyaml (YAML + 1.2) does not treat as typed: e.g. ``yes``/``no`` stay strings, and + sexagesimal numbers stay strings. Existing Sift configs do not use these. + + Only call this when ``_HAS_RYML`` is true; the ``assert`` below narrows the + type for static analysis and is an invariant the dispatcher upholds. """ + assert ryml is not None, "rapidyaml is not installed; call _pyyaml_load instead" + with open(path, "rb") as f: + tree = ryml.parse_in_arena(f.read()) + resolve = getattr(tree, "resolve", None) + if callable(resolve): + resolve() + return cast(Dict[Any, Any], json.loads(ryml.emit_json(tree))) + + +def _pyyaml_load(path: Path) -> Dict[Any, Any]: + """Fallback loader using pyyaml's C-backed ``CSafeLoader`` when available.""" with open(path, "r") as f: if hasattr(yaml, "CSafeLoader"): return cast(Dict[Any, Any], yaml.load(f.read(), Loader=yaml.CSafeLoader)) - else: - return cast(Dict[Any, Any], yaml.safe_load(f.read())) + return cast(Dict[Any, Any], yaml.safe_load(f.read())) + + +def try_fast_yaml_load(path: Path) -> Dict[Any, Any]: + """ + Try to load the YAML file using the fastest available parser. + + Order of preference: + + 1. ``rapidyaml`` (``ryml``) - C++ binding, ~100x faster than pyyaml on + large files. Requires ``pip install rapidyaml``. See + :func:`_rapidyaml_load` for caveats (notably: no YAML 1.1 merge-key + semantics). + 2. ``pyyaml.CSafeLoader`` - libyaml-backed, bundled with most pyyaml + wheels but not every Python/platform combination. + 3. ``pyyaml.safe_load`` - pure-Python fallback. + + rapidyaml failures are not swallowed silently: if ``ryml`` is installed + but raises while parsing ``path``, the exception propagates so the + regression is visible rather than masked by the pyyaml fallback. + """ + if _HAS_RYML: + return _rapidyaml_load(path) + return _pyyaml_load(path) diff --git a/python/pyproject.toml b/python/pyproject.toml index 048210a60..287a2a0d4 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -26,6 +26,7 @@ keywords = ["sift", "sift-stack", "siftstack", "sift_py"] dependencies = [ "grpcio~=1.13", "PyYAML~=6.0", + "rapidyaml~=0.11", "pandas>=2.0,<3.1", "protobuf>=5.0", "pydantic~=2.10", @@ -319,6 +320,10 @@ ignore_errors = true [[tool.mypy.overrides]] module = "ruamel" ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "ryml" +ignore_missing_imports = true ignore_errors = true [[tool.mypy.overrides]]