diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d17d17..83f8623 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ All notable changes to this project are documented in this file. [#21](https://github.com/leroyvn/xarray-validate/pull/21) - Allow making `DatasetSchema`'s data variables optional, and allow unknown data variables [#22](https://github.com/leroyvn/xarray-validate/pull/22) +- Support pattern matching for coordinate and data variable keys + [#23](https://github.com/leroyvn/xarray-validate/pull/23) ## 0.0.4 — 2025-12-17 diff --git a/TODO.md b/TODO.md index fb60cd1..13a1840 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,8 @@ # TODO list -- DOCS: Add thorough YAML schema writing guide -- DatasetSchema: Allow defining optional variables or coordinates -- DatasetSchema: Allow regex-based variable or coordinate name matching -- AttrSchema: Support string input in the type field when deserializing -- AttrSchema: Add regex-based string validation for attributes -- AttrSchema: Add pint-based unit validation system +- [ ] DOCS: Add thorough YAML schema writing guide +- [x] DatasetSchema: Allow defining optional variables or coordinates +- [x] DatasetSchema: Allow regex-based variable or coordinate name matching +- [ ] AttrSchema: Support string input in the type field when deserializing +- [ ] AttrSchema: Add regex-based string validation for attributes +- [ ] AttrSchema: Add pint-based unit validation system diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 51dec27..22fe31d 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -122,6 +122,94 @@ errors will be collected and reported after running all subschemas. For example: ('dims', SchemaError('dimension mismatch in axis 0: got y, expected x')), ('dims', SchemaError('dimension mismatch in axis 1: got x, expected y'))]) +Pattern matching for coordinates and data variables +---------------------------------------------------- + +Coordinate and data variable keys in schemas support pattern matching, allowing +you to validate multiple similarly-named items with a single schema definition. +Two pattern types are supported: + +**Glob patterns** use wildcards (``*`` and ``?``) for simple matching: + +.. doctest:: + + >>> ds = xr.Dataset( + ... { + ... "x_0": xr.DataArray([1, 2, 3], dims="x"), + ... "x_1": xr.DataArray([4, 5, 6], dims="x"), + ... "x_2": xr.DataArray([7, 8, 9], dims="x"), + ... } + ... ) + >>> schema = DatasetSchema( + ... data_vars={ + ... "x_*": DataArraySchema(dtype=np.int64, dims=["x"], shape=(3,)) + ... } + ... ) + >>> schema.validate(ds) + +**Regex patterns** use regular expressions enclosed in curly braces for precise +matching: + +.. doctest:: + + >>> ds = xr.Dataset( + ... { + ... "x_0": xr.DataArray([1, 2, 3], dims="x"), + ... "x_1": xr.DataArray([4, 5, 6], dims="x"), + ... "x_foo": xr.DataArray([7, 8, 9], dims="x"), # Won't match + ... } + ... ) + >>> schema = DatasetSchema( + ... data_vars={ + ... "{x_\\d+}": DataArraySchema(dtype=np.int64, dims=["x"], shape=(3,)) + ... }, + ... allow_extra_keys=True, # Allow x_foo to exist + ... ) + >>> schema.validate(ds) + +Pattern matching also works with :class:`.CoordsSchema`: + +.. doctest:: + + >>> da = xr.DataArray( + ... np.ones((3, 3)), + ... dims=["x", "y"], + ... coords={ + ... "x": np.arange(3), + ... "x_label_0": ("x", np.array(["a", "b", "c"], dtype=object)), + ... "x_label_1": ("x", np.array(["d", "e", "f"], dtype=object)), + ... }, + ... ) + >>> schema = DataArraySchema( + ... coords=CoordsSchema( + ... { + ... "x": DataArraySchema(dtype=np.int64), + ... "x_label_*": DataArraySchema(dtype=object), + ... } + ... ) + ... ) + >>> schema.validate(da) + +**Pattern matching rules:** + +- Exact keys take precedence over patterns +- When ``require_all_keys=True`` (default), only exact keys are required; + pattern keys are optional +- When ``allow_extra_keys=False``, keys must match either an exact key or a + pattern +- Multiple patterns can match the same key; all matching schemas will validate + it + +.. admonition:: Tips + :class: tip + + * Learn more about Python's Unix shell-style wildcards in the :mod:`fnmatch` + module documentation. + * Learn more about Python's regular expressions in the :mod:`re` module + documentation. + * Internally, Unix-style wildcards are converted to regular expressions + using the :func:`fnmatch.translate` function. + Loading schemas from serialized data structures ----------------------------------------------- diff --git a/src/xarray_validate/_match.py b/src/xarray_validate/_match.py new file mode 100644 index 0000000..1af19f8 --- /dev/null +++ b/src/xarray_validate/_match.py @@ -0,0 +1,120 @@ +"""Pattern matching support functions.""" + +import fnmatch +import re +from typing import Any, Dict, Mapping, Set, Tuple + + +def is_regex_pattern(key: str) -> bool: + """Check if a key is a regex pattern (enclosed in curly braces).""" + return key.startswith("{") and key.endswith("}") + + +def is_glob_pattern(key: str) -> bool: + """Check if a key is a glob pattern (contains * or ?).""" + return "*" in key or "?" in key + + +def is_pattern_key(key: str) -> bool: + """Check if a key is any kind of pattern (glob or regex).""" + return is_glob_pattern(key) or is_regex_pattern(key) + + +def pattern_to_regex(pattern: str) -> re.Pattern: + r""" + Convert a pattern key to a compiled regex. + + Supports two pattern types: + + - glob patterns: ``'x_*'`` matches ``x_0``, ``x_1``, ``x_foo``, etc. + - regex patterns: ``'{x_\\d+}'`` matches ``x_0``, ``x_1``, but not ``x_foo`` + + Parameters + ---------- + pattern : str + The pattern string (regex in curly braces or glob). + + Returns + ------- + re.Pattern + Compiled regex pattern + """ + if is_regex_pattern(pattern): + # Remove curly braces and compile as regex + regex_str = pattern[1:-1] + return re.compile(regex_str) + + elif is_glob_pattern(pattern): + # Convert glob to regex + regex_str = fnmatch.translate(pattern) + return re.compile(regex_str) + + else: + # Exact match + return re.compile(re.escape(pattern) + "$") + + +def separate_keys( + schema_keys: Dict[str, Any], +) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, re.Pattern]]: + """ + Separate schema keys into exact and pattern keys, and compile patterns. + + Parameters + ---------- + schema_keys : dict + Dictionary with string keys (exact or pattern) and schema values. + + Returns + ------- + exact_keys : dict + Dictionary with exact (non-pattern) keys. + + pattern_keys : dict + Dictionary with pattern keys. + + compiled_patterns : dict + Dictionary mapping pattern keys to compiled regex objects. + """ + exact_keys = {k: v for k, v in schema_keys.items() if not is_pattern_key(k)} + pattern_keys = {k: v for k, v in schema_keys.items() if is_pattern_key(k)} + compiled_patterns = {k: pattern_to_regex(k) for k in pattern_keys} + return exact_keys, pattern_keys, compiled_patterns + + +def find_matched_keys( + actual_keys: Mapping[str, Any], + exact_keys: Dict[str, Any], + compiled_patterns: Dict[str, re.Pattern], +) -> Set[str]: + """ + Find all actual keys that match either exact or pattern keys. + + Parameters + ---------- + actual_keys : mapping + The actual keys to check (*e.g.* ``coords`` or ``data_vars``). + + exact_keys : dict + Dictionary with exact (non-pattern) keys. + + compiled_patterns : dict + Dictionary mapping pattern keys to compiled regex objects. + + Returns + ------- + set + Set of actual keys that match either exact or pattern keys. + """ + matched_keys = set() + for key_name in actual_keys: + # Check exact match + if key_name in exact_keys: + matched_keys.add(key_name) + continue + # Check pattern match + for pattern, regex in compiled_patterns.items(): + if regex.fullmatch(key_name): + matched_keys.add(key_name) + break + return matched_keys diff --git a/src/xarray_validate/dataarray.py b/src/xarray_validate/dataarray.py index d2b3496..ff2d31f 100644 --- a/src/xarray_validate/dataarray.py +++ b/src/xarray_validate/dataarray.py @@ -14,6 +14,7 @@ import attrs as _attrs import xarray as xr +from . import _match from .base import ( BaseSchema, SchemaError, @@ -34,19 +35,26 @@ @_attrs.define(on_setattr=[_attrs.setters.convert, _attrs.setters.validate]) class CoordsSchema(BaseSchema): - """ + r""" Schema container for Coordinates Parameters ---------- coords : dict - Dict of coordinate keys and ``DataArraySchema`` objects. + Dict of coordinate keys and ``DataArraySchema`` objects. Keys can be + either exact coordinate names or patterns: + + - Exact match: ``'time'`` matches only 'time' + - Glob pattern: ``'x_*'`` matches x_0, x_1, x_foo, etc. + - Regex pattern: ``'{x_\\d+}'`` matches x_0, x_1, but not x_foo require_all_keys : bool, default: True Whether to require to all coordinates included in ``coords``. + Only applies to exact keys, not pattern keys. allow_extra_keys : bool, default: True Whether to allow coordinates not included in ``coords`` dict. + Coordinates matching pattern keys are not considered "extra". """ coords: Dict[str, DataArraySchema] = _attrs.field() @@ -78,8 +86,12 @@ def validate( ) -> None: # Inherit docstring + # Separate exact keys from pattern keys and compile patterns + exact_keys, pattern_keys, compiled_patterns = _match.separate_keys(self.coords) + if self.require_all_keys: - missing_keys = set(self.coords) - set(coords) + # Only check exact keys for require_all_keys + missing_keys = set(exact_keys) - set(coords) if missing_keys: error = SchemaError(f"coords has missing keys: {missing_keys}") if context: @@ -88,7 +100,11 @@ def validate( raise error if not self.allow_extra_keys: - extra_keys = set(coords) - set(self.coords) + # Check that all coordinates match either exact or pattern keys + matched_coords = _match.find_matched_keys( + coords, exact_keys, compiled_patterns + ) + extra_keys = set(coords) - matched_coords if extra_keys: error = SchemaError(f"coords has extra keys: {extra_keys}") if context: @@ -96,7 +112,8 @@ def validate( else: raise error - for key, da_schema in self.coords.items(): + # Validate coordinates matching exact keys + for key, da_schema in exact_keys.items(): if key not in coords: error = SchemaError(f"key {key} not in coords") if context: @@ -107,6 +124,16 @@ def validate( child_context = context.push(f"coords.{key}") if context else None da_schema.validate(coords[key], child_context) + # Validate coordinates matching pattern keys + for pattern_key, da_schema in pattern_keys.items(): + regex = compiled_patterns[pattern_key] + for coord_name in coords: + if regex.fullmatch(coord_name) and coord_name not in exact_keys: + child_context = ( + context.push(f"coords.{coord_name}") if context else None + ) + da_schema.validate(coords[coord_name], child_context) + @_attrs.define(on_setattr=[_attrs.setters.convert, _attrs.setters.validate]) class DataArraySchema(BaseSchema): diff --git a/src/xarray_validate/dataset.py b/src/xarray_validate/dataset.py index 90c4518..f62ad93 100644 --- a/src/xarray_validate/dataset.py +++ b/src/xarray_validate/dataset.py @@ -5,11 +5,13 @@ import attrs as _attrs import xarray as xr +from . import _match from .base import ( BaseSchema, SchemaError, ValidationContext, ValidationMode, + ValidationResult, ) from .components import AttrsSchema from .dataarray import CoordsSchema, DataArraySchema @@ -23,13 +25,20 @@ class DatasetSchema(BaseSchema): Parameters ---------- data_vars : dict, optional - Per-variable :class:`.DataArraySchema`\ s. + Per-variable :class:`.DataArraySchema`\ s. Keys can be either exact + variable names or patterns: + + - Exact match: ``'temperature'`` matches only 'temperature' + - Glob pattern: ``'x_*'`` matches x_0, x_1, x_foo, etc. + - Regex pattern: ``'{x_\\d+}'`` matches x_0, x_1, but not x_foo require_all_keys : bool, default: True Whether to require all data variables included in ``data_vars``. + Only applies to exact keys, not pattern keys. allow_extra_keys : bool, default: True Whether to allow data variables not included in ``data_vars`` dict. + Variables matching pattern keys are not considered "extra". coords : CoordsSchema, optional Coordinate validation schema. @@ -113,7 +122,7 @@ def validate( ds: xr.Dataset, context: ValidationContext | None = None, mode: Literal["eager", "lazy"] | None = None, - ) -> None: + ) -> ValidationResult | None: """ Validate an xarray.DataArray against this schema. @@ -142,8 +151,14 @@ def validate( context = ValidationContext(mode=mode) if self.data_vars is not None: + # Separate exact keys from pattern keys and compile patterns + exact_keys, pattern_keys, compiled_patterns = _match.separate_keys( + self.data_vars + ) + if self.require_all_keys: - missing_keys = set(self.data_vars.keys()) - set(ds.data_vars.keys()) + # Only check exact keys for require_all_keys + missing_keys = set(exact_keys.keys()) - set(ds.data_vars.keys()) if missing_keys: error = SchemaError(f"data_vars has missing keys: {missing_keys}") if context: @@ -152,7 +167,11 @@ def validate( raise error if not self.allow_extra_keys: - extra_keys = set(ds.data_vars.keys()) - set(self.data_vars.keys()) + # Check that all dataset variables match either exact or pattern keys + matched_vars = _match.find_matched_keys( + ds.data_vars, exact_keys, compiled_patterns + ) + extra_keys = set(ds.data_vars.keys()) - matched_vars if extra_keys: error = SchemaError(f"data_vars has extra keys: {extra_keys}") if context: @@ -160,11 +179,22 @@ def validate( else: raise error - for key, da_schema in self.data_vars.items(): + # Validate variables matching exact keys + for key, da_schema in exact_keys.items(): if da_schema is not None and key in ds.data_vars: data_var_context = context.push(f"data_vars.{key}") da_schema.validate(ds.data_vars[key], data_var_context) + # Validate variables matching pattern keys + for pattern_key, da_schema in pattern_keys.items(): + if da_schema is None: + continue + regex = compiled_patterns[pattern_key] + for var_name in ds.data_vars.keys(): + if regex.fullmatch(var_name) and var_name not in exact_keys: + data_var_context = context.push(f"data_vars.{var_name}") + da_schema.validate(ds.data_vars[var_name], data_var_context) + if self.coords is not None: # pragma: no cover coords_context = context.push("coords") self.coords.validate(ds.coords, coords_context) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index bbd96e3..57b01a7 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -228,3 +228,203 @@ def test_data_vars_extra_keys_error(): with pytest.raises(SchemaError, match="data_vars has extra keys"): ds_schema.validate(ds) + + +def test_glob_pattern_data_vars(): + """Test glob pattern matching for data variables.""" + ds = xr.Dataset( + { + "x_0": xr.DataArray([1.0, 2.0, 3.0], dims=["time"]), + "x_1": xr.DataArray([4.0, 5.0, 6.0], dims=["time"]), + "x_2": xr.DataArray([7.0, 8.0, 9.0], dims=["time"]), + "temp": xr.DataArray([10.0, 11.0, 12.0], dims=["time"]), + } + ) + + # Pattern key 'x_*' should match all variables starting with 'x_' + ds_schema = DatasetSchema( + data_vars={ + "x_*": DataArraySchema(dtype=np.float64, dims=["time"]), + "temp": DataArraySchema(dtype=np.float64, dims=["time"]), + } + ) + + # Should validate successfully + ds_schema.validate(ds) + + +def test_regex_pattern_data_vars(): + """Test regex pattern matching for data variables.""" + ds = xr.Dataset( + { + "x_0": xr.DataArray([1.0, 2.0, 3.0], dims=["time"]), + "x_1": xr.DataArray([4.0, 5.0, 6.0], dims=["time"]), + "x_foo": xr.DataArray([7.0, 8.0, 9.0], dims=["time"]), + "temp": xr.DataArray([10.0, 11.0, 12.0], dims=["time"]), + } + ) + + # Regex pattern '{x_\d+}' should match only x_0, x_1 but not x_foo + ds_schema = DatasetSchema( + data_vars={ + r"{x_\d+}": DataArraySchema(dtype=np.float64, dims=["time"]), + "temp": DataArraySchema(dtype=np.float64, dims=["time"]), + "x_foo": DataArraySchema(dtype=np.float64, dims=["time"]), + } + ) + + # Should validate successfully + ds_schema.validate(ds) + + +def test_glob_pattern_multiple_patterns(): + """Test multiple glob patterns in data_vars.""" + ds = xr.Dataset( + { + "x_0": xr.DataArray([1.0, 2.0], dims=["time"]), + "x_1": xr.DataArray([3.0, 4.0], dims=["time"]), + "y_0": xr.DataArray([5.0, 6.0], dims=["time"]), + "y_1": xr.DataArray([7.0, 8.0], dims=["time"]), + "z": xr.DataArray([9.0, 10.0], dims=["time"]), + } + ) + + ds_schema = DatasetSchema( + data_vars={ + "x_*": DataArraySchema(dtype=np.float64, dims=["time"]), + "y_*": DataArraySchema(dtype=np.float64, dims=["time"]), + "z": DataArraySchema(dtype=np.float64, dims=["time"]), + } + ) + + ds_schema.validate(ds) + + +def test_pattern_with_allow_extra_keys_false(): + """Test patterns with allow_extra_keys=False.""" + ds = xr.Dataset( + { + "x_0": xr.DataArray([1.0, 2.0], dims=["time"]), + "x_1": xr.DataArray([3.0, 4.0], dims=["time"]), + "temp": xr.DataArray([5.0, 6.0], dims=["time"]), + } + ) + + # 'x_*' matches x_0 and x_1, 'temp' is exact match + ds_schema = DatasetSchema( + data_vars={ + "x_*": DataArraySchema(dtype=np.float64), + "temp": DataArraySchema(dtype=np.float64), + }, + allow_extra_keys=False, + ) + + # Should validate successfully + ds_schema.validate(ds) + + # Add an extra variable that doesn't match any pattern + ds["extra"] = xr.DataArray([7.0, 8.0], dims=["time"]) + + # Should raise error for extra key + with pytest.raises(SchemaError, match="data_vars has extra keys"): + ds_schema.validate(ds) + + +def test_pattern_validation_error(): + """Test that validation errors are raised for variables matching patterns.""" + ds = xr.Dataset( + { + "x_0": xr.DataArray([1.0, 2.0], dims=["time"]), + "x_1": xr.DataArray([3, 4], dims=["time"]), # Wrong dtype + } + ) + + ds_schema = DatasetSchema( + data_vars={ + "x_*": DataArraySchema(dtype=np.float64), + } + ) + + # Should raise error because x_1 has wrong dtype + with pytest.raises(SchemaError, match="dtype"): + ds_schema.validate(ds) + + +def test_pattern_require_all_keys(): + """Test that require_all_keys only applies to exact keys, not patterns.""" + ds = xr.Dataset( + { + "temp": xr.DataArray([1.0, 2.0], dims=["time"]), + # No x_* variables present + } + ) + + # Pattern keys should be optional even with require_all_keys=True + ds_schema = DatasetSchema( + data_vars={ + "x_*": DataArraySchema(dtype=np.float64), + "temp": DataArraySchema(dtype=np.float64), + }, + require_all_keys=True, + ) + + # Should validate successfully (pattern keys are optional) + ds_schema.validate(ds) + + # But exact keys should still be required + ds_no_temp = xr.Dataset({}) + with pytest.raises(SchemaError, match="data_vars has missing keys"): + ds_schema.validate(ds_no_temp) + + +def test_pattern_exact_key_precedence(): + """Test that exact keys take precedence over pattern keys.""" + ds = xr.Dataset( + { + "x_special": xr.DataArray([1, 2], dims=["time"]), + "x_0": xr.DataArray([3.0, 4.0], dims=["time"]), + } + ) + + # 'x_special' has both an exact match and matches the pattern 'x_*' + # The exact key schema should be used + ds_schema = DatasetSchema( + data_vars={ + "x_special": DataArraySchema(dtype=np.int64), # Exact match: int64 + "x_*": DataArraySchema(dtype=np.float64), # Pattern: float64 + } + ) + + # Should validate successfully: x_special uses exact schema (int64) + # and x_0 uses pattern schema (float64) + ds_schema.validate(ds) + + +def test_regex_vs_glob_patterns(): + """Test that regex patterns are more restrictive than glob patterns.""" + ds = xr.Dataset( + { + "var_0": xr.DataArray([1.0, 2.0], dims=["time"]), + "var_1": xr.DataArray([3.0, 4.0], dims=["time"]), + "var_foo": xr.DataArray([5.0, 6.0], dims=["time"]), + } + ) + + # Glob pattern 'var_*' matches all + glob_schema = DatasetSchema( + data_vars={ + "var_*": DataArraySchema(dtype=np.float64), + } + ) + glob_schema.validate(ds) + + # Regex pattern '{var_\d+}' matches only var_0 and var_1 + # This should fail because var_foo doesn't match the pattern + regex_schema = DatasetSchema( + data_vars={ + r"{var_\d+}": DataArraySchema(dtype=np.float64), + }, + allow_extra_keys=False, + ) + with pytest.raises(SchemaError, match="data_vars has extra keys"): + regex_schema.validate(ds) diff --git a/tests/test_match.py b/tests/test_match.py new file mode 100644 index 0000000..a3d02f9 --- /dev/null +++ b/tests/test_match.py @@ -0,0 +1,294 @@ +"""Tests for pattern matching utilities.""" + +import re + +import pytest + +from xarray_validate._match import ( + find_matched_keys, + is_glob_pattern, + is_pattern_key, + is_regex_pattern, + pattern_to_regex, + separate_keys, +) + + +@pytest.mark.parametrize( + "s, expected", + [ + ("{x_\\d+}", True), + ("{[a-z]+_[0-9]{2,4}}", True), + ("x_\\d+", False), + ("{x_\\d+", False), + ("x_\\d+}", False), + ("", False), + ("simple_name", False), + ("{}", True), + ], + ids=[ + "simple", + "complex", + "no_curly", + "only_opening", + "only_closing", + "empty", + "plain_text", + "empty_braces", + ], +) +def test_is_regex_pattern(s, expected): + assert is_regex_pattern(s) is expected + + +@pytest.mark.parametrize( + "s, expected", + [ + ("x_*", True), + ("x_?", True), + ("x_*_?", True), + ("*_*_*", True), + ("simple_name", False), + ("", False), + ("{x_\\d+}", False), + ], + ids=[ + "asterisk", + "question_mark", + "both_wildcards", + "multiple_asterisks", + "plain_text", + "empty", + "regex_pattern", + ], +) +def test_is_glob_pattern(s, expected): + assert is_glob_pattern(s) is expected + + +@pytest.mark.parametrize( + "s, expected", + [ + ("x_*", True), + ("{x_\\d+}", True), + ("prefix_*_suffix", True), + ("x_?", True), + ("simple_name", False), + ("", False), + ], + ids=[ + "glob", + "regex", + "glob_with_asterisk", + "glob_with_question", + "plain", + "empty", + ], +) +def test_is_pattern_key(s, expected): + assert is_pattern_key(s) is expected + + +class TestPatternToRegex: + """Tests for pattern_to_regex() function.""" + + def test_regex_pattern_conversion(self): + """Test conversion of regex pattern.""" + pattern = pattern_to_regex("{x_\\d+}") + assert isinstance(pattern, re.Pattern) + assert pattern.match("x_0") + assert pattern.match("x_123") + assert not pattern.match("x_") + assert not pattern.match("x_foo") + + def test_glob_pattern_asterisk_conversion(self): + """Test conversion of glob pattern with asterisk.""" + pattern = pattern_to_regex("x_*") + assert isinstance(pattern, re.Pattern) + assert pattern.match("x_0") + assert pattern.match("x_foo") + assert pattern.match("x_bar_baz") + assert not pattern.match("y_0") + + def test_glob_pattern_question_conversion(self): + """Test conversion of glob pattern with question mark.""" + pattern = pattern_to_regex("x_?") + assert isinstance(pattern, re.Pattern) + assert pattern.match("x_0") + assert pattern.match("x_a") + assert not pattern.match("x_00") + assert not pattern.match("x_") + + def test_glob_pattern_mixed_conversion(self): + """Test conversion of glob pattern with mixed wildcards.""" + pattern = pattern_to_regex("prefix_*_?_suffix") + assert isinstance(pattern, re.Pattern) + assert pattern.match("prefix_foo_a_suffix") + assert pattern.match("prefix_bar_1_suffix") + assert not pattern.match("prefix_foo_suffix") + assert not pattern.match("prefix_foo_ab_suffix") + + def test_exact_match_conversion(self): + """Test conversion of plain text to exact match regex.""" + pattern = pattern_to_regex("exact_name") + assert isinstance(pattern, re.Pattern) + assert pattern.match("exact_name") + assert not pattern.match("exact_name_suffix") + assert not pattern.match("prefix_exact_name") + + def test_exact_match_with_special_chars(self): + """Test exact match with regex special characters.""" + pattern = pattern_to_regex("name.with.dots") + assert isinstance(pattern, re.Pattern) + assert pattern.match("name.with.dots") + assert not pattern.match("namexwithxdots") + + def test_complex_regex_pattern(self): + """Test complex regex pattern with fullmatch.""" + pattern = pattern_to_regex("{[a-z]+_[0-9]{2,4}}") + assert isinstance(pattern, re.Pattern) + # Use fullmatch to match entire string (as done in actual code) + assert pattern.fullmatch("foo_12") + assert pattern.fullmatch("bar_1234") + assert not pattern.fullmatch("foo_1") + assert not pattern.fullmatch("FOO_12") + assert not pattern.fullmatch("foo_12345") + + def test_regex_with_anchors(self): + """Test regex pattern with anchors.""" + pattern = pattern_to_regex("{^x_\\d+$}") + assert isinstance(pattern, re.Pattern) + assert pattern.match("x_123") + assert not pattern.match("prefix_x_123") + assert not pattern.match("x_123_suffix") + + def test_glob_empty_match(self): + """Test glob pattern that can match empty string.""" + pattern = pattern_to_regex("*") + assert isinstance(pattern, re.Pattern) + assert pattern.match("") + assert pattern.match("anything") + assert pattern.match("x_0_y_1") + + def test_regex_alternation(self): + """Test regex pattern with alternation.""" + pattern = pattern_to_regex("{(foo|bar)_\\d+}") + assert isinstance(pattern, re.Pattern) + assert pattern.match("foo_1") + assert pattern.match("bar_2") + assert not pattern.match("baz_3") + + +class TestSeparateKeys: + """Tests for separate_keys function.""" + + def test_all_exact_keys(self): + """Test separation when all keys are exact (non-pattern).""" + schema_keys = {"x": 1, "y": 2, "z": 3} + exact, pattern, compiled = separate_keys(schema_keys) + + assert exact == {"x": 1, "y": 2, "z": 3} + assert pattern == {} + assert compiled == {} + + def test_all_pattern_keys(self): + """Test separation when all keys are patterns.""" + schema_keys = {"x_*": 1, "{y_\\d+}": 2} + exact, pattern, compiled = separate_keys(schema_keys) + + assert exact == {} + assert pattern == {"x_*": 1, "{y_\\d+}": 2} + assert len(compiled) == 2 + assert "x_*" in compiled + assert "{y_\\d+}" in compiled + assert isinstance(compiled["x_*"], re.Pattern) + assert isinstance(compiled["{y_\\d+}"], re.Pattern) + + def test_mixed_keys(self): + """Test separation with both exact and pattern keys.""" + schema_keys = {"x": 1, "y_*": 2, "z": 3, "{w_\\d+}": 4} + exact, pattern, compiled = separate_keys(schema_keys) + + assert exact == {"x": 1, "z": 3} + assert pattern == {"y_*": 2, "{w_\\d+}": 4} + assert len(compiled) == 2 + assert "y_*" in compiled + assert "{w_\\d+}" in compiled + + def test_empty_dict(self): + """Test separation with empty dictionary.""" + schema_keys = {} + exact, pattern, compiled = separate_keys(schema_keys) + + assert exact == {} + assert pattern == {} + assert compiled == {} + + def test_compiled_patterns_functional(self): + """Test that compiled patterns work correctly.""" + schema_keys = {"x_*": 1, "{y_\\d+}": 2} + exact, pattern, compiled = separate_keys(schema_keys) + + # Test glob pattern + assert compiled["x_*"].fullmatch("x_foo") + assert compiled["x_*"].fullmatch("x_0") + assert not compiled["x_*"].fullmatch("y_foo") + + # Test regex pattern + assert compiled["{y_\\d+}"].fullmatch("y_0") + assert compiled["{y_\\d+}"].fullmatch("y_123") + assert not compiled["{y_\\d+}"].fullmatch("y_foo") + + +@pytest.mark.parametrize( + "actual_keys, exact_keys, pattern_specs, expected", + [ + # All exact matches + ({"x": 1, "y": 2, "z": 3}, {"x": 1, "y": 2}, {}, {"x", "y"}), + # All pattern matches + ({"x_0": 1, "x_1": 2, "y_foo": 3}, {}, {"x_*": "x_*"}, {"x_0", "x_1"}), + # Mixed matches + ( + {"x": 1, "y_0": 2, "y_1": 3, "z": 4}, + {"x": 1}, + {"y_*": "y_*"}, + {"x", "y_0", "y_1"}, + ), + # No matches + ({"a": 1, "b": 2}, {"x": 1}, {"y_*": "y_*"}, set()), + # Regex pattern matches + ( + {"x_0": 1, "x_1": 2, "x_foo": 3}, + {}, + {"{x_\\d+}": "{x_\\d+}"}, + {"x_0", "x_1"}, + ), + # Multiple patterns + ( + {"x_0": 1, "y_foo": 2, "z_1": 3}, + {}, + {"{x_\\d+}": "{x_\\d+}", "y_*": "y_*"}, + {"x_0", "y_foo"}, + ), + # Exact takes precedence + ({"x_0": 1, "x_1": 2}, {"x_0": 1}, {"x_*": "x_*"}, {"x_0", "x_1"}), + # Empty actual keys + ({}, {"x": 1}, {"y_*": "y_*"}, set()), + ], + ids=[ + "all_exact_matches", + "all_pattern_matches", + "mixed_matches", + "no_matches", + "regex_pattern_matches", + "multiple_patterns", + "exact_takes_precedence", + "empty_actual_keys", + ], +) +def test_find_matched_keys(actual_keys, exact_keys, pattern_specs, expected): + """Test finding keys that match either exact or pattern keys.""" + # Convert pattern specs to compiled patterns + compiled_patterns = {k: pattern_to_regex(v) for k, v in pattern_specs.items()} + matched = find_matched_keys(actual_keys, exact_keys, compiled_patterns) + assert matched == expected