From 9ec3c31992dcfd91367d19a29255bbf72eb798ef Mon Sep 17 00:00:00 2001 From: Daniel Chudnov Date: Sun, 12 Apr 2026 17:46:12 -0400 Subject: [PATCH] Add to_unicode, permissive, and recovery_mode flags to MARCReader (closes #78) pymarc-compatible kwargs for MARCReader: - to_unicode: accepted for compat, warns if False (mrrc always converts) - permissive: yields None for bad records instead of raising (pymarc behavior) - recovery_mode: exposes mrrc's RecoveryMode for salvaging partial data Updated migration guide, reading tutorial, and quickstart with error handling documentation. Bead: bd-y331 Co-Authored-By: Claude Opus 4.6 (1M context) --- .beads/issues.jsonl | 4 +- docs/getting-started/quickstart-python.md | 2 + docs/guides/migration-from-pymarc.md | 52 ++++++ docs/tutorials/python/reading-records.md | 27 +++ mrrc/__init__.py | 56 +++++- mrrc/_mrrc.pyi | 2 +- src-python/src/readers.rs | 28 ++- tests/python/test_marcreader_flags.py | 212 ++++++++++++++++++++++ 8 files changed, 366 insertions(+), 17 deletions(-) create mode 100644 tests/python/test_marcreader_flags.py diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index 189e40c..ffaa93b 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -3,7 +3,7 @@ {"id":"bd-264q","title":"Add __str__ and __repr__ to Python Record wrapper","description":"The Python Record wrapper class in mrrc/__init__.py does not define __str__ or __repr__. Python's data model skips __getattr__ for dunder methods, so str(rec) returns the default '' instead of delegating to the Rust PyRecord's __str__ (which returns 'Record(type=a)'). Need to add explicit __str__ and __repr__ methods that delegate to self._inner. Reported in https://github.com/dchud/mrrc/issues/57 comment.","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-03-04T21:06:29.006105Z","created_by":"dchud","updated_at":"2026-03-04T21:53:01.013120Z","closed_at":"2026-03-04T21:53:01.013107Z","close_reason":"Fixed in PR #60, merged","source_repo":".","compaction_level":0,"original_size":0} {"id":"bd-3kee","title":"Add Record.get(tag) alias for pymarc compatibility","description":"pymarc's Record.get(tag, default=None) returns the first field matching a tag, or a default value. mrrc has the equivalent as Record.get_field(tag) but lacks the .get() alias.\n\nAdd a Record.get() method to the Python wrapper in mrrc/__init__.py that delegates to get_field() with an optional default parameter:\n\n def get(self, tag: str, default=None):\n return self.get_field(tag) or default\n\nThis is a pure Python wrapper addition — no Rust changes needed. Update _mrrc.pyi stubs accordingly.\n\npymarc source: Record.get() delegates to Record.__getitem__ with a try/except KeyError fallback.","status":"closed","priority":3,"issue_type":"feature","created_at":"2026-03-04T21:16:19.327037Z","created_by":"dchud","updated_at":"2026-03-04T21:53:01.068246Z","closed_at":"2026-03-04T21:53:01.068233Z","close_reason":"Fixed in PR #60, merged","source_repo":".","compaction_level":0,"original_size":0} {"id":"bd-3peu","title":"Fix check.sh --quick: Python tests run against stale extension","status":"closed","priority":1,"issue_type":"bug","created_at":"2026-03-04T16:56:07.259492Z","created_by":"dchud","updated_at":"2026-03-04T17:35:12.613323Z","closed_at":"2026-03-04T17:35:12.613307Z","close_reason":"Merged in PR #56","source_repo":".","compaction_level":0,"original_size":0} -{"id":"bd-7dre","title":"Fix repeated control fields being dropped (e.g. multiple 007s)","status":"in_progress","priority":1,"issue_type":"bug","created_at":"2026-04-10T01:47:09.917469Z","created_by":"dchud","updated_at":"2026-04-10T01:48:10.983328Z","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":18,"issue_id":"bd-7dre","author":"dchud","text":"Elevated to GitHub issue #77: https://github.com/dchud/mrrc/issues/77","created_at":"2026-04-10T01:48:10Z"}]} +{"id":"bd-7dre","title":"Fix repeated control fields being dropped (e.g. multiple 007s)","status":"closed","priority":1,"issue_type":"bug","created_at":"2026-04-10T01:47:09.917469Z","created_by":"dchud","updated_at":"2026-04-12T21:18:06.004792Z","closed_at":"2026-04-12T21:18:06.004656Z","close_reason":"Fixed in PR #79, merged to main","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":18,"issue_id":"bd-7dre","author":"dchud","text":"Elevated to GitHub issue #77: https://github.com/dchud/mrrc/issues/77","created_at":"2026-04-10T01:48:10Z"}]} {"id":"bd-93qn","title":"record_to_dublin_core_xml not exported from mrrc Python wrapper","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-03-04T18:59:47.002714Z","created_by":"dchud","updated_at":"2026-03-04T19:45:29.894208Z","closed_at":"2026-03-04T19:45:29.894196Z","close_reason":"Fixed in PR #59, merged","source_repo":".","compaction_level":0,"original_size":0} {"id":"bd-b7mg","title":"record_to_json/xml fail on records from xml_to_record()","notes":"GitHub issue: https://github.com/dchud/mrrc/issues/57","status":"closed","priority":1,"issue_type":"bug","created_at":"2026-03-04T18:55:40.894350Z","created_by":"dchud","updated_at":"2026-03-04T19:45:29.002501Z","closed_at":"2026-03-04T19:45:29.002489Z","close_reason":"Fixed in PR #58, merged","source_repo":".","compaction_level":0,"original_size":0} {"id":"bd-qjjs","title":"Pymarc API compatibility: comprehensive gap analysis and implementation plan","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-03-22T20:43:50.852232Z","created_by":"dchud","updated_at":"2026-03-22T22:43:09.600381Z","closed_at":"2026-03-22T22:43:09.600049Z","close_reason":"Replaced by bd-14jh — clean consolidated version","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":6,"issue_id":"bd-qjjs","author":"dchud","text":"## Pymarc API compatibility gaps identified from rmarc bench_deathmatch.py\n\nAnalysis of https://github.com/matthewdeanmartin/rmarc/blob/main/bench/bench_deathmatch.py reveals 4 API differences where mrrc diverges from pymarc's public API.\n\n### Gap 1: Record convenience accessors are methods, pymarc uses properties\n\nIn pymarc, `title`, `author`, `isbn`, `issn`, `subjects`, `notes`, and `location` are all `@property` decorated — accessed as `record.title`, not `record.title()`.\n\nIn mrrc, these are all plain methods requiring parentheses: `record.title()`.\n\nThe benchmark script has a special mrrc branch: `rec.title()` vs `rec.title` for pymarc/rmarc.\n\n**Fix:** Convert these to `@property` in `mrrc/__init__.py`. The underlying Rust `_inner` methods stay as-is; only the Python wrapper changes. This is a breaking change for existing mrrc users calling `record.title()` — could add a deprecation shim that detects callable usage.\n\n### Gap 2: Missing `Field.value()` method\n\npymarc's `Field.value()` returns all subfield values space-joined (or `data` for control fields). mrrc's Field wrapper has no `value()` method. The benchmark uses `str(field)` for mrrc vs `field.value()` for pymarc.\n\n**Fix:** Add `def value(self) -> str` to both `Field` and `ControlField` in the Python wrapper. For `Field`, return space-joined subfield values (matching pymarc). For `ControlField`, return `self.value` (already an attribute, but the method form is what pymarc uses).\n\n### Gap 3: Missing `Record.as_json()` method\n\npymarc uses `record.as_json(**kwargs)` which serializes via `json.dumps(record.as_dict(), **kwargs)`. mrrc has `record.to_json()` instead.\n\n**Fix:** Add `as_json()` as an alias or new method on `Record`. Ideally also add `as_dict()` for full compat. Keep `to_json()` as well since it's already in our public API.\n\n### Gap 4: Missing `parse_xml_to_array()` top-level function\n\npymarc exports `parse_xml_to_array(xml_file, strict=False, normalize_form=None)` as a top-level convenience function. mrrc uses `xml_to_records()` instead, with different argument conventions (takes a string, not a file path/handle).\n\n**Fix:** Add `parse_xml_to_array()` as a top-level function in `mrrc/__init__.py` that wraps `xml_to_records()`. Accept file paths and file-like objects for full pymarc compat.\n\n---\n\nElevated to GitHub issue #65: https://github.com/dchud/mrrc/issues/65","created_at":"2026-03-22T20:44:08Z"},{"id":7,"issue_id":"bd-qjjs","author":"dchud","text":"## Additional pymarc API compatibility gaps (deep comparison)\n\nFull API surface comparison reveals more gaps beyond the 4 from the benchmark.\n\n### Gap 5: Record.as_marc() / as_marc21() — serialize to ISO 2709 bytes\npymarc has `record.as_marc()` and `record.as_marc21()` returning `bytes`. mrrc has no equivalent for serializing a single Record back to raw MARC binary. This is critical for round-trip workflows (read, modify, write back).\n\n### Gap 6: Record.as_dict()\npymarc has `record.as_dict()` returning a plain dict. `as_json()` is built on top of it. mrrc has neither.\n\n### Gap 7: Record.add_field() signature mismatch\npymarc: `add_field(self, *fields)` — accepts multiple fields. mrrc: `add_field(self, field)` — accepts exactly one.\n\n### Gap 8: Record.remove_field() signature mismatch + missing remove_fields()\npymarc: `remove_field(self, *fields)` — accepts multiple. mrrc: one at a time. pymarc also has `remove_fields(self, *tags)` for bulk removal by tag string — mrrc has no equivalent.\n\n### Gap 9: Record.add_ordered_field() / add_grouped_field() missing\npymarc has both for inserting fields in tag-sorted or tag-grouped positions.\n\n### Gap 10: Naming mismatches (no underscore in pymarc)\n- pymarc: `record.physicaldescription` → mrrc: `record.physical_description()`\n- pymarc: `record.uniformtitle` → mrrc: `record.uniform_title()`\n- pymarc: `record.addedentries` → mrrc: missing entirely\n\n### Gap 11: Record.pubyear return type\npymarc returns `Optional[str]` (e.g. \"2024\"). mrrc returns `Optional[int]`. This breaks code doing string operations on pubyear.\n\n### Gap 12: Field.value() (already noted in Gap 2 above)\n\n### Gap 13: Field.format_field() missing\npymarc `field.format_field()` returns a human-readable string. Used internally by Record property helpers like `title`. Different from `value()`.\n\n### Gap 14: Field.add_subfield() missing pos parameter\npymarc: `add_subfield(self, code, value, pos=None)`. mrrc only supports append.\n\n### Gap 15: Record.decode_marc() missing\npymarc has `decode_marc(self, marc, to_unicode=True, ...)` to populate a Record from raw bytes. Used by MARCReader internally but also called directly by users.\n\n### Gap 16: Field.as_marc() / as_marc21() missing\npymarc can serialize individual fields to ISO 2709 bytes.\n\n---\n\n### GIL considerations for as_json() and parse_xml_to_array()\n\nNeither needs GIL release. Both operate on data already in Python memory:\n- as_json() serializes a single already-parsed Record — too fast to justify GIL overhead\n- parse_xml_to_array() wraps xml_to_records() which takes a &str — no Python I/O during Rust parsing\n\nThe GIL 3-phase model is only needed for MARCReader-style iterators that interleave Python file I/O with Rust parsing across many iterations.","created_at":"2026-03-22T20:48:58Z"},{"id":8,"issue_id":"bd-qjjs","author":"dchud","text":"## Correction: parse_xml_to_array() SHOULD follow MARCReader's input dispatch pattern\n\npymarc's parse_xml_to_array() accepts file paths, open file handles, and StringIO objects. If we implement it to match, it should follow the same input-dispatch pattern MARCReader already uses:\n\n- **String path / pathlib.Path** → read file in Rust (RustFile backend, no GIL)\n- **File-like object** → read content via Python (GIL held), then parse in Rust\n- **String content** → parse directly in Rust (no GIL needed)\n\nThe XML *parsing* itself is always GIL-free (pure Rust on &str). The GIL concern is only about the file-reading step. This is exactly the MARCReader pattern from readers.rs — detect input type, use native I/O when possible, fall back to Python I/O with proper GIL management when given a Python file object.\n\nThis means parse_xml_to_array() needs a Rust-side implementation (or at minimum a Python wrapper that handles the dispatch), not just a thin alias over xml_to_records().","created_at":"2026-03-22T20:49:19Z"},{"id":9,"issue_id":"bd-qjjs","author":"dchud","text":"## Clarification: GIL avoidance is the priority\n\nThe goal is to avoid holding the GIL as much as possible, not to replicate the MARCReader pattern specifically. For parse_xml_to_array():\n\n- **String path / pathlib.Path** → read + parse entirely in Rust, zero GIL\n- **String content** → parse entirely in Rust, zero GIL\n- **Python file object** → only case where GIL is unavoidable (need it to call .read())\n\nThe MARCReader's 3-phase model is a solution to a problem we may not have here. If we can handle the common cases (file paths, string content) entirely GIL-free, that's the right approach. Only add GIL management for the Python file object fallback, and even then minimize the window.","created_at":"2026-03-22T20:50:54Z"},{"id":10,"issue_id":"bd-qjjs","author":"dchud","text":"## Documentation: show preferred GIL-free usage with explicit examples\n\nWhen implementing parse_xml_to_array(), update docs (intro guide, API reference, examples) to show the preferred path-based approach with clear comments about why, the same way we document passing paths to MARCReader() as preferred. E.g.:\n\n```python\n# Preferred: file path lets mrrc read + parse entirely in Rust (no GIL)\nrecords = mrrc.parse_xml_to_array(\"collection.xml\")\n\n# Also good: string content parsed entirely in Rust (no GIL)\nrecords = mrrc.parse_xml_to_array(xml_string)\n\n# Works but slower: Python file object requires GIL for .read()\nwith open(\"collection.xml\") as f:\n records = mrrc.parse_xml_to_array(f)\n```\n\nThis pattern should be consistent with how MARCReader path-based usage is already documented across the docs.","created_at":"2026-03-22T20:52:05Z"},{"id":11,"issue_id":"bd-qjjs","author":"dchud","text":"## Consolidated revision: complete gap inventory (supersedes earlier comments)\n\nReview of earlier comments found errors, duplicates, and missing gaps. This comment is the authoritative reference.\n\n---\n\n### Tier 1: Structural / semantic divergences (highest impact)\n\n**A. Separate ControlField class (NEW)**\npymarc has NO ControlField class — both control and data fields are `pymarc.Field`, distinguished by `is_control_field()`. mrrc exposes a separate `ControlField` class. This breaks `isinstance(f, Field)` checks and means `get_fields()` returns mixed types in mrrc vs uniform Field in pymarc.\n\n**B. ControlField uses `.value` attribute; pymarc uses `.data` (NEW)**\npymarc control fields store content in `.data`. mrrc's ControlField uses `.value`. Code doing `field.data` on a control field will fail.\n\n**C. Record convenience accessors are methods, pymarc uses @property (Gap 1)**\n`title`, `author`, `isbn`, `issn`, `issn_title`, `issnl`, `subjects`, `notes`, `location`, `series`, `sudoc`, `publisher`, `pubyear`, `physicaldescription`, `uniformtitle`, `addedentries` are all `@property` in pymarc. mrrc has them as methods (and some are missing — see Tier 2).\n\n**D. Record.__getitem__ returns None for missing tags; pymarc raises KeyError (NEW)**\npymarc `record['999']` raises `KeyError`. mrrc returns `None`. This silently changes control flow for code using try/except KeyError.\n\n**E. Field.__str__ returns useless default repr (NEW)**\npymarc `str(field)` returns MARC display format: `=245 10$aThe Great Gatsby$cF. Scott Fitzgerald`. mrrc returns `` because the Python wrapper has no `__str__` and `__getattr__` doesn't intercept dunder methods. The Rust `_inner.__str__` returns `Field(245)` which is also not pymarc-compatible. Note: the rmarc benchmark uses `str(field)` for mrrc as a workaround for missing `value()`, but it doesn't actually produce useful output.\n\n**F. Record.as_marc() / as_marc21() missing (Gap 5)**\nNo way to serialize a Record to ISO 2709 bytes. MARCWriter can write to files, but there's no method to get bytes from a single Record. Critical for round-trip workflows.\n\n---\n\n### Tier 2: Missing methods and signature mismatches\n\n**G. Field.value() missing (Gap 2)**\npymarc `field.value()` returns space-joined subfield values (or `.data` for control fields). mrrc has nothing equivalent. NOTE: For ControlField, we can't just add a `value()` method because `.value` is already an attribute — resolve by renaming attribute to `.data` (matching pymarc) per Gap B above.\n\n**H. Field.format_field() missing (Gap 13)**\npymarc `field.format_field()` returns human-readable text (no indicators or subfield codes). Different from `value()` only in edge cases but used internally by Record property helpers like `title`.\n\n**I. Record.as_json(**kwargs) missing (Gap 3)**\npymarc `record.as_json(**kwargs)` wraps `json.dumps(record.as_dict(), **kwargs)`. mrrc only has `to_json()` with no kwargs support. Keep `to_json()` in our public API.\n\n**J. Record.as_dict() missing (Gap 6)**\npymarc builds `as_json()` on `as_dict()`. Should implement both.\n\n**K. parse_xml_to_array() missing (Gap 4)**\npymarc top-level function accepting file paths, file handles, StringIO. mrrc has `xml_to_records()` (string-only). Implementation should avoid holding the GIL: file paths and string content → entirely Rust, zero GIL. Python file objects → GIL only for `.read()`, then Rust parsing. Document preferred path-based usage with examples, consistent with MARCReader docs.\n\n**L. add_field(*fields) signature mismatch (Gap 7)**\npymarc accepts `*fields`. mrrc accepts exactly one.\n\n**M. remove_field(*fields) + remove_fields(*tags) (Gap 8)**\npymarc `remove_field` accepts multiple. mrrc accepts one (Field or str). pymarc also has `remove_fields(*tags)` for bulk removal by tag — mrrc has no equivalent.\n\n**N. add_ordered_field() / add_grouped_field() missing (Gap 9)**\npymarc has both for inserting fields in tag-sorted or tag-grouped positions.\n\n**O. Field.add_subfield() missing pos parameter (Gap 14)**\npymarc: `add_subfield(self, code, value, pos=None)`. mrrc only supports append.\n\n**P. Field.as_marc() / as_marc21() missing (Gap 16)**\npymarc can serialize individual fields to ISO 2709 bytes.\n\n---\n\n### Tier 3: Naming and return type mismatches\n\n**Q. Property name mismatches (Gap 10)**\n- pymarc `physicaldescription` → mrrc `physical_description` (underscore)\n- pymarc `uniformtitle` → mrrc `uniform_title` (underscore)\n- pymarc `addedentries` → mrrc: missing entirely (returns 700/710/711/730 fields)\n\n**R. pubyear return type (Gap 11)**\npymarc returns `Optional[str]` (e.g. `\"2020\"`). mrrc returns `Optional[int]` (`2020`). Confirmed: mrrc returns int.\n\n---\n\n### Tier 4: Lower-priority gaps\n\n**S. decode_marc() missing (Gap 15)** — semi-internal, used by MARCReader. Uncommon in user code.\n\n**T. Missing reader/writer variants** — JSONReader, JSONWriter, MARCMakerReader, XMLWriter, TextWriter.\n\n**U. No exception hierarchy** — pymarc has ~15 specific exception classes. mrrc exposes generic errors.\n\n**V. No MARC8 character encoding support.**\n\n**W. Missing convenience functions** — `map_records()`, `map_xml()`, `parse_json_to_array()`, `record_to_xml_node()`.\n\n**X. Missing constants** — `LEADER_LEN`, `DIRECTORY_ENTRY_LEN`, `END_OF_FIELD`, `END_OF_RECORD`, `SUBFIELD_INDICATOR`, etc.\n\n---\n\n### GIL policy (final, supersedes earlier comments)\n\nAvoid holding the GIL whenever possible.\n\n- `as_json()`: no GIL concern — single record, in-memory, fast.\n- `parse_xml_to_array()`: file paths and string content → entirely Rust, zero GIL. Python file objects → GIL only for `.read()`, minimize window.\n- `as_marc()` / `as_marc21()`: pure serialization, no GIL needed.\n\n---\n\n### Removed from earlier comments\n- Gap 12 was a duplicate of Gap 2 — removed.\n\nElevated to GitHub issue #65: https://github.com/dchud/mrrc/issues/65","created_at":"2026-03-22T21:00:17Z"}]} @@ -12,7 +12,7 @@ {"id":"bd-vc06","title":"Add negation flag to SubfieldValueQuery","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-03-18T23:57:42.630104Z","created_by":"dchud","updated_at":"2026-04-06T00:39:18.974586Z","closed_at":"2026-04-06T00:39:18.974381Z","close_reason":"Completed — merged in PR #74","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":3,"issue_id":"bd-vc06","author":"dchud","text":"## Motivation\n\nFollow-up to bd-w9cm (negation on SubfieldPatternQuery). The same use case applies to exact and partial string matching: users scanning large record sets for fields where a subfield does NOT equal or contain a value currently must filter in Python, incurring FFI overhead.\n\n## Design\n\nMirror the approach from bd-w9cm. SubfieldValueQuery already has a `partial: bool` flag — add `negate: bool` alongside it.\n\n### Rust core (`src/field_query.rs`)\n\n- Add `pub negate: bool` to `SubfieldValueQuery`\n- Update `new()` and `partial()` constructors to set `negate: false`\n- Add `pub fn negated(tag, subfield_code, value) -> Self` and `pub fn partial_negated(tag, subfield_code, value) -> Self` constructors\n- Update `matches()`:\n\n field.get_subfield(self.subfield_code).is_some_and(|value| {\n let matched = if self.partial {\n value.contains(self.value.as_str())\n } else {\n value == self.value.as_str()\n };\n matched != self.negate\n })\n\n### PyO3 bindings (`src-python/src/query.rs`)\n\n- Add `negate` as an optional keyword argument to `PySubfieldValueQuery::new()` (default `False`)\n- Add `#[getter] fn negate(&self) -> bool`\n- Update `__repr__` to include `negate=true` when set\n\n### Python stubs (`mrrc/_mrrc.pyi`)\n\n- Update `SubfieldValueQuery.__init__` signature: add `negate: bool = False`\n- Add `negate` property\n\n### Tests\n\n- Rust unit tests: negated exact match, negated partial match, negated with missing subfield (returns false)\n- Python tests in `tests/python/test_query_dsl.py`: negated value query via kwarg, verify property, integration with `fields_matching_value()`\n\n### Documentation\n\n- Update `docs/guides/query-dsl.md` (SubfieldValueQuery section)\n- Update `docs/tutorials/python/querying-fields.md` if SubfieldValueQuery is covered\n- Update `mrrc/_mrrc.pyi` stubs\n\n### Edge cases\n\nSame as bd-w9cm:\n- Missing subfield + negate=true → false (subfield must exist)\n- Repeating subfields: only the first value is checked (existing `get_subfield` behavior)\n\n### Sequencing\n\nImplement after bd-w9cm lands so both query types get consistent API and we can share test patterns.","created_at":"2026-03-18T23:58:00Z"},{"id":5,"issue_id":"bd-vc06","author":"dchud","text":"Elevated to GitHub issue #64: https://github.com/dchud/mrrc/issues/64","created_at":"2026-03-18T23:59:36Z"}]} {"id":"bd-w9cm","title":"Add negation flag to SubfieldPatternQuery","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-03-18T23:53:40.703765Z","created_by":"dchud","updated_at":"2026-04-06T00:13:06.687346Z","closed_at":"2026-04-06T00:13:06.687035Z","close_reason":"Completed — merged in PR #73","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":1,"issue_id":"bd-w9cm","author":"dchud","text":"## Motivation\n\nA user scanning ~250k records needed to find ones where a subfield did NOT match a regex. Currently this requires Python-side filtering (`not re.match(...)`) which means:\n- Extra string copies across the Rust/Python FFI boundary for every record\n- GIL overhead on each field value that crosses into Python\n- Inability to short-circuit inside Rust iteration\n\nAdding a `negate` flag pushes the inversion into Rust so non-matching fields never cross the FFI boundary.\n\n## Design\n\nFollow the existing `partial` flag pattern on `SubfieldValueQuery` — add a `negate: bool` field defaulting to `false`.\n\n### Rust core (`src/field_query.rs`)\n\n- Add `pub negate: bool` to `SubfieldPatternQuery`\n- Update `new()` to set `negate: false`\n- Add `pub fn negated(tag, subfield_code, pattern) -> Result` constructor (sets `negate: true`)\n- Update `matches()`: XOR the regex result with `self.negate` so that when negated, a match means \"subfield exists but does NOT match the pattern\"\n\n### PyO3 bindings (`src-python/src/query.rs`)\n\n- Add `negate` as an optional keyword argument to `PySubfieldPatternQuery::new()` (default `False`)\n- Add `#[getter] fn negate(&self) -> bool`\n- Update `__repr__` to include `negate=true` when set\n\n### Python stubs (`mrrc/_mrrc.pyi`)\n\n- Update `SubfieldPatternQuery.__init__` signature: add `negate: bool = False`\n- Add `negate` property\n\n### Tests\n\n- Rust unit tests in `src/field_query.rs`: negated match, negated non-match, negated with missing subfield (should return false — subfield must exist)\n- Python tests in `tests/python/test_query_dsl.py`: negated pattern query via kwarg, verify `negate` property, integration with `fields_matching_pattern()`\n\n### Documentation\n\n- Update `docs/guides/query-dsl.md` with negation examples\n- Update `docs/tutorials/python/querying-fields.md` if SubfieldPatternQuery is covered there\n- Update API reference docs if they document SubfieldPatternQuery parameters\n\n## Edge case: missing subfield\n\nWhen `negate=true` and the subfield doesn't exist on a field, `matches()` should return `false`. Negation means \"the subfield exists but its value doesn't match\" — not \"the subfield is absent.\" This matches user intent (filtering by value, not by presence).\n\n## Scope\n\nThis bead covers the `negate` flag only. A broader query composition framework (NotQuery, AndQuery, OrQuery) is a separate concern.","created_at":"2026-03-18T23:53:58Z"},{"id":2,"issue_id":"bd-w9cm","author":"dchud","text":"## Revision: additional detail and edge cases\n\n### Repeating subfields\n\n`get_subfield()` returns only the **first** subfield with the given code. MARC fields can have repeating subfields (e.g., multiple `$a` in a 650). With `negate=true`, the query checks whether the first value does NOT match — it does not scan all values with that code. This is consistent with the existing non-negated behavior but worth documenting explicitly since negation makes the semantics more consequential. If users need \"no subfield with this code matches,\" they can combine with Python-level filtering or we can address multi-value matching in a future enhancement.\n\n### matches() implementation detail\n\nThe cleanest implementation uses `!=` rather than a literal XOR:\n\n field.get_subfield(self.subfield_code)\n .is_some_and(|value| self.pattern.is_match(value) != self.negate)\n\nThis handles all cases correctly:\n- subfield exists, pattern matches, negate=false → true (correct)\n- subfield exists, pattern doesn't match, negate=false → false (correct)\n- subfield exists, pattern matches, negate=true → false (correct: match excluded)\n- subfield exists, pattern doesn't match, negate=true → true (correct: non-match included)\n- subfield missing, any negate value → false (is_some_and short-circuits)\n\nNo extra branching needed for the missing-subfield edge case.\n\n### Follow-up\n\nThe same `negate` flag would be useful on `SubfieldValueQuery` (same user scenario, exact/partial matching). That should be a separate bead to keep scope tight.\n\n### Documentation locations confirmed\n\n- `docs/guides/query-dsl.md` — SubfieldPatternQuery section at line 108, examples at line 228\n- `docs/tutorials/python/querying-fields.md` — two usage examples at lines 80, 85\n- `mrrc/_mrrc.pyi` — stub at line 295\n- No separate API reference docs beyond the stubs; mkdocs-material may auto-generate from docstrings","created_at":"2026-03-18T23:56:36Z"},{"id":4,"issue_id":"bd-w9cm","author":"dchud","text":"Elevated to GitHub issue #63: https://github.com/dchud/mrrc/issues/63","created_at":"2026-03-18T23:59:36Z"}]} {"id":"bd-wifu","title":"Pymarc API compatibility: verified gap analysis and implementation plan","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-03-23T00:03:50.574638Z","created_by":"dchud","updated_at":"2026-04-05T21:33:52.664164Z","closed_at":"2026-04-05T21:33:52.663939Z","close_reason":"Superseded by bd-sgwi / GitHub #71 — clean ordered plan replaces accumulated analysis","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":15,"issue_id":"bd-wifu","author":"dchud","text":"Verified pymarc API gap analysis. Every gap confirmed via live introspection of pymarc and mrrc. Replaces bd-qjjs and bd-14jh. Elevated to GitHub issue #65: https://github.com/dchud/mrrc/issues/65\n\n---\n\n## Tier 1: Structural / semantic divergences\n\nThese affect the fundamental object model and break the most pymarc-compatible code.\n\n### A. Separate ControlField class\n\npymarc has NO ControlField — both control and data fields are `pymarc.Field`, distinguished by `is_control_field()`. mrrc exposes a separate `ControlField` class. Breaks `isinstance(f, Field)` checks; `get_fields()` returns mixed types in mrrc vs uniform Field in pymarc.\n\n**Fix:** Unify into a single Field class in the wrapper, with `is_control_field()` dispatching behavior internally.\n\n### B. Control field content: mrrc `.value` attribute vs pymarc `.data` attribute\n\npymarc control fields store content in `.data`. mrrc's ControlField uses `.value`. Code doing `field.data` on a control field fails. Also creates a naming collision with `Field.value()` method (Gap G).\n\n**Fix:** Use `.data` when unifying Field (Gap A). Provides pymarc compat and clears the way for `value()` method.\n\n### C. Record convenience accessors are methods; pymarc uses @property\n\nAll 17 verified as properties in pymarc, methods in mrrc: `title`, `author`, `isbn`, `issn`, `issn_title`, `issnl`, `subjects`, `notes`, `location`, `series`, `sudoc`, `publisher`, `pubyear`. Plus 3 missing entirely from mrrc: `physicaldescription`, `uniformtitle`, `addedentries` (see Gap Q).\n\n**Fix:** Convert to `@property` in `mrrc/__init__.py`. Rust `_inner` methods stay as-is. Breaking change for `record.title()` callers — consider deprecation shim.\n\n### D. Record.__getitem__ returns None for missing tags; pymarc raises KeyError\n\nVerified: pymarc `record['999']` raises `KeyError`. mrrc returns `None`. Silently changes control flow for code using try/except KeyError. Note: `Record.get(tag, default)` already exists in both and behaves the same (returns default).\n\n**Fix:** Raise `KeyError` for missing tags in `__getitem__`.\n\n### E. Field.__str__ returns useless default repr\n\nVerified: pymarc `str(field)` → `'=245 10$aThe Great Gatsby$cF. Scott Fitzgerald'`. mrrc `str(field)` → `''`. The Python wrapper has no `__str__`; `__getattr__` doesn't intercept dunder methods. The Rust `_inner.__str__` returns `'Field(245)'` which is also not pymarc-compatible.\n\n**Fix:** Add `__str__` and `__repr__` to the wrapper Field class matching pymarc's MARC display format.\n\n### F. Record.as_marc() / as_marc21() missing\n\nVerified: pymarc `record.as_marc()` returns `bytes` (ISO 2709). mrrc has no equivalent — `MARCWriter.write_record()` writes to a file, returns `None`. Critical for round-trip workflows.\n\n**Fix:** Implement in Rust (pure serialization, no GIL needed), expose via PyO3, add wrapper method. `as_marc21()` should be an alias.\n\n### G. JSON serialization format mismatch\n\nVerified structural differences between pymarc `as_json()` and mrrc `to_json()`:\n\n**pymarc** (follows MARC-in-JSON / code4lib proposal):\n```json\n{\"leader\": \"...\", \"fields\": [{\"245\": {\"ind1\": \"1\", \"ind2\": \"0\", \"subfields\": [{\"a\": \"Title\"}, {\"c\": \"Author\"}]}}]}\n```\n\n**mrrc**:\n```json\n[{\"leader\": \"...\"}, {\"245\": {\"ind1\": \"1\", \"ind2\": \"0\", \"subfields\": {\"a\": \"Title\", \"c\": \"Author\"}}}]\n```\n\nTwo differences: (1) Top-level: pymarc uses `{leader, fields}` object, mrrc uses flat array. (2) Subfields: pymarc uses array of single-key objects (preserves order, allows duplicate codes), mrrc uses single object (loses duplicate subfield codes).\n\n**Fix:** `as_json()` and `as_dict()` must produce pymarc-compatible format. Keep `to_json()` as mrrc's native format. This likely needs Rust-side work or a Python-side reformatter.\n\n---\n\n## Tier 2: Missing methods and signature mismatches\n\n### H. Field.value() missing\n\nVerified: pymarc `field.value()` returns space-joined subfield values (or `.data` for control fields). mrrc Field has no `value` attribute or method. Depends on Gap B (renaming `.value` → `.data` on control fields).\n\n**Fix:** Add `def value(self) -> str` to the unified Field class.\n\n### I. Field.format_field() missing\n\nVerified: pymarc `field.format_field()` returns human-readable text without indicators or subfield codes. For the test data, `value()` and `format_field()` return the same string, but they can differ for fields with special formatting.\n\n**Fix:** Add `def format_field(self) -> str` to Field.\n\n### J. Record.as_json(**kwargs) and Record.as_dict() missing\n\nVerified: pymarc has both. `as_dict()` returns a plain dict; `as_json(**kwargs)` wraps `json.dumps(as_dict(), **kwargs)`. mrrc has neither. Note: `as_json()` must produce the pymarc-compatible JSON schema (Gap G), not mrrc's native format.\n\n**Fix:** Implement `as_dict()` producing pymarc-compatible structure, then `as_json(**kwargs)` on top. Keep mrrc's `to_json()` as its own format.\n\n### K. parse_xml_to_array() missing\n\nVerified: pymarc `parse_xml_to_array(xml_file, strict=False, normalize_form=None)` accepts file paths, open file handles, and StringIO. mrrc has `xml_to_records(str)` only.\n\n**Fix:** Add `parse_xml_to_array()` to `mrrc/__init__.py`. GIL policy: file paths and string content → entirely Rust, zero GIL. Python file objects → GIL only for `.read()`, minimize window. Document preferred path-based usage with explicit examples and comments explaining why, consistent with MARCReader docs:\n\n```python\n# Preferred: file path — read + parse entirely in Rust, zero GIL\nrecords = mrrc.parse_xml_to_array(\"collection.xml\")\n\n# Also good: string content — parse entirely in Rust, zero GIL\nrecords = mrrc.parse_xml_to_array(xml_string)\n\n# Works but slower: Python file object requires GIL for .read()\nwith open(\"collection.xml\") as f:\n records = mrrc.parse_xml_to_array(f)\n```\n\n### L. add_field(*fields) signature mismatch\n\nVerified: pymarc `add_field(self, *fields)`. mrrc `add_field(self, field: Field) -> None`.\n\n**Fix:** Change signature to accept `*fields` with a loop.\n\n### M. remove_field(*fields) + remove_fields(*tags)\n\nVerified: pymarc `remove_field(self, *fields) -> None` accepts multiple. mrrc `remove_field(self, field: Union[Field, str]) -> List[Field]` accepts one. pymarc also has `remove_fields(self, *tags) -> None` for bulk removal by tag string — mrrc has no equivalent. Also note: pymarc returns None; mrrc returns the removed fields.\n\n**Fix:** Change `remove_field` to accept `*fields`. Add `remove_fields(*tags)`. Consider whether to keep mrrc's return value (non-breaking addition).\n\n### N. add_ordered_field() / add_grouped_field() missing\n\nVerified: both exist in pymarc, neither in mrrc.\n\n**Fix:** Implement in wrapper. `add_ordered_field` inserts maintaining tag sort order. `add_grouped_field` inserts after the last field with the same tag.\n\n### O. Field.add_subfield() missing pos parameter\n\nVerified: pymarc `add_subfield(self, code, value, pos=None)`. mrrc `add_subfield(self, code, value)`.\n\n**Fix:** Add optional `pos` parameter. May need Rust-side support for positional insert.\n\n### P. Field.as_marc() / as_marc21() missing\n\nVerified: pymarc Field has both, mrrc Field has neither.\n\n**Fix:** Implement in Rust, expose via PyO3. No GIL needed. `as_marc21` as alias.\n\n### Q. Field.linkage_occurrence_num() missing\n\nVerified: pymarc `field.linkage_occurrence_num() -> Optional[str]` extracts the occurrence number from subfield $6 linkage.\n\n**Fix:** Implement in wrapper or Rust.\n\n---\n\n## Tier 3: Naming and return type mismatches\n\n### R. Property name mismatches\n\nVerified:\n- pymarc `physicaldescription` → mrrc has `physical_description` (underscore) but NOT `physicaldescription`\n- pymarc `uniformtitle` → mrrc has `uniform_title` (underscore) but NOT `uniformtitle`\n- pymarc `addedentries` → mrrc: missing entirely (returns 700/710/711/730 fields)\n\n**Fix:** Add pymarc-compatible names as `@property` (part of Gap C). Keep mrrc underscore names as aliases.\n\n### S. pubyear return type\n\nVerified: mrrc returns `Optional[int]` (e.g. `2020`). pymarc returns `Optional[str]` (e.g. `\"2020\"`).\n\n**Fix:** Return `str` from the property wrapper.\n\n### T. Field.convert_legacy_subfields() missing\n\nVerified: pymarc has `Field.convert_legacy_subfields(subfields: list[str]) -> list[Subfield]` as a classmethod. Converts old `['a', 'value', 'b', 'value']` format to Subfield namedtuples.\n\n**Fix:** Add classmethod to Field wrapper. Low priority — only needed for migrating old pymarc code.\n\n---\n\n## Tier 4: Lower priority\n\nThese are real gaps but unlikely to block most pymarc-compatible code.\n\n### U. Record.as_marc() / as_marc21() (already covered as Gap F)\n\n### V. decode_marc()\n\nVerified missing. Semi-internal — populates Record from raw bytes. Uncommon in user code but used by some advanced pymarc scripts.\n\n**Fix:** Could wrap mrrc's MARCReader to parse a single record from bytes.\n\n### W. Reader/writer variants\n\nVerified all missing: JSONReader, JSONWriter, MARCMakerReader, XMLWriter, TextWriter, Reader (base), Writer (base).\n\n**Fix:** Implement as needed. JSONReader/JSONWriter most likely to be used.\n\n### X. Exception hierarchy\n\nVerified: pymarc has 10 specific exception classes (`BadSubfieldCodeWarning`, `BaseAddressInvalid`, `BaseAddressNotFound`, `EndOfRecordNotFound`, `FatalReaderError`, `FieldNotFound`, `PymarcException`, `RecordDirectoryInvalid`, `RecordLeaderInvalid`, `RecordLengthInvalid`). mrrc exports zero exception classes.\n\n**Fix:** Define exception hierarchy in Python wrapper. Map Rust errors to specific exception types.\n\n### Y. MARC8 Python API\n\nVerified: mrrc has MARC8 support in Rust core but exports nothing at the Python level. pymarc exports: `MARC8ToUnicode` class, `marc8_to_unicode()` function, `map_marc8_field()`, `map_marc8_record()`.\n\n**Fix:** Expose Rust MARC8 functionality via PyO3, add pymarc-compatible wrapper names.\n\n### Z. Convenience functions\n\nVerified all missing: `map_records()`, `map_xml()`, `parse_json_to_array()`, `record_to_xml_node()`, `normalize_subfield_code()`.\n\n**Fix:** Implement as wrappers. `parse_json_to_array()` most likely to be used.\n\n### AA. Constants\n\nVerified all missing: `LEADER_LEN`, `DIRECTORY_ENTRY_LEN`, `END_OF_FIELD`, `END_OF_RECORD`, `SUBFIELD_INDICATOR`, `MARC_XML_NS`, `MARC_XML_SCHEMA`.\n\n**Fix:** Export from `mrrc/__init__.py`. These are just integer/string constants.\n\n### BB. Record instance attributes\n\npymarc Record has `force_utf8`, `pos`, `to_unicode` as instance attributes (member descriptors). These are used during decode and by some user code. mrrc has none of these.\n\n**Fix:** Low priority. `pos` (file position) could be useful; `force_utf8`/`to_unicode` are decode-time flags.\n\n---\n\n## GIL policy\n\nAvoid holding the GIL whenever possible.\n\n- **as_json(), as_dict(), as_marc(), as_marc21():** In-memory, single record — no GIL concern.\n- **parse_xml_to_array():** File paths and string content → entirely Rust, zero GIL. Python file objects → GIL only for `.read()`, minimize window.\n- **Field.value(), format_field(), as_marc():** Pure computation, no GIL concern.","created_at":"2026-03-23T00:05:04Z"}]} -{"id":"bd-y331","title":"Add pymarc-compatible to_unicode and permissive flags to MARCReader","status":"open","priority":3,"issue_type":"feature","created_at":"2026-04-10T01:47:10.886181Z","created_by":"dchud","updated_at":"2026-04-10T01:48:11.593780Z","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":19,"issue_id":"bd-y331","author":"dchud","text":"Elevated to GitHub issue #78: https://github.com/dchud/mrrc/issues/78","created_at":"2026-04-10T01:48:11Z"}]} +{"id":"bd-y331","title":"Add pymarc-compatible to_unicode and permissive flags to MARCReader","status":"in_progress","priority":3,"issue_type":"feature","created_at":"2026-04-10T01:47:10.886181Z","created_by":"dchud","updated_at":"2026-04-12T21:37:14.182755Z","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":19,"issue_id":"bd-y331","author":"dchud","text":"Elevated to GitHub issue #78: https://github.com/dchud/mrrc/issues/78","created_at":"2026-04-10T01:48:11Z"}]} {"id":"mrrc-086","title":"Add parallel benchmarking suite (rayon/threading/multiprocessing)","status":"closed","priority":2,"issue_type":"feature","created_at":"2025-12-31T17:02:57.787595Z","updated_at":"2026-01-06T16:11:40.252865Z","closed_at":"2026-01-06T16:11:40.252865Z","close_reason":"Satisfied by Phase F - Comprehensive benchmarking suite with Criterion.rs, speedup curves (2.04x @ 2-thread, 3.20x @ 4-thread), fixtures (1k/10k/100k records), and pymrrc vs Rust efficiency comparison (92%) completed.","source_repo":".","compaction_level":0,"original_size":0} {"id":"mrrc-08k","title":"Phase 3: Linked field navigation and specialized queries","description":"Phase 3 of Advanced Field Query DSL (see design/FIELD_QUERY_DSL.md).\n\nImplement specialized query patterns for linked fields, authority control, and format-specific queries.\n\nTasks:\n1. Linked field navigation (880 field linkage)\n - Implement subfield 6 parsing for linkage occurrence numbers\n - Methods:\n * field.get_linked_field(record) -> Option (880 partner)\n * record.get_all_linked_fields(field) -> Vec\n * record.get_field_pairs(tag) -> Vec<(Field, Option)> (original + 880)\n - Support for bidirectional lookups\n\n2. Authority control helpers (for authority records)\n - Methods for finding related authority headings\n - Navigate 4XX (see from) and 5XX (see also) tracings\n - Helper: find_related_headings(heading: &Field) -> Vec\n - Extract authority relationship information from 7XX fields\n\n3. Format-specific query traits\n - BibliographicQueries trait (for Record)\n - AuthorityQueries trait (for AuthorityRecord)\n - HoldingsQueries trait (for HoldingsRecord)\n - Each provides domain-specific helper methods\n\n4. Comprehensive testing\n - Test 880 field linkage with real MARC-8 encoded data\n - Test authority record navigation patterns\n - Test complex record type-specific queries\n \nDependencies: Requires Phase 1 (mrrc-9n8) and Phase 2 (mrrc-131) completed","notes":"Phase 3 Status: Linked field navigation COMPLETED (28 tests). Authority control helpers TODO. Format-specific traits TODO.","status":"closed","priority":4,"issue_type":"task","created_at":"2025-12-26T17:13:18.416250Z","updated_at":"2025-12-28T16:25:38.021558Z","closed_at":"2025-12-28T16:25:38.021558Z","close_reason":"All three sub-features completed: linked field navigation, authority control helpers, format-specific query traits. 282 library tests passing.","source_repo":".","compaction_level":0,"original_size":0} {"id":"mrrc-0jm","title":"Update check.sh to include mrrc-python Clippy and test checks","status":"closed","priority":1,"issue_type":"task","created_at":"2026-01-06T16:50:13.684695Z","created_by":"dchud","updated_at":"2026-01-06T16:59:17.545907Z","closed_at":"2026-01-06T16:59:17.545907Z","close_reason":"Completed","source_repo":".","compaction_level":0,"original_size":0} diff --git a/docs/getting-started/quickstart-python.md b/docs/getting-started/quickstart-python.md index a0b7c5d..17f2f06 100644 --- a/docs/getting-started/quickstart-python.md +++ b/docs/getting-started/quickstart-python.md @@ -20,6 +20,8 @@ for record in MARCReader("records.mrc"): This approach uses pure Rust I/O and releases Python's GIL, enabling multi-threaded speedups. +For files with malformed records, use `permissive=True` to skip errors (yields `None`), or `recovery_mode="lenient"` to salvage partial data. See the [error handling guide](../guides/migration-from-pymarc.md#error-handling) for details. + ## Access Fields ```python diff --git a/docs/guides/migration-from-pymarc.md b/docs/guides/migration-from-pymarc.md index f06b113..2c55ee2 100644 --- a/docs/guides/migration-from-pymarc.md +++ b/docs/guides/migration-from-pymarc.md @@ -92,6 +92,8 @@ with open('output.mrc', 'wb') as f: | Operation | pymarc | mrrc | Same? | |-----------|--------|------|-------| | Create reader | `MARCReader(file_obj)` | `MARCReader('path.mrc')` (recommended) or `MARCReader(file_obj)` | Enhanced | +| Permissive mode | `MARCReader(f, permissive=True)` | `MARCReader(f, permissive=True)` | **Same** | +| Unicode flag | `MARCReader(f, to_unicode=True)` | `MARCReader(f, to_unicode=True)` | **Same** | | Read record | `reader.next()` or `next(reader)` | `next(reader)` | **Same** | | Write record | `writer.write(record)` | `writer.write(record)` | **Same** | | Iterate | `for record in reader:` | `for record in reader:` | **Same** | @@ -338,6 +340,56 @@ from mrrc import MrrcException, MarcError - [ ] Update writers to use context managers: `with mrrc.MARCWriter(f) as w:` (better resource management) - [ ] Use `record.as_marc()`, `record.as_json()`, `record.as_dict()` for serialization +## Error Handling + +### Permissive Mode (pymarc-compatible) + +pymarc's `permissive=True` flag yields `None` for records that fail to parse, +letting callers skip bad records and keep processing. mrrc supports the same +flag with identical behavior: + +```python +# Works the same in both pymarc and mrrc +for record in mrrc.MARCReader('records.mrc', permissive=True): + if record is None: + continue # skip malformed record + print(record.title) +``` + +### to_unicode Flag + +pymarc's `to_unicode=True` (the default) converts MARC-8 encoded records to +UTF-8. mrrc always converts MARC-8 to UTF-8 automatically — the conversion +happens in the Rust parsing layer and cannot be disabled. The `to_unicode` +kwarg is accepted for compatibility so existing scripts work unchanged. +Passing `to_unicode=False` emits a warning but has no effect. + +### Recovery Mode (mrrc-specific) + +mrrc also offers a `recovery_mode` kwarg that goes beyond pymarc's +permissive mode. Instead of skipping bad records entirely, recovery mode +attempts to salvage valid fields from damaged records: + +```python +# Attempt to recover partial data from malformed records +reader = mrrc.MARCReader('records.mrc', recovery_mode='lenient') +for record in reader: + print(f"Got {len(record.get_fields())} fields") + +# Even more lenient — accept partial data +reader = mrrc.MARCReader('records.mrc', recovery_mode='permissive') +``` + +Recovery modes: +- `"strict"` (default) — raise on any malformation +- `"lenient"` — attempt to recover, salvage valid fields +- `"permissive"` — very lenient, accept partial data + +Note: `permissive=True` and `recovery_mode` other than `"strict"` cannot +be combined — they represent different error-handling strategies. Use +`permissive=True` for pymarc-compatible "skip bad records" behavior, or +`recovery_mode` for mrrc's "salvage what you can" approach. + ## Known Differences from pymarc 1. **Record constructor**: `mrrc.Record()` works (defaults to `Leader()`), or pass explicit `mrrc.Record(mrrc.Leader())` diff --git a/docs/tutorials/python/reading-records.md b/docs/tutorials/python/reading-records.md index 01f3728..3c2fe79 100644 --- a/docs/tutorials/python/reading-records.md +++ b/docs/tutorials/python/reading-records.md @@ -128,6 +128,33 @@ except FileNotFoundError: print("File not found") ``` +### Permissive Mode + +When processing files that may contain malformed records, use +`permissive=True` to skip bad records instead of raising errors. +This matches pymarc's `permissive` flag behavior: + +```python +for record in MARCReader("records.mrc", permissive=True): + if record is None: + continue # skip malformed record + print(record.title) +``` + +### Recovery Mode + +For more control over error handling, use `recovery_mode` to attempt +salvaging valid fields from damaged records: + +```python +# Attempt to recover partial data +for record in MARCReader("records.mrc", recovery_mode="lenient"): + print(f"Got {len(record.get_fields())} fields") +``` + +See the [migration guide](../guides/migration-from-pymarc.md#error-handling) +for details on the differences between `permissive` and `recovery_mode`. + ## Complete Example This example analyzes a MARC file to summarize the collection by language and material type: diff --git a/mrrc/__init__.py b/mrrc/__init__.py index c2e11df..3826074 100644 --- a/mrrc/__init__.py +++ b/mrrc/__init__.py @@ -1464,19 +1464,55 @@ def __hash__(self) -> int: class MARCReader: - """MARC Reader wrapper.""" - - def __init__(self, file_obj): + """MARC Reader wrapper. + + Args: + file_obj: File path (str), pathlib.Path, bytes/bytearray, or file-like object. + to_unicode: Accepted for pymarc compatibility. mrrc always converts + MARC-8 to UTF-8; passing ``False`` emits a warning. + permissive: When ``True``, yields ``None`` for records that fail to + parse instead of raising, matching pymarc's ``permissive`` behavior. + recovery_mode: mrrc-native error handling: ``"strict"`` (default, raise + on errors), ``"lenient"`` (attempt to salvage valid fields), or + ``"permissive"`` (very lenient, accept partial data). Cannot be + combined with ``permissive=True``. + """ + + def __init__(self, file_obj, to_unicode: bool = True, permissive: bool = False, + recovery_mode: str = "strict"): """Create a new MARC reader.""" - self._inner = _MARCReader(file_obj) - + if not to_unicode: + import warnings + warnings.warn( + "mrrc always converts MARC-8 to UTF-8; to_unicode=False has no effect", + stacklevel=2, + ) + if permissive and recovery_mode != "strict": + raise ValueError( + "Cannot combine permissive=True with recovery_mode other than " + "'strict' — they represent different error-handling strategies" + ) + self._permissive = permissive + self._inner = _MARCReader(file_obj, recovery_mode=recovery_mode) + def __iter__(self): """Iterate over records.""" return self - - def __next__(self) -> Record: - """Get next record.""" - record = next(self._inner) + + def __next__(self) -> Optional[Record]: + """Get next record. + + When ``permissive=True``, returns ``None`` for records that fail + to parse instead of raising, matching pymarc behavior. + """ + try: + record = next(self._inner) + except StopIteration: + raise + except Exception: + if self._permissive: + return None + raise wrapper = Record(None) wrapper._inner = record # Create a Leader wrapper from the Rust record's leader @@ -1486,7 +1522,7 @@ def __next__(self) -> Record: wrapper._leader = leader wrapper._leader_modified = False return wrapper - + @property def backend_type(self) -> str: """The backend type: ``"rust_file"``, ``"cursor"``, or ``"python_file"``.""" diff --git a/mrrc/_mrrc.pyi b/mrrc/_mrrc.pyi index 778e13d..5918fa7 100644 --- a/mrrc/_mrrc.pyi +++ b/mrrc/_mrrc.pyi @@ -366,7 +366,7 @@ class MARCReader: futures = [executor.submit(process_file, f) for f in file_list] results = [f.result() for f in futures] """ - def __init__(self, file: Any) -> None: ... + def __init__(self, file: Any, *, recovery_mode: str = "strict") -> None: ... def __repr__(self) -> str: ... def __iter__(self) -> Iterator[Record]: ... def __next__(self) -> Record: diff --git a/src-python/src/readers.rs b/src-python/src/readers.rs index e5646e3..d2563e4 100644 --- a/src-python/src/readers.rs +++ b/src-python/src/readers.rs @@ -9,7 +9,7 @@ use crate::batched_reader::BatchedMarcReader; use crate::batched_unified_reader::BatchedUnifiedReader; use crate::parse_error::ParseError; use crate::wrappers::PyRecord; -use mrrc::MarcReader; +use mrrc::{MarcReader, RecoveryMode}; use pyo3::prelude::*; use smallvec::SmallVec; @@ -59,6 +59,8 @@ enum ReaderType { pub struct PyMARCReader { /// Reader backend (Python-based or unified multi-backend) reader: Option, + /// Recovery mode for handling malformed records + recovery_mode: RecoveryMode, } #[pymethods] @@ -73,12 +75,27 @@ impl PyMARCReader { /// /// # Arguments /// * `source` - File path (str), pathlib.Path, bytes/bytearray, or file-like object + /// * `recovery_mode` - Error handling mode: 'strict' (default), 'lenient', 'permissive' #[new] - pub fn new(source: &Bound<'_, PyAny>) -> PyResult { + #[pyo3(signature = (source, recovery_mode = "strict"))] + pub fn new(source: &Bound<'_, PyAny>, recovery_mode: &str) -> PyResult { + let rec_mode = match recovery_mode { + "lenient" => RecoveryMode::Lenient, + "permissive" => RecoveryMode::Permissive, + "strict" => RecoveryMode::Strict, + _ => { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "Invalid recovery_mode '{}': must be 'strict', 'lenient', or 'permissive'", + recovery_mode + ))); + }, + }; + // Try unified reader first (handles file paths and bytes) match BatchedUnifiedReader::new(source) { Ok(unified_reader) => Ok(PyMARCReader { reader: Some(ReaderType::Unified(unified_reader)), + recovery_mode: rec_mode, }), Err(_) => { // Fall back to legacy Python file wrapper @@ -87,6 +104,7 @@ impl PyMARCReader { let batched_reader = BatchedMarcReader::new(file_obj); Ok(PyMARCReader { reader: Some(ReaderType::Python(batched_reader)), + recovery_mode: rec_mode, }) }, } @@ -122,11 +140,12 @@ impl PyMARCReader { // Step 2: Parse bytes (GIL released) // CRITICAL FIX: Use Python::detach() which properly releases the GIL. + let rec_mode = self.recovery_mode; let record = py .detach(|| { // Create a cursor from owned bytes let cursor = std::io::Cursor::new(bytes_owned.to_vec()); - let mut parser = MarcReader::new(cursor); + let mut parser = MarcReader::new(cursor).with_recovery_mode(rec_mode); // Parse the single record parser.read_record().map_err(|e| { @@ -229,13 +248,14 @@ impl PyMARCReader { // We defer conversion to PyErr until AFTER detach() returns (GIL re-acquired). // This is required because PyErr construction needs the GIL. // NOTE: Use detach() which properly releases GIL + let rec_mode = slf.recovery_mode; let parse_result: Result, crate::parse_error::ParseError> = py.detach(|| { // This closure runs WITHOUT the GIL held // All data here is owned (no Python references) // Return Rust errors only; defer PyErr conversion to Phase 3 let cursor = std::io::Cursor::new(record_bytes_owned.to_vec()); - let mut parser = MarcReader::new(cursor); + let mut parser = MarcReader::new(cursor).with_recovery_mode(rec_mode); // Parse the single record from bytes // Return ParseError (Rust type), not PyErr diff --git a/tests/python/test_marcreader_flags.py b/tests/python/test_marcreader_flags.py new file mode 100644 index 0000000..f1fc61e --- /dev/null +++ b/tests/python/test_marcreader_flags.py @@ -0,0 +1,212 @@ +""" +Tests for MARCReader to_unicode, permissive, and recovery_mode flags (GitHub issue #78). + +Verifies pymarc-compatible constructor kwargs and mrrc-native recovery_mode. +""" + +import io +import warnings +import pytest +import mrrc + + +# ============================================================================ +# Helpers +# ============================================================================ + +def _build_valid_record_bytes() -> bytes: + """Build a minimal valid MARC record and return as bytes.""" + record = mrrc.Record() + record.add_field(mrrc.Field("001", data="test001")) + record.add_field(mrrc.Field("008", data="230327s2023 wau ef 000 1 eng d")) + f245 = mrrc.Field("245", "1", "0") + f245.add_subfield("a", "Test title") + record.add_field(f245) + output = io.BytesIO() + writer = mrrc.MARCWriter(output) + writer.write_record(record) + writer.close() + return output.getvalue() + + +def _build_malformed_record_bytes() -> bytes: + """Build a record with valid leader length but corrupt directory/data. + + The leader claims 50 bytes total with a base address of 25 (1 directory + entry of 12 bytes + field terminator). The directory entry has an invalid + tag and the data area is garbage. This is enough for the batch reader to + read the correct number of bytes but causes a parse error. + """ + # Leader: 00050nam 2200025 4500 (50 bytes total, base address 25) + leader = b"00050nam 2200025 4500" + # Directory: one 12-byte entry with garbage, then field terminator + directory = b"XXX000100000" + b"\x1e" + # Data area: fill to reach 50 bytes total, end with record terminator + data_needed = 50 - len(leader) - len(directory) - 1 # -1 for record terminator + data = b"\xff" * data_needed + b"\x1d" + return leader + directory + data + + +def _build_two_record_stream(inject_bad_second: bool = False) -> bytes: + """Build a stream with two records; optionally corrupt the second.""" + good = _build_valid_record_bytes() + if inject_bad_second: + bad = _build_malformed_record_bytes() + return good + bad + second = good # just duplicate the valid record + return good + second + + +# ============================================================================ +# to_unicode tests +# ============================================================================ + +class TestToUnicode: + """Test to_unicode kwarg behavior.""" + + def test_to_unicode_true_no_warning(self): + """to_unicode=True (default) should not emit a warning.""" + data = _build_valid_record_bytes() + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + mrrc.MARCReader(io.BytesIO(data), to_unicode=True) + assert len(w) == 0 + + def test_to_unicode_default_no_warning(self): + """Default (no to_unicode arg) should not emit a warning.""" + data = _build_valid_record_bytes() + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + mrrc.MARCReader(io.BytesIO(data)) + assert len(w) == 0 + + def test_to_unicode_false_warns(self): + """to_unicode=False should emit a warning.""" + data = _build_valid_record_bytes() + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + mrrc.MARCReader(io.BytesIO(data), to_unicode=False) + assert len(w) == 1 + assert "to_unicode=False has no effect" in str(w[0].message) + + def test_to_unicode_false_still_reads(self): + """to_unicode=False should still read records normally.""" + data = _build_valid_record_bytes() + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + reader = mrrc.MARCReader(io.BytesIO(data), to_unicode=False) + records = list(reader) + assert len(records) == 1 + assert records[0].get_fields("001")[0].data == "test001" + + +# ============================================================================ +# permissive tests +# ============================================================================ + +class TestPermissive: + """Test permissive kwarg behavior (pymarc compatibility).""" + + def test_permissive_false_raises_on_bad_record(self): + """Default (permissive=False) should raise on malformed records.""" + data = _build_two_record_stream(inject_bad_second=True) + reader = mrrc.MARCReader(io.BytesIO(data), permissive=False) + # First record should be fine + record = next(reader) + assert record is not None + # Second record is malformed — should raise + with pytest.raises(Exception): + next(reader) + + def test_permissive_true_yields_none_for_bad_record(self): + """permissive=True should yield None for malformed records.""" + data = _build_two_record_stream(inject_bad_second=True) + reader = mrrc.MARCReader(io.BytesIO(data), permissive=True) + records = list(reader) + # Should get the good record and None for the bad one + assert any(r is not None for r in records), "Should have at least one valid record" + assert any(r is None for r in records), "Should have None for malformed record" + + def test_permissive_true_valid_records_normal(self): + """permissive=True should return valid records normally.""" + data = _build_two_record_stream(inject_bad_second=False) + reader = mrrc.MARCReader(io.BytesIO(data), permissive=True) + records = list(reader) + assert len(records) == 2 + assert all(r is not None for r in records) + + def test_permissive_pymarc_pattern(self): + """The standard pymarc permissive pattern should work.""" + data = _build_two_record_stream(inject_bad_second=True) + reader = mrrc.MARCReader(io.BytesIO(data), permissive=True) + valid_count = 0 + error_count = 0 + for record in reader: + if record is None: + error_count += 1 + continue + valid_count += 1 + assert valid_count >= 1 + assert error_count >= 1 + + +# ============================================================================ +# recovery_mode tests +# ============================================================================ + +class TestRecoveryMode: + """Test recovery_mode kwarg behavior.""" + + def test_recovery_mode_default_strict(self): + """Default recovery_mode should be strict.""" + data = _build_valid_record_bytes() + reader = mrrc.MARCReader(io.BytesIO(data)) + records = list(reader) + assert len(records) == 1 + + def test_recovery_mode_lenient(self): + """recovery_mode='lenient' should be accepted.""" + data = _build_valid_record_bytes() + reader = mrrc.MARCReader(io.BytesIO(data), recovery_mode="lenient") + records = list(reader) + assert len(records) == 1 + + def test_recovery_mode_permissive(self): + """recovery_mode='permissive' should be accepted.""" + data = _build_valid_record_bytes() + reader = mrrc.MARCReader(io.BytesIO(data), recovery_mode="permissive") + records = list(reader) + assert len(records) == 1 + + def test_recovery_mode_invalid_raises(self): + """Invalid recovery_mode should raise ValueError.""" + data = _build_valid_record_bytes() + with pytest.raises(ValueError, match="Invalid recovery_mode"): + mrrc.MARCReader(io.BytesIO(data), recovery_mode="invalid") + + +# ============================================================================ +# Conflict validation tests +# ============================================================================ + +class TestConflictValidation: + """Test that conflicting options are rejected.""" + + def test_permissive_with_lenient_raises(self): + """permissive=True + recovery_mode='lenient' should raise ValueError.""" + data = _build_valid_record_bytes() + with pytest.raises(ValueError, match="Cannot combine"): + mrrc.MARCReader(io.BytesIO(data), permissive=True, recovery_mode="lenient") + + def test_permissive_with_permissive_recovery_raises(self): + """permissive=True + recovery_mode='permissive' should raise ValueError.""" + data = _build_valid_record_bytes() + with pytest.raises(ValueError, match="Cannot combine"): + mrrc.MARCReader(io.BytesIO(data), permissive=True, recovery_mode="permissive") + + def test_permissive_with_strict_ok(self): + """permissive=True + recovery_mode='strict' (default) should be fine.""" + data = _build_valid_record_bytes() + reader = mrrc.MARCReader(io.BytesIO(data), permissive=True, recovery_mode="strict") + records = list(reader) + assert len(records) == 1