diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index c1c41c4..189e40c 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -3,6 +3,7 @@ {"id":"bd-264q","title":"Add __str__ and __repr__ to Python Record wrapper","description":"The Python Record wrapper class in mrrc/__init__.py does not define __str__ or __repr__. Python's data model skips __getattr__ for dunder methods, so str(rec) returns the default '' instead of delegating to the Rust PyRecord's __str__ (which returns 'Record(type=a)'). Need to add explicit __str__ and __repr__ methods that delegate to self._inner. Reported in https://github.com/dchud/mrrc/issues/57 comment.","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-03-04T21:06:29.006105Z","created_by":"dchud","updated_at":"2026-03-04T21:53:01.013120Z","closed_at":"2026-03-04T21:53:01.013107Z","close_reason":"Fixed in PR #60, merged","source_repo":".","compaction_level":0,"original_size":0} {"id":"bd-3kee","title":"Add Record.get(tag) alias for pymarc compatibility","description":"pymarc's Record.get(tag, default=None) returns the first field matching a tag, or a default value. mrrc has the equivalent as Record.get_field(tag) but lacks the .get() alias.\n\nAdd a Record.get() method to the Python wrapper in mrrc/__init__.py that delegates to get_field() with an optional default parameter:\n\n def get(self, tag: str, default=None):\n return self.get_field(tag) or default\n\nThis is a pure Python wrapper addition — no Rust changes needed. Update _mrrc.pyi stubs accordingly.\n\npymarc source: Record.get() delegates to Record.__getitem__ with a try/except KeyError fallback.","status":"closed","priority":3,"issue_type":"feature","created_at":"2026-03-04T21:16:19.327037Z","created_by":"dchud","updated_at":"2026-03-04T21:53:01.068246Z","closed_at":"2026-03-04T21:53:01.068233Z","close_reason":"Fixed in PR #60, merged","source_repo":".","compaction_level":0,"original_size":0} {"id":"bd-3peu","title":"Fix check.sh --quick: Python tests run against stale extension","status":"closed","priority":1,"issue_type":"bug","created_at":"2026-03-04T16:56:07.259492Z","created_by":"dchud","updated_at":"2026-03-04T17:35:12.613323Z","closed_at":"2026-03-04T17:35:12.613307Z","close_reason":"Merged in PR #56","source_repo":".","compaction_level":0,"original_size":0} +{"id":"bd-7dre","title":"Fix repeated control fields being dropped (e.g. multiple 007s)","status":"in_progress","priority":1,"issue_type":"bug","created_at":"2026-04-10T01:47:09.917469Z","created_by":"dchud","updated_at":"2026-04-10T01:48:10.983328Z","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":18,"issue_id":"bd-7dre","author":"dchud","text":"Elevated to GitHub issue #77: https://github.com/dchud/mrrc/issues/77","created_at":"2026-04-10T01:48:10Z"}]} {"id":"bd-93qn","title":"record_to_dublin_core_xml not exported from mrrc Python wrapper","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-03-04T18:59:47.002714Z","created_by":"dchud","updated_at":"2026-03-04T19:45:29.894208Z","closed_at":"2026-03-04T19:45:29.894196Z","close_reason":"Fixed in PR #59, merged","source_repo":".","compaction_level":0,"original_size":0} {"id":"bd-b7mg","title":"record_to_json/xml fail on records from xml_to_record()","notes":"GitHub issue: https://github.com/dchud/mrrc/issues/57","status":"closed","priority":1,"issue_type":"bug","created_at":"2026-03-04T18:55:40.894350Z","created_by":"dchud","updated_at":"2026-03-04T19:45:29.002501Z","closed_at":"2026-03-04T19:45:29.002489Z","close_reason":"Fixed in PR #58, merged","source_repo":".","compaction_level":0,"original_size":0} {"id":"bd-qjjs","title":"Pymarc API compatibility: comprehensive gap analysis and implementation plan","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-03-22T20:43:50.852232Z","created_by":"dchud","updated_at":"2026-03-22T22:43:09.600381Z","closed_at":"2026-03-22T22:43:09.600049Z","close_reason":"Replaced by bd-14jh — clean consolidated version","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":6,"issue_id":"bd-qjjs","author":"dchud","text":"## Pymarc API compatibility gaps identified from rmarc bench_deathmatch.py\n\nAnalysis of https://github.com/matthewdeanmartin/rmarc/blob/main/bench/bench_deathmatch.py reveals 4 API differences where mrrc diverges from pymarc's public API.\n\n### Gap 1: Record convenience accessors are methods, pymarc uses properties\n\nIn pymarc, `title`, `author`, `isbn`, `issn`, `subjects`, `notes`, and `location` are all `@property` decorated — accessed as `record.title`, not `record.title()`.\n\nIn mrrc, these are all plain methods requiring parentheses: `record.title()`.\n\nThe benchmark script has a special mrrc branch: `rec.title()` vs `rec.title` for pymarc/rmarc.\n\n**Fix:** Convert these to `@property` in `mrrc/__init__.py`. The underlying Rust `_inner` methods stay as-is; only the Python wrapper changes. This is a breaking change for existing mrrc users calling `record.title()` — could add a deprecation shim that detects callable usage.\n\n### Gap 2: Missing `Field.value()` method\n\npymarc's `Field.value()` returns all subfield values space-joined (or `data` for control fields). mrrc's Field wrapper has no `value()` method. The benchmark uses `str(field)` for mrrc vs `field.value()` for pymarc.\n\n**Fix:** Add `def value(self) -> str` to both `Field` and `ControlField` in the Python wrapper. For `Field`, return space-joined subfield values (matching pymarc). For `ControlField`, return `self.value` (already an attribute, but the method form is what pymarc uses).\n\n### Gap 3: Missing `Record.as_json()` method\n\npymarc uses `record.as_json(**kwargs)` which serializes via `json.dumps(record.as_dict(), **kwargs)`. mrrc has `record.to_json()` instead.\n\n**Fix:** Add `as_json()` as an alias or new method on `Record`. Ideally also add `as_dict()` for full compat. Keep `to_json()` as well since it's already in our public API.\n\n### Gap 4: Missing `parse_xml_to_array()` top-level function\n\npymarc exports `parse_xml_to_array(xml_file, strict=False, normalize_form=None)` as a top-level convenience function. mrrc uses `xml_to_records()` instead, with different argument conventions (takes a string, not a file path/handle).\n\n**Fix:** Add `parse_xml_to_array()` as a top-level function in `mrrc/__init__.py` that wraps `xml_to_records()`. Accept file paths and file-like objects for full pymarc compat.\n\n---\n\nElevated to GitHub issue #65: https://github.com/dchud/mrrc/issues/65","created_at":"2026-03-22T20:44:08Z"},{"id":7,"issue_id":"bd-qjjs","author":"dchud","text":"## Additional pymarc API compatibility gaps (deep comparison)\n\nFull API surface comparison reveals more gaps beyond the 4 from the benchmark.\n\n### Gap 5: Record.as_marc() / as_marc21() — serialize to ISO 2709 bytes\npymarc has `record.as_marc()` and `record.as_marc21()` returning `bytes`. mrrc has no equivalent for serializing a single Record back to raw MARC binary. This is critical for round-trip workflows (read, modify, write back).\n\n### Gap 6: Record.as_dict()\npymarc has `record.as_dict()` returning a plain dict. `as_json()` is built on top of it. mrrc has neither.\n\n### Gap 7: Record.add_field() signature mismatch\npymarc: `add_field(self, *fields)` — accepts multiple fields. mrrc: `add_field(self, field)` — accepts exactly one.\n\n### Gap 8: Record.remove_field() signature mismatch + missing remove_fields()\npymarc: `remove_field(self, *fields)` — accepts multiple. mrrc: one at a time. pymarc also has `remove_fields(self, *tags)` for bulk removal by tag string — mrrc has no equivalent.\n\n### Gap 9: Record.add_ordered_field() / add_grouped_field() missing\npymarc has both for inserting fields in tag-sorted or tag-grouped positions.\n\n### Gap 10: Naming mismatches (no underscore in pymarc)\n- pymarc: `record.physicaldescription` → mrrc: `record.physical_description()`\n- pymarc: `record.uniformtitle` → mrrc: `record.uniform_title()`\n- pymarc: `record.addedentries` → mrrc: missing entirely\n\n### Gap 11: Record.pubyear return type\npymarc returns `Optional[str]` (e.g. \"2024\"). mrrc returns `Optional[int]`. This breaks code doing string operations on pubyear.\n\n### Gap 12: Field.value() (already noted in Gap 2 above)\n\n### Gap 13: Field.format_field() missing\npymarc `field.format_field()` returns a human-readable string. Used internally by Record property helpers like `title`. Different from `value()`.\n\n### Gap 14: Field.add_subfield() missing pos parameter\npymarc: `add_subfield(self, code, value, pos=None)`. mrrc only supports append.\n\n### Gap 15: Record.decode_marc() missing\npymarc has `decode_marc(self, marc, to_unicode=True, ...)` to populate a Record from raw bytes. Used by MARCReader internally but also called directly by users.\n\n### Gap 16: Field.as_marc() / as_marc21() missing\npymarc can serialize individual fields to ISO 2709 bytes.\n\n---\n\n### GIL considerations for as_json() and parse_xml_to_array()\n\nNeither needs GIL release. Both operate on data already in Python memory:\n- as_json() serializes a single already-parsed Record — too fast to justify GIL overhead\n- parse_xml_to_array() wraps xml_to_records() which takes a &str — no Python I/O during Rust parsing\n\nThe GIL 3-phase model is only needed for MARCReader-style iterators that interleave Python file I/O with Rust parsing across many iterations.","created_at":"2026-03-22T20:48:58Z"},{"id":8,"issue_id":"bd-qjjs","author":"dchud","text":"## Correction: parse_xml_to_array() SHOULD follow MARCReader's input dispatch pattern\n\npymarc's parse_xml_to_array() accepts file paths, open file handles, and StringIO objects. If we implement it to match, it should follow the same input-dispatch pattern MARCReader already uses:\n\n- **String path / pathlib.Path** → read file in Rust (RustFile backend, no GIL)\n- **File-like object** → read content via Python (GIL held), then parse in Rust\n- **String content** → parse directly in Rust (no GIL needed)\n\nThe XML *parsing* itself is always GIL-free (pure Rust on &str). The GIL concern is only about the file-reading step. This is exactly the MARCReader pattern from readers.rs — detect input type, use native I/O when possible, fall back to Python I/O with proper GIL management when given a Python file object.\n\nThis means parse_xml_to_array() needs a Rust-side implementation (or at minimum a Python wrapper that handles the dispatch), not just a thin alias over xml_to_records().","created_at":"2026-03-22T20:49:19Z"},{"id":9,"issue_id":"bd-qjjs","author":"dchud","text":"## Clarification: GIL avoidance is the priority\n\nThe goal is to avoid holding the GIL as much as possible, not to replicate the MARCReader pattern specifically. For parse_xml_to_array():\n\n- **String path / pathlib.Path** → read + parse entirely in Rust, zero GIL\n- **String content** → parse entirely in Rust, zero GIL\n- **Python file object** → only case where GIL is unavoidable (need it to call .read())\n\nThe MARCReader's 3-phase model is a solution to a problem we may not have here. If we can handle the common cases (file paths, string content) entirely GIL-free, that's the right approach. Only add GIL management for the Python file object fallback, and even then minimize the window.","created_at":"2026-03-22T20:50:54Z"},{"id":10,"issue_id":"bd-qjjs","author":"dchud","text":"## Documentation: show preferred GIL-free usage with explicit examples\n\nWhen implementing parse_xml_to_array(), update docs (intro guide, API reference, examples) to show the preferred path-based approach with clear comments about why, the same way we document passing paths to MARCReader() as preferred. E.g.:\n\n```python\n# Preferred: file path lets mrrc read + parse entirely in Rust (no GIL)\nrecords = mrrc.parse_xml_to_array(\"collection.xml\")\n\n# Also good: string content parsed entirely in Rust (no GIL)\nrecords = mrrc.parse_xml_to_array(xml_string)\n\n# Works but slower: Python file object requires GIL for .read()\nwith open(\"collection.xml\") as f:\n records = mrrc.parse_xml_to_array(f)\n```\n\nThis pattern should be consistent with how MARCReader path-based usage is already documented across the docs.","created_at":"2026-03-22T20:52:05Z"},{"id":11,"issue_id":"bd-qjjs","author":"dchud","text":"## Consolidated revision: complete gap inventory (supersedes earlier comments)\n\nReview of earlier comments found errors, duplicates, and missing gaps. This comment is the authoritative reference.\n\n---\n\n### Tier 1: Structural / semantic divergences (highest impact)\n\n**A. Separate ControlField class (NEW)**\npymarc has NO ControlField class — both control and data fields are `pymarc.Field`, distinguished by `is_control_field()`. mrrc exposes a separate `ControlField` class. This breaks `isinstance(f, Field)` checks and means `get_fields()` returns mixed types in mrrc vs uniform Field in pymarc.\n\n**B. ControlField uses `.value` attribute; pymarc uses `.data` (NEW)**\npymarc control fields store content in `.data`. mrrc's ControlField uses `.value`. Code doing `field.data` on a control field will fail.\n\n**C. Record convenience accessors are methods, pymarc uses @property (Gap 1)**\n`title`, `author`, `isbn`, `issn`, `issn_title`, `issnl`, `subjects`, `notes`, `location`, `series`, `sudoc`, `publisher`, `pubyear`, `physicaldescription`, `uniformtitle`, `addedentries` are all `@property` in pymarc. mrrc has them as methods (and some are missing — see Tier 2).\n\n**D. Record.__getitem__ returns None for missing tags; pymarc raises KeyError (NEW)**\npymarc `record['999']` raises `KeyError`. mrrc returns `None`. This silently changes control flow for code using try/except KeyError.\n\n**E. Field.__str__ returns useless default repr (NEW)**\npymarc `str(field)` returns MARC display format: `=245 10$aThe Great Gatsby$cF. Scott Fitzgerald`. mrrc returns `` because the Python wrapper has no `__str__` and `__getattr__` doesn't intercept dunder methods. The Rust `_inner.__str__` returns `Field(245)` which is also not pymarc-compatible. Note: the rmarc benchmark uses `str(field)` for mrrc as a workaround for missing `value()`, but it doesn't actually produce useful output.\n\n**F. Record.as_marc() / as_marc21() missing (Gap 5)**\nNo way to serialize a Record to ISO 2709 bytes. MARCWriter can write to files, but there's no method to get bytes from a single Record. Critical for round-trip workflows.\n\n---\n\n### Tier 2: Missing methods and signature mismatches\n\n**G. Field.value() missing (Gap 2)**\npymarc `field.value()` returns space-joined subfield values (or `.data` for control fields). mrrc has nothing equivalent. NOTE: For ControlField, we can't just add a `value()` method because `.value` is already an attribute — resolve by renaming attribute to `.data` (matching pymarc) per Gap B above.\n\n**H. Field.format_field() missing (Gap 13)**\npymarc `field.format_field()` returns human-readable text (no indicators or subfield codes). Different from `value()` only in edge cases but used internally by Record property helpers like `title`.\n\n**I. Record.as_json(**kwargs) missing (Gap 3)**\npymarc `record.as_json(**kwargs)` wraps `json.dumps(record.as_dict(), **kwargs)`. mrrc only has `to_json()` with no kwargs support. Keep `to_json()` in our public API.\n\n**J. Record.as_dict() missing (Gap 6)**\npymarc builds `as_json()` on `as_dict()`. Should implement both.\n\n**K. parse_xml_to_array() missing (Gap 4)**\npymarc top-level function accepting file paths, file handles, StringIO. mrrc has `xml_to_records()` (string-only). Implementation should avoid holding the GIL: file paths and string content → entirely Rust, zero GIL. Python file objects → GIL only for `.read()`, then Rust parsing. Document preferred path-based usage with examples, consistent with MARCReader docs.\n\n**L. add_field(*fields) signature mismatch (Gap 7)**\npymarc accepts `*fields`. mrrc accepts exactly one.\n\n**M. remove_field(*fields) + remove_fields(*tags) (Gap 8)**\npymarc `remove_field` accepts multiple. mrrc accepts one (Field or str). pymarc also has `remove_fields(*tags)` for bulk removal by tag — mrrc has no equivalent.\n\n**N. add_ordered_field() / add_grouped_field() missing (Gap 9)**\npymarc has both for inserting fields in tag-sorted or tag-grouped positions.\n\n**O. Field.add_subfield() missing pos parameter (Gap 14)**\npymarc: `add_subfield(self, code, value, pos=None)`. mrrc only supports append.\n\n**P. Field.as_marc() / as_marc21() missing (Gap 16)**\npymarc can serialize individual fields to ISO 2709 bytes.\n\n---\n\n### Tier 3: Naming and return type mismatches\n\n**Q. Property name mismatches (Gap 10)**\n- pymarc `physicaldescription` → mrrc `physical_description` (underscore)\n- pymarc `uniformtitle` → mrrc `uniform_title` (underscore)\n- pymarc `addedentries` → mrrc: missing entirely (returns 700/710/711/730 fields)\n\n**R. pubyear return type (Gap 11)**\npymarc returns `Optional[str]` (e.g. `\"2020\"`). mrrc returns `Optional[int]` (`2020`). Confirmed: mrrc returns int.\n\n---\n\n### Tier 4: Lower-priority gaps\n\n**S. decode_marc() missing (Gap 15)** — semi-internal, used by MARCReader. Uncommon in user code.\n\n**T. Missing reader/writer variants** — JSONReader, JSONWriter, MARCMakerReader, XMLWriter, TextWriter.\n\n**U. No exception hierarchy** — pymarc has ~15 specific exception classes. mrrc exposes generic errors.\n\n**V. No MARC8 character encoding support.**\n\n**W. Missing convenience functions** — `map_records()`, `map_xml()`, `parse_json_to_array()`, `record_to_xml_node()`.\n\n**X. Missing constants** — `LEADER_LEN`, `DIRECTORY_ENTRY_LEN`, `END_OF_FIELD`, `END_OF_RECORD`, `SUBFIELD_INDICATOR`, etc.\n\n---\n\n### GIL policy (final, supersedes earlier comments)\n\nAvoid holding the GIL whenever possible.\n\n- `as_json()`: no GIL concern — single record, in-memory, fast.\n- `parse_xml_to_array()`: file paths and string content → entirely Rust, zero GIL. Python file objects → GIL only for `.read()`, minimize window.\n- `as_marc()` / `as_marc21()`: pure serialization, no GIL needed.\n\n---\n\n### Removed from earlier comments\n- Gap 12 was a duplicate of Gap 2 — removed.\n\nElevated to GitHub issue #65: https://github.com/dchud/mrrc/issues/65","created_at":"2026-03-22T21:00:17Z"}]} @@ -11,6 +12,7 @@ {"id":"bd-vc06","title":"Add negation flag to SubfieldValueQuery","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-03-18T23:57:42.630104Z","created_by":"dchud","updated_at":"2026-04-06T00:39:18.974586Z","closed_at":"2026-04-06T00:39:18.974381Z","close_reason":"Completed — merged in PR #74","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":3,"issue_id":"bd-vc06","author":"dchud","text":"## Motivation\n\nFollow-up to bd-w9cm (negation on SubfieldPatternQuery). The same use case applies to exact and partial string matching: users scanning large record sets for fields where a subfield does NOT equal or contain a value currently must filter in Python, incurring FFI overhead.\n\n## Design\n\nMirror the approach from bd-w9cm. SubfieldValueQuery already has a `partial: bool` flag — add `negate: bool` alongside it.\n\n### Rust core (`src/field_query.rs`)\n\n- Add `pub negate: bool` to `SubfieldValueQuery`\n- Update `new()` and `partial()` constructors to set `negate: false`\n- Add `pub fn negated(tag, subfield_code, value) -> Self` and `pub fn partial_negated(tag, subfield_code, value) -> Self` constructors\n- Update `matches()`:\n\n field.get_subfield(self.subfield_code).is_some_and(|value| {\n let matched = if self.partial {\n value.contains(self.value.as_str())\n } else {\n value == self.value.as_str()\n };\n matched != self.negate\n })\n\n### PyO3 bindings (`src-python/src/query.rs`)\n\n- Add `negate` as an optional keyword argument to `PySubfieldValueQuery::new()` (default `False`)\n- Add `#[getter] fn negate(&self) -> bool`\n- Update `__repr__` to include `negate=true` when set\n\n### Python stubs (`mrrc/_mrrc.pyi`)\n\n- Update `SubfieldValueQuery.__init__` signature: add `negate: bool = False`\n- Add `negate` property\n\n### Tests\n\n- Rust unit tests: negated exact match, negated partial match, negated with missing subfield (returns false)\n- Python tests in `tests/python/test_query_dsl.py`: negated value query via kwarg, verify property, integration with `fields_matching_value()`\n\n### Documentation\n\n- Update `docs/guides/query-dsl.md` (SubfieldValueQuery section)\n- Update `docs/tutorials/python/querying-fields.md` if SubfieldValueQuery is covered\n- Update `mrrc/_mrrc.pyi` stubs\n\n### Edge cases\n\nSame as bd-w9cm:\n- Missing subfield + negate=true → false (subfield must exist)\n- Repeating subfields: only the first value is checked (existing `get_subfield` behavior)\n\n### Sequencing\n\nImplement after bd-w9cm lands so both query types get consistent API and we can share test patterns.","created_at":"2026-03-18T23:58:00Z"},{"id":5,"issue_id":"bd-vc06","author":"dchud","text":"Elevated to GitHub issue #64: https://github.com/dchud/mrrc/issues/64","created_at":"2026-03-18T23:59:36Z"}]} {"id":"bd-w9cm","title":"Add negation flag to SubfieldPatternQuery","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-03-18T23:53:40.703765Z","created_by":"dchud","updated_at":"2026-04-06T00:13:06.687346Z","closed_at":"2026-04-06T00:13:06.687035Z","close_reason":"Completed — merged in PR #73","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":1,"issue_id":"bd-w9cm","author":"dchud","text":"## Motivation\n\nA user scanning ~250k records needed to find ones where a subfield did NOT match a regex. Currently this requires Python-side filtering (`not re.match(...)`) which means:\n- Extra string copies across the Rust/Python FFI boundary for every record\n- GIL overhead on each field value that crosses into Python\n- Inability to short-circuit inside Rust iteration\n\nAdding a `negate` flag pushes the inversion into Rust so non-matching fields never cross the FFI boundary.\n\n## Design\n\nFollow the existing `partial` flag pattern on `SubfieldValueQuery` — add a `negate: bool` field defaulting to `false`.\n\n### Rust core (`src/field_query.rs`)\n\n- Add `pub negate: bool` to `SubfieldPatternQuery`\n- Update `new()` to set `negate: false`\n- Add `pub fn negated(tag, subfield_code, pattern) -> Result` constructor (sets `negate: true`)\n- Update `matches()`: XOR the regex result with `self.negate` so that when negated, a match means \"subfield exists but does NOT match the pattern\"\n\n### PyO3 bindings (`src-python/src/query.rs`)\n\n- Add `negate` as an optional keyword argument to `PySubfieldPatternQuery::new()` (default `False`)\n- Add `#[getter] fn negate(&self) -> bool`\n- Update `__repr__` to include `negate=true` when set\n\n### Python stubs (`mrrc/_mrrc.pyi`)\n\n- Update `SubfieldPatternQuery.__init__` signature: add `negate: bool = False`\n- Add `negate` property\n\n### Tests\n\n- Rust unit tests in `src/field_query.rs`: negated match, negated non-match, negated with missing subfield (should return false — subfield must exist)\n- Python tests in `tests/python/test_query_dsl.py`: negated pattern query via kwarg, verify `negate` property, integration with `fields_matching_pattern()`\n\n### Documentation\n\n- Update `docs/guides/query-dsl.md` with negation examples\n- Update `docs/tutorials/python/querying-fields.md` if SubfieldPatternQuery is covered there\n- Update API reference docs if they document SubfieldPatternQuery parameters\n\n## Edge case: missing subfield\n\nWhen `negate=true` and the subfield doesn't exist on a field, `matches()` should return `false`. Negation means \"the subfield exists but its value doesn't match\" — not \"the subfield is absent.\" This matches user intent (filtering by value, not by presence).\n\n## Scope\n\nThis bead covers the `negate` flag only. A broader query composition framework (NotQuery, AndQuery, OrQuery) is a separate concern.","created_at":"2026-03-18T23:53:58Z"},{"id":2,"issue_id":"bd-w9cm","author":"dchud","text":"## Revision: additional detail and edge cases\n\n### Repeating subfields\n\n`get_subfield()` returns only the **first** subfield with the given code. MARC fields can have repeating subfields (e.g., multiple `$a` in a 650). With `negate=true`, the query checks whether the first value does NOT match — it does not scan all values with that code. This is consistent with the existing non-negated behavior but worth documenting explicitly since negation makes the semantics more consequential. If users need \"no subfield with this code matches,\" they can combine with Python-level filtering or we can address multi-value matching in a future enhancement.\n\n### matches() implementation detail\n\nThe cleanest implementation uses `!=` rather than a literal XOR:\n\n field.get_subfield(self.subfield_code)\n .is_some_and(|value| self.pattern.is_match(value) != self.negate)\n\nThis handles all cases correctly:\n- subfield exists, pattern matches, negate=false → true (correct)\n- subfield exists, pattern doesn't match, negate=false → false (correct)\n- subfield exists, pattern matches, negate=true → false (correct: match excluded)\n- subfield exists, pattern doesn't match, negate=true → true (correct: non-match included)\n- subfield missing, any negate value → false (is_some_and short-circuits)\n\nNo extra branching needed for the missing-subfield edge case.\n\n### Follow-up\n\nThe same `negate` flag would be useful on `SubfieldValueQuery` (same user scenario, exact/partial matching). That should be a separate bead to keep scope tight.\n\n### Documentation locations confirmed\n\n- `docs/guides/query-dsl.md` — SubfieldPatternQuery section at line 108, examples at line 228\n- `docs/tutorials/python/querying-fields.md` — two usage examples at lines 80, 85\n- `mrrc/_mrrc.pyi` — stub at line 295\n- No separate API reference docs beyond the stubs; mkdocs-material may auto-generate from docstrings","created_at":"2026-03-18T23:56:36Z"},{"id":4,"issue_id":"bd-w9cm","author":"dchud","text":"Elevated to GitHub issue #63: https://github.com/dchud/mrrc/issues/63","created_at":"2026-03-18T23:59:36Z"}]} {"id":"bd-wifu","title":"Pymarc API compatibility: verified gap analysis and implementation plan","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-03-23T00:03:50.574638Z","created_by":"dchud","updated_at":"2026-04-05T21:33:52.664164Z","closed_at":"2026-04-05T21:33:52.663939Z","close_reason":"Superseded by bd-sgwi / GitHub #71 — clean ordered plan replaces accumulated analysis","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":15,"issue_id":"bd-wifu","author":"dchud","text":"Verified pymarc API gap analysis. Every gap confirmed via live introspection of pymarc and mrrc. Replaces bd-qjjs and bd-14jh. Elevated to GitHub issue #65: https://github.com/dchud/mrrc/issues/65\n\n---\n\n## Tier 1: Structural / semantic divergences\n\nThese affect the fundamental object model and break the most pymarc-compatible code.\n\n### A. Separate ControlField class\n\npymarc has NO ControlField — both control and data fields are `pymarc.Field`, distinguished by `is_control_field()`. mrrc exposes a separate `ControlField` class. Breaks `isinstance(f, Field)` checks; `get_fields()` returns mixed types in mrrc vs uniform Field in pymarc.\n\n**Fix:** Unify into a single Field class in the wrapper, with `is_control_field()` dispatching behavior internally.\n\n### B. Control field content: mrrc `.value` attribute vs pymarc `.data` attribute\n\npymarc control fields store content in `.data`. mrrc's ControlField uses `.value`. Code doing `field.data` on a control field fails. Also creates a naming collision with `Field.value()` method (Gap G).\n\n**Fix:** Use `.data` when unifying Field (Gap A). Provides pymarc compat and clears the way for `value()` method.\n\n### C. Record convenience accessors are methods; pymarc uses @property\n\nAll 17 verified as properties in pymarc, methods in mrrc: `title`, `author`, `isbn`, `issn`, `issn_title`, `issnl`, `subjects`, `notes`, `location`, `series`, `sudoc`, `publisher`, `pubyear`. Plus 3 missing entirely from mrrc: `physicaldescription`, `uniformtitle`, `addedentries` (see Gap Q).\n\n**Fix:** Convert to `@property` in `mrrc/__init__.py`. Rust `_inner` methods stay as-is. Breaking change for `record.title()` callers — consider deprecation shim.\n\n### D. Record.__getitem__ returns None for missing tags; pymarc raises KeyError\n\nVerified: pymarc `record['999']` raises `KeyError`. mrrc returns `None`. Silently changes control flow for code using try/except KeyError. Note: `Record.get(tag, default)` already exists in both and behaves the same (returns default).\n\n**Fix:** Raise `KeyError` for missing tags in `__getitem__`.\n\n### E. Field.__str__ returns useless default repr\n\nVerified: pymarc `str(field)` → `'=245 10$aThe Great Gatsby$cF. Scott Fitzgerald'`. mrrc `str(field)` → `''`. The Python wrapper has no `__str__`; `__getattr__` doesn't intercept dunder methods. The Rust `_inner.__str__` returns `'Field(245)'` which is also not pymarc-compatible.\n\n**Fix:** Add `__str__` and `__repr__` to the wrapper Field class matching pymarc's MARC display format.\n\n### F. Record.as_marc() / as_marc21() missing\n\nVerified: pymarc `record.as_marc()` returns `bytes` (ISO 2709). mrrc has no equivalent — `MARCWriter.write_record()` writes to a file, returns `None`. Critical for round-trip workflows.\n\n**Fix:** Implement in Rust (pure serialization, no GIL needed), expose via PyO3, add wrapper method. `as_marc21()` should be an alias.\n\n### G. JSON serialization format mismatch\n\nVerified structural differences between pymarc `as_json()` and mrrc `to_json()`:\n\n**pymarc** (follows MARC-in-JSON / code4lib proposal):\n```json\n{\"leader\": \"...\", \"fields\": [{\"245\": {\"ind1\": \"1\", \"ind2\": \"0\", \"subfields\": [{\"a\": \"Title\"}, {\"c\": \"Author\"}]}}]}\n```\n\n**mrrc**:\n```json\n[{\"leader\": \"...\"}, {\"245\": {\"ind1\": \"1\", \"ind2\": \"0\", \"subfields\": {\"a\": \"Title\", \"c\": \"Author\"}}}]\n```\n\nTwo differences: (1) Top-level: pymarc uses `{leader, fields}` object, mrrc uses flat array. (2) Subfields: pymarc uses array of single-key objects (preserves order, allows duplicate codes), mrrc uses single object (loses duplicate subfield codes).\n\n**Fix:** `as_json()` and `as_dict()` must produce pymarc-compatible format. Keep `to_json()` as mrrc's native format. This likely needs Rust-side work or a Python-side reformatter.\n\n---\n\n## Tier 2: Missing methods and signature mismatches\n\n### H. Field.value() missing\n\nVerified: pymarc `field.value()` returns space-joined subfield values (or `.data` for control fields). mrrc Field has no `value` attribute or method. Depends on Gap B (renaming `.value` → `.data` on control fields).\n\n**Fix:** Add `def value(self) -> str` to the unified Field class.\n\n### I. Field.format_field() missing\n\nVerified: pymarc `field.format_field()` returns human-readable text without indicators or subfield codes. For the test data, `value()` and `format_field()` return the same string, but they can differ for fields with special formatting.\n\n**Fix:** Add `def format_field(self) -> str` to Field.\n\n### J. Record.as_json(**kwargs) and Record.as_dict() missing\n\nVerified: pymarc has both. `as_dict()` returns a plain dict; `as_json(**kwargs)` wraps `json.dumps(as_dict(), **kwargs)`. mrrc has neither. Note: `as_json()` must produce the pymarc-compatible JSON schema (Gap G), not mrrc's native format.\n\n**Fix:** Implement `as_dict()` producing pymarc-compatible structure, then `as_json(**kwargs)` on top. Keep mrrc's `to_json()` as its own format.\n\n### K. parse_xml_to_array() missing\n\nVerified: pymarc `parse_xml_to_array(xml_file, strict=False, normalize_form=None)` accepts file paths, open file handles, and StringIO. mrrc has `xml_to_records(str)` only.\n\n**Fix:** Add `parse_xml_to_array()` to `mrrc/__init__.py`. GIL policy: file paths and string content → entirely Rust, zero GIL. Python file objects → GIL only for `.read()`, minimize window. Document preferred path-based usage with explicit examples and comments explaining why, consistent with MARCReader docs:\n\n```python\n# Preferred: file path — read + parse entirely in Rust, zero GIL\nrecords = mrrc.parse_xml_to_array(\"collection.xml\")\n\n# Also good: string content — parse entirely in Rust, zero GIL\nrecords = mrrc.parse_xml_to_array(xml_string)\n\n# Works but slower: Python file object requires GIL for .read()\nwith open(\"collection.xml\") as f:\n records = mrrc.parse_xml_to_array(f)\n```\n\n### L. add_field(*fields) signature mismatch\n\nVerified: pymarc `add_field(self, *fields)`. mrrc `add_field(self, field: Field) -> None`.\n\n**Fix:** Change signature to accept `*fields` with a loop.\n\n### M. remove_field(*fields) + remove_fields(*tags)\n\nVerified: pymarc `remove_field(self, *fields) -> None` accepts multiple. mrrc `remove_field(self, field: Union[Field, str]) -> List[Field]` accepts one. pymarc also has `remove_fields(self, *tags) -> None` for bulk removal by tag string — mrrc has no equivalent. Also note: pymarc returns None; mrrc returns the removed fields.\n\n**Fix:** Change `remove_field` to accept `*fields`. Add `remove_fields(*tags)`. Consider whether to keep mrrc's return value (non-breaking addition).\n\n### N. add_ordered_field() / add_grouped_field() missing\n\nVerified: both exist in pymarc, neither in mrrc.\n\n**Fix:** Implement in wrapper. `add_ordered_field` inserts maintaining tag sort order. `add_grouped_field` inserts after the last field with the same tag.\n\n### O. Field.add_subfield() missing pos parameter\n\nVerified: pymarc `add_subfield(self, code, value, pos=None)`. mrrc `add_subfield(self, code, value)`.\n\n**Fix:** Add optional `pos` parameter. May need Rust-side support for positional insert.\n\n### P. Field.as_marc() / as_marc21() missing\n\nVerified: pymarc Field has both, mrrc Field has neither.\n\n**Fix:** Implement in Rust, expose via PyO3. No GIL needed. `as_marc21` as alias.\n\n### Q. Field.linkage_occurrence_num() missing\n\nVerified: pymarc `field.linkage_occurrence_num() -> Optional[str]` extracts the occurrence number from subfield $6 linkage.\n\n**Fix:** Implement in wrapper or Rust.\n\n---\n\n## Tier 3: Naming and return type mismatches\n\n### R. Property name mismatches\n\nVerified:\n- pymarc `physicaldescription` → mrrc has `physical_description` (underscore) but NOT `physicaldescription`\n- pymarc `uniformtitle` → mrrc has `uniform_title` (underscore) but NOT `uniformtitle`\n- pymarc `addedentries` → mrrc: missing entirely (returns 700/710/711/730 fields)\n\n**Fix:** Add pymarc-compatible names as `@property` (part of Gap C). Keep mrrc underscore names as aliases.\n\n### S. pubyear return type\n\nVerified: mrrc returns `Optional[int]` (e.g. `2020`). pymarc returns `Optional[str]` (e.g. `\"2020\"`).\n\n**Fix:** Return `str` from the property wrapper.\n\n### T. Field.convert_legacy_subfields() missing\n\nVerified: pymarc has `Field.convert_legacy_subfields(subfields: list[str]) -> list[Subfield]` as a classmethod. Converts old `['a', 'value', 'b', 'value']` format to Subfield namedtuples.\n\n**Fix:** Add classmethod to Field wrapper. Low priority — only needed for migrating old pymarc code.\n\n---\n\n## Tier 4: Lower priority\n\nThese are real gaps but unlikely to block most pymarc-compatible code.\n\n### U. Record.as_marc() / as_marc21() (already covered as Gap F)\n\n### V. decode_marc()\n\nVerified missing. Semi-internal — populates Record from raw bytes. Uncommon in user code but used by some advanced pymarc scripts.\n\n**Fix:** Could wrap mrrc's MARCReader to parse a single record from bytes.\n\n### W. Reader/writer variants\n\nVerified all missing: JSONReader, JSONWriter, MARCMakerReader, XMLWriter, TextWriter, Reader (base), Writer (base).\n\n**Fix:** Implement as needed. JSONReader/JSONWriter most likely to be used.\n\n### X. Exception hierarchy\n\nVerified: pymarc has 10 specific exception classes (`BadSubfieldCodeWarning`, `BaseAddressInvalid`, `BaseAddressNotFound`, `EndOfRecordNotFound`, `FatalReaderError`, `FieldNotFound`, `PymarcException`, `RecordDirectoryInvalid`, `RecordLeaderInvalid`, `RecordLengthInvalid`). mrrc exports zero exception classes.\n\n**Fix:** Define exception hierarchy in Python wrapper. Map Rust errors to specific exception types.\n\n### Y. MARC8 Python API\n\nVerified: mrrc has MARC8 support in Rust core but exports nothing at the Python level. pymarc exports: `MARC8ToUnicode` class, `marc8_to_unicode()` function, `map_marc8_field()`, `map_marc8_record()`.\n\n**Fix:** Expose Rust MARC8 functionality via PyO3, add pymarc-compatible wrapper names.\n\n### Z. Convenience functions\n\nVerified all missing: `map_records()`, `map_xml()`, `parse_json_to_array()`, `record_to_xml_node()`, `normalize_subfield_code()`.\n\n**Fix:** Implement as wrappers. `parse_json_to_array()` most likely to be used.\n\n### AA. Constants\n\nVerified all missing: `LEADER_LEN`, `DIRECTORY_ENTRY_LEN`, `END_OF_FIELD`, `END_OF_RECORD`, `SUBFIELD_INDICATOR`, `MARC_XML_NS`, `MARC_XML_SCHEMA`.\n\n**Fix:** Export from `mrrc/__init__.py`. These are just integer/string constants.\n\n### BB. Record instance attributes\n\npymarc Record has `force_utf8`, `pos`, `to_unicode` as instance attributes (member descriptors). These are used during decode and by some user code. mrrc has none of these.\n\n**Fix:** Low priority. `pos` (file position) could be useful; `force_utf8`/`to_unicode` are decode-time flags.\n\n---\n\n## GIL policy\n\nAvoid holding the GIL whenever possible.\n\n- **as_json(), as_dict(), as_marc(), as_marc21():** In-memory, single record — no GIL concern.\n- **parse_xml_to_array():** File paths and string content → entirely Rust, zero GIL. Python file objects → GIL only for `.read()`, minimize window.\n- **Field.value(), format_field(), as_marc():** Pure computation, no GIL concern.","created_at":"2026-03-23T00:05:04Z"}]} +{"id":"bd-y331","title":"Add pymarc-compatible to_unicode and permissive flags to MARCReader","status":"open","priority":3,"issue_type":"feature","created_at":"2026-04-10T01:47:10.886181Z","created_by":"dchud","updated_at":"2026-04-10T01:48:11.593780Z","source_repo":".","compaction_level":0,"original_size":0,"comments":[{"id":19,"issue_id":"bd-y331","author":"dchud","text":"Elevated to GitHub issue #78: https://github.com/dchud/mrrc/issues/78","created_at":"2026-04-10T01:48:11Z"}]} {"id":"mrrc-086","title":"Add parallel benchmarking suite (rayon/threading/multiprocessing)","status":"closed","priority":2,"issue_type":"feature","created_at":"2025-12-31T17:02:57.787595Z","updated_at":"2026-01-06T16:11:40.252865Z","closed_at":"2026-01-06T16:11:40.252865Z","close_reason":"Satisfied by Phase F - Comprehensive benchmarking suite with Criterion.rs, speedup curves (2.04x @ 2-thread, 3.20x @ 4-thread), fixtures (1k/10k/100k records), and pymrrc vs Rust efficiency comparison (92%) completed.","source_repo":".","compaction_level":0,"original_size":0} {"id":"mrrc-08k","title":"Phase 3: Linked field navigation and specialized queries","description":"Phase 3 of Advanced Field Query DSL (see design/FIELD_QUERY_DSL.md).\n\nImplement specialized query patterns for linked fields, authority control, and format-specific queries.\n\nTasks:\n1. Linked field navigation (880 field linkage)\n - Implement subfield 6 parsing for linkage occurrence numbers\n - Methods:\n * field.get_linked_field(record) -> Option (880 partner)\n * record.get_all_linked_fields(field) -> Vec\n * record.get_field_pairs(tag) -> Vec<(Field, Option)> (original + 880)\n - Support for bidirectional lookups\n\n2. Authority control helpers (for authority records)\n - Methods for finding related authority headings\n - Navigate 4XX (see from) and 5XX (see also) tracings\n - Helper: find_related_headings(heading: &Field) -> Vec\n - Extract authority relationship information from 7XX fields\n\n3. Format-specific query traits\n - BibliographicQueries trait (for Record)\n - AuthorityQueries trait (for AuthorityRecord)\n - HoldingsQueries trait (for HoldingsRecord)\n - Each provides domain-specific helper methods\n\n4. Comprehensive testing\n - Test 880 field linkage with real MARC-8 encoded data\n - Test authority record navigation patterns\n - Test complex record type-specific queries\n \nDependencies: Requires Phase 1 (mrrc-9n8) and Phase 2 (mrrc-131) completed","notes":"Phase 3 Status: Linked field navigation COMPLETED (28 tests). Authority control helpers TODO. Format-specific traits TODO.","status":"closed","priority":4,"issue_type":"task","created_at":"2025-12-26T17:13:18.416250Z","updated_at":"2025-12-28T16:25:38.021558Z","closed_at":"2025-12-28T16:25:38.021558Z","close_reason":"All three sub-features completed: linked field navigation, authority control helpers, format-specific query traits. 282 library tests passing.","source_repo":".","compaction_level":0,"original_size":0} {"id":"mrrc-0jm","title":"Update check.sh to include mrrc-python Clippy and test checks","status":"closed","priority":1,"issue_type":"task","created_at":"2026-01-06T16:50:13.684695Z","created_by":"dchud","updated_at":"2026-01-06T16:59:17.545907Z","closed_at":"2026-01-06T16:59:17.545907Z","close_reason":"Completed","source_repo":".","compaction_level":0,"original_size":0} diff --git a/CHANGELOG.md b/CHANGELOG.md index 60e8bbb..774c445 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,13 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -### Added +### Fixed -### Changed +- **Repeated control fields silently dropped** ([#77](https://github.com/dchud/mrrc/issues/77), [#79](https://github.com/dchud/mrrc/pull/79)): Control fields (e.g., multiple 007s) were stored in `IndexMap`, causing later values to overwrite earlier ones. Changed to `IndexMap>` across all three record types (`Record`, `AuthorityRecord`, `HoldingsRecord`). Updated all 7 serialization formats (ISO 2709, JSON, MARCJSON, MARCXML, CSV, Dublin Core/MODS, BIBFRAME) to emit all values. The pymarc-compatible `get_fields()` API now correctly returns all repeated control fields, matching pymarc behavior. -### Fixed +### Changed -### Documentation +- **Simplified control field API**: Removed `get_control_field_values()` accessor from `Record` — redundant with direct access to the public `control_fields` map. `get_control_field()` remains as a convenience for non-repeatable tags (001, 003, 005, 008). ## [0.7.5] - 2026-04-05 diff --git a/mrrc/__init__.py b/mrrc/__init__.py index 36da3eb..c2e11df 100644 --- a/mrrc/__init__.py +++ b/mrrc/__init__.py @@ -849,8 +849,7 @@ def get_fields(self, *tags: str) -> List['Field']: if not tags: # Return all control fields, then all data fields for tag in _CONTROL_TAGS: - value = self._inner.control_field(tag) - if value is not None: + for value in self._inner.control_field_values(tag): result.append(Field(tag, data=value)) for field in self._inner.fields(): wrapper = Field.__new__(Field) @@ -861,8 +860,7 @@ def get_fields(self, *tags: str) -> List['Field']: # Return fields for specified tags for tag in tags: if _is_control_tag(tag): - value = self._inner.control_field(tag) - if value is not None: + for value in self._inner.control_field_values(tag): result.append(Field(tag, data=value)) else: for field in self._inner.get_fields(tag): @@ -1453,11 +1451,10 @@ def __eq__(self, other: Any) -> bool: return False # Compare control fields - for code, value in self._inner.control_fields(): - if self._inner.control_field(code) != value: - return False - if other._inner.control_field(code) != value: - return False + self_cfs = self._inner.control_fields() + other_cfs = other._inner.control_fields() + if self_cfs != other_cfs: + return False return True diff --git a/mrrc/_mrrc.pyi b/mrrc/_mrrc.pyi index 2c52fc6..778e13d 100644 --- a/mrrc/_mrrc.pyi +++ b/mrrc/_mrrc.pyi @@ -202,7 +202,17 @@ class Record: """ ... def control_fields(self) -> List[tuple[str, str]]: - """Get all control fields as (tag, value) tuples.""" + """Get all control fields as (tag, value) tuples. + + Repeated tags (e.g., multiple 007 fields) produce multiple entries. + """ + ... + def control_field_values(self, tag: str) -> List[str]: + """Get all values for a control field tag. + + Returns all values for tags that may be repeated (e.g., 006, 007). + Returns an empty list if the tag doesn't exist. + """ ... def get_field(self, tag: str) -> Optional[Field]: ... def get_fields(self, tag: str) -> List[Field]: ... diff --git a/src-python/src/wrappers.rs b/src-python/src/wrappers.rs index 0c31efe..4b39c44 100644 --- a/src-python/src/wrappers.rs +++ b/src-python/src/wrappers.rs @@ -705,15 +705,28 @@ impl PyRecord { result } - /// Get all control fields as a dict-like structure + /// Get all control fields as a list of (tag, value) tuples + /// + /// Repeated tags (e.g., multiple 007 fields) produce multiple entries. pub fn control_fields(&self) -> Vec<(String, String)> { self.inner .control_fields .iter() - .map(|(tag, value)| (tag.clone(), value.clone())) + .flat_map(|(tag, values)| values.iter().map(move |value| (tag.clone(), value.clone()))) .collect() } + /// Get all values for a control field tag + /// + /// Returns all values for tags that may be repeated (e.g., 006, 007). + pub fn control_field_values(&self, tag: &str) -> Vec { + self.inner + .control_fields + .get(tag) + .map(|values| values.iter().map(String::clone).collect()) + .unwrap_or_default() + } + /// Remove all fields with a given tag /// /// Returns the removed fields. diff --git a/src/authority_record.rs b/src/authority_record.rs index ea14c43..5dbf9e8 100644 --- a/src/authority_record.rs +++ b/src/authority_record.rs @@ -19,7 +19,8 @@ pub struct AuthorityRecord { /// Record leader (24 bytes) pub leader: Leader, /// Control fields (000-009) - preserves insertion order - pub control_fields: IndexMap, + /// Multiple values per tag are supported (e.g., repeated 006/007 fields) + pub control_fields: IndexMap>, /// Variable fields (010+) - unified storage, preserves insertion order pub fields: IndexMap>, } @@ -100,13 +101,16 @@ impl AuthorityRecord { /// Add a control field pub fn add_control_field(&mut self, tag: String, value: String) { - self.control_fields.insert(tag, value); + self.control_fields.entry(tag).or_default().push(value); } - /// Get a control field value + /// Get the first control field value for a tag #[must_use] pub fn get_control_field(&self, tag: &str) -> Option<&str> { - self.control_fields.get(tag).map(String::as_str) + self.control_fields + .get(tag) + .and_then(|v| v.first()) + .map(String::as_str) } /// Set the heading (1XX field) @@ -327,19 +331,25 @@ impl MarcRecord for AuthorityRecord { } fn add_control_field(&mut self, tag: impl Into, value: impl Into) { - self.control_fields.insert(tag.into(), value.into()); + self.control_fields + .entry(tag.into()) + .or_default() + .push(value.into()); } fn get_control_field(&self, tag: &str) -> Option<&str> { - self.control_fields.get(tag).map(String::as_str) + self.control_fields + .get(tag) + .and_then(|v| v.first()) + .map(String::as_str) } fn control_fields_iter(&self) -> Box + '_> { - Box::new( - self.control_fields + Box::new(self.control_fields.iter().flat_map(|(tag, values)| { + values .iter() - .map(|(tag, value)| (tag.as_str(), value.as_str())), - ) + .map(move |value| (tag.as_str(), value.as_str())) + })) } fn get_fields(&self, tag: &str) -> Option<&[Field]> { diff --git a/src/authority_writer.rs b/src/authority_writer.rs index f814831..a2b4625 100644 --- a/src/authority_writer.rs +++ b/src/authority_writer.rs @@ -87,8 +87,10 @@ impl AuthorityMarcWriter { }; // Write control fields - for (tag, value) in &record.control_fields { - add_field(tag, None, Some(value), None, &mut directory, &mut data)?; + for (tag, values) in &record.control_fields { + for value in values { + add_field(tag, None, Some(value), None, &mut directory, &mut data)?; + } } // Write all variable fields (010+) in tag order diff --git a/src/bibframe/converter.rs b/src/bibframe/converter.rs index 8cfed76..22a09f4 100644 --- a/src/bibframe/converter.rs +++ b/src/bibframe/converter.rs @@ -220,6 +220,7 @@ impl<'a> MarcToBibframeConverter<'a> { self.record .control_fields .get("001") + .and_then(|v| v.first()) .map_or("unknown", String::as_str) } else { "unknown" @@ -1637,6 +1638,7 @@ impl<'a> MarcToBibframeConverter<'a> { self.record .control_fields .get("001") + .and_then(|v| v.first()) .map_or("unknown", String::as_str) } else { "unknown" @@ -1674,7 +1676,12 @@ impl<'a> MarcToBibframeConverter<'a> { } // Add creation date from 008/00-05 if available - if let Some(field_008) = self.record.control_fields.get("008") { + if let Some(field_008) = self + .record + .control_fields + .get("008") + .and_then(|v| v.first()) + { if field_008.len() >= 6 { let date_entered = &field_008[0..6]; self.graph.add( diff --git a/src/csv.rs b/src/csv.rs index d5fab3a..3721537 100644 --- a/src/csv.rs +++ b/src/csv.rs @@ -118,10 +118,12 @@ pub fn records_to_csv(records: &[Record]) -> Result { for record in records { // Write control fields - for (tag, value) in &record.control_fields { - // Escape value if needed - let escaped_value = escape_csv_value(value); - writeln!(output, "{tag},,,{escaped_value}").ok(); + for (tag, values) in &record.control_fields { + for value in values { + // Escape value if needed + let escaped_value = escape_csv_value(value); + writeln!(output, "{tag},,,{escaped_value}").ok(); + } } // Write data fields with subfields @@ -188,10 +190,12 @@ where for record in records { // Write control fields - for (tag, value) in &record.control_fields { + for (tag, values) in &record.control_fields { if filter(tag) { - let escaped_value = escape_csv_value(value); - writeln!(output, "{tag},,,{escaped_value}").ok(); + for value in values { + let escaped_value = escape_csv_value(value); + writeln!(output, "{tag},,,{escaped_value}").ok(); + } } } diff --git a/src/dublin_core.rs b/src/dublin_core.rs index b8d161b..60adc0c 100644 --- a/src/dublin_core.rs +++ b/src/dublin_core.rs @@ -309,7 +309,7 @@ fn extract_identifiers(record: &Record, dc: &mut DublinCoreRecord) { } // Control number (001) - if let Some(control_001) = record.control_fields.get("001") { + if let Some(control_001) = record.control_fields.get("001").and_then(|v| v.first()) { dc.identifier.push(format!("Control#: {control_001}")); } } diff --git a/src/encoding_validation.rs b/src/encoding_validation.rs index 307cb7a..c53e21d 100644 --- a/src/encoding_validation.rs +++ b/src/encoding_validation.rs @@ -46,13 +46,15 @@ impl EncodingValidator { let mut inconsistent_field_count = 0usize; // Check control fields - for value in record.control_fields.values() { - if Self::is_likely_different_encoding(value, primary_encoding) { - inconsistent_field_count += 1; - let detected = Self::detect_encoding_from_string(value); - if let Some(enc) = detected { - if enc != primary_encoding && !mixed_encodings.contains(&enc) { - mixed_encodings.push(enc); + for values in record.control_fields.values() { + for value in values { + if Self::is_likely_different_encoding(value, primary_encoding) { + inconsistent_field_count += 1; + let detected = Self::detect_encoding_from_string(value); + if let Some(enc) = detected { + if enc != primary_encoding && !mixed_encodings.contains(&enc) { + mixed_encodings.push(enc); + } } } } diff --git a/src/holdings_reader.rs b/src/holdings_reader.rs index a31d06f..a223fdd 100644 --- a/src/holdings_reader.rs +++ b/src/holdings_reader.rs @@ -143,7 +143,7 @@ impl HoldingsMarcReader { // Parse directory (entries are 12 bytes each: 3 bytes tag + 4 bytes length + 5 bytes start position) let mut fields: indexmap::IndexMap> = indexmap::IndexMap::new(); - let mut control_fields: indexmap::IndexMap = indexmap::IndexMap::new(); + let mut control_fields: indexmap::IndexMap> = indexmap::IndexMap::new(); let mut i = 0; while i + 12 <= directory_bytes.len() { @@ -180,7 +180,7 @@ impl HoldingsMarcReader { MarcError::InvalidRecord("Invalid control field encoding".to_string()) })? .to_string(); - control_fields.insert(tag, value); + control_fields.entry(tag).or_default().push(value); } else { // Data field if field_data.len() < 3 { @@ -238,8 +238,10 @@ impl HoldingsMarcReader { let mut record = HoldingsRecord::new(leader); // Add control fields - for (tag, value) in control_fields { - record.add_control_field(tag, value); + for (tag, values) in control_fields { + for value in values { + record.add_control_field(tag.clone(), value); + } } // Organize data fields by their functional role diff --git a/src/holdings_record.rs b/src/holdings_record.rs index f6a6e37..a3a7a58 100644 --- a/src/holdings_record.rs +++ b/src/holdings_record.rs @@ -19,7 +19,8 @@ pub struct HoldingsRecord { /// Record leader (24 bytes) pub leader: Leader, /// Control fields (000-009) - preserves insertion order - pub control_fields: IndexMap, + /// Multiple values per tag are supported (e.g., repeated 006/007 fields) + pub control_fields: IndexMap>, /// Variable fields (010+) - unified storage, preserves insertion order pub fields: IndexMap>, } @@ -103,13 +104,16 @@ impl HoldingsRecord { /// Add a control field pub fn add_control_field(&mut self, tag: String, value: String) { - self.control_fields.insert(tag, value); + self.control_fields.entry(tag).or_default().push(value); } - /// Get a control field value + /// Get the first control field value for a tag #[must_use] pub fn get_control_field(&self, tag: &str) -> Option<&str> { - self.control_fields.get(tag).map(String::as_str) + self.control_fields + .get(tag) + .and_then(|v| v.first()) + .map(String::as_str) } /// Add a location field (852) @@ -371,19 +375,25 @@ impl MarcRecord for HoldingsRecord { } fn add_control_field(&mut self, tag: impl Into, value: impl Into) { - self.control_fields.insert(tag.into(), value.into()); + self.control_fields + .entry(tag.into()) + .or_default() + .push(value.into()); } fn get_control_field(&self, tag: &str) -> Option<&str> { - self.control_fields.get(tag).map(String::as_str) + self.control_fields + .get(tag) + .and_then(|v| v.first()) + .map(String::as_str) } fn control_fields_iter(&self) -> Box + '_> { - Box::new( - self.control_fields + Box::new(self.control_fields.iter().flat_map(|(tag, values)| { + values .iter() - .map(|(tag, value)| (tag.as_str(), value.as_str())), - ) + .map(move |value| (tag.as_str(), value.as_str())) + })) } fn get_fields(&self, tag: &str) -> Option<&[Field]> { diff --git a/src/holdings_writer.rs b/src/holdings_writer.rs index 0ac4bf6..0358682 100644 --- a/src/holdings_writer.rs +++ b/src/holdings_writer.rs @@ -87,8 +87,10 @@ impl HoldingsMarcWriter { }; // Write control fields - for (tag, value) in &record.control_fields { - add_field(tag, None, Some(value), None, &mut directory, &mut data)?; + for (tag, values) in &record.control_fields { + for value in values { + add_field(tag, None, Some(value), None, &mut directory, &mut data)?; + } } // Write data fields (all organized in fields map) diff --git a/src/json.rs b/src/json.rs index 1acb3ac..55bd389 100644 --- a/src/json.rs +++ b/src/json.rs @@ -58,10 +58,12 @@ pub fn record_to_json(record: &Record) -> Result { })); // Add control fields - for (tag, value) in &record.control_fields { - fields.push(json!({ - tag: value - })); + for (tag, values) in &record.control_fields { + for value in values { + fields.push(json!({ + tag: value + })); + } } // Add data fields diff --git a/src/marcjson.rs b/src/marcjson.rs index 714e174..c5aaa2c 100644 --- a/src/marcjson.rs +++ b/src/marcjson.rs @@ -49,10 +49,12 @@ pub fn record_to_marcjson(record: &Record) -> Result { })); // Add control fields (001-009) - for (tag, value) in &record.control_fields { - let mut field = serde_json::Map::new(); - field.insert(tag.clone(), Value::String(value.clone())); - fields.push(Value::Object(field)); + for (tag, values) in &record.control_fields { + for value in values { + let mut field = serde_json::Map::new(); + field.insert(tag.clone(), Value::String(value.clone())); + fields.push(Value::Object(field)); + } } // Add data fields (010+) diff --git a/src/marcxml.rs b/src/marcxml.rs index 75116f7..cba63a1 100644 --- a/src/marcxml.rs +++ b/src/marcxml.rs @@ -162,11 +162,13 @@ pub fn record_to_marcxml(record: &Record) -> Result { let leader_str = String::from_utf8_lossy(&leader_bytes).to_string(); let mut controlfields = Vec::new(); - for (tag, value) in &record.control_fields { - controlfields.push(MarcxmlControlField { - tag: tag.clone(), - value: value.clone(), - }); + for (tag, values) in &record.control_fields { + for value in values { + controlfields.push(MarcxmlControlField { + tag: tag.clone(), + value: value.clone(), + }); + } } let mut datafields = Vec::new(); diff --git a/src/mods.rs b/src/mods.rs index 0a248c8..fb86182 100644 --- a/src/mods.rs +++ b/src/mods.rs @@ -388,7 +388,7 @@ fn write_identifiers(xml: &mut String, record: &Record) { } // Control number (001) - if let Some(control_001) = record.control_fields.get("001") { + if let Some(control_001) = record.control_fields.get("001").and_then(|v| v.first()) { writeln!( xml, " {}", diff --git a/src/record.rs b/src/record.rs index 5ac437b..c063a5c 100644 --- a/src/record.rs +++ b/src/record.rs @@ -65,8 +65,9 @@ use std::ops::Index; pub struct Record { /// Record leader (24 bytes) pub leader: Leader, - /// Control fields (000-009) - tag -> value, preserves insertion order - pub control_fields: IndexMap, + /// Control fields (000-009) - tag -> values, preserves insertion order + /// Multiple values per tag are supported (e.g., repeated 006/007 fields) + pub control_fields: IndexMap>, /// Data fields (010+) - tag -> fields, preserves insertion order pub fields: IndexMap>, } @@ -146,8 +147,10 @@ impl Record { } /// Add a control field (000-009) + /// + /// Multiple values per tag are supported (e.g., repeated 006/007 fields). pub fn add_control_field(&mut self, tag: String, value: String) { - self.control_fields.insert(tag, value); + self.control_fields.entry(tag).or_default().push(value); } /// Add a control field using string slices @@ -157,12 +160,16 @@ impl Record { self.add_control_field(tag.to_string(), value.to_string()); } - /// Get a control field value + /// Get the first control field value for a tag + /// + /// For tags that may have repeated values (e.g., 006, 007), access + /// `control_fields` directly to retrieve all values. #[must_use] pub fn get_control_field(&self, tag: &str) -> Option<&str> { self.control_fields .get(tag) - .map(std::string::String::as_str) + .and_then(|v| v.first()) + .map(String::as_str) } /// Add a data field @@ -209,11 +216,14 @@ impl Record { /// Iterate over all control fields /// - /// Returns an iterator of (tag, value) tuples. + /// Returns an iterator of (tag, value) tuples, yielding one entry per + /// value. Repeated tags (e.g., two 007 fields) produce multiple entries. pub fn control_fields_iter(&self) -> impl Iterator { - self.control_fields - .iter() - .map(|(tag, value)| (tag.as_str(), value.as_str())) + self.control_fields.iter().flat_map(|(tag, values)| { + values + .iter() + .map(move |value| (tag.as_str(), value.as_str())) + }) } // ============================================================================ @@ -813,19 +823,25 @@ impl MarcRecord for Record { } fn add_control_field(&mut self, tag: impl Into, value: impl Into) { - self.control_fields.insert(tag.into(), value.into()); + self.control_fields + .entry(tag.into()) + .or_default() + .push(value.into()); } fn get_control_field(&self, tag: &str) -> Option<&str> { - self.control_fields.get(tag).map(String::as_str) + self.control_fields + .get(tag) + .and_then(|v| v.first()) + .map(String::as_str) } fn control_fields_iter(&self) -> Box + '_> { - Box::new( - self.control_fields + Box::new(self.control_fields.iter().flat_map(|(tag, values)| { + values .iter() - .map(|(tag, value)| (tag.as_str(), value.as_str())), - ) + .map(move |value| (tag.as_str(), value.as_str())) + })) } fn get_fields(&self, tag: &str) -> Option<&[Field]> { diff --git a/src/record_validation.rs b/src/record_validation.rs index 6ed4de8..c6acae5 100644 --- a/src/record_validation.rs +++ b/src/record_validation.rs @@ -197,8 +197,8 @@ impl RecordStructureValidator { /// Returns `Err` if directory structure is invalid. pub fn validate_directory_structure(record: &Record) -> Result<()> { // Calculate expected directory length (12 bytes per field entry + 1 for terminator) - let total_fields = - record.control_fields.len() + record.fields.values().map(Vec::len).sum::(); + let total_fields = record.control_fields.values().map(Vec::len).sum::() + + record.fields.values().map(Vec::len).sum::(); let directory_length = (total_fields * 12) + 1; // Validate that base address would fit in 5-digit field @@ -211,8 +211,10 @@ impl RecordStructureValidator { // Validate that record length would fit in 5-digit field let mut total_length = base_address; - for value in record.control_fields.values() { - total_length += value.len() + 1; // +1 for field terminator + for values in record.control_fields.values() { + for value in values { + total_length += value.len() + 1; // +1 for field terminator + } } for fields in record.fields.values() { diff --git a/src/writer.rs b/src/writer.rs index 60fa7d1..9011dd5 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -146,20 +146,22 @@ impl MarcWriter { let mut current_position = 0; // Write control fields first (001-009) - for (tag, value) in &record.control_fields { + for (tag, values) in &record.control_fields { if tag.as_str() < "010" { - let field_data = value.as_bytes(); - let field_length = field_data.len() + 1; // +1 for terminator - - // Add directory entry - directory.extend_from_slice(tag.as_bytes()); - directory.extend_from_slice(format!("{field_length:04}").as_bytes()); - directory.extend_from_slice(format!("{current_position:05}").as_bytes()); - - // Add data - data_area.extend_from_slice(field_data); - data_area.push(FIELD_TERMINATOR); - current_position += field_length; + for value in values { + let field_data = value.as_bytes(); + let field_length = field_data.len() + 1; // +1 for terminator + + // Add directory entry + directory.extend_from_slice(tag.as_bytes()); + directory.extend_from_slice(format!("{field_length:04}").as_bytes()); + directory.extend_from_slice(format!("{current_position:05}").as_bytes()); + + // Add data + data_area.extend_from_slice(field_data); + data_area.push(FIELD_TERMINATOR); + current_position += field_length; + } } } diff --git a/tests/memory_safety_asan.rs b/tests/memory_safety_asan.rs index 0832139..f7fe547 100644 --- a/tests/memory_safety_asan.rs +++ b/tests/memory_safety_asan.rs @@ -272,9 +272,7 @@ mod asan_memory_safety_tests { let mut record = Record::new(leader); // Add some control fields - record - .control_fields - .insert("008".to_string(), "Test008".to_string()); + record.add_control_field("008".to_string(), "Test008".to_string()); // Add variable fields using builder pattern let field = Field::builder("245".to_string(), '1', '4') diff --git a/tests/python/test_repeated_control_fields.py b/tests/python/test_repeated_control_fields.py new file mode 100644 index 0000000..5a236b1 --- /dev/null +++ b/tests/python/test_repeated_control_fields.py @@ -0,0 +1,303 @@ +""" +Tests for repeated control fields (GitHub issue #77). + +MARC records can have repeated control fields, particularly 006 and 007. +This test suite verifies that mrrc correctly preserves all instances of +repeated control fields through parsing, access, serialization, and +round-tripping. + +The bug: control fields were stored in IndexMap, causing +later values to overwrite earlier ones for the same tag. + +Reference record: NLS catalog BRC01945 +https://search.nlscatalog.loc.gov/instances/f851ab51-0ca0-516a-8871-002de5d575b3 +""" + +import io +import pytest +import mrrc + +try: + import pymarc + HAS_PYMARC = True +except ImportError: + HAS_PYMARC = False + +requires_pymarc = pytest.mark.skipif( + not HAS_PYMARC, reason="pymarc not installed" +) + + +# ============================================================================ +# Fixture builders — mrrc-only (no pymarc dependency) +# ============================================================================ + +def _build_record_with_two_007s_mrrc() -> bytes: + """Build a MARC record with two 007 fields using mrrc, return as bytes. + + Same content as the NLS catalog record (BRC01945) which has both a + computer resource 007 and a microform 007. + """ + record = mrrc.Record() + record.add_field(mrrc.Field("001", data="1118690")) + record.add_field(mrrc.Field("007", data="cr|nn ||||||aa")) + record.add_field(mrrc.Field("007", data="fb|a bnnnn")) + record.add_field(mrrc.Field("008", data="230327s2023 wau ef 000 1 eng d")) + f245 = mrrc.Field("245", "1", "0") + f245.add_subfield("a", "Test title") + record.add_field(f245) + output = io.BytesIO() + writer = mrrc.MARCWriter(output) + writer.write_record(record) + writer.close() + return output.getvalue() + + +def _build_record_with_repeated_006_007_mrrc() -> bytes: + """Build a MARC record with repeated 006 AND 007 fields using mrrc.""" + record = mrrc.Record() + record.add_field(mrrc.Field("001", data="9999999")) + record.add_field(mrrc.Field("006", data="m o d ")) + record.add_field(mrrc.Field("006", data="j s n ")) + record.add_field(mrrc.Field("007", data="cr |||||||||||")) + record.add_field(mrrc.Field("007", data="sd fsngnnmmned")) + record.add_field(mrrc.Field("007", data="vf cbahos")) + record.add_field(mrrc.Field("008", data="230327s2023 wau ef 000 1 eng d")) + f245 = mrrc.Field("245", "1", "0") + f245.add_subfield("a", "Multi-format item") + record.add_field(f245) + output = io.BytesIO() + writer = mrrc.MARCWriter(output) + writer.write_record(record) + writer.close() + return output.getvalue() + + +# ============================================================================ +# Fixture builders — pymarc (for parity tests) +# ============================================================================ + +def _build_record_with_two_007s_pymarc() -> bytes: + """Build a MARC record with two 007 fields using pymarc, return as bytes.""" + record = pymarc.Record() + record.add_field(pymarc.Field(tag="001", data="1118690")) + record.add_field(pymarc.Field(tag="007", data="cr|nn ||||||aa")) + record.add_field(pymarc.Field(tag="007", data="fb|a bnnnn")) + record.add_field( + pymarc.Field( + tag="008", + data="230327s2023 wau ef 000 1 eng d", + ) + ) + record.add_field( + pymarc.Field( + tag="245", + indicators=["1", "0"], + subfields=[pymarc.Subfield(code="a", value="Test title")], + ) + ) + return record.as_marc() + + +def _build_record_with_repeated_006_007_pymarc() -> bytes: + """Build a MARC record with repeated 006 AND 007 fields using pymarc.""" + record = pymarc.Record() + record.add_field(pymarc.Field(tag="001", data="9999999")) + record.add_field(pymarc.Field(tag="006", data="m o d ")) + record.add_field(pymarc.Field(tag="006", data="j s n ")) + record.add_field(pymarc.Field(tag="007", data="cr |||||||||||")) + record.add_field(pymarc.Field(tag="007", data="sd fsngnnmmned")) + record.add_field(pymarc.Field(tag="007", data="vf cbahos")) + record.add_field( + pymarc.Field( + tag="008", + data="230327s2023 wau ef 000 1 eng d", + ) + ) + record.add_field( + pymarc.Field( + tag="245", + indicators=["1", "0"], + subfields=[pymarc.Subfield(code="a", value="Multi-format item")], + ) + ) + return record.as_marc() + + +# ============================================================================ +# mrrc-only tests (always run, no pymarc dependency) +# ============================================================================ + +class TestMrrcRepeatedControlFields: + """Test that mrrc preserves repeated control fields through parsing.""" + + def test_mrrc_returns_two_007s(self): + """mrrc must return both 007 fields from a parsed record.""" + marc_bytes = _build_record_with_two_007s_mrrc() + reader = mrrc.MARCReader(io.BytesIO(marc_bytes)) + record = reader.read_record() + + fields_007 = record.get_fields("007") + assert len(fields_007) == 2, ( + f"Expected 2 007 fields, got {len(fields_007)}: " + f"{[f.data for f in fields_007]}" + ) + assert fields_007[0].data == "cr|nn ||||||aa" + assert fields_007[1].data == "fb|a bnnnn" + + def test_mrrc_returns_repeated_006_and_007(self): + """mrrc must return all repeated 006 and 007 fields.""" + marc_bytes = _build_record_with_repeated_006_007_mrrc() + reader = mrrc.MARCReader(io.BytesIO(marc_bytes)) + record = reader.read_record() + + fields_006 = record.get_fields("006") + assert len(fields_006) == 2, ( + f"Expected 2 006 fields, got {len(fields_006)}" + ) + + fields_007 = record.get_fields("007") + assert len(fields_007) == 3, ( + f"Expected 3 007 fields, got {len(fields_007)}" + ) + + def test_mrrc_add_field_preserves_repeated_control_fields(self): + """Adding multiple control fields with the same tag must preserve all.""" + record = mrrc.Record() + record.add_field(mrrc.Field("007", data="cr|nn ||||||aa")) + record.add_field(mrrc.Field("007", data="fb|a bnnnn")) + + fields_007 = record.get_fields("007") + assert len(fields_007) == 2 + assert fields_007[0].data == "cr|nn ||||||aa" + assert fields_007[1].data == "fb|a bnnnn" + + +class TestMrrcRepeatedControlFieldRoundTrip: + """Test round-tripping records with repeated control fields.""" + + def test_roundtrip_preserves_two_007s(self): + """Serialize then parse must preserve repeated 007 fields.""" + marc_bytes = _build_record_with_two_007s_mrrc() + reader = mrrc.MARCReader(io.BytesIO(marc_bytes)) + record = reader.read_record() + + # Re-serialize + output = io.BytesIO() + writer = mrrc.MARCWriter(output) + writer.write_record(record) + writer.close() + + # Re-parse + reader2 = mrrc.MARCReader(io.BytesIO(output.getvalue())) + record2 = reader2.read_record() + + fields_007 = record2.get_fields("007") + assert len(fields_007) == 2 + assert fields_007[0].data == "cr|nn ||||||aa" + assert fields_007[1].data == "fb|a bnnnn" + + def test_roundtrip_preserves_leader_length(self): + """Record length in leader must account for all control fields.""" + marc_bytes = _build_record_with_two_007s_mrrc() + original_length = int(marc_bytes[:5].decode("ascii")) + + reader = mrrc.MARCReader(io.BytesIO(marc_bytes)) + record = reader.read_record() + + output = io.BytesIO() + writer = mrrc.MARCWriter(output) + writer.write_record(record) + writer.close() + + roundtrip_bytes = output.getvalue() + roundtrip_length = int(roundtrip_bytes[:5].decode("ascii")) + + assert roundtrip_length == original_length, ( + f"Round-trip leader length {roundtrip_length} != original " + f"{original_length} — likely a repeated control field was dropped" + ) + + def test_all_fields_returns_all_control_fields(self): + """get_fields() with no args must include all repeated control fields.""" + marc_bytes = _build_record_with_two_007s_mrrc() + + reader = mrrc.MARCReader(io.BytesIO(marc_bytes)) + record = reader.read_record() + + all_fields = record.get_fields() + tags_007 = [f for f in all_fields if f.tag == "007"] + assert len(tags_007) == 2 + + +# ============================================================================ +# Pymarc parity tests (skipped when pymarc is not installed) +# ============================================================================ + +@requires_pymarc +class TestPymarcBaselineRepeatedControlFields: + """Verify pymarc itself handles repeated control fields correctly. + + These tests establish the expected behavior that mrrc must match. + """ + + def test_pymarc_returns_two_007s(self): + """Baseline: pymarc returns both 007 fields.""" + marc_bytes = _build_record_with_two_007s_pymarc() + reader = pymarc.MARCReader(io.BytesIO(marc_bytes)) + record = next(reader) + + fields_007 = record.get_fields("007") + assert len(fields_007) == 2 + assert fields_007[0].data == "cr|nn ||||||aa" + assert fields_007[1].data == "fb|a bnnnn" + + def test_pymarc_returns_repeated_006_and_007(self): + """Baseline: pymarc returns all repeated 006 and 007 fields.""" + marc_bytes = _build_record_with_repeated_006_007_pymarc() + reader = pymarc.MARCReader(io.BytesIO(marc_bytes)) + record = next(reader) + + fields_006 = record.get_fields("006") + assert len(fields_006) == 2 + + fields_007 = record.get_fields("007") + assert len(fields_007) == 3 + + +@requires_pymarc +class TestMrrcPymarcControlFieldParity: + """Direct comparison between pymarc and mrrc output for repeated fields.""" + + def test_get_fields_returns_same_count(self): + """get_fields('007') must return same count in pymarc and mrrc.""" + marc_bytes = _build_record_with_two_007s_pymarc() + + pymarc_reader = pymarc.MARCReader(io.BytesIO(marc_bytes)) + pymarc_record = next(pymarc_reader) + + mrrc_reader = mrrc.MARCReader(io.BytesIO(marc_bytes)) + mrrc_record = mrrc_reader.read_record() + + pymarc_007s = pymarc_record.get_fields("007") + mrrc_007s = mrrc_record.get_fields("007") + + assert len(mrrc_007s) == len(pymarc_007s), ( + f"mrrc returned {len(mrrc_007s)} 007 fields but pymarc returned " + f"{len(pymarc_007s)}" + ) + + def test_get_fields_returns_same_values(self): + """get_fields('007') values must match between pymarc and mrrc.""" + marc_bytes = _build_record_with_two_007s_pymarc() + + pymarc_reader = pymarc.MARCReader(io.BytesIO(marc_bytes)) + pymarc_record = next(pymarc_reader) + + mrrc_reader = mrrc.MARCReader(io.BytesIO(marc_bytes)) + mrrc_record = mrrc_reader.read_record() + + pymarc_values = [f.data for f in pymarc_record.get_fields("007")] + mrrc_values = [f.data for f in mrrc_record.get_fields("007")] + + assert mrrc_values == pymarc_values diff --git a/tests/record_builder_generic.rs b/tests/record_builder_generic.rs index 6c43958..5484332 100644 --- a/tests/record_builder_generic.rs +++ b/tests/record_builder_generic.rs @@ -100,11 +100,18 @@ fn test_generic_builder_multiple_control_fields() { } #[test] -fn test_generic_builder_overwrite_field() { +fn test_generic_builder_repeated_control_field() { let record = GenericRecordBuilder::new(Record::new(make_leader())) - .control_field("001", "first_value") - .control_field("001", "second_value") // Overwrite + .control_field("007", "cr|nn ||||||aa") + .control_field("007", "fb|a bnnnn") .build(); - assert_eq!(record.get_control_field("001"), Some("second_value")); + // get_control_field returns the first value + assert_eq!(record.get_control_field("007"), Some("cr|nn ||||||aa")); + + // Access all values directly via the public control_fields map + let values = record.control_fields.get("007").unwrap(); + assert_eq!(values.len(), 2); + assert_eq!(values[0], "cr|nn ||||||aa"); + assert_eq!(values[1], "fb|a bnnnn"); }