diff --git a/.jules/bolt.md b/.jules/bolt.md new file mode 100644 index 0000000..0290ca7 --- /dev/null +++ b/.jules/bolt.md @@ -0,0 +1,3 @@ +## 2026-05-27 - ElementTree.iterparse Memory Leaks and Overhead +**Learning:** `ElementTree.iterparse` leaks memory if only parsing the `end` events and calling `.clear()` on child elements, as the `root` element retains references to its children until the document finishes parsing. Also, parsing the local XML namespace name using a custom method (like `.rsplit()`) adds up linearly with the size of the XML. +**Action:** For parsing large XML files using `iterparse`, always fetch the `start` and `end` events, grab the `root` element on the first `start` event, and call both `elem.clear()` and `root.clear()` on the `end` events. Use simple string checks like `tag.endswith("}segment") or tag == "segment"` to save string parsing overhead. diff --git a/verify_nzb.py b/verify_nzb.py index 953dccd..9d5d3ce 100644 --- a/verify_nzb.py +++ b/verify_nzb.py @@ -80,23 +80,21 @@ class MissingArticleError(NntpError): """The NNTP server does not have the requested article.""" -def _local_name(tag: str) -> str: - if "}" in tag: - return tag.rsplit("}", 1)[-1] - return tag - - def parse_nzb_message_ids(path: str | Path) -> Iterator[str]: """Yield message IDs from elements in an NZB file.""" with open(path, "rb") as handle: - for event, elem in ET.iterparse(handle, events=("end",)): - if _local_name(elem.tag) != "segment": - continue - text = (elem.text or "").strip() - if text: - yield text - elem.clear() + context = ET.iterparse(handle, events=("start", "end")) + _, root = next(context) + for event, elem in context: + if event == "end": + tag = elem.tag + if tag.endswith("}segment") or tag == "segment": + text = (elem.text or "").strip() + if text: + yield text + elem.clear() + root.clear() def normalize_message_id(message_id: str) -> str: