From 64d253a3c93c9bf9b5880a9a53da775dd63f6cec Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 27 May 2026 00:29:10 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20Optimize=20XML=20iterparse?= =?UTF-8?q?=20to=20prevent=20memory=20leak=20and=20speed=20up=20tag=20matc?= =?UTF-8?q?hing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: xbmc4lyfe <273732874+xbmc4lyfe@users.noreply.github.com> --- .jules/bolt.md | 3 +++ verify_nzb.py | 24 +++++++++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) create mode 100644 .jules/bolt.md diff --git a/.jules/bolt.md b/.jules/bolt.md new file mode 100644 index 0000000..0290ca7 --- /dev/null +++ b/.jules/bolt.md @@ -0,0 +1,3 @@ +## 2026-05-27 - ElementTree.iterparse Memory Leaks and Overhead +**Learning:** `ElementTree.iterparse` leaks memory if only parsing the `end` events and calling `.clear()` on child elements, as the `root` element retains references to its children until the document finishes parsing. Also, parsing the local XML namespace name using a custom method (like `.rsplit()`) adds up linearly with the size of the XML. +**Action:** For parsing large XML files using `iterparse`, always fetch the `start` and `end` events, grab the `root` element on the first `start` event, and call both `elem.clear()` and `root.clear()` on the `end` events. Use simple string checks like `tag.endswith("}segment") or tag == "segment"` to save string parsing overhead. diff --git a/verify_nzb.py b/verify_nzb.py index 953dccd..9d5d3ce 100644 --- a/verify_nzb.py +++ b/verify_nzb.py @@ -80,23 +80,21 @@ class MissingArticleError(NntpError): """The NNTP server does not have the requested article.""" -def _local_name(tag: str) -> str: - if "}" in tag: - return tag.rsplit("}", 1)[-1] - return tag - - def parse_nzb_message_ids(path: str | Path) -> Iterator[str]: """Yield message IDs from elements in an NZB file.""" with open(path, "rb") as handle: - for event, elem in ET.iterparse(handle, events=("end",)): - if _local_name(elem.tag) != "segment": - continue - text = (elem.text or "").strip() - if text: - yield text - elem.clear() + context = ET.iterparse(handle, events=("start", "end")) + _, root = next(context) + for event, elem in context: + if event == "end": + tag = elem.tag + if tag.endswith("}segment") or tag == "segment": + text = (elem.text or "").strip() + if text: + yield text + elem.clear() + root.clear() def normalize_message_id(message_id: str) -> str: