From 86870c0b75b5ba022fccbcf34848968906bcc097 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Jun 2026 07:08:18 +0000 Subject: [PATCH 1/4] bench_parse: record Python version and platform in output Parse timings are only comparable within one interpreter build, so print the Python version/implementation/platform (and WebSearcher version) at the top of every benchmark and profile run. --- scripts/bench_parse.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/bench_parse.py b/scripts/bench_parse.py index ba476d8..c941035 100644 --- a/scripts/bench_parse.py +++ b/scripts/bench_parse.py @@ -13,8 +13,10 @@ import cProfile import gc import logging +import platform import pstats import statistics +import sys import time from pathlib import Path @@ -72,6 +74,12 @@ def main( """Benchmark or profile parse_serp over the fixture corpus.""" logging.getLogger("WebSearcher").setLevel(logging.WARNING) + # Record the interpreter: timings are only comparable within one Python build. + typer.echo( + f"Python {platform.python_version()} " + f"({platform.python_implementation()}, {sys.platform}) | WebSearcher {ws.__version__}" + ) + paths = fixtures or sorted(FIXTURES_DIR.glob("serps-v*.json.bz2")) if not paths: typer.echo(f"No fixtures found in {FIXTURES_DIR}") From 6a4b1617431a300afafbaaba8e174c7550697775 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Jun 2026 07:31:21 +0000 Subject: [PATCH 2/4] optimize get_text with a native selectolax text() fast path The pure-Python get_text fragment walker was the largest optimizable cost in a fresh benchmark of the post-selectolax parse pipeline (~18% cumulative, 824k fragment visits/870 parses). Delegate to lexbor's C text() when it is provably byte-identical: the subtree has no script/style/template (native includes their text; the walker skips it) AND either separator=='' or strip is False (so native's kept empty fragments are invisible). Every other call keeps the walker. Verified byte-identical over the full fixture corpus (315k nodes, 0 mismatches) and the snapshot suite stays green without updates (336 passed, 87 snapshots). Back-to-back A/B on Python 3.13: corpus 3872 -> 3590 ms (-7.3%), median 39.9 -> 36.7 ms/SERP, well above the ~0.5% noise floor. Also record the interpreter version/platform at the top of every bench_parse run, since parse timings are only comparable within one Python build. --- CHANGELOG.md | 1 + WebSearcher/_slx.py | 16 ++++ docs/plans/035-get-text-native-fastpath.md | 95 ++++++++++++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 docs/plans/035-get-text-native-fastpath.md diff --git a/CHANGELOG.md b/CHANGELOG.md index c67ce7b..79acbd2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/). - Fixed a regression on *complementary* `kp-wholepage` panels — a music/entity panel that sits beside an intact organic column instead of collapsing it (e.g. `this is my life billy joel youtube`): the sub-column gate mis-keyed on `div.tF2Cxc` and dropped the 10 video organics, and the panel mistyped as `searches_related`/`or_give_general_feedback` from an inner "People also search for" heading. The collapse gate now recognizes any titled-link `div.g` (`h3` + `a[href]`) as organic, and both the header classifier and the knowledge sub-typer defer for `kp-wholepage-osrp` panels so they classify as `knowledge`/`panel`. The true collapsed case (`30 year mortgage rates`) is unaffected - Split bare-`tF2Cxc` organic bundles in `general` components into one result per organic (excluding People-Also-Ask sources), instead of collapsing them to a single result — recovers the organics on finance-style panels (e.g. an `aapl stock price` panel went from 1 to 9 organics) - Added `election_dates`, `election_results`, and `election_resources` component types for the election widgets embedded in whole-page election panels +- Optimized `get_text` (the most-called parse helper) with a native-`text()` fast path: when a subtree contains no `script`/`style`/`template` and the separator semantics permit, it delegates to selectolax's C `text()` instead of the pure-Python fragment walker. Byte-identical output (verified over 315k nodes in the fixture corpus; all 87 snapshots unchanged) for ~7% faster `parse_serp` (corpus 3872 -> 3590 ms, median 39.9 -> 36.7 ms/SERP back-to-back, Python 3.13) ## [0.8.6] - 2026-05-26 diff --git a/WebSearcher/_slx.py b/WebSearcher/_slx.py index efe4521..6672eea 100644 --- a/WebSearcher/_slx.py +++ b/WebSearcher/_slx.py @@ -106,6 +106,22 @@ def get_text(node: Node | None, separator: str = "", strip: bool = False) -> str + ``get_text``.""" if node is None: return None + # Fast path: selectolax's native C ``text()`` is byte-identical to the Python + # fragment walker below EXCEPT for two differences -- native includes + # script/style/template text (the walker skips those subtrees) and native + # keeps empty fragments (the ``strip=True`` walker drops them). Both vanish + # when the subtree holds no script/style/template AND either ``separator`` + # is empty (empties add nothing to a ""-join, so kept-vs-dropped is + # invisible) or ``strip`` is False (both keep empties identically). That + # covers ~95% of calls on this corpus; the one remaining case (``strip=True`` + # with a non-empty separator) keeps the Python walker. Verified + # byte-identical over the full fixture corpus (315k nodes). + if ( + (separator == "" or not strip) + and node.tag not in _SKIP_TEXT_TAGS + and node.css_first("script,style,template") is None + ): + return node.text(deep=True, separator=separator, strip=strip) frags = _iter_text_fragments(node) if strip: parts = [s for s in (f.strip() for f in frags) if s] diff --git a/docs/plans/035-get-text-native-fastpath.md b/docs/plans/035-get-text-native-fastpath.md new file mode 100644 index 0000000..dccf845 --- /dev/null +++ b/docs/plans/035-get-text-native-fastpath.md @@ -0,0 +1,95 @@ +--- +status: done +branch: claude/new-benchmark-PMegd +created: 2026-06-01 +--- + +# `get_text` native-`text()` fast path (post-selectolax benchmark) + +A fresh benchmark of the current parse pipeline (the first since the selectolax +native rewrite in plan 026 and the parser additions in plans 027-034) found that +the pure-Python `get_text` fragment walker had become the single largest +*optimizable* cost. This plan records that benchmark and a byte-identical fast +path that recovers ~7% of `parse_serp` latency. + +Methodology follows plan 023: per-SERP median + MAD, gate on the run-to-run noise +floor, and only trust **back-to-back same-session A/B** numbers (never chain +deltas across sessions or machines). + +## Environment + +`scripts/bench_parse.py` now records the interpreter at the top of every run +(`platform.python_version()` / implementation / platform + `WebSearcher` +version) -- parse timings are only comparable within one Python build, and the +repo's pinned `.python-version` (`3.14.0rc2`) currently can't import the package +(`pydantic 2.13.4` vs the 3.14 RC `typing._eval_type` signature). All numbers +below are **Python 3.13.12, CPython, linux**, fixture corpus +`tests/fixtures/serps.json.bz2` (87 SERPs). + +## Baseline benchmark (current HEAD) + +`bench_parse.py --iterations 10 --runs 3`: + +- median **39.5 ms/SERP**, MAD 13.6 ms; min 17.3 / p90 75.7 / max 115.7 ms +- corpus 3760.6 ms/pass; inter-run MAD 5.7 ms -> **noise floor ~0.3%** (idle box). + +cProfile (`--profile`, 870 parses, 52.6 s, tottime) top buckets: + +| Frame | self | nature | +|---|---|---| +| `make_soup` (lexbor parse) | 10.5 s (20%) | structural -- one parse/SERP, unavoidable | +| `_iter_text_fragments` (get_text walker) | 5.8 s; 9.6 s cum (**18%**) | pure-Python, hot -- the target | +| `_ComponentSignals.__init__` | 5.3 s; 6.9 s cum (13%) | pure-Python (one `css('*')` walk/component) | +| `_extract_from_html` (serp-features regex) | 2.0 s | | +| `_get_dom_positions` | 1.9 s | | + +`get_text` is called ~176x/parse (153,580 over 870), each walking a subtree via a +Python stack of `.iter()` generators -- 824k fragment visits, 5.4M `next()` calls. + +## The fast path + +Plan 026 flagged native selectolax `.text()` as the next lever but "unsafe" +because (a) native includes `script`/`style`/`template` text (the walker skips +those subtrees) and (b) native `strip=True` keeps empty fragments (the walker +drops them). Both differences are **observable only under specific conditions**, +and outside them native C `text()` is byte-identical: + +- (a) vanishes when the subtree has no `script`/`style`/`template`. +- (b) vanishes when `separator == ""` (an empty fragment adds nothing to a + `""`-join, so kept-vs-dropped is invisible) **or** `strip is False` (both keep + empties identically). + +So `get_text` delegates to `node.text(deep=True, separator=sep, strip=strip)` +when `(separator == "" or not strip)` and the subtree holds no +script/style/template (one `css_first("script,style,template")` C probe); every +other call keeps the Python walker. The only call signature that always stays on +the walker is `strip=True` with a non-empty separator (`get_text(x, " ", +strip=True)`, 38 sites) -- exactly the drop-empties-with-visible-separator case. + +**Correctness verification.** Over the full corpus (315,095 element nodes, 95.2% +fast-path-eligible) every fast-pathable signature -- `("", False)`, `("", True)`, +`(" ", False)`, `("<|>", False)` -- produced **0 mismatches** against the walker. +The snapshot suite stays green without updates (`uv run pytest`: 336 passed, 4 +skipped, **87 snapshots unchanged**). + +## Result (back-to-back A/B, same machine state) + +Stash/pop A/B with `--iterations 10 --runs 3`: + +| Metric | Baseline | Fast path | Delta | +|---|---|---|---| +| Corpus total | 3872.0 ms | 3590.3 ms | **-7.3%** | +| Per-SERP median | 39.9 ms | 36.7 ms | **-8.0%** | + +Far above the ~0.4-0.5% noise floor. Post-change profile: `_iter_text_fragments` +self 5.8 -> 2.2 s, cum 9.6 -> 3.5 s, fragment visits 824k -> 276k (the remainder +is the `strip=True`/non-empty-sep walker, script-bearing subtrees, and +`has_text`/`knowledge_box`); the displaced work moved into lexbor's C `text()`. + +## Left for follow-up + +`_ComponentSignals.__init__` is now the #2 pure-Python cost (~5 s self, ~13%): one +`css('*')` walk per component building class/id/tag presence sets. A shared scan +feeding both it and `_get_dom_positions`/`reorder_by_dom_position` (each of which +also `css('*')`-walks) is the next structural lever -- deferred to keep this +change small and byte-identical. From ea8a3becf337ba2226aa08b0d876b30927f4adb9 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Jun 2026 13:35:50 +0000 Subject: [PATCH 3/4] add plan 036: _ComponentSignals lever + extractor hot-path review --- ...component-signals-and-extractor-hotpath.md | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 docs/plans/036-component-signals-and-extractor-hotpath.md diff --git a/docs/plans/036-component-signals-and-extractor-hotpath.md b/docs/plans/036-component-signals-and-extractor-hotpath.md new file mode 100644 index 0000000..9af4979 --- /dev/null +++ b/docs/plans/036-component-signals-and-extractor-hotpath.md @@ -0,0 +1,96 @@ +--- +status: proposed +branch: TBD +created: 2026-06-01 +--- + +# `_ComponentSignals` consolidation + extractor hot-path review + +Follow-up to [plan 035](035-get-text-native-fastpath.md). With the `get_text` +native fast path banked (~7% off `parse_serp`), the profile's #2 *optimizable* +cost is now `classifiers/main.py:_ComponentSignals.__init__`, and the extractor +phase is the largest unprofiled-in-detail bucket worth a pass. This plan scopes +both. Same methodology as plans 023/035: per-SERP median + MAD, gate on the +run-to-run noise floor (~0.3-0.5% on the current idle box), and trust only +**back-to-back same-session A/B** numbers. + +## Current baseline (Python 3.13.12, 87-SERP corpus) + +From plan 035's profile (`bench_parse.py --profile`, 870 parses, post-fast-path): + +| Frame | self | nature | +|---|---|---| +| `make_soup` (lexbor parse) | 10.1 s (~20%) | structural -- one parse/SERP | +| `_ComponentSignals.__init__` | 5.1 s; 6.7 s cum (**~13%**) | pure-Python, **this plan** | +| `_iter_text_fragments` (residual walker) | 2.2 s; 3.5 s cum | already optimized in 035 | +| `_get_dom_positions` | 1.5 s | full-document `css('*')` walk | +| `_ai_overview_payloads._iter_payload_blobs` | 1.6 s; 2.1 s cum | recent addition | +| `is_valid` (`extractor_main.py:568`) | 1.0 s; 1.8 s cum | extraction | +| `extract_from_standard` (`extractor_main.py:333`) | 0.8 s; 2.4 s cum | extraction | + +## Lever 1: `_ComponentSignals` (primary) + +`ClassifyMain.classify` builds a `_ComponentSignals` per main component -- one +`cmpt.css('*')` descendant walk that fills three sets (class tokens, ids, tag +names), feeding the necessary-signal preconditions on the classifier chain +(plan 023 item 3a). It is called ~13x/parse (11,390 over 870 parses) and is now +~13% of parse time. The cost is the per-element Python loop: +`set.update(cls.split())` (1.67M updates), `set.add(tag)` (2.47M adds), +`str.split` (1.72M), `el.attrs.get`/`el.id`/`el.tag` per element. + +Candidate directions (each must stay byte-identical -- preconditions are +*necessary* conditions, so any change must not drop a real signal; pin with the +87-snapshot suite, no updates): + +1. **Build only the signals the chain actually consults.** The `names` set is + queried for ~8 custom-element tags (`g-scrolling-carousel`, `g-tray-header`, + `block-component`, `h2`, `promo-throttler`, `product-viewer-group`, + `g-more-link`), yet every element's tag is added (2.47M adds). The `ids` set + is queried for a similarly small fixed set. Restricting `names`/`ids` to a + precomputed interest set (membership-test on add) trades 2.47M unconditional + adds for 2.47M cheap `in` checks against a small frozenset -- measure whether + that nets out, since the `in` check isn't free either. Classes are broadly + consulted and likely must stay full. +2. **Lexbor-side signal extraction.** Investigate whether a small number of + targeted `css_first(...)` C probes for the gated signals beat one Python + `css('*')` walk that materializes three sets -- i.e. revisit whether the 3a + "presence set" is still the right shape now that the walker (not `find` + misses) is the cost. This is the inverse of the 023 decision and must be + re-measured, not assumed. +3. **Share one document walk.** `_get_dom_positions` (1.5 s) already walks the + whole document with `css('*')`, and `reorder_by_dom_position._range` walks + each main-component subtree with `css('*')` again. A single document walk that + yields both the position map and per-component signal sets would remove + redundant traversals -- but it couples currently-independent phases and risks + the byte-identical contract; scope carefully and gate hard. + +Recommended first step: option 1 (lowest risk, local to `_ComponentSignals`), +A/B it, then decide whether option 3's shared walk is worth the coupling. + +## Lever 2: extractor hot-path review (investigate) + +`ExtractorMain` (`extract_from_standard`, `get_layout`, `is_valid`, +`_ads_bottom`) is the largest phase after `make_soup` once classify is addressed, +and it grew ~648 lines since plan 023 (new layouts, kp-wholepage sub-columns, +complementary panels). It has not had a dedicated profiling pass on the +selectolax backend. Worth investigating: + +- `is_valid` (1.8 s cum, 25,460 calls) runs per candidate component -- re-check + the bad-label text scan and survey-throttler probe on lexbor nodes (the 023 + bounds were tuned for bs4). +- `extract_from_standard` / the `_StandardLayout` dispatch -- look for repeated + `css`/`css_first` over the same subtrees across layout detection and block + collection that could be hoisted or shared. +- `_iter_payload_blobs` (ai_overview, 2.1 s cum) -- a recent addition; confirm it + isn't re-walking payload subtrees. + +No commitment yet -- this lever is a profiling/scoping task that may or may not +surface a gateable win. Capture a `--profile-sort cumulative` split by phase +(as plan 023 did) before touching extractor code. + +## Verification gate (per change) + +- `uv run pytest` -- 87 snapshots green **without updates**, full suite passing. +- Back-to-back A/B of `scripts/bench_parse.py` over the fixture corpus, same + session, clearing the noise floor; record numbers in this plan's Log. +- `ruff check` / `ruff format --check` clean. From e8564c46472d12d6aa3b91cec5deefb81a787fa6 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Jun 2026 14:10:58 +0000 Subject: [PATCH 4/4] clarify get_text fast-path comment: walker skips script/style/template only as descendants Addresses a PR review note -- the Python walker skips those tags only when they are descendants, not when the root node is itself one, which is why the fast path needs both the node.tag guard and the css_first descendant probe. --- WebSearcher/_slx.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/WebSearcher/_slx.py b/WebSearcher/_slx.py index 6672eea..75fd9d6 100644 --- a/WebSearcher/_slx.py +++ b/WebSearcher/_slx.py @@ -108,12 +108,15 @@ def get_text(node: Node | None, separator: str = "", strip: bool = False) -> str return None # Fast path: selectolax's native C ``text()`` is byte-identical to the Python # fragment walker below EXCEPT for two differences -- native includes - # script/style/template text (the walker skips those subtrees) and native - # keeps empty fragments (the ``strip=True`` walker drops them). Both vanish - # when the subtree holds no script/style/template AND either ``separator`` - # is empty (empties add nothing to a ""-join, so kept-vs-dropped is - # invisible) or ``strip`` is False (both keep empties identically). That - # covers ~95% of calls on this corpus; the one remaining case (``strip=True`` + # script/style/template text and native keeps empty fragments (the + # ``strip=True`` walker drops them). On the script/style/template side, the + # walker skips those tags only as *descendants* (``_iter_text_fragments`` + # does not special-case a root that is itself one), so two guards are needed: + # ``node.tag not in _SKIP_TEXT_TAGS`` (root) and the ``css_first`` probe + # (descendants). The empty-fragment difference vanishes when ``separator`` is + # empty (empties add nothing to a ""-join, so kept-vs-dropped is invisible) + # or ``strip`` is False (both keep empties identically). Together these + # cover ~95% of calls on this corpus; the one remaining case (``strip=True`` # with a non-empty separator) keeps the Python walker. Verified # byte-identical over the full fixture corpus (315k nodes). if (