From 1669db1f00386141c1fee87b764b0260285b165c Mon Sep 17 00:00:00 2001 From: "clementine-oaklight[bot]" <290556973+clementine-oaklight[bot]@users.noreply.github.com> Date: Thu, 11 Jun 2026 21:47:19 -0500 Subject: [PATCH 1/2] readability: extend score propagation for deeply nested SPA pages Modern SPA-rendered pages (React, Vue, etc.) wrap content in 3-5 levels of wrapper divs. The original algorithm only propagated paragraph scores to parent (1x) and grandparent (0.5x), so the actual article container never accumulated enough score to be selected as best candidate. Changes: - Extend score propagation to 4 ancestor levels (1x, 0.5x, 0.33x, 0.25x) using a configurable _ANCESTOR_WEIGHTS list - Lower sibling inclusion threshold from 0.2 to 0.1 to capture more content sections when the best candidate is found This fixes extraction of documentation pages like Alibaba Cloud help docs where rendered HTML uses deeply nested div structures with no

tags. Before: 97 chars extracted (single paragraph). After: 1118 chars (full article). Regression tested against GitHub Docs and large Alibaba Cloud ECS pages - traditional article extraction is unaffected. --- readability/readability.py | 49 +++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/readability/readability.py b/readability/readability.py index 4fb0c35..4d51197 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -695,11 +695,17 @@ def _transform_divs_to_paragraphs(self) -> None: # ── Scoring ────────────────────────────────────────────────────────── + # Maximum ancestor levels for score propagation. The original + # algorithm only propagated to parent (1x) and grandparent (0.5x). + # Modern SPA-rendered pages nest content 3-5 levels deep in wrapper + # divs, so we extend propagation with diminishing weights. + _ANCESTOR_WEIGHTS = [1.0, 0.5, 0.333, 0.25] + def _score_paragraphs(self) -> dict[int, dict[str, Any]]: """Score paragraph-like nodes and propagate to ancestors. Returns: - Dict mapping ``id(tag)`` → ``{"tag": tag, "score": float}``. + Dict mapping ``id(tag)`` -> ``{"tag": tag, "score": float}``. """ candidates: dict[int, dict[str, Any]] = {} @@ -708,21 +714,22 @@ def _score_paragraphs(self) -> dict[int, dict[str, Any]]: if len(inner_text) < MIN_PARAGRAPH_LENGTH: continue - parent = tag.parent - grandparent = parent.parent if parent is not None else None - - # Ensure parent is initialised. - if parent is not None and id(parent) not in candidates: - candidates[id(parent)] = { - "tag": parent, - "score": self._init_score(parent), - } - # Ensure grandparent is initialised. - if grandparent is not None and id(grandparent) not in candidates: - candidates[id(grandparent)] = { - "tag": grandparent, - "score": self._init_score(grandparent), - } + # Collect ancestors up to the propagation depth. + ancestors: list[Any] = [] + cur = tag.parent + for _ in range(len(self._ANCESTOR_WEIGHTS)): + if cur is None or cur.name in ("html", "body", "[document]"): + break + ancestors.append(cur) + cur = cur.parent + + # Ensure each ancestor is initialised in the candidate map. + for anc in ancestors: + if id(anc) not in candidates: + candidates[id(anc)] = { + "tag": anc, + "score": self._init_score(anc), + } # Content score for this paragraph. inner_len = len(inner_text) @@ -730,11 +737,9 @@ def _score_paragraphs(self) -> dict[int, dict[str, Any]]: content_score += len(COMMAS_RE.findall(inner_text)) content_score += min(inner_len / 100.0, 3.0) - # Propagate to parent (full) and grandparent (half). - if parent is not None and id(parent) in candidates: - candidates[id(parent)]["score"] += content_score - if grandparent is not None and id(grandparent) in candidates: - candidates[id(grandparent)]["score"] += content_score / 2.0 + # Propagate to ancestors with diminishing weights. + for i, anc in enumerate(ancestors): + candidates[id(anc)]["score"] += content_score * self._ANCESTOR_WEIGHTS[i] # Scale scores by link density. for entry in candidates.values(): @@ -804,7 +809,7 @@ def _get_article( # Create an article wrapper. article = self._Tag("div") - sibling_threshold = max(10.0, best["score"] * 0.2) + sibling_threshold = max(10.0, best["score"] * 0.1) # If there's no parent, use the candidate itself. if parent is None: From ca0f8c9c8730a7a5ab13fb05999ddc4aaf2e99ea Mon Sep 17 00:00:00 2001 From: Peng Ding Date: Fri, 12 Jun 2026 16:33:09 -0500 Subject: [PATCH 2/2] fix: wrap long line in readability score propagation (E501) --- readability/readability.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/readability/readability.py b/readability/readability.py index 4d51197..9409d3e 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -739,7 +739,8 @@ def _score_paragraphs(self) -> dict[int, dict[str, Any]]: # Propagate to ancestors with diminishing weights. for i, anc in enumerate(ancestors): - candidates[id(anc)]["score"] += content_score * self._ANCESTOR_WEIGHTS[i] + weight = self._ANCESTOR_WEIGHTS[i] + candidates[id(anc)]["score"] += content_score * weight # Scale scores by link density. for entry in candidates.values():