Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 28 additions & 22 deletions readability/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,11 +695,17 @@ def _transform_divs_to_paragraphs(self) -> None:

# ── Scoring ──────────────────────────────────────────────────────────

# Maximum ancestor levels for score propagation. The original
# algorithm only propagated to parent (1x) and grandparent (0.5x).
# Modern SPA-rendered pages nest content 3-5 levels deep in wrapper
# divs, so we extend propagation with diminishing weights.
_ANCESTOR_WEIGHTS = [1.0, 0.5, 0.333, 0.25]

def _score_paragraphs(self) -> dict[int, dict[str, Any]]:
"""Score paragraph-like nodes and propagate to ancestors.

Returns:
Dict mapping ``id(tag)`` ``{"tag": tag, "score": float}``.
Dict mapping ``id(tag)`` -> ``{"tag": tag, "score": float}``.
"""
candidates: dict[int, dict[str, Any]] = {}

Expand All @@ -708,33 +714,33 @@ def _score_paragraphs(self) -> dict[int, dict[str, Any]]:
if len(inner_text) < MIN_PARAGRAPH_LENGTH:
continue

parent = tag.parent
grandparent = parent.parent if parent is not None else None

# Ensure parent is initialised.
if parent is not None and id(parent) not in candidates:
candidates[id(parent)] = {
"tag": parent,
"score": self._init_score(parent),
}
# Ensure grandparent is initialised.
if grandparent is not None and id(grandparent) not in candidates:
candidates[id(grandparent)] = {
"tag": grandparent,
"score": self._init_score(grandparent),
}
# Collect ancestors up to the propagation depth.
ancestors: list[Any] = []
cur = tag.parent
for _ in range(len(self._ANCESTOR_WEIGHTS)):
if cur is None or cur.name in ("html", "body", "[document]"):
break
ancestors.append(cur)
cur = cur.parent

# Ensure each ancestor is initialised in the candidate map.
for anc in ancestors:
if id(anc) not in candidates:
candidates[id(anc)] = {
"tag": anc,
"score": self._init_score(anc),
}

# Content score for this paragraph.
inner_len = len(inner_text)
content_score = 1.0
content_score += len(COMMAS_RE.findall(inner_text))
content_score += min(inner_len / 100.0, 3.0)

# Propagate to parent (full) and grandparent (half).
if parent is not None and id(parent) in candidates:
candidates[id(parent)]["score"] += content_score
if grandparent is not None and id(grandparent) in candidates:
candidates[id(grandparent)]["score"] += content_score / 2.0
# Propagate to ancestors with diminishing weights.
for i, anc in enumerate(ancestors):
weight = self._ANCESTOR_WEIGHTS[i]
candidates[id(anc)]["score"] += content_score * weight

# Scale scores by link density.
for entry in candidates.values():
Expand Down Expand Up @@ -804,7 +810,7 @@ def _get_article(

# Create an article wrapper.
article = self._Tag("div")
sibling_threshold = max(10.0, best["score"] * 0.2)
sibling_threshold = max(10.0, best["score"] * 0.1)

# If there's no parent, use the candidate itself.
if parent is None:
Expand Down
Loading