Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion docs/entities.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,34 @@ Five-tier matching against document metadata and content:

- Longer alias matches are preferred (e.g. "Kit Martin" matches before "Kit")
- Very short single names (<=3 chars, e.g. "Ed", "Jo") are skipped for content and title matching (still matched via tags)
- Single names 4+ chars (e.g. "Anders", "Wren") are matched in both content and title
- Single names 4+ chars (e.g. "Wren") are matched in both content and title
- File-stem aliases with hyphens (e.g. "dave-martin") are excluded from content matching

#### Bare ambiguous first names (#36)

A **bare single first name** (≥4 chars, e.g. "Alexandre", "Thomas", "Jean") that is
**ambiguous** — claimed by 2+ entities — does **not** auto-link on its own in the
participant, title, or content tiers. It links only when the entity is *corroborated*
elsewhere in the same document by a stronger signal:

- a tag (`tagged`) or source-ref (`source_ref`) match,
- a multi-word / full-name match (`participant`, `title`, or `discussed`),
- an exact title-part match, or
- appearing in the document's frontmatter `attendees`.

Otherwise the bare match is dropped (it's a likely false positive — the wrong colleague).
**Unambiguous** bare first names (owned by exactly one entity) still link as before.

Ambiguity is computed by `build_first_name_index()`, which **accent-folds** names
(`Jérémy` → `jeremy`) so an accented name and its ASCII near-twin collide into the same
ambiguity bucket. Folding is used *only* to widen the ambiguity set (making the matcher
more conservative) — it never broadens a match. So a bare ASCII "Jeremy" that could be
either "Jérémy Cotineau" or "Jeremy Brown" is gated until one of them is corroborated.

The indexer passes the document's `attendees` and a once-built `name_owners` index into
`find_entity_mentions`. A genuinely wrong link that survives anyway can be removed with
`kbx entity unlink` (see §Suppressions).

## Suppressions (manual unlink/relink)

Automatic linking is heuristic, so it occasionally produces a false positive (a bare
Expand Down
185 changes: 148 additions & 37 deletions src/kb/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import json
import re
import unicodedata
from pathlib import Path
from typing import TYPE_CHECKING, Any

Expand Down Expand Up @@ -589,6 +590,52 @@ def _boundary_pattern(escaped: str, name: str, flags: int = 0) -> re.Pattern[str
return re.compile(rf"\b{escaped}(?=\s|$|[^\w])", flags)


def _fold_name(text: str) -> str:
"""Lowercase + strip accents (NFKD → ASCII) for collision detection (#36).

Used only to *detect* ambiguity between an accented name and its ASCII near-twin
(e.g. "Jérémy" / "Jeremy"). Folding here only *widens* the set of entities a bare
name could refer to — making the matcher more conservative — never broadening a match.
"""
return unicodedata.normalize("NFKD", text.lower()).encode("ascii", "ignore").decode("ascii")


def _is_gateable_single_name(name: str) -> bool:
"""True for a single-word personal first name that auto-links via content/title today.

These are the matches #36 gates on ambiguity. Mirrors the inclusion rules in
``_build_name_patterns``: excludes file-stem aliases, short all-caps abbreviations,
common English words used as team names, ``src:`` IDs, and names < 4 chars (already
skipped for content matching).
"""
if len(name.split()) != 1:
return False
if len(name) < 4:
return False
if "-" in name and name == name.lower():
return False # file stem (e.g. "dave-kowalski")
if name.isupper() and len(name) <= 4:
return False # abbreviation (e.g. "GG")
if name.lower() in _COMMON_WORDS:
return False
return not name.startswith("src:")


def build_first_name_index(entities: list[Entity]) -> dict[str, set[int]]:
"""Map each folded single first-name → set of entity ids that claim it (#36).

A folded name owned by 2+ entities is *ambiguous*: a bare occurrence of it in a
document can't be attributed to a single person without corroboration. Built once
per entity set (like ``build_entity_patterns``) and passed to ``find_entity_mentions``.
"""
owners: dict[str, set[int]] = {}
for entity in entities:
for name in (entity.name, *entity.aliases):
if _is_gateable_single_name(name):
owners.setdefault(_fold_name(name), set()).add(entity.id)
return owners


def _build_name_patterns(entity: Entity) -> list[tuple[re.Pattern[str], int]]:
"""Build regex patterns for matching entity names/aliases in content.

Expand Down Expand Up @@ -670,6 +717,8 @@ def find_entity_mentions(
*,
cached_patterns: list[tuple[re.Pattern[str], Entity]] | None = None,
suppressed_ids: set[int] | None = None,
attendees: list[str] | None = None,
name_owners: dict[str, set[int]] | None = None,
) -> list[EntityMention]:
"""Find entity mentions in a document's metadata and content.

Expand All @@ -682,12 +731,24 @@ def find_entity_mentions(
Pass cached_patterns (from build_entity_patterns()) to avoid recompiling
regex patterns on every call. If None, patterns are built on the fly.

Disambiguation: prefer longer alias matches. If ambiguous (e.g. "Anders"),
match all possible entities.
First-name disambiguation (#36): a *bare* single first name that is **ambiguous**
(claimed by 2+ entities — accent-folded, so "Jérémy"/"Jeremy" collide) does not
auto-link on its own. It only links if the entity is *corroborated* in the same
document — by a tag, title participant, source-ref, full-name match, or by appearing
in ``attendees``. Unambiguous bare first names still link as before. Pass
``name_owners`` (from build_first_name_index()) to skip rebuilding the ambiguity map.
"""
mentions: list[EntityMention] = []
seen: set[tuple[int, str]] = set()
_suppressed = suppressed_ids or set()
if name_owners is None:
name_owners = build_first_name_index(entities)

# Entities anchored by a strong signal in this doc. A bare ambiguous first name
# only links if its entity ends up here (#36). Built incrementally as we go.
corroborated: set[int] = set()
# Deferred bare-ambiguous matches, resolved once all strong matches are known.
pending: list[tuple[int, str]] = []

def _add(entity_id: int, mention_type: str) -> None:
if entity_id in _suppressed:
Expand All @@ -697,70 +758,120 @@ def _add(entity_id: int, mention_type: str) -> None:
seen.add(key)
mentions.append(EntityMention(entity_id=entity_id, mention_type=mention_type))

# 1. Tag matching
def _add_strong(entity_id: int, mention_type: str) -> None:
"""Add a mention and corroborate the entity (rescues its bare-name matches)."""
_add(entity_id, mention_type)
corroborated.add(entity_id)

def _is_ambiguous_bare(name: str) -> bool:
"""A bare first name claimed by 2+ entities (accent-folded) — #36."""
return _is_gateable_single_name(name) and len(name_owners.get(_fold_name(name), ())) >= 2

# 1. Tag matching — explicit annotation, always strong
for tag in tags:
tag_lower = tag.lower().strip()
if not tag_lower:
continue
for entity in entities:
all_names = [entity.name.lower()] + [a.lower() for a in entity.aliases]
if tag_lower in all_names:
_add(entity.id, "tagged")
_add_strong(entity.id, "tagged")

# 3.5. Source ID matching — unambiguous, case-sensitive substring check; strong
for entity in entities:
for alias in entity.aliases:
if not alias.startswith("src:"):
continue
# Partial match: tag is a first name or short form
for name in all_names:
if tag_lower == name:
_add(entity.id, "tagged")
break

# 2. Title participant parsing
# Split on common separators: " / ", " x ", " & ", " vs "
source_id = alias[4:] # strip "src:" prefix
if source_id in content:
_add_strong(entity.id, "source_ref")
break # one source_ref match per entity is enough

# Attendees corroborate (no mention emitted) — #36 context from frontmatter.
if attendees:
attendee_folded = {_fold_name(a) for a in attendees if a}
for entity in entities:
candidate = {_fold_name(entity.name), *(_fold_name(a) for a in entity.aliases)}
if candidate & attendee_folded:
corroborated.add(entity.id)

# 2. Title participant parsing — split on " / ", " x ", " & ", " vs "
parts = re.split(r"\s+/\s+|\s+x\s+|\s+&\s+|\s+vs\s+", title, flags=re.IGNORECASE)
for part in parts:
part = part.strip()
if not part:
continue
part_lower = part.lower()
for entity in entities:
all_names = [entity.name.lower()] + [a.lower() for a in entity.aliases]
if part_lower in all_names or any(
re.search(rf"\b{re.escape(n)}\b", part_lower)
for n in all_names
if len(n) >= 4 # skip very short names for substring matching
):
_add(entity.id, "participant")

# 3. Title substring matching — catch names embedded in the title
# e.g. "Anders Sync Notes", "Wren 1:1", "Helix Refactor Review"
matched_strong = False
matched_bare_ambiguous = False
for name in [entity.name, *entity.aliases]:
n_lower = name.lower()
is_exact = part_lower == n_lower
is_substr = len(n_lower) >= 4 and (
re.search(rf"\b{re.escape(n_lower)}\b", part_lower) is not None
)
if not (is_exact or is_substr):
continue
if _is_ambiguous_bare(name):
matched_bare_ambiguous = True
continue # keep looking for a stronger name for this entity
# Exact whole-part match or a multi-word name corroborates; an
# unambiguous bare substring just links.
if is_exact or len(name.split()) >= 2:
_add_strong(entity.id, "participant")
else:
_add(entity.id, "participant")
matched_strong = True
break
if matched_bare_ambiguous and not matched_strong:
pending.append((entity.id, "participant"))

# 3. Title substring matching — names embedded in the title (e.g. "Wren 1:1")
title_lower = title.lower()
for entity in entities:
all_names = [entity.name, *list(entity.aliases)]
for name in all_names:
matched_strong = False
matched_bare_ambiguous = False
for name in [entity.name, *list(entity.aliases)]:
# Skip very short names and file-stem aliases
if len(name) <= 3:
continue
if "-" in name and name == name.lower():
continue
if re.search(rf"\b{re.escape(name)}\b", title_lower, re.IGNORECASE):
_add(entity.id, "title")
break # one match per entity is enough

# 3.5. Source ID matching — unambiguous, case-sensitive substring check
for entity in entities:
for alias in entity.aliases:
if not alias.startswith("src:"):
if not re.search(rf"\b{re.escape(name)}\b", title_lower, re.IGNORECASE):
continue
source_id = alias[4:] # strip "src:" prefix
if source_id in content:
_add(entity.id, "source_ref")
break # one source_ref match per entity is enough
if _is_ambiguous_bare(name):
matched_bare_ambiguous = True
continue # keep looking for a stronger name for this entity
if len(name.split()) >= 2:
_add_strong(entity.id, "title")
else:
_add(entity.id, "title")
matched_strong = True
break # one strong match per entity is enough
if matched_bare_ambiguous and not matched_strong:
pending.append((entity.id, "title"))

# 4. Content name matching with disambiguation
if cached_patterns is None:
cached_patterns = build_entity_patterns(entities)

for pattern, entity in cached_patterns:
if pattern.search(content):
m = pattern.search(content)
if m is None:
continue
matched = m.group(0)
if len(matched.split()) == 1 and len(name_owners.get(_fold_name(matched), ())) >= 2:
pending.append((entity.id, "discussed")) # bare ambiguous — defer
continue
if len(matched.split()) >= 2:
_add_strong(entity.id, "discussed")
else:
_add(entity.id, "discussed")

# Resolve deferred bare-ambiguous matches: link only if corroborated elsewhere.
for entity_id, mention_type in pending:
if entity_id in corroborated:
_add(entity_id, mention_type)

return mentions
7 changes: 7 additions & 0 deletions src/kb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
Entity,
EntityMention,
build_entity_patterns,
build_first_name_index,
find_entity_mentions,
load_entities,
seed_entities,
Expand Down Expand Up @@ -178,6 +179,8 @@ def index_all(
seed_entities(db, project_root)
entities = load_entities(db)
entity_patterns = build_entity_patterns(entities)
# Ambiguous-first-name index (#36): folded single name → owning entity ids.
name_owners = build_first_name_index(entities)

# Entity-link suppressions (#35): per-document "do not link entity X here", kept in a
# sidecar so they survive reindex + Granola sync. Resolve names → entity ids once.
Expand Down Expand Up @@ -322,13 +325,16 @@ def _flush_embeddings() -> None:
_suppressed_ids = {
entity_name_to_id[n] for n in _doc_suppressed if n in entity_name_to_id
}
_attendee_names = [a.get("name", "") for a in doc.attendees if a.get("name")]
mentions = find_entity_mentions(
doc.title,
doc.tags,
section_content,
entities,
cached_patterns=entity_patterns,
suppressed_ids=_suppressed_ids,
attendees=_attendee_names,
name_owners=name_owners,
)
entity_id_set = {m.entity_id for m in mentions}
result.entities_linked += len(mentions)
Expand Down Expand Up @@ -439,6 +445,7 @@ def _flush_embeddings() -> None:
if result.entities_created > 0:
entities = load_entities(db)
entity_patterns = build_entity_patterns(entities)
name_owners = build_first_name_index(entities)
if embedder:
embedder.release_gpu_memory()
gc.collect()
Expand Down
Loading