From f921d4fac668d9720926a0d7ca2ca96b1d9d6253 Mon Sep 17 00:00:00 2001 From: Jeremy Brown Date: Thu, 25 Jun 2026 14:19:35 +0200 Subject: [PATCH] feat(entities): gate ambiguous bare first names on corroboration (refs #36) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bare common first names (≥4 chars, e.g. "Alexandre", "Thomas", "Jean") were auto-linked anywhere they appeared in a title or content, so any occurrence attached to whichever entity carried that name — frequently the wrong colleague, and worse for accented names whose ASCII near-twins collide. A bare single first name that is *ambiguous* (claimed by 2+ entities) now links only when its entity is corroborated in the same document — by a tag, source ref, full-name match, an exact title-part, or by appearing in the frontmatter attendees. Unambiguous bare names still link as before. Ambiguity is computed by build_first_name_index(), which accent-folds names so "Jérémy"/"Jeremy" share a bucket; folding only *widens* the ambiguity set (more conservative) and never broadens a match. The indexer threads the document's attendees and a once-built name_owners index into find_entity_mentions. --- docs/entities.md | 27 ++++- src/kb/entities.py | 185 ++++++++++++++++++++++++++++------- src/kb/indexer.py | 7 ++ tests/test_disambiguation.py | 94 ++++++++++++++++++ tests/test_entities.py | 141 +++++++++++++++++++++++--- 5 files changed, 403 insertions(+), 51 deletions(-) create mode 100644 tests/test_disambiguation.py diff --git a/docs/entities.md b/docs/entities.md index 52f63a6..04d630d 100644 --- a/docs/entities.md +++ b/docs/entities.md @@ -108,9 +108,34 @@ Five-tier matching against document metadata and content: - Longer alias matches are preferred (e.g. "Kit Martin" matches before "Kit") - Very short single names (<=3 chars, e.g. "Ed", "Jo") are skipped for content and title matching (still matched via tags) -- Single names 4+ chars (e.g. "Anders", "Wren") are matched in both content and title +- Single names 4+ chars (e.g. "Wren") are matched in both content and title - File-stem aliases with hyphens (e.g. "dave-martin") are excluded from content matching +#### Bare ambiguous first names (#36) + +A **bare single first name** (≥4 chars, e.g. "Alexandre", "Thomas", "Jean") that is +**ambiguous** — claimed by 2+ entities — does **not** auto-link on its own in the +participant, title, or content tiers. It links only when the entity is *corroborated* +elsewhere in the same document by a stronger signal: + +- a tag (`tagged`) or source-ref (`source_ref`) match, +- a multi-word / full-name match (`participant`, `title`, or `discussed`), +- an exact title-part match, or +- appearing in the document's frontmatter `attendees`. + +Otherwise the bare match is dropped (it's a likely false positive — the wrong colleague). +**Unambiguous** bare first names (owned by exactly one entity) still link as before. + +Ambiguity is computed by `build_first_name_index()`, which **accent-folds** names +(`Jérémy` → `jeremy`) so an accented name and its ASCII near-twin collide into the same +ambiguity bucket. Folding is used *only* to widen the ambiguity set (making the matcher +more conservative) — it never broadens a match. So a bare ASCII "Jeremy" that could be +either "Jérémy Cotineau" or "Jeremy Brown" is gated until one of them is corroborated. + +The indexer passes the document's `attendees` and a once-built `name_owners` index into +`find_entity_mentions`. A genuinely wrong link that survives anyway can be removed with +`kbx entity unlink` (see §Suppressions). + ## Suppressions (manual unlink/relink) Automatic linking is heuristic, so it occasionally produces a false positive (a bare diff --git a/src/kb/entities.py b/src/kb/entities.py index ad32121..53f2cb9 100644 --- a/src/kb/entities.py +++ b/src/kb/entities.py @@ -4,6 +4,7 @@ import json import re +import unicodedata from pathlib import Path from typing import TYPE_CHECKING, Any @@ -589,6 +590,52 @@ def _boundary_pattern(escaped: str, name: str, flags: int = 0) -> re.Pattern[str return re.compile(rf"\b{escaped}(?=\s|$|[^\w])", flags) +def _fold_name(text: str) -> str: + """Lowercase + strip accents (NFKD → ASCII) for collision detection (#36). + + Used only to *detect* ambiguity between an accented name and its ASCII near-twin + (e.g. "Jérémy" / "Jeremy"). Folding here only *widens* the set of entities a bare + name could refer to — making the matcher more conservative — never broadening a match. + """ + return unicodedata.normalize("NFKD", text.lower()).encode("ascii", "ignore").decode("ascii") + + +def _is_gateable_single_name(name: str) -> bool: + """True for a single-word personal first name that auto-links via content/title today. + + These are the matches #36 gates on ambiguity. Mirrors the inclusion rules in + ``_build_name_patterns``: excludes file-stem aliases, short all-caps abbreviations, + common English words used as team names, ``src:`` IDs, and names < 4 chars (already + skipped for content matching). + """ + if len(name.split()) != 1: + return False + if len(name) < 4: + return False + if "-" in name and name == name.lower(): + return False # file stem (e.g. "dave-kowalski") + if name.isupper() and len(name) <= 4: + return False # abbreviation (e.g. "GG") + if name.lower() in _COMMON_WORDS: + return False + return not name.startswith("src:") + + +def build_first_name_index(entities: list[Entity]) -> dict[str, set[int]]: + """Map each folded single first-name → set of entity ids that claim it (#36). + + A folded name owned by 2+ entities is *ambiguous*: a bare occurrence of it in a + document can't be attributed to a single person without corroboration. Built once + per entity set (like ``build_entity_patterns``) and passed to ``find_entity_mentions``. + """ + owners: dict[str, set[int]] = {} + for entity in entities: + for name in (entity.name, *entity.aliases): + if _is_gateable_single_name(name): + owners.setdefault(_fold_name(name), set()).add(entity.id) + return owners + + def _build_name_patterns(entity: Entity) -> list[tuple[re.Pattern[str], int]]: """Build regex patterns for matching entity names/aliases in content. @@ -670,6 +717,8 @@ def find_entity_mentions( *, cached_patterns: list[tuple[re.Pattern[str], Entity]] | None = None, suppressed_ids: set[int] | None = None, + attendees: list[str] | None = None, + name_owners: dict[str, set[int]] | None = None, ) -> list[EntityMention]: """Find entity mentions in a document's metadata and content. @@ -682,12 +731,24 @@ def find_entity_mentions( Pass cached_patterns (from build_entity_patterns()) to avoid recompiling regex patterns on every call. If None, patterns are built on the fly. - Disambiguation: prefer longer alias matches. If ambiguous (e.g. "Anders"), - match all possible entities. + First-name disambiguation (#36): a *bare* single first name that is **ambiguous** + (claimed by 2+ entities — accent-folded, so "Jérémy"/"Jeremy" collide) does not + auto-link on its own. It only links if the entity is *corroborated* in the same + document — by a tag, title participant, source-ref, full-name match, or by appearing + in ``attendees``. Unambiguous bare first names still link as before. Pass + ``name_owners`` (from build_first_name_index()) to skip rebuilding the ambiguity map. """ mentions: list[EntityMention] = [] seen: set[tuple[int, str]] = set() _suppressed = suppressed_ids or set() + if name_owners is None: + name_owners = build_first_name_index(entities) + + # Entities anchored by a strong signal in this doc. A bare ambiguous first name + # only links if its entity ends up here (#36). Built incrementally as we go. + corroborated: set[int] = set() + # Deferred bare-ambiguous matches, resolved once all strong matches are known. + pending: list[tuple[int, str]] = [] def _add(entity_id: int, mention_type: str) -> None: if entity_id in _suppressed: @@ -697,7 +758,16 @@ def _add(entity_id: int, mention_type: str) -> None: seen.add(key) mentions.append(EntityMention(entity_id=entity_id, mention_type=mention_type)) - # 1. Tag matching + def _add_strong(entity_id: int, mention_type: str) -> None: + """Add a mention and corroborate the entity (rescues its bare-name matches).""" + _add(entity_id, mention_type) + corroborated.add(entity_id) + + def _is_ambiguous_bare(name: str) -> bool: + """A bare first name claimed by 2+ entities (accent-folded) — #36.""" + return _is_gateable_single_name(name) and len(name_owners.get(_fold_name(name), ())) >= 2 + + # 1. Tag matching — explicit annotation, always strong for tag in tags: tag_lower = tag.lower().strip() if not tag_lower: @@ -705,16 +775,27 @@ def _add(entity_id: int, mention_type: str) -> None: for entity in entities: all_names = [entity.name.lower()] + [a.lower() for a in entity.aliases] if tag_lower in all_names: - _add(entity.id, "tagged") + _add_strong(entity.id, "tagged") + + # 3.5. Source ID matching — unambiguous, case-sensitive substring check; strong + for entity in entities: + for alias in entity.aliases: + if not alias.startswith("src:"): continue - # Partial match: tag is a first name or short form - for name in all_names: - if tag_lower == name: - _add(entity.id, "tagged") - break - - # 2. Title participant parsing - # Split on common separators: " / ", " x ", " & ", " vs " + source_id = alias[4:] # strip "src:" prefix + if source_id in content: + _add_strong(entity.id, "source_ref") + break # one source_ref match per entity is enough + + # Attendees corroborate (no mention emitted) — #36 context from frontmatter. + if attendees: + attendee_folded = {_fold_name(a) for a in attendees if a} + for entity in entities: + candidate = {_fold_name(entity.name), *(_fold_name(a) for a in entity.aliases)} + if candidate & attendee_folded: + corroborated.add(entity.id) + + # 2. Title participant parsing — split on " / ", " x ", " & ", " vs " parts = re.split(r"\s+/\s+|\s+x\s+|\s+&\s+|\s+vs\s+", title, flags=re.IGNORECASE) for part in parts: part = part.strip() @@ -722,45 +803,75 @@ def _add(entity_id: int, mention_type: str) -> None: continue part_lower = part.lower() for entity in entities: - all_names = [entity.name.lower()] + [a.lower() for a in entity.aliases] - if part_lower in all_names or any( - re.search(rf"\b{re.escape(n)}\b", part_lower) - for n in all_names - if len(n) >= 4 # skip very short names for substring matching - ): - _add(entity.id, "participant") - - # 3. Title substring matching — catch names embedded in the title - # e.g. "Anders Sync Notes", "Wren 1:1", "Helix Refactor Review" + matched_strong = False + matched_bare_ambiguous = False + for name in [entity.name, *entity.aliases]: + n_lower = name.lower() + is_exact = part_lower == n_lower + is_substr = len(n_lower) >= 4 and ( + re.search(rf"\b{re.escape(n_lower)}\b", part_lower) is not None + ) + if not (is_exact or is_substr): + continue + if _is_ambiguous_bare(name): + matched_bare_ambiguous = True + continue # keep looking for a stronger name for this entity + # Exact whole-part match or a multi-word name corroborates; an + # unambiguous bare substring just links. + if is_exact or len(name.split()) >= 2: + _add_strong(entity.id, "participant") + else: + _add(entity.id, "participant") + matched_strong = True + break + if matched_bare_ambiguous and not matched_strong: + pending.append((entity.id, "participant")) + + # 3. Title substring matching — names embedded in the title (e.g. "Wren 1:1") title_lower = title.lower() for entity in entities: - all_names = [entity.name, *list(entity.aliases)] - for name in all_names: + matched_strong = False + matched_bare_ambiguous = False + for name in [entity.name, *list(entity.aliases)]: # Skip very short names and file-stem aliases if len(name) <= 3: continue if "-" in name and name == name.lower(): continue - if re.search(rf"\b{re.escape(name)}\b", title_lower, re.IGNORECASE): - _add(entity.id, "title") - break # one match per entity is enough - - # 3.5. Source ID matching — unambiguous, case-sensitive substring check - for entity in entities: - for alias in entity.aliases: - if not alias.startswith("src:"): + if not re.search(rf"\b{re.escape(name)}\b", title_lower, re.IGNORECASE): continue - source_id = alias[4:] # strip "src:" prefix - if source_id in content: - _add(entity.id, "source_ref") - break # one source_ref match per entity is enough + if _is_ambiguous_bare(name): + matched_bare_ambiguous = True + continue # keep looking for a stronger name for this entity + if len(name.split()) >= 2: + _add_strong(entity.id, "title") + else: + _add(entity.id, "title") + matched_strong = True + break # one strong match per entity is enough + if matched_bare_ambiguous and not matched_strong: + pending.append((entity.id, "title")) # 4. Content name matching with disambiguation if cached_patterns is None: cached_patterns = build_entity_patterns(entities) for pattern, entity in cached_patterns: - if pattern.search(content): + m = pattern.search(content) + if m is None: + continue + matched = m.group(0) + if len(matched.split()) == 1 and len(name_owners.get(_fold_name(matched), ())) >= 2: + pending.append((entity.id, "discussed")) # bare ambiguous — defer + continue + if len(matched.split()) >= 2: + _add_strong(entity.id, "discussed") + else: _add(entity.id, "discussed") + # Resolve deferred bare-ambiguous matches: link only if corroborated elsewhere. + for entity_id, mention_type in pending: + if entity_id in corroborated: + _add(entity_id, mention_type) + return mentions diff --git a/src/kb/indexer.py b/src/kb/indexer.py index fab098d..c25baab 100644 --- a/src/kb/indexer.py +++ b/src/kb/indexer.py @@ -13,6 +13,7 @@ Entity, EntityMention, build_entity_patterns, + build_first_name_index, find_entity_mentions, load_entities, seed_entities, @@ -178,6 +179,8 @@ def index_all( seed_entities(db, project_root) entities = load_entities(db) entity_patterns = build_entity_patterns(entities) + # Ambiguous-first-name index (#36): folded single name → owning entity ids. + name_owners = build_first_name_index(entities) # Entity-link suppressions (#35): per-document "do not link entity X here", kept in a # sidecar so they survive reindex + Granola sync. Resolve names → entity ids once. @@ -322,6 +325,7 @@ def _flush_embeddings() -> None: _suppressed_ids = { entity_name_to_id[n] for n in _doc_suppressed if n in entity_name_to_id } + _attendee_names = [a.get("name", "") for a in doc.attendees if a.get("name")] mentions = find_entity_mentions( doc.title, doc.tags, @@ -329,6 +333,8 @@ def _flush_embeddings() -> None: entities, cached_patterns=entity_patterns, suppressed_ids=_suppressed_ids, + attendees=_attendee_names, + name_owners=name_owners, ) entity_id_set = {m.entity_id for m in mentions} result.entities_linked += len(mentions) @@ -439,6 +445,7 @@ def _flush_embeddings() -> None: if result.entities_created > 0: entities = load_entities(db) entity_patterns = build_entity_patterns(entities) + name_owners = build_first_name_index(entities) if embedder: embedder.release_gpu_memory() gc.collect() diff --git a/tests/test_disambiguation.py b/tests/test_disambiguation.py new file mode 100644 index 0000000..5e22a48 --- /dev/null +++ b/tests/test_disambiguation.py @@ -0,0 +1,94 @@ +"""Integration tests for bare first-name disambiguation through index_all (#36).""" + +from __future__ import annotations + +import sqlite3 +import tempfile +from pathlib import Path + + +def _count_mentions(conn: sqlite3.Connection, entity_name: str, doc_like: str) -> int: + row = conn.execute( + """ + SELECT COUNT(*) AS n + FROM entity_mentions em + JOIN entities e ON e.id = em.entity_id + JOIN documents d ON d.id = em.document_id + WHERE e.name = ? AND d.path LIKE ? + """, + (entity_name, doc_like), + ).fetchone() + return int(row["n"]) + + +def _seed_two_alexandres(root: Path) -> None: + people = root / "memory" / "people" + people.mkdir(parents=True) + (people / "alexandre-dupont.md").write_text( + "# Alexandre Dupont\n\n**Also known as:** Alexandre\n\n**Role:** Engineer\n" + ) + (people / "alexandre-martin.md").write_text( + "# Alexandre Martin\n\n**Also known as:** Alexandre\n\n**Role:** Designer\n" + ) + + +def _write_meeting(root: Path, name: str, body: str, attendees_yaml: str = "") -> None: + d = root / "memory" / "meetings" / "2026" / "05" / "24" + d.mkdir(parents=True, exist_ok=True) + front = "---\ntitle: Sync\ndate: 2026-05-24\ntype: notes\ngranola_id: " + name[:8] + "\n" + front += attendees_yaml + front += "---\n\n## Notes\n\n" + body + "\n" + (d / f"{name}.granola.notes.md").write_text(front) + + +class TestFirstNameDisambiguationIntegration: + def test_ambiguous_bare_first_name_not_linked(self, tmp_db): + from kb.indexer import index_all + + db, _ = tmp_db + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + _seed_two_alexandres(root) + _write_meeting(root, "aaaa1111_bare", "Alexandre walked us through the roadmap.") + + index_all(db, None, root, full=True) + conn = db.get_sqlite_conn() + assert _count_mentions(conn, "Alexandre Dupont", "%bare%") == 0 + assert _count_mentions(conn, "Alexandre Martin", "%bare%") == 0 + + def test_full_name_corroborates_one_alexandre(self, tmp_db): + from kb.indexer import index_all + + db, _ = tmp_db + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + _seed_two_alexandres(root) + _write_meeting( + root, + "bbbb2222_full", + "Alexandre Dupont opened. Later Alexandre summarised the actions.", + ) + + index_all(db, None, root, full=True) + conn = db.get_sqlite_conn() + assert _count_mentions(conn, "Alexandre Dupont", "%full%") >= 1 + assert _count_mentions(conn, "Alexandre Martin", "%full%") == 0 + + def test_attendee_corroborates_one_alexandre(self, tmp_db): + from kb.indexer import index_all + + db, _ = tmp_db + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + _seed_two_alexandres(root) + _write_meeting( + root, + "cccc3333_att", + "Alexandre walked us through the roadmap.", + attendees_yaml="attendees:\n - name: Alexandre Dupont\n email: ad@example.com\n", + ) + + index_all(db, None, root, full=True) + conn = db.get_sqlite_conn() + assert _count_mentions(conn, "Alexandre Dupont", "%att%") >= 1 + assert _count_mentions(conn, "Alexandre Martin", "%att%") == 0 diff --git a/tests/test_entities.py b/tests/test_entities.py index 2ed818e..62e9407 100644 --- a/tests/test_entities.py +++ b/tests/test_entities.py @@ -271,29 +271,32 @@ def test_content_word_boundary(self, sample_entities): assert len(charles_mentions) == 0 def test_short_first_name_matched_in_content(self, sample_entities): - """First names >3 chars should match in content (threshold lowered from 6 to 3).""" + """An *unambiguous* first name >3 chars should match in content (threshold rule). + + ("Soren" is a 5-char single name owned by exactly one entity; ambiguous bare + first names are gated separately — see TestFirstNameDisambiguation, #36.) + """ from kb.entities import find_entity_mentions mentions = find_entity_mentions( title="Random meeting", tags=[], - content="Anders presented the quarterly results.", + content="Soren presented the quarterly results.", entities=sample_entities, ) discussed = [m for m in mentions if m.mention_type == "discussed"] - david_ids = {m.entity_id for m in discussed if m.entity_id in (3, 4)} - # "Anders" (5 chars) should now match — threshold lowered to 3 - assert len(david_ids) >= 1 + assert 2 in {m.entity_id for m in discussed} # Soren Vance (unambiguous) def test_suppressed_ids_are_not_linked(self, sample_entities): """find_entity_mentions skips entity ids in suppressed_ids (#35).""" from kb.entities import find_entity_mentions + # Use a full name so the match is robust to #36's bare-name gating. base = find_entity_mentions( title="Random meeting", tags=[], - content="Anders presented the quarterly results.", + content="Soren Vance presented the quarterly results.", entities=sample_entities, ) base_ids = {m.entity_id for m in base} @@ -302,7 +305,7 @@ def test_suppressed_ids_are_not_linked(self, sample_entities): suppressed = find_entity_mentions( title="Random meeting", tags=[], - content="Anders presented the quarterly results.", + content="Soren Vance presented the quarterly results.", entities=sample_entities, suppressed_ids={target}, ) @@ -347,21 +350,21 @@ def test_short_first_name_matches_via_tag(self, sample_entities): assert 4 in david_ids def test_title_substring_matching(self, sample_entities): - """Entity names appearing as substrings in the title should match as 'title' type.""" + """An unambiguous name appearing as a substring in the title matches as 'title'. + + ("Wren" is owned by one entity; ambiguous bare names in titles are gated — #36.) + """ from kb.entities import find_entity_mentions mentions = find_entity_mentions( - title="Anders Sync Notes", + title="Wren Sync Notes", tags=[], content="Short content with no names.", entities=sample_entities, ) title_mentions = [m for m in mentions if m.mention_type == "title"] - title_ids = {m.entity_id for m in title_mentions} - # Both Davids should match via title substring - assert 3 in title_ids # Soren Vance - assert 4 in title_ids # Kit Larsen (alias "Anders") + assert 1 in {m.entity_id for m in title_mentions} # Wren Kasper (unambiguous) def test_title_substring_skips_short_names(self, sample_entities): """Title substring matching should skip very short names (<=3 chars).""" @@ -430,6 +433,118 @@ def test_combined_mention_types(self, sample_entities): assert "discussed" in types +class TestFirstNameDisambiguation: + """Bare ambiguous first names must not auto-link without corroboration (#36).""" + + def _alex_pair(self): + from kb.entities import Entity + + return [ + Entity(id=101, name="Alexandre Dupont", entity_type="person", aliases=["Alexandre"]), + Entity(id=102, name="Alexandre Martin", entity_type="person", aliases=["Alexandre"]), + ] + + def test_build_first_name_index_flags_ambiguous(self): + from kb.entities import build_first_name_index + + owners = build_first_name_index(self._alex_pair()) + assert owners.get("alexandre") == {101, 102} + + def test_build_first_name_index_folds_accents(self): + """An accented name and its ASCII near-twin collide in the ambiguity index.""" + from kb.entities import Entity, build_first_name_index + + ents = [ + Entity(id=201, name="Jérémy Cotineau", entity_type="person", aliases=["Jérémy"]), + Entity(id=202, name="Jeremy Brown", entity_type="person", aliases=["Jeremy"]), + ] + owners = build_first_name_index(ents) + assert owners.get("jeremy") == {201, 202} + + def test_ambiguous_bare_first_name_not_linked_without_context(self): + from kb.entities import find_entity_mentions + + mentions = find_entity_mentions( + title="Weekly sync", + tags=[], + content="Alexandre opened the meeting and walked through the roadmap.", + entities=self._alex_pair(), + ) + discussed = {m.entity_id for m in mentions if m.mention_type == "discussed"} + assert discussed == set(), "ambiguous bare first name should not auto-link" + + def test_ambiguous_bare_name_linked_when_full_name_corroborates(self): + from kb.entities import find_entity_mentions + + mentions = find_entity_mentions( + title="Weekly sync", + tags=[], + content="Alexandre Dupont opened. Later Alexandre summarised the actions.", + entities=self._alex_pair(), + ) + discussed = {m.entity_id for m in mentions if m.mention_type == "discussed"} + assert 101 in discussed, "full-name match should corroborate this Alexandre" + assert 102 not in discussed, "the other Alexandre stays unlinked" + + def test_ambiguous_bare_name_linked_when_attendee_corroborates(self): + from kb.entities import find_entity_mentions + + mentions = find_entity_mentions( + title="Weekly sync", + tags=[], + content="Alexandre opened the meeting and walked through the roadmap.", + entities=self._alex_pair(), + attendees=["Alexandre Dupont"], + ) + discussed = {m.entity_id for m in mentions if m.mention_type == "discussed"} + assert 101 in discussed, "attendee should corroborate this Alexandre" + assert 102 not in discussed + + def test_accent_twin_gates_an_otherwise_unambiguous_match(self): + """A bare ASCII 'Jeremy' that could be the accented 'Jérémy' is gated.""" + from kb.entities import Entity, find_entity_mentions + + ents = [ + Entity(id=201, name="Jérémy Cotineau", entity_type="person", aliases=["Jérémy"]), + Entity(id=202, name="Jeremy Brown", entity_type="person", aliases=["Jeremy"]), + ] + mentions = find_entity_mentions( + title="Review", + tags=[], + content="Jeremy attended the review and gave feedback.", + entities=ents, + ) + discussed = {m.entity_id for m in mentions if m.mention_type == "discussed"} + assert discussed == set(), "accent-fold collision should gate the bare match" + + def test_ambiguous_bare_name_in_title_gated(self): + from kb.entities import find_entity_mentions + + mentions = find_entity_mentions( + title="Alexandre Sync Notes", + tags=[], + content="Short content with no names.", + entities=self._alex_pair(), + ) + title_ids = {m.entity_id for m in mentions if m.mention_type == "title"} + assert title_ids == set(), "ambiguous bare first name in title should be gated" + + def test_unambiguous_first_name_still_links(self): + from kb.entities import Entity, find_entity_mentions + + ents = [ + Entity(id=301, name="Soren Vance", entity_type="person", aliases=["Soren"]), + ] + mentions = find_entity_mentions( + title="Weekly sync", + tags=[], + content="Soren walked the team through the new design.", + entities=ents, + ) + discussed = {m.entity_id for m in mentions if m.mention_type == "discussed"} + assert 301 in discussed, "an unambiguous first name must still auto-link" + + class TestSeedEntitiesNonDestructive: """seed_entities() must be non-destructive: upsert entities, preserve mentions."""