From f921d4fac668d9720926a0d7ca2ca96b1d9d6253 Mon Sep 17 00:00:00 2001
From: Jeremy Brown <jeremy@tenfourty.com>
Date: Thu, 25 Jun 2026 14:19:35 +0200
Subject: [PATCH] feat(entities): gate ambiguous bare first names on
 corroboration (refs #36)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bare common first names (≥4 chars, e.g. "Alexandre", "Thomas", "Jean") were
auto-linked anywhere they appeared in a title or content, so any occurrence
attached to whichever entity carried that name — frequently the wrong
colleague, and worse for accented names whose ASCII near-twins collide.

A bare single first name that is *ambiguous* (claimed by 2+ entities) now links
only when its entity is corroborated in the same document — by a tag, source
ref, full-name match, an exact title-part, or by appearing in the frontmatter
attendees. Unambiguous bare names still link as before. Ambiguity is computed
by build_first_name_index(), which accent-folds names so "Jérémy"/"Jeremy"
share a bucket; folding only *widens* the ambiguity set (more conservative) and
never broadens a match. The indexer threads the document's attendees and a
once-built name_owners index into find_entity_mentions.
---
 docs/entities.md             |  27 ++++-
 src/kb/entities.py           | 185 ++++++++++++++++++++++++++++-------
 src/kb/indexer.py            |   7 ++
 tests/test_disambiguation.py |  94 ++++++++++++++++++
 tests/test_entities.py       | 141 +++++++++++++++++++++++---
 5 files changed, 403 insertions(+), 51 deletions(-)
 create mode 100644 tests/test_disambiguation.py

diff --git a/docs/entities.md b/docs/entities.md
index 52f63a6..04d630d 100644
--- a/docs/entities.md
+++ b/docs/entities.md
@@ -108,9 +108,34 @@ Five-tier matching against document metadata and content:
 
 - Longer alias matches are preferred (e.g. "Kit Martin" matches before "Kit")
 - Very short single names (<=3 chars, e.g. "Ed", "Jo") are skipped for content and title matching (still matched via tags)
-- Single names 4+ chars (e.g. "Anders", "Wren") are matched in both content and title
+- Single names 4+ chars (e.g. "Wren") are matched in both content and title
 - File-stem aliases with hyphens (e.g. "dave-martin") are excluded from content matching
 
+#### Bare ambiguous first names (#36)
+
+A **bare single first name** (≥4 chars, e.g. "Alexandre", "Thomas", "Jean") that is
+**ambiguous** — claimed by 2+ entities — does **not** auto-link on its own in the
+participant, title, or content tiers. It links only when the entity is *corroborated*
+elsewhere in the same document by a stronger signal:
+
+- a tag (`tagged`) or source-ref (`source_ref`) match,
+- a multi-word / full-name match (`participant`, `title`, or `discussed`),
+- an exact title-part match, or
+- appearing in the document's frontmatter `attendees`.
+
+Otherwise the bare match is dropped (it's a likely false positive — the wrong colleague).
+**Unambiguous** bare first names (owned by exactly one entity) still link as before.
+
+Ambiguity is computed by `build_first_name_index()`, which **accent-folds** names
+(`Jérémy` → `jeremy`) so an accented name and its ASCII near-twin collide into the same
+ambiguity bucket. Folding is used *only* to widen the ambiguity set (making the matcher
+more conservative) — it never broadens a match. So a bare ASCII "Jeremy" that could be
+either "Jérémy Cotineau" or "Jeremy Brown" is gated until one of them is corroborated.
+
+The indexer passes the document's `attendees` and a once-built `name_owners` index into
+`find_entity_mentions`. A genuinely wrong link that survives anyway can be removed with
+`kbx entity unlink` (see §Suppressions).
+
 ## Suppressions (manual unlink/relink)
 
 Automatic linking is heuristic, so it occasionally produces a false positive (a bare
diff --git a/src/kb/entities.py b/src/kb/entities.py
index ad32121..53f2cb9 100644
--- a/src/kb/entities.py
+++ b/src/kb/entities.py
@@ -4,6 +4,7 @@
 
 import json
 import re
+import unicodedata
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
@@ -589,6 +590,52 @@ def _boundary_pattern(escaped: str, name: str, flags: int = 0) -> re.Pattern[str
     return re.compile(rf"\b{escaped}(?=\s|$|[^\w])", flags)
 
 
+def _fold_name(text: str) -> str:
+    """Lowercase + strip accents (NFKD → ASCII) for collision detection (#36).
+
+    Used only to *detect* ambiguity between an accented name and its ASCII near-twin
+    (e.g. "Jérémy" / "Jeremy"). Folding here only *widens* the set of entities a bare
+    name could refer to — making the matcher more conservative — never broadening a match.
+    """
+    return unicodedata.normalize("NFKD", text.lower()).encode("ascii", "ignore").decode("ascii")
+
+
+def _is_gateable_single_name(name: str) -> bool:
+    """True for a single-word personal first name that auto-links via content/title today.
+
+    These are the matches #36 gates on ambiguity. Mirrors the inclusion rules in
+    ``_build_name_patterns``: excludes file-stem aliases, short all-caps abbreviations,
+    common English words used as team names, ``src:`` IDs, and names < 4 chars (already
+    skipped for content matching).
+    """
+    if len(name.split()) != 1:
+        return False
+    if len(name) < 4:
+        return False
+    if "-" in name and name == name.lower():
+        return False  # file stem (e.g. "dave-kowalski")
+    if name.isupper() and len(name) <= 4:
+        return False  # abbreviation (e.g. "GG")
+    if name.lower() in _COMMON_WORDS:
+        return False
+    return not name.startswith("src:")
+
+
+def build_first_name_index(entities: list[Entity]) -> dict[str, set[int]]:
+    """Map each folded single first-name → set of entity ids that claim it (#36).
+
+    A folded name owned by 2+ entities is *ambiguous*: a bare occurrence of it in a
+    document can't be attributed to a single person without corroboration. Built once
+    per entity set (like ``build_entity_patterns``) and passed to ``find_entity_mentions``.
+    """
+    owners: dict[str, set[int]] = {}
+    for entity in entities:
+        for name in (entity.name, *entity.aliases):
+            if _is_gateable_single_name(name):
+                owners.setdefault(_fold_name(name), set()).add(entity.id)
+    return owners
+
+
 def _build_name_patterns(entity: Entity) -> list[tuple[re.Pattern[str], int]]:
     """Build regex patterns for matching entity names/aliases in content.
 
@@ -670,6 +717,8 @@ def find_entity_mentions(
     *,
     cached_patterns: list[tuple[re.Pattern[str], Entity]] | None = None,
     suppressed_ids: set[int] | None = None,
+    attendees: list[str] | None = None,
+    name_owners: dict[str, set[int]] | None = None,
 ) -> list[EntityMention]:
     """Find entity mentions in a document's metadata and content.
 
@@ -682,12 +731,24 @@ def find_entity_mentions(
     Pass cached_patterns (from build_entity_patterns()) to avoid recompiling
     regex patterns on every call. If None, patterns are built on the fly.
 
-    Disambiguation: prefer longer alias matches. If ambiguous (e.g. "Anders"),
-    match all possible entities.
+    First-name disambiguation (#36): a *bare* single first name that is **ambiguous**
+    (claimed by 2+ entities — accent-folded, so "Jérémy"/"Jeremy" collide) does not
+    auto-link on its own. It only links if the entity is *corroborated* in the same
+    document — by a tag, title participant, source-ref, full-name match, or by appearing
+    in ``attendees``. Unambiguous bare first names still link as before. Pass
+    ``name_owners`` (from build_first_name_index()) to skip rebuilding the ambiguity map.
     """
     mentions: list[EntityMention] = []
     seen: set[tuple[int, str]] = set()
     _suppressed = suppressed_ids or set()
+    if name_owners is None:
+        name_owners = build_first_name_index(entities)
+
+    # Entities anchored by a strong signal in this doc. A bare ambiguous first name
+    # only links if its entity ends up here (#36). Built incrementally as we go.
+    corroborated: set[int] = set()
+    # Deferred bare-ambiguous matches, resolved once all strong matches are known.
+    pending: list[tuple[int, str]] = []
 
     def _add(entity_id: int, mention_type: str) -> None:
         if entity_id in _suppressed:
@@ -697,7 +758,16 @@ def _add(entity_id: int, mention_type: str) -> None:
             seen.add(key)
             mentions.append(EntityMention(entity_id=entity_id, mention_type=mention_type))
 
-    # 1. Tag matching
+    def _add_strong(entity_id: int, mention_type: str) -> None:
+        """Add a mention and corroborate the entity (rescues its bare-name matches)."""
+        _add(entity_id, mention_type)
+        corroborated.add(entity_id)
+
+    def _is_ambiguous_bare(name: str) -> bool:
+        """A bare first name claimed by 2+ entities (accent-folded) — #36."""
+        return _is_gateable_single_name(name) and len(name_owners.get(_fold_name(name), ())) >= 2
+
+    # 1. Tag matching — explicit annotation, always strong
     for tag in tags:
         tag_lower = tag.lower().strip()
         if not tag_lower:
@@ -705,16 +775,27 @@ def _add(entity_id: int, mention_type: str) -> None:
         for entity in entities:
             all_names = [entity.name.lower()] + [a.lower() for a in entity.aliases]
             if tag_lower in all_names:
-                _add(entity.id, "tagged")
+                _add_strong(entity.id, "tagged")
+
+    # 3.5. Source ID matching — unambiguous, case-sensitive substring check; strong
+    for entity in entities:
+        for alias in entity.aliases:
+            if not alias.startswith("src:"):
                 continue
-            # Partial match: tag is a first name or short form
-            for name in all_names:
-                if tag_lower == name:
-                    _add(entity.id, "tagged")
-                    break
-
-    # 2. Title participant parsing
-    # Split on common separators: " / ", " x ", " & ", " vs "
+            source_id = alias[4:]  # strip "src:" prefix
+            if source_id in content:
+                _add_strong(entity.id, "source_ref")
+                break  # one source_ref match per entity is enough
+
+    # Attendees corroborate (no mention emitted) — #36 context from frontmatter.
+    if attendees:
+        attendee_folded = {_fold_name(a) for a in attendees if a}
+        for entity in entities:
+            candidate = {_fold_name(entity.name), *(_fold_name(a) for a in entity.aliases)}
+            if candidate & attendee_folded:
+                corroborated.add(entity.id)
+
+    # 2. Title participant parsing — split on " / ", " x ", " & ", " vs "
     parts = re.split(r"\s+/\s+|\s+x\s+|\s+&\s+|\s+vs\s+", title, flags=re.IGNORECASE)
     for part in parts:
         part = part.strip()
@@ -722,45 +803,75 @@ def _add(entity_id: int, mention_type: str) -> None:
             continue
         part_lower = part.lower()
         for entity in entities:
-            all_names = [entity.name.lower()] + [a.lower() for a in entity.aliases]
-            if part_lower in all_names or any(
-                re.search(rf"\b{re.escape(n)}\b", part_lower)
-                for n in all_names
-                if len(n) >= 4  # skip very short names for substring matching
-            ):
-                _add(entity.id, "participant")
-
-    # 3. Title substring matching — catch names embedded in the title
-    # e.g. "Anders Sync Notes", "Wren 1:1", "Helix Refactor Review"
+            matched_strong = False
+            matched_bare_ambiguous = False
+            for name in [entity.name, *entity.aliases]:
+                n_lower = name.lower()
+                is_exact = part_lower == n_lower
+                is_substr = len(n_lower) >= 4 and (
+                    re.search(rf"\b{re.escape(n_lower)}\b", part_lower) is not None
+                )
+                if not (is_exact or is_substr):
+                    continue
+                if _is_ambiguous_bare(name):
+                    matched_bare_ambiguous = True
+                    continue  # keep looking for a stronger name for this entity
+                # Exact whole-part match or a multi-word name corroborates; an
+                # unambiguous bare substring just links.
+                if is_exact or len(name.split()) >= 2:
+                    _add_strong(entity.id, "participant")
+                else:
+                    _add(entity.id, "participant")
+                matched_strong = True
+                break
+            if matched_bare_ambiguous and not matched_strong:
+                pending.append((entity.id, "participant"))
+
+    # 3. Title substring matching — names embedded in the title (e.g. "Wren 1:1")
     title_lower = title.lower()
     for entity in entities:
-        all_names = [entity.name, *list(entity.aliases)]
-        for name in all_names:
+        matched_strong = False
+        matched_bare_ambiguous = False
+        for name in [entity.name, *list(entity.aliases)]:
             # Skip very short names and file-stem aliases
             if len(name) <= 3:
                 continue
             if "-" in name and name == name.lower():
                 continue
-            if re.search(rf"\b{re.escape(name)}\b", title_lower, re.IGNORECASE):
-                _add(entity.id, "title")
-                break  # one match per entity is enough
-
-    # 3.5. Source ID matching — unambiguous, case-sensitive substring check
-    for entity in entities:
-        for alias in entity.aliases:
-            if not alias.startswith("src:"):
+            if not re.search(rf"\b{re.escape(name)}\b", title_lower, re.IGNORECASE):
                 continue
-            source_id = alias[4:]  # strip "src:" prefix
-            if source_id in content:
-                _add(entity.id, "source_ref")
-                break  # one source_ref match per entity is enough
+            if _is_ambiguous_bare(name):
+                matched_bare_ambiguous = True
+                continue  # keep looking for a stronger name for this entity
+            if len(name.split()) >= 2:
+                _add_strong(entity.id, "title")
+            else:
+                _add(entity.id, "title")
+            matched_strong = True
+            break  # one strong match per entity is enough
+        if matched_bare_ambiguous and not matched_strong:
+            pending.append((entity.id, "title"))
 
     # 4. Content name matching with disambiguation
     if cached_patterns is None:
         cached_patterns = build_entity_patterns(entities)
 
     for pattern, entity in cached_patterns:
-        if pattern.search(content):
+        m = pattern.search(content)
+        if m is None:
+            continue
+        matched = m.group(0)
+        if len(matched.split()) == 1 and len(name_owners.get(_fold_name(matched), ())) >= 2:
+            pending.append((entity.id, "discussed"))  # bare ambiguous — defer
+            continue
+        if len(matched.split()) >= 2:
+            _add_strong(entity.id, "discussed")
+        else:
             _add(entity.id, "discussed")
 
+    # Resolve deferred bare-ambiguous matches: link only if corroborated elsewhere.
+    for entity_id, mention_type in pending:
+        if entity_id in corroborated:
+            _add(entity_id, mention_type)
+
     return mentions
diff --git a/src/kb/indexer.py b/src/kb/indexer.py
index fab098d..c25baab 100644
--- a/src/kb/indexer.py
+++ b/src/kb/indexer.py
@@ -13,6 +13,7 @@
     Entity,
     EntityMention,
     build_entity_patterns,
+    build_first_name_index,
     find_entity_mentions,
     load_entities,
     seed_entities,
@@ -178,6 +179,8 @@ def index_all(
         seed_entities(db, project_root)
     entities = load_entities(db)
     entity_patterns = build_entity_patterns(entities)
+    # Ambiguous-first-name index (#36): folded single name → owning entity ids.
+    name_owners = build_first_name_index(entities)
 
     # Entity-link suppressions (#35): per-document "do not link entity X here", kept in a
     # sidecar so they survive reindex + Granola sync. Resolve names → entity ids once.
@@ -322,6 +325,7 @@ def _flush_embeddings() -> None:
             _suppressed_ids = {
                 entity_name_to_id[n] for n in _doc_suppressed if n in entity_name_to_id
             }
+            _attendee_names = [a.get("name", "") for a in doc.attendees if a.get("name")]
             mentions = find_entity_mentions(
                 doc.title,
                 doc.tags,
@@ -329,6 +333,8 @@ def _flush_embeddings() -> None:
                 entities,
                 cached_patterns=entity_patterns,
                 suppressed_ids=_suppressed_ids,
+                attendees=_attendee_names,
+                name_owners=name_owners,
             )
             entity_id_set = {m.entity_id for m in mentions}
             result.entities_linked += len(mentions)
@@ -439,6 +445,7 @@ def _flush_embeddings() -> None:
             if result.entities_created > 0:
                 entities = load_entities(db)
                 entity_patterns = build_entity_patterns(entities)
+                name_owners = build_first_name_index(entities)
             if embedder:
                 embedder.release_gpu_memory()
             gc.collect()
diff --git a/tests/test_disambiguation.py b/tests/test_disambiguation.py
new file mode 100644
index 0000000..5e22a48
--- /dev/null
+++ b/tests/test_disambiguation.py
@@ -0,0 +1,94 @@
+"""Integration tests for bare first-name disambiguation through index_all (#36)."""
+
+from __future__ import annotations
+
+import sqlite3
+import tempfile
+from pathlib import Path
+
+
+def _count_mentions(conn: sqlite3.Connection, entity_name: str, doc_like: str) -> int:
+    row = conn.execute(
+        """
+        SELECT COUNT(*) AS n
+        FROM entity_mentions em
+        JOIN entities e ON e.id = em.entity_id
+        JOIN documents d ON d.id = em.document_id
+        WHERE e.name = ? AND d.path LIKE ?
+        """,
+        (entity_name, doc_like),
+    ).fetchone()
+    return int(row["n"])
+
+
+def _seed_two_alexandres(root: Path) -> None:
+    people = root / "memory" / "people"
+    people.mkdir(parents=True)
+    (people / "alexandre-dupont.md").write_text(
+        "# Alexandre Dupont\n\n**Also known as:** Alexandre\n\n**Role:** Engineer\n"
+    )
+    (people / "alexandre-martin.md").write_text(
+        "# Alexandre Martin\n\n**Also known as:** Alexandre\n\n**Role:** Designer\n"
+    )
+
+
+def _write_meeting(root: Path, name: str, body: str, attendees_yaml: str = "") -> None:
+    d = root / "memory" / "meetings" / "2026" / "05" / "24"
+    d.mkdir(parents=True, exist_ok=True)
+    front = "---\ntitle: Sync\ndate: 2026-05-24\ntype: notes\ngranola_id: " + name[:8] + "\n"
+    front += attendees_yaml
+    front += "---\n\n## Notes\n\n" + body + "\n"
+    (d / f"{name}.granola.notes.md").write_text(front)
+
+
+class TestFirstNameDisambiguationIntegration:
+    def test_ambiguous_bare_first_name_not_linked(self, tmp_db):
+        from kb.indexer import index_all
+
+        db, _ = tmp_db
+        with tempfile.TemporaryDirectory() as tmpdir:
+            root = Path(tmpdir)
+            _seed_two_alexandres(root)
+            _write_meeting(root, "aaaa1111_bare", "Alexandre walked us through the roadmap.")
+
+            index_all(db, None, root, full=True)
+            conn = db.get_sqlite_conn()
+            assert _count_mentions(conn, "Alexandre Dupont", "%bare%") == 0
+            assert _count_mentions(conn, "Alexandre Martin", "%bare%") == 0
+
+    def test_full_name_corroborates_one_alexandre(self, tmp_db):
+        from kb.indexer import index_all
+
+        db, _ = tmp_db
+        with tempfile.TemporaryDirectory() as tmpdir:
+            root = Path(tmpdir)
+            _seed_two_alexandres(root)
+            _write_meeting(
+                root,
+                "bbbb2222_full",
+                "Alexandre Dupont opened. Later Alexandre summarised the actions.",
+            )
+
+            index_all(db, None, root, full=True)
+            conn = db.get_sqlite_conn()
+            assert _count_mentions(conn, "Alexandre Dupont", "%full%") >= 1
+            assert _count_mentions(conn, "Alexandre Martin", "%full%") == 0
+
+    def test_attendee_corroborates_one_alexandre(self, tmp_db):
+        from kb.indexer import index_all
+
+        db, _ = tmp_db
+        with tempfile.TemporaryDirectory() as tmpdir:
+            root = Path(tmpdir)
+            _seed_two_alexandres(root)
+            _write_meeting(
+                root,
+                "cccc3333_att",
+                "Alexandre walked us through the roadmap.",
+                attendees_yaml="attendees:\n  - name: Alexandre Dupont\n    email: ad@example.com\n",
+            )
+
+            index_all(db, None, root, full=True)
+            conn = db.get_sqlite_conn()
+            assert _count_mentions(conn, "Alexandre Dupont", "%att%") >= 1
+            assert _count_mentions(conn, "Alexandre Martin", "%att%") == 0
diff --git a/tests/test_entities.py b/tests/test_entities.py
index 2ed818e..62e9407 100644
--- a/tests/test_entities.py
+++ b/tests/test_entities.py
@@ -271,29 +271,32 @@ def test_content_word_boundary(self, sample_entities):
         assert len(charles_mentions) == 0
 
     def test_short_first_name_matched_in_content(self, sample_entities):
-        """First names >3 chars should match in content (threshold lowered from 6 to 3)."""
+        """An *unambiguous* first name >3 chars should match in content (threshold rule).
+
+        ("Soren" is a 5-char single name owned by exactly one entity; ambiguous bare
+        first names are gated separately — see TestFirstNameDisambiguation, #36.)
+        """
         from kb.entities import find_entity_mentions
 
         mentions = find_entity_mentions(
             title="Random meeting",
             tags=[],
-            content="Anders presented the quarterly results.",
+            content="Soren presented the quarterly results.",
             entities=sample_entities,
         )
 
         discussed = [m for m in mentions if m.mention_type == "discussed"]
-        david_ids = {m.entity_id for m in discussed if m.entity_id in (3, 4)}
-        # "Anders" (5 chars) should now match — threshold lowered to 3
-        assert len(david_ids) >= 1
+        assert 2 in {m.entity_id for m in discussed}  # Soren Vance (unambiguous)
 
     def test_suppressed_ids_are_not_linked(self, sample_entities):
         """find_entity_mentions skips entity ids in suppressed_ids (#35)."""
         from kb.entities import find_entity_mentions
 
+        # Use a full name so the match is robust to #36's bare-name gating.
         base = find_entity_mentions(
             title="Random meeting",
             tags=[],
-            content="Anders presented the quarterly results.",
+            content="Soren Vance presented the quarterly results.",
             entities=sample_entities,
         )
         base_ids = {m.entity_id for m in base}
@@ -302,7 +305,7 @@ def test_suppressed_ids_are_not_linked(self, sample_entities):
         suppressed = find_entity_mentions(
             title="Random meeting",
             tags=[],
-            content="Anders presented the quarterly results.",
+            content="Soren Vance presented the quarterly results.",
             entities=sample_entities,
             suppressed_ids={target},
         )
@@ -347,21 +350,21 @@ def test_short_first_name_matches_via_tag(self, sample_entities):
         assert 4 in david_ids
 
     def test_title_substring_matching(self, sample_entities):
-        """Entity names appearing as substrings in the title should match as 'title' type."""
+        """An unambiguous name appearing as a substring in the title matches as 'title'.
+
+        ("Wren" is owned by one entity; ambiguous bare names in titles are gated — #36.)
+        """
         from kb.entities import find_entity_mentions
 
         mentions = find_entity_mentions(
-            title="Anders Sync Notes",
+            title="Wren Sync Notes",
             tags=[],
             content="Short content with no names.",
             entities=sample_entities,
         )
 
         title_mentions = [m for m in mentions if m.mention_type == "title"]
-        title_ids = {m.entity_id for m in title_mentions}
-        # Both Davids should match via title substring
-        assert 3 in title_ids  # Soren Vance
-        assert 4 in title_ids  # Kit Larsen (alias "Anders")
+        assert 1 in {m.entity_id for m in title_mentions}  # Wren Kasper (unambiguous)
 
     def test_title_substring_skips_short_names(self, sample_entities):
         """Title substring matching should skip very short names (<=3 chars)."""
@@ -430,6 +433,118 @@ def test_combined_mention_types(self, sample_entities):
         assert "discussed" in types
 
 
+class TestFirstNameDisambiguation:
+    """Bare ambiguous first names must not auto-link without corroboration (#36)."""
+
+    def _alex_pair(self):
+        from kb.entities import Entity
+
+        return [
+            Entity(id=101, name="Alexandre Dupont", entity_type="person", aliases=["Alexandre"]),
+            Entity(id=102, name="Alexandre Martin", entity_type="person", aliases=["Alexandre"]),
+        ]
+
+    def test_build_first_name_index_flags_ambiguous(self):
+        from kb.entities import build_first_name_index
+
+        owners = build_first_name_index(self._alex_pair())
+        assert owners.get("alexandre") == {101, 102}
+
+    def test_build_first_name_index_folds_accents(self):
+        """An accented name and its ASCII near-twin collide in the ambiguity index."""
+        from kb.entities import Entity, build_first_name_index
+
+        ents = [
+            Entity(id=201, name="Jérémy Cotineau", entity_type="person", aliases=["Jérémy"]),
+            Entity(id=202, name="Jeremy Brown", entity_type="person", aliases=["Jeremy"]),
+        ]
+        owners = build_first_name_index(ents)
+        assert owners.get("jeremy") == {201, 202}
+
+    def test_ambiguous_bare_first_name_not_linked_without_context(self):
+        from kb.entities import find_entity_mentions
+
+        mentions = find_entity_mentions(
+            title="Weekly sync",
+            tags=[],
+            content="Alexandre opened the meeting and walked through the roadmap.",
+            entities=self._alex_pair(),
+        )
+        discussed = {m.entity_id for m in mentions if m.mention_type == "discussed"}
+        assert discussed == set(), "ambiguous bare first name should not auto-link"
+
+    def test_ambiguous_bare_name_linked_when_full_name_corroborates(self):
+        from kb.entities import find_entity_mentions
+
+        mentions = find_entity_mentions(
+            title="Weekly sync",
+            tags=[],
+            content="Alexandre Dupont opened. Later Alexandre summarised the actions.",
+            entities=self._alex_pair(),
+        )
+        discussed = {m.entity_id for m in mentions if m.mention_type == "discussed"}
+        assert 101 in discussed, "full-name match should corroborate this Alexandre"
+        assert 102 not in discussed, "the other Alexandre stays unlinked"
+
+    def test_ambiguous_bare_name_linked_when_attendee_corroborates(self):
+        from kb.entities import find_entity_mentions
+
+        mentions = find_entity_mentions(
+            title="Weekly sync",
+            tags=[],
+            content="Alexandre opened the meeting and walked through the roadmap.",
+            entities=self._alex_pair(),
+            attendees=["Alexandre Dupont"],
+        )
+        discussed = {m.entity_id for m in mentions if m.mention_type == "discussed"}
+        assert 101 in discussed, "attendee should corroborate this Alexandre"
+        assert 102 not in discussed
+
+    def test_accent_twin_gates_an_otherwise_unambiguous_match(self):
+        """A bare ASCII 'Jeremy' that could be the accented 'Jérémy' is gated."""
+        from kb.entities import Entity, find_entity_mentions
+
+        ents = [
+            Entity(id=201, name="Jérémy Cotineau", entity_type="person", aliases=["Jérémy"]),
+            Entity(id=202, name="Jeremy Brown", entity_type="person", aliases=["Jeremy"]),
+        ]
+        mentions = find_entity_mentions(
+            title="Review",
+            tags=[],
+            content="Jeremy attended the review and gave feedback.",
+            entities=ents,
+        )
+        discussed = {m.entity_id for m in mentions if m.mention_type == "discussed"}
+        assert discussed == set(), "accent-fold collision should gate the bare match"
+
+    def test_ambiguous_bare_name_in_title_gated(self):
+        from kb.entities import find_entity_mentions
+
+        mentions = find_entity_mentions(
+            title="Alexandre Sync Notes",
+            tags=[],
+            content="Short content with no names.",
+            entities=self._alex_pair(),
+        )
+        title_ids = {m.entity_id for m in mentions if m.mention_type == "title"}
+        assert title_ids == set(), "ambiguous bare first name in title should be gated"
+
+    def test_unambiguous_first_name_still_links(self):
+        from kb.entities import Entity, find_entity_mentions
+
+        ents = [
+            Entity(id=301, name="Soren Vance", entity_type="person", aliases=["Soren"]),
+        ]
+        mentions = find_entity_mentions(
+            title="Weekly sync",
+            tags=[],
+            content="Soren walked the team through the new design.",
+            entities=ents,
+        )
+        discussed = {m.entity_id for m in mentions if m.mention_type == "discussed"}
+        assert 301 in discussed, "an unambiguous first name must still auto-link"
+
+
 class TestSeedEntitiesNonDestructive:
     """seed_entities() must be non-destructive: upsert entities, preserve mentions."""