diff --git a/lib/kith/contacts.ex b/lib/kith/contacts.ex index 1427759..d92f9e4 100644 --- a/lib/kith/contacts.ex +++ b/lib/kith/contacts.ex @@ -1740,14 +1740,28 @@ defmodule Kith.Contacts do end, set: [contact_id: survivor.id] ) - # Remap photos - |> Ecto.Multi.update_all( - :remap_photos, - fn _changes -> + # Remap photos (delete duplicates by content_hash first, then move remaining) + |> Ecto.Multi.run(:remap_photos, fn repo, _changes -> + # Delete photos from non-survivor that already exist on survivor (same content_hash) + repo.query( + """ + DELETE FROM photos + WHERE contact_id = $1 + AND content_hash IS NOT NULL + AND content_hash IN ( + SELECT content_hash FROM photos WHERE contact_id = $2 AND content_hash IS NOT NULL + ) + """, + [non_survivor.id, survivor.id] + ) + + # Move remaining photos + {count, _} = from(p in Photo, where: p.contact_id == ^non_survivor.id) - end, - set: [contact_id: survivor.id] - ) + |> repo.update_all(set: [contact_id: survivor.id]) + + {:ok, count} + end) # Remap addresses |> Ecto.Multi.update_all( :remap_addresses, diff --git a/lib/kith/duplicate_detection.ex b/lib/kith/duplicate_detection.ex index a7599d0..359417e 100644 --- a/lib/kith/duplicate_detection.ex +++ b/lib/kith/duplicate_detection.ex @@ -4,13 +4,19 @@ defmodule Kith.DuplicateDetection do alias Kith.Contacts.DuplicateCandidate alias Kith.Repo + @default_page_size 20 + def list_candidates(account_id, opts \\ []) do status = Keyword.get(opts, :status, "pending") + limit = Keyword.get(opts, :limit, @default_page_size) + offset = Keyword.get(opts, :offset, 0) DuplicateCandidate |> scope_to_account(account_id) |> where([d], d.status == ^status) |> order_by([d], desc: d.score) + |> limit(^limit) + |> offset(^offset) |> Repo.all() |> Repo.preload([:contact, :duplicate_contact]) end diff --git a/lib/kith/workers/duplicate_detection_worker.ex b/lib/kith/workers/duplicate_detection_worker.ex index 8e67e29..efea61d 100644 --- a/lib/kith/workers/duplicate_detection_worker.ex +++ b/lib/kith/workers/duplicate_detection_worker.ex @@ -5,9 +5,18 @@ defmodule Kith.Workers.DuplicateDetectionWorker do Detection algorithm: 1. Name similarity via pg_trgm similarity() on display_name (threshold: 0.5) - 2. Exact email match across contact_fields - 3. Exact phone match across contact_fields - 4. Weighted score: name(0.4) + email(0.35) + phone(0.25) + 2. Case-insensitive email match across contact_fields + 3. Normalized phone match across contact_fields (digits only) + 4. Address match on line1 + postal_code + + Scoring (max-signal + bonus): + Each signal has an independent base score: + - email_match: 0.85 + - phone_match: 0.75 + - address_match: 0.60 + - name_match: the raw pg_trgm similarity (> 0.5) + Final score = max(base scores) + 0.05 per additional signal, capped at 1.0 + Threshold: >= 0.5 """ use Oban.Worker, @@ -15,7 +24,7 @@ defmodule Kith.Workers.DuplicateDetectionWorker do max_attempts: 3 import Ecto.Query - alias Kith.Contacts.{Contact, ContactField, DuplicateCandidate} + alias Kith.Contacts.{Address, Contact, ContactField, ContactFieldType, DuplicateCandidate} alias Kith.Repo @impl Oban.Worker @@ -36,33 +45,25 @@ defmodule Kith.Workers.DuplicateDetectionWorker do end defp detect_duplicates(account_id) do - # Get active contacts for this account - contacts = + contact_count = Contact |> where([c], c.account_id == ^account_id) |> where([c], is_nil(c.deleted_at)) - |> select([c], %{id: c.id, display_name: c.display_name}) - |> Repo.all() + |> Repo.aggregate(:count) - if length(contacts) < 2, do: :ok, else: find_duplicates(account_id, contacts) + if contact_count >= 2, do: find_duplicates(account_id) end - defp find_duplicates(account_id, _contacts) do - # Find name-based duplicates using pg_trgm + defp find_duplicates(account_id) do name_matches = find_name_matches(account_id) - - # Find email-based duplicates email_matches = find_email_matches(account_id) - - # Find phone-based duplicates phone_matches = find_phone_matches(account_id) + address_matches = find_address_matches(account_id) - # Merge and score all matches all_pairs = - merge_matches(name_matches, email_matches, phone_matches) - |> Enum.filter(fn {_pair, score, _reasons} -> score >= 0.4 end) + merge_matches(name_matches, email_matches, phone_matches, address_matches) + |> Enum.filter(fn {_pair, score, _reasons} -> score >= 0.5 end) - # Get existing pending/dismissed candidates to avoid re-inserting existing = DuplicateCandidate |> where([d], d.account_id == ^account_id) @@ -73,9 +74,7 @@ defmodule Kith.Workers.DuplicateDetectionWorker do now = DateTime.utc_now() |> DateTime.truncate(:second) - # Insert new candidates Enum.each(all_pairs, fn {{id1, id2}, score, reasons} -> - # Canonicalize: smaller id first {contact_id, dup_id} = if id1 < id2, do: {id1, id2}, else: {id2, id1} unless MapSet.member?(existing, {contact_id, dup_id}) do @@ -93,7 +92,6 @@ defmodule Kith.Workers.DuplicateDetectionWorker do end defp find_name_matches(account_id) do - # Use pg_trgm similarity for fuzzy name matching query = """ SELECT c1.id AS id1, c2.id AS id2, similarity(c1.display_name, c2.display_name) AS sim FROM contacts c1 @@ -102,6 +100,8 @@ defmodule Kith.Workers.DuplicateDetectionWorker do WHERE c1.account_id = $1 AND c1.deleted_at IS NULL AND c2.deleted_at IS NULL + AND c1.display_name IS NOT NULL AND c1.display_name != '' + AND c2.display_name IS NOT NULL AND c2.display_name != '' AND similarity(c1.display_name, c2.display_name) > 0.5 ORDER BY sim DESC LIMIT 500 @@ -119,55 +119,94 @@ defmodule Kith.Workers.DuplicateDetectionWorker do end defp find_email_matches(account_id) do - # Find contacts that share an exact email address + # Case-insensitive email match, both fields verified as email type query = from cf1 in ContactField, join: cf2 in ContactField, - on: cf1.value == cf2.value and cf1.id < cf2.id, - join: cft in assoc(cf1, :contact_field_type), + on: + fragment("LOWER(?)", cf1.value) == fragment("LOWER(?)", cf2.value) and + cf1.id < cf2.id, + join: cft1 in ContactFieldType, + on: cf1.contact_field_type_id == cft1.id, + join: cft2 in ContactFieldType, + on: cf2.contact_field_type_id == cft2.id, where: cf1.account_id == ^account_id, where: cf2.account_id == ^account_id, - where: cft.protocol == "mailto:", + where: fragment("? LIKE 'mailto%'", cft1.protocol), + where: fragment("? LIKE 'mailto%'", cft2.protocol), where: cf1.contact_id != cf2.contact_id, + where: cf1.value != "" and not is_nil(cf1.value), select: {cf1.contact_id, cf2.contact_id} query |> Repo.all() - |> Enum.uniq() |> Enum.map(fn {id1, id2} -> - {id1, id2} = if id1 < id2, do: {id1, id2}, else: {id2, id1} - {{id1, id2}, 1.0, ["email_match"]} + if id1 < id2, do: {id1, id2}, else: {id2, id1} end) - |> Enum.uniq_by(fn {pair, _, _} -> pair end) + |> Enum.uniq() + |> Enum.map(fn {id1, id2} -> {{id1, id2}, 1.0, ["email_match"]} end) end defp find_phone_matches(account_id) do - # Find contacts that share an exact phone number (normalized: digits only) + # Normalized phone match (digits only), both fields verified as phone type query = from cf1 in ContactField, join: cf2 in ContactField, on: fragment("regexp_replace(?, '[^0-9]', '', 'g')", cf1.value) == - fragment("regexp_replace(?, '[^0-9]', '', 'g')", cf2.value) and cf1.id < cf2.id, - join: cft in assoc(cf1, :contact_field_type), + fragment("regexp_replace(?, '[^0-9]', '', 'g')", cf2.value) and + cf1.id < cf2.id, + join: cft1 in ContactFieldType, + on: cf1.contact_field_type_id == cft1.id, + join: cft2 in ContactFieldType, + on: cf2.contact_field_type_id == cft2.id, where: cf1.account_id == ^account_id, where: cf2.account_id == ^account_id, - where: cft.protocol == "tel:", + where: fragment("? LIKE 'tel%'", cft1.protocol), + where: fragment("? LIKE 'tel%'", cft2.protocol), where: cf1.contact_id != cf2.contact_id, + where: cf1.value != "" and not is_nil(cf1.value), select: {cf1.contact_id, cf2.contact_id} query |> Repo.all() + |> Enum.map(fn {id1, id2} -> + if id1 < id2, do: {id1, id2}, else: {id2, id1} + end) |> Enum.uniq() + |> Enum.map(fn {id1, id2} -> {{id1, id2}, 1.0, ["phone_match"]} end) + end + + defp find_address_matches(account_id) do + # Match on normalized line1 + postal_code + query = + from a1 in Address, + join: a2 in Address, + on: + fragment("LOWER(TRIM(?))", a1.line1) == fragment("LOWER(TRIM(?))", a2.line1) and + fragment("LOWER(TRIM(?))", a1.postal_code) == + fragment("LOWER(TRIM(?))", a2.postal_code) and + a1.id < a2.id, + where: a1.account_id == ^account_id, + where: a2.account_id == ^account_id, + where: a1.contact_id != a2.contact_id, + where: a1.line1 != "" and not is_nil(a1.line1), + where: a1.postal_code != "" and not is_nil(a1.postal_code), + where: a2.line1 != "" and not is_nil(a2.line1), + where: a2.postal_code != "" and not is_nil(a2.postal_code), + select: {a1.contact_id, a2.contact_id} + + query + |> Repo.all() |> Enum.map(fn {id1, id2} -> - {id1, id2} = if id1 < id2, do: {id1, id2}, else: {id2, id1} - {{id1, id2}, 1.0, ["phone_match"]} + if id1 < id2, do: {id1, id2}, else: {id2, id1} end) - |> Enum.uniq_by(fn {pair, _, _} -> pair end) + |> Enum.uniq() + |> Enum.map(fn {id1, id2} -> {{id1, id2}, 1.0, ["address_match"]} end) end - defp merge_matches(name_matches, email_matches, phone_matches) do - (name_matches ++ email_matches ++ phone_matches) + defp merge_matches(name_matches, email_matches, phone_matches, address_matches) do + (name_matches ++ email_matches ++ phone_matches ++ address_matches) |> Enum.group_by(fn {pair, _score, _reasons} -> pair end) |> Enum.map(&compute_merged_score/1) end @@ -176,9 +215,19 @@ defmodule Kith.Workers.DuplicateDetectionWorker do reasons = matches |> Enum.flat_map(fn {_, _, r} -> r end) |> Enum.uniq() name_sim = Enum.find_value(matches, 0.0, &extract_name_score/1) - email_weight = if "email_match" in reasons, do: 0.35, else: 0.0 - phone_weight = if "phone_match" in reasons, do: 0.25, else: 0.0 - score = min(name_sim * 0.4 + email_weight + phone_weight, 1.0) + # Base score for each signal type + base_scores = + [] + |> then(fn acc -> if "email_match" in reasons, do: [0.85 | acc], else: acc end) + |> then(fn acc -> if "phone_match" in reasons, do: [0.75 | acc], else: acc end) + |> then(fn acc -> if "address_match" in reasons, do: [0.60 | acc], else: acc end) + |> then(fn acc -> if name_sim > 0.0, do: [name_sim | acc], else: acc end) + + signal_count = length(base_scores) + max_score = Enum.max(base_scores, fn -> 0.0 end) + bonus = max(signal_count - 1, 0) * 0.05 + + score = min(max_score + bonus, 1.0) {pair, Float.round(score, 2), reasons} end diff --git a/lib/kith/workers/import_source_worker.ex b/lib/kith/workers/import_source_worker.ex index 6cdf883..d5feaa6 100644 --- a/lib/kith/workers/import_source_worker.ex +++ b/lib/kith/workers/import_source_worker.ex @@ -11,6 +11,7 @@ defmodule Kith.Workers.ImportSourceWorker do alias Kith.Imports alias Kith.Storage + alias Kith.Workers.DuplicateDetectionWorker @impl Oban.Worker def perform(%Oban.Job{args: %{"import_id" => import_id}}) do @@ -33,6 +34,9 @@ defmodule Kith.Workers.ImportSourceWorker do topic = "import:#{import.account_id}" Phoenix.PubSub.broadcast(Kith.PubSub, topic, {:import_complete, summary_map}) + # Trigger duplicate detection for newly imported contacts + Oban.insert(DuplicateDetectionWorker.new(%{account_id: import.account_id})) + Logger.info("Import #{import_id} completed: #{inspect(summary_map)}") :ok else diff --git a/lib/kith/workers/import_worker.ex b/lib/kith/workers/import_worker.ex index 790620e..68dde77 100644 --- a/lib/kith/workers/import_worker.ex +++ b/lib/kith/workers/import_worker.ex @@ -11,6 +11,7 @@ defmodule Kith.Workers.ImportWorker do alias Kith.Contacts alias Kith.VCard.Parser + alias Kith.Workers.DuplicateDetectionWorker @impl Oban.Worker def perform(%Oban.Job{ @@ -42,6 +43,9 @@ defmodule Kith.Workers.ImportWorker do {:import_complete, results} ) + # Trigger duplicate detection for newly imported contacts + Oban.insert(DuplicateDetectionWorker.new(%{account_id: account_id})) + Logger.info( "vCard import complete for account #{account_id}: " <> "#{results.imported} imported, #{results.skipped} skipped" diff --git a/lib/kith/workers/monica_api_crawl_worker.ex b/lib/kith/workers/monica_api_crawl_worker.ex index b5355ba..f366140 100644 --- a/lib/kith/workers/monica_api_crawl_worker.ex +++ b/lib/kith/workers/monica_api_crawl_worker.ex @@ -15,6 +15,7 @@ defmodule Kith.Workers.MonicaApiCrawlWorker do alias Kith.Imports alias Kith.Imports.Sources.MonicaApi + alias Kith.Workers.DuplicateDetectionWorker @impl Oban.Worker def perform(%Oban.Job{args: %{"import_id" => import_id}}) do @@ -47,6 +48,9 @@ defmodule Kith.Workers.MonicaApiCrawlWorker do topic = "import:#{import_job.account_id}" Phoenix.PubSub.broadcast(Kith.PubSub, topic, {:import_complete, summary_map}) + # Trigger duplicate detection for newly imported contacts + Oban.insert(DuplicateDetectionWorker.new(%{account_id: import_job.account_id})) + Logger.info("MonicaApi import #{import_id} completed: #{inspect(summary_map)}") :ok else diff --git a/lib/kith_web/live/contact_live/duplicates.ex b/lib/kith_web/live/contact_live/duplicates.ex index 1dad4f5..36c951b 100644 --- a/lib/kith_web/live/contact_live/duplicates.ex +++ b/lib/kith_web/live/contact_live/duplicates.ex @@ -7,12 +7,16 @@ defmodule KithWeb.ContactLive.Duplicates do alias Kith.Policy alias Kith.Workers.DuplicateDetectionWorker + @page_size 20 + @impl true def mount(_params, _session, socket) do {:ok, socket |> assign(:page_title, "Duplicate Contacts") - |> assign(:candidates, [])} + |> assign(:candidates, []) + |> assign(:has_more, false) + |> assign(:total_count, 0)} end @impl true @@ -20,12 +24,15 @@ defmodule KithWeb.ContactLive.Duplicates do scope = socket.assigns.current_scope account_id = scope.account.id - candidates = DuplicateDetection.list_candidates(account_id) + candidates = DuplicateDetection.list_candidates(account_id, limit: @page_size) + total_count = DuplicateDetection.pending_count(account_id) {:noreply, socket |> assign(:account_id, account_id) - |> assign(:candidates, candidates)} + |> assign(:candidates, candidates) + |> assign(:total_count, total_count) + |> assign(:has_more, length(candidates) >= @page_size)} end @impl true @@ -35,15 +42,32 @@ defmodule KithWeb.ContactLive.Duplicates do {:ok, _} = DuplicateDetection.dismiss_candidate(candidate) - candidates = DuplicateDetection.list_candidates(socket.assigns.account_id) + candidates = Enum.reject(socket.assigns.candidates, &(&1.id == candidate.id)) + total_count = socket.assigns.total_count - 1 {:noreply, socket |> assign(:candidates, candidates) - |> assign(:pending_duplicates_count, length(candidates)) + |> assign(:total_count, total_count) + |> assign(:pending_duplicates_count, total_count) |> put_flash(:info, "Duplicate dismissed.")} end + def handle_event("load_more", _params, socket) do + offset = length(socket.assigns.candidates) + + more = + DuplicateDetection.list_candidates(socket.assigns.account_id, + limit: @page_size, + offset: offset + ) + + {:noreply, + socket + |> assign(:candidates, socket.assigns.candidates ++ more) + |> assign(:has_more, length(more) >= @page_size)} + end + def handle_event("scan", _params, socket) do user = socket.assigns.current_scope.user @@ -79,7 +103,7 @@ defmodule KithWeb.ContactLive.Duplicates do

Duplicate Contacts

- {length(@candidates)} potential duplicate{if length(@candidates) != 1, do: "s"} found + {@total_count} potential duplicate{if @total_count != 1, do: "s"} found

+ {candidate.duplicate_contact.display_name} + - <% end %> + +
+ <.link + navigate={ + ~p"/contacts/#{candidate.contact.id}/merge?with=#{candidate.duplicate_contact.id}&candidate_id=#{candidate.id}" + } + class="inline-flex items-center gap-1.5 rounded-[var(--radius-md)] bg-[var(--color-accent)] text-[var(--color-accent-foreground)] px-3 py-1.5 text-xs font-medium hover:bg-[var(--color-accent-hover)] transition-colors" + > + <.icon name="hero-arrows-right-left" class="size-4" /> Merge + + +
+ + +
+ +
<% end %> diff --git a/lib/kith_web/live/contact_live/index.ex b/lib/kith_web/live/contact_live/index.ex index c1f3b55..491b9b4 100644 --- a/lib/kith_web/live/contact_live/index.ex +++ b/lib/kith_web/live/contact_live/index.ex @@ -33,6 +33,8 @@ defmodule KithWeb.ContactLive.Index do |> assign(:meta, nil) |> assign(:tags, Contacts.list_tags(account_id)) |> assign(:candidates, []) + |> assign(:duplicates_total, 0) + |> assign(:duplicates_has_more, false) |> assign(:trashed_contacts, [])} end @@ -55,12 +57,18 @@ defmodule KithWeb.ContactLive.Index do |> load_contacts() end + @duplicates_page_size 20 + defp apply_action(socket, :duplicates, _params) do - candidates = DuplicateDetection.list_candidates(socket.assigns.account_id) + account_id = socket.assigns.account_id + candidates = DuplicateDetection.list_candidates(account_id, limit: @duplicates_page_size) + total_count = DuplicateDetection.pending_count(account_id) socket |> assign(:page_title, "Duplicate Contacts") |> assign(:candidates, candidates) + |> assign(:duplicates_total, total_count) + |> assign(:duplicates_has_more, length(candidates) >= @duplicates_page_size) end defp apply_action(socket, :trash, _params) do @@ -245,15 +253,32 @@ defmodule KithWeb.ContactLive.Index do {:ok, _} = DuplicateDetection.dismiss_candidate(candidate) - candidates = DuplicateDetection.list_candidates(socket.assigns.account_id) + candidates = Enum.reject(socket.assigns.candidates, &(&1.id == candidate.id)) + total = socket.assigns.duplicates_total - 1 {:noreply, socket |> assign(:candidates, candidates) - |> assign(:pending_duplicates_count, length(candidates)) + |> assign(:duplicates_total, total) + |> assign(:pending_duplicates_count, total) |> put_flash(:info, "Duplicate dismissed.")} end + def handle_event("load_more_duplicates", _params, socket) do + offset = length(socket.assigns.candidates) + + more = + DuplicateDetection.list_candidates(socket.assigns.account_id, + limit: @duplicates_page_size, + offset: offset + ) + + {:noreply, + socket + |> assign(:candidates, socket.assigns.candidates ++ more) + |> assign(:duplicates_has_more, length(more) >= @duplicates_page_size)} + end + def handle_event("scan", _params, socket) do user = socket.assigns.current_scope.user diff --git a/lib/kith_web/live/contact_live/index.html.heex b/lib/kith_web/live/contact_live/index.html.heex index 7e08df4..3db183f 100644 --- a/lib/kith_web/live/contact_live/index.html.heex +++ b/lib/kith_web/live/contact_live/index.html.heex @@ -381,7 +381,7 @@

- {length(@candidates)} potential duplicate{if length(@candidates) != 1, do: "s"} found + {@duplicates_total} potential duplicate{if @duplicates_total != 1, do: "s"} found

<% end %> + + <%= if @duplicates_has_more do %> +
+ +
+ <% end %> <% end %> <% end %> diff --git a/lib/kith_web/live/contact_live/merge.ex b/lib/kith_web/live/contact_live/merge.ex index 6bdad13..f45dbbc 100644 --- a/lib/kith_web/live/contact_live/merge.ex +++ b/lib/kith_web/live/contact_live/merge.ex @@ -150,13 +150,7 @@ defmodule KithWeb.ContactLive.Merge do end end - # ── Step 3: Preview ──────────────────────────────────────────────────── - - def handle_event("confirm-merge", _params, socket) do - {:noreply, assign(socket, :step, 4)} - end - - # ── Step 4: Execute ──────────────────────────────────────────────────── + # ── Step 3: Preview & Execute ─────────────────────────────────────────── def handle_event("execute-merge", _params, socket) do contact_a = socket.assigns.contact_a @@ -233,14 +227,14 @@ defmodule KithWeb.ContactLive.Merge do Merge Contacts - Step {@step} of 4 — {step_label(@step)} + Step {@step} of 3 — {step_label(@step)} <%!-- Step indicator (horizontal stepper) --%>
- Back - Confirm Merge -
- -
- - <%!-- Step 4: Confirm & Execute --%> -
- -
-

- Final Confirmation -

-

- This action cannot be easily undone. Are you sure you want to merge - - {@contact_b.display_name} - - into {@contact_a.display_name}? -

- -
- - Go Back - - - {if @merging, do: "Merging...", else: "Merge Contacts"} - -
+ Back + + {if @merging, do: "Merging...", else: "Merge Contacts"} +
@@ -463,8 +432,7 @@ defmodule KithWeb.ContactLive.Merge do defp step_label(1), do: "Select contact" defp step_label(2), do: "Choose fields" - defp step_label(3), do: "Preview" - defp step_label(4), do: "Confirm" + defp step_label(3), do: "Review & merge" defp default_field_choices do @mergeable_fields diff --git a/test/kith/workers/duplicate_detection_worker_test.exs b/test/kith/workers/duplicate_detection_worker_test.exs new file mode 100644 index 0000000..8f2728e --- /dev/null +++ b/test/kith/workers/duplicate_detection_worker_test.exs @@ -0,0 +1,601 @@ +defmodule Kith.Workers.DuplicateDetectionWorkerTest do + use Kith.DataCase, async: true + use Oban.Testing, repo: Kith.Repo + + import Kith.Factory + import Kith.ContactsFixtures + + alias Kith.Contacts.DuplicateCandidate + alias Kith.Workers.DuplicateDetectionWorker + + setup do + seed_reference_data!() + {account, _user} = setup_account() + + email_type = + Repo.one!( + from t in "contact_field_types", + where: t.protocol == "mailto:", + select: %{id: t.id}, + limit: 1 + ) + + phone_type = + Repo.one!( + from t in "contact_field_types", + where: t.protocol == "tel:", + select: %{id: t.id}, + limit: 1 + ) + + %{account: account, email_type_id: email_type.id, phone_type_id: phone_type.id} + end + + defp run_detection(account_id) do + perform_job(DuplicateDetectionWorker, %{account_id: account_id}) + end + + defp pending_candidates(account_id) do + DuplicateCandidate + |> where([d], d.account_id == ^account_id) + |> where([d], d.status == "pending") + |> order_by([d], desc: d.score) + |> Repo.all() + end + + describe "name matching" do + test "detects contacts with similar display names", %{account: account} do + insert(:contact, + account: account, + display_name: "John Smith", + first_name: "John", + last_name: "Smith" + ) + + insert(:contact, + account: account, + display_name: "John Smithe", + first_name: "John", + last_name: "Smithe" + ) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "name_match" in hd(candidates).reasons + assert hd(candidates).score >= 0.5 + end + + test "does not match dissimilar names", %{account: account} do + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + assert :ok = run_detection(account.id) + + assert pending_candidates(account.id) == [] + end + end + + describe "email matching" do + test "detects contacts sharing the same email", %{ + account: account, + email_type_id: email_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "shared@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "email_match" in hd(candidates).reasons + assert hd(candidates).score >= 0.8 + end + + test "email matching is case-insensitive", %{account: account, email_type_id: email_type_id} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "SHARED@Example.COM"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "email_match" in hd(candidates).reasons + end + + test "email-only match scores around 0.85", %{account: account, email_type_id: email_type_id} do + c1 = + insert(:contact, + account: account, + display_name: "Completely Different", + first_name: "Completely", + last_name: "Different" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Totally Unique", + first_name: "Totally", + last_name: "Unique" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "same@email.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "same@email.com"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert hd(candidates).score == 0.85 + end + end + + describe "phone matching" do + test "detects contacts sharing the same phone number", %{ + account: account, + phone_type_id: phone_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, phone_type_id, %{"value" => "+1-555-1234"}) + contact_field_fixture(c2, phone_type_id, %{"value" => "+1-555-1234"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "phone_match" in hd(candidates).reasons + assert hd(candidates).score >= 0.7 + end + + test "phone matching normalizes formatting", %{account: account, phone_type_id: phone_type_id} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, phone_type_id, %{"value" => "+1-555-1234"}) + contact_field_fixture(c2, phone_type_id, %{"value" => "15551234"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "phone_match" in hd(candidates).reasons + end + + test "phone-only match scores 0.75", %{account: account, phone_type_id: phone_type_id} do + c1 = + insert(:contact, + account: account, + display_name: "Completely Different", + first_name: "Completely", + last_name: "Different" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Totally Unique", + first_name: "Totally", + last_name: "Unique" + ) + + contact_field_fixture(c1, phone_type_id, %{"value" => "5559876"}) + contact_field_fixture(c2, phone_type_id, %{"value" => "5559876"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert hd(candidates).score == 0.75 + end + end + + describe "address matching" do + test "detects contacts sharing the same address", %{account: account} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + address_fixture(c1, %{"line1" => "123 Main St", "postal_code" => "90210"}) + address_fixture(c2, %{"line1" => "123 Main St", "postal_code" => "90210"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "address_match" in hd(candidates).reasons + assert hd(candidates).score == 0.6 + end + + test "address matching is case-insensitive and trims whitespace", %{account: account} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + address_fixture(c1, %{"line1" => " 123 Main St ", "postal_code" => "90210"}) + address_fixture(c2, %{"line1" => "123 MAIN ST", "postal_code" => "90210"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "address_match" in hd(candidates).reasons + end + + test "does not match on postal_code alone", %{account: account} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + address_fixture(c1, %{"line1" => "123 Main St", "postal_code" => "90210"}) + address_fixture(c2, %{"line1" => "456 Oak Ave", "postal_code" => "90210"}) + + assert :ok = run_detection(account.id) + + assert pending_candidates(account.id) == [] + end + end + + describe "combined signals" do + test "email + name match scores higher than email alone", %{ + account: account, + email_type_id: email_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "John Smith", + first_name: "John", + last_name: "Smith" + ) + + c2 = + insert(:contact, + account: account, + display_name: "John Smithe", + first_name: "John", + last_name: "Smithe" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "john@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "john@example.com"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + candidate = hd(candidates) + assert "name_match" in candidate.reasons + assert "email_match" in candidate.reasons + # email base (0.85) + bonus for name signal (0.05) = 0.90 + assert candidate.score > 0.85 + end + + test "email + phone match boosts score", %{ + account: account, + email_type_id: email_type_id, + phone_type_id: phone_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "Completely Different", + first_name: "Completely", + last_name: "Different" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Totally Unique", + first_name: "Totally", + last_name: "Unique" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "same@email.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "same@email.com"}) + contact_field_fixture(c1, phone_type_id, %{"value" => "5551234"}) + contact_field_fixture(c2, phone_type_id, %{"value" => "5551234"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + candidate = hd(candidates) + assert "email_match" in candidate.reasons + assert "phone_match" in candidate.reasons + # email base (0.85) + 1 bonus (0.05) = 0.90 + assert candidate.score == 0.9 + end + end + + describe "edge cases" do + test "skips soft-deleted contacts", %{account: account, email_type_id: email_type_id} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams", + deleted_at: DateTime.utc_now(:second) + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "shared@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + assert :ok = run_detection(account.id) + + assert pending_candidates(account.id) == [] + end + + test "does not re-insert existing pending candidates", %{ + account: account, + email_type_id: email_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "shared@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + # First run + assert :ok = run_detection(account.id) + assert length(pending_candidates(account.id)) == 1 + + # Second run should not create duplicates + assert :ok = run_detection(account.id) + assert length(pending_candidates(account.id)) == 1 + end + + test "does not re-insert dismissed candidates", %{ + account: account, + email_type_id: email_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "shared@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + # First run, then dismiss + assert :ok = run_detection(account.id) + [candidate] = pending_candidates(account.id) + Kith.DuplicateDetection.dismiss_candidate(candidate) + + # Second run should not re-create dismissed candidate + assert :ok = run_detection(account.id) + assert pending_candidates(account.id) == [] + end + + test "account isolation — only detects within same account", %{email_type_id: email_type_id} do + {account1, _} = setup_account() + {account2, _} = setup_account() + + c1 = + insert(:contact, + account: account1, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account2, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "shared@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + assert :ok = run_detection(account1.id) + assert :ok = run_detection(account2.id) + + assert pending_candidates(account1.id) == [] + assert pending_candidates(account2.id) == [] + end + + test "handles fewer than 2 contacts gracefully", %{account: account} do + insert(:contact, + account: account, + display_name: "Only Contact", + first_name: "Only", + last_name: "Contact" + ) + + assert :ok = run_detection(account.id) + + assert pending_candidates(account.id) == [] + end + + test "handles zero contacts gracefully", %{account: account} do + assert :ok = run_detection(account.id) + + assert pending_candidates(account.id) == [] + end + end + + describe "cron mode" do + test "runs for all accounts when no account_id provided" do + {account1, _} = setup_account() + {account2, _} = setup_account() + + insert(:contact, + account: account1, + display_name: "John Smith", + first_name: "John", + last_name: "Smith" + ) + + insert(:contact, + account: account1, + display_name: "John Smithe", + first_name: "John", + last_name: "Smithe" + ) + + insert(:contact, + account: account2, + display_name: "Jane Doe", + first_name: "Jane", + last_name: "Doe" + ) + + insert(:contact, + account: account2, + display_name: "Jane Doee", + first_name: "Jane", + last_name: "Doee" + ) + + assert :ok = perform_job(DuplicateDetectionWorker, %{}) + + assert length(pending_candidates(account1.id)) == 1 + assert length(pending_candidates(account2.id)) == 1 + end + end +end