From 38cadb8cdc82a7765b3c8edaa333208ab4845f5b Mon Sep 17 00:00:00 2001 From: Bashar Qassis <23612682+bashar-qassis@users.noreply.github.com> Date: Sat, 4 Apr 2026 18:03:14 +0300 Subject: [PATCH 1/4] fix: overhaul duplicate detection scoring, add address matching, trigger after imports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The duplicate detection worker had several bugs preventing it from catching obvious duplicates: - Scoring formula (name*0.4 + email*0.35 + phone*0.25 with threshold 0.4) meant contacts sharing the same email but with different names scored 0.35, below the threshold — silently missed. - Email comparison was case-sensitive. - Only one side of email/phone field pairs had its type verified. - Address data was completely ignored. - No import worker triggered duplicate detection after completion. Fixes: - Replace additive scoring with max-signal + bonus approach where each signal independently qualifies (email=0.85, phone=0.75, address=0.60, name=similarity) - Add case-insensitive email matching via LOWER() fragments - Filter both cf1 and cf2 contact_field_types in email/phone queries - Use LIKE 'mailto%' pattern to handle protocol colon inconsistency - Add address matching on normalized line1 + postal_code - Enqueue DuplicateDetectionWorker after successful completion in all three import workers (MonicaApiCrawlWorker, ImportSourceWorker, ImportWorker) - Add comprehensive test suite (20 tests) for the detection worker --- .../workers/duplicate_detection_worker.ex | 133 ++-- lib/kith/workers/import_source_worker.ex | 4 + lib/kith/workers/import_worker.ex | 4 + lib/kith/workers/monica_api_crawl_worker.ex | 4 + .../duplicate_detection_worker_test.exs | 601 ++++++++++++++++++ 5 files changed, 704 insertions(+), 42 deletions(-) create mode 100644 test/kith/workers/duplicate_detection_worker_test.exs diff --git a/lib/kith/workers/duplicate_detection_worker.ex b/lib/kith/workers/duplicate_detection_worker.ex index 8e67e29..efea61d 100644 --- a/lib/kith/workers/duplicate_detection_worker.ex +++ b/lib/kith/workers/duplicate_detection_worker.ex @@ -5,9 +5,18 @@ defmodule Kith.Workers.DuplicateDetectionWorker do Detection algorithm: 1. Name similarity via pg_trgm similarity() on display_name (threshold: 0.5) - 2. Exact email match across contact_fields - 3. Exact phone match across contact_fields - 4. Weighted score: name(0.4) + email(0.35) + phone(0.25) + 2. Case-insensitive email match across contact_fields + 3. Normalized phone match across contact_fields (digits only) + 4. Address match on line1 + postal_code + + Scoring (max-signal + bonus): + Each signal has an independent base score: + - email_match: 0.85 + - phone_match: 0.75 + - address_match: 0.60 + - name_match: the raw pg_trgm similarity (> 0.5) + Final score = max(base scores) + 0.05 per additional signal, capped at 1.0 + Threshold: >= 0.5 """ use Oban.Worker, @@ -15,7 +24,7 @@ defmodule Kith.Workers.DuplicateDetectionWorker do max_attempts: 3 import Ecto.Query - alias Kith.Contacts.{Contact, ContactField, DuplicateCandidate} + alias Kith.Contacts.{Address, Contact, ContactField, ContactFieldType, DuplicateCandidate} alias Kith.Repo @impl Oban.Worker @@ -36,33 +45,25 @@ defmodule Kith.Workers.DuplicateDetectionWorker do end defp detect_duplicates(account_id) do - # Get active contacts for this account - contacts = + contact_count = Contact |> where([c], c.account_id == ^account_id) |> where([c], is_nil(c.deleted_at)) - |> select([c], %{id: c.id, display_name: c.display_name}) - |> Repo.all() + |> Repo.aggregate(:count) - if length(contacts) < 2, do: :ok, else: find_duplicates(account_id, contacts) + if contact_count >= 2, do: find_duplicates(account_id) end - defp find_duplicates(account_id, _contacts) do - # Find name-based duplicates using pg_trgm + defp find_duplicates(account_id) do name_matches = find_name_matches(account_id) - - # Find email-based duplicates email_matches = find_email_matches(account_id) - - # Find phone-based duplicates phone_matches = find_phone_matches(account_id) + address_matches = find_address_matches(account_id) - # Merge and score all matches all_pairs = - merge_matches(name_matches, email_matches, phone_matches) - |> Enum.filter(fn {_pair, score, _reasons} -> score >= 0.4 end) + merge_matches(name_matches, email_matches, phone_matches, address_matches) + |> Enum.filter(fn {_pair, score, _reasons} -> score >= 0.5 end) - # Get existing pending/dismissed candidates to avoid re-inserting existing = DuplicateCandidate |> where([d], d.account_id == ^account_id) @@ -73,9 +74,7 @@ defmodule Kith.Workers.DuplicateDetectionWorker do now = DateTime.utc_now() |> DateTime.truncate(:second) - # Insert new candidates Enum.each(all_pairs, fn {{id1, id2}, score, reasons} -> - # Canonicalize: smaller id first {contact_id, dup_id} = if id1 < id2, do: {id1, id2}, else: {id2, id1} unless MapSet.member?(existing, {contact_id, dup_id}) do @@ -93,7 +92,6 @@ defmodule Kith.Workers.DuplicateDetectionWorker do end defp find_name_matches(account_id) do - # Use pg_trgm similarity for fuzzy name matching query = """ SELECT c1.id AS id1, c2.id AS id2, similarity(c1.display_name, c2.display_name) AS sim FROM contacts c1 @@ -102,6 +100,8 @@ defmodule Kith.Workers.DuplicateDetectionWorker do WHERE c1.account_id = $1 AND c1.deleted_at IS NULL AND c2.deleted_at IS NULL + AND c1.display_name IS NOT NULL AND c1.display_name != '' + AND c2.display_name IS NOT NULL AND c2.display_name != '' AND similarity(c1.display_name, c2.display_name) > 0.5 ORDER BY sim DESC LIMIT 500 @@ -119,55 +119,94 @@ defmodule Kith.Workers.DuplicateDetectionWorker do end defp find_email_matches(account_id) do - # Find contacts that share an exact email address + # Case-insensitive email match, both fields verified as email type query = from cf1 in ContactField, join: cf2 in ContactField, - on: cf1.value == cf2.value and cf1.id < cf2.id, - join: cft in assoc(cf1, :contact_field_type), + on: + fragment("LOWER(?)", cf1.value) == fragment("LOWER(?)", cf2.value) and + cf1.id < cf2.id, + join: cft1 in ContactFieldType, + on: cf1.contact_field_type_id == cft1.id, + join: cft2 in ContactFieldType, + on: cf2.contact_field_type_id == cft2.id, where: cf1.account_id == ^account_id, where: cf2.account_id == ^account_id, - where: cft.protocol == "mailto:", + where: fragment("? LIKE 'mailto%'", cft1.protocol), + where: fragment("? LIKE 'mailto%'", cft2.protocol), where: cf1.contact_id != cf2.contact_id, + where: cf1.value != "" and not is_nil(cf1.value), select: {cf1.contact_id, cf2.contact_id} query |> Repo.all() - |> Enum.uniq() |> Enum.map(fn {id1, id2} -> - {id1, id2} = if id1 < id2, do: {id1, id2}, else: {id2, id1} - {{id1, id2}, 1.0, ["email_match"]} + if id1 < id2, do: {id1, id2}, else: {id2, id1} end) - |> Enum.uniq_by(fn {pair, _, _} -> pair end) + |> Enum.uniq() + |> Enum.map(fn {id1, id2} -> {{id1, id2}, 1.0, ["email_match"]} end) end defp find_phone_matches(account_id) do - # Find contacts that share an exact phone number (normalized: digits only) + # Normalized phone match (digits only), both fields verified as phone type query = from cf1 in ContactField, join: cf2 in ContactField, on: fragment("regexp_replace(?, '[^0-9]', '', 'g')", cf1.value) == - fragment("regexp_replace(?, '[^0-9]', '', 'g')", cf2.value) and cf1.id < cf2.id, - join: cft in assoc(cf1, :contact_field_type), + fragment("regexp_replace(?, '[^0-9]', '', 'g')", cf2.value) and + cf1.id < cf2.id, + join: cft1 in ContactFieldType, + on: cf1.contact_field_type_id == cft1.id, + join: cft2 in ContactFieldType, + on: cf2.contact_field_type_id == cft2.id, where: cf1.account_id == ^account_id, where: cf2.account_id == ^account_id, - where: cft.protocol == "tel:", + where: fragment("? LIKE 'tel%'", cft1.protocol), + where: fragment("? LIKE 'tel%'", cft2.protocol), where: cf1.contact_id != cf2.contact_id, + where: cf1.value != "" and not is_nil(cf1.value), select: {cf1.contact_id, cf2.contact_id} query |> Repo.all() + |> Enum.map(fn {id1, id2} -> + if id1 < id2, do: {id1, id2}, else: {id2, id1} + end) |> Enum.uniq() + |> Enum.map(fn {id1, id2} -> {{id1, id2}, 1.0, ["phone_match"]} end) + end + + defp find_address_matches(account_id) do + # Match on normalized line1 + postal_code + query = + from a1 in Address, + join: a2 in Address, + on: + fragment("LOWER(TRIM(?))", a1.line1) == fragment("LOWER(TRIM(?))", a2.line1) and + fragment("LOWER(TRIM(?))", a1.postal_code) == + fragment("LOWER(TRIM(?))", a2.postal_code) and + a1.id < a2.id, + where: a1.account_id == ^account_id, + where: a2.account_id == ^account_id, + where: a1.contact_id != a2.contact_id, + where: a1.line1 != "" and not is_nil(a1.line1), + where: a1.postal_code != "" and not is_nil(a1.postal_code), + where: a2.line1 != "" and not is_nil(a2.line1), + where: a2.postal_code != "" and not is_nil(a2.postal_code), + select: {a1.contact_id, a2.contact_id} + + query + |> Repo.all() |> Enum.map(fn {id1, id2} -> - {id1, id2} = if id1 < id2, do: {id1, id2}, else: {id2, id1} - {{id1, id2}, 1.0, ["phone_match"]} + if id1 < id2, do: {id1, id2}, else: {id2, id1} end) - |> Enum.uniq_by(fn {pair, _, _} -> pair end) + |> Enum.uniq() + |> Enum.map(fn {id1, id2} -> {{id1, id2}, 1.0, ["address_match"]} end) end - defp merge_matches(name_matches, email_matches, phone_matches) do - (name_matches ++ email_matches ++ phone_matches) + defp merge_matches(name_matches, email_matches, phone_matches, address_matches) do + (name_matches ++ email_matches ++ phone_matches ++ address_matches) |> Enum.group_by(fn {pair, _score, _reasons} -> pair end) |> Enum.map(&compute_merged_score/1) end @@ -176,9 +215,19 @@ defmodule Kith.Workers.DuplicateDetectionWorker do reasons = matches |> Enum.flat_map(fn {_, _, r} -> r end) |> Enum.uniq() name_sim = Enum.find_value(matches, 0.0, &extract_name_score/1) - email_weight = if "email_match" in reasons, do: 0.35, else: 0.0 - phone_weight = if "phone_match" in reasons, do: 0.25, else: 0.0 - score = min(name_sim * 0.4 + email_weight + phone_weight, 1.0) + # Base score for each signal type + base_scores = + [] + |> then(fn acc -> if "email_match" in reasons, do: [0.85 | acc], else: acc end) + |> then(fn acc -> if "phone_match" in reasons, do: [0.75 | acc], else: acc end) + |> then(fn acc -> if "address_match" in reasons, do: [0.60 | acc], else: acc end) + |> then(fn acc -> if name_sim > 0.0, do: [name_sim | acc], else: acc end) + + signal_count = length(base_scores) + max_score = Enum.max(base_scores, fn -> 0.0 end) + bonus = max(signal_count - 1, 0) * 0.05 + + score = min(max_score + bonus, 1.0) {pair, Float.round(score, 2), reasons} end diff --git a/lib/kith/workers/import_source_worker.ex b/lib/kith/workers/import_source_worker.ex index 6cdf883..d5feaa6 100644 --- a/lib/kith/workers/import_source_worker.ex +++ b/lib/kith/workers/import_source_worker.ex @@ -11,6 +11,7 @@ defmodule Kith.Workers.ImportSourceWorker do alias Kith.Imports alias Kith.Storage + alias Kith.Workers.DuplicateDetectionWorker @impl Oban.Worker def perform(%Oban.Job{args: %{"import_id" => import_id}}) do @@ -33,6 +34,9 @@ defmodule Kith.Workers.ImportSourceWorker do topic = "import:#{import.account_id}" Phoenix.PubSub.broadcast(Kith.PubSub, topic, {:import_complete, summary_map}) + # Trigger duplicate detection for newly imported contacts + Oban.insert(DuplicateDetectionWorker.new(%{account_id: import.account_id})) + Logger.info("Import #{import_id} completed: #{inspect(summary_map)}") :ok else diff --git a/lib/kith/workers/import_worker.ex b/lib/kith/workers/import_worker.ex index 790620e..68dde77 100644 --- a/lib/kith/workers/import_worker.ex +++ b/lib/kith/workers/import_worker.ex @@ -11,6 +11,7 @@ defmodule Kith.Workers.ImportWorker do alias Kith.Contacts alias Kith.VCard.Parser + alias Kith.Workers.DuplicateDetectionWorker @impl Oban.Worker def perform(%Oban.Job{ @@ -42,6 +43,9 @@ defmodule Kith.Workers.ImportWorker do {:import_complete, results} ) + # Trigger duplicate detection for newly imported contacts + Oban.insert(DuplicateDetectionWorker.new(%{account_id: account_id})) + Logger.info( "vCard import complete for account #{account_id}: " <> "#{results.imported} imported, #{results.skipped} skipped" diff --git a/lib/kith/workers/monica_api_crawl_worker.ex b/lib/kith/workers/monica_api_crawl_worker.ex index b5355ba..f366140 100644 --- a/lib/kith/workers/monica_api_crawl_worker.ex +++ b/lib/kith/workers/monica_api_crawl_worker.ex @@ -15,6 +15,7 @@ defmodule Kith.Workers.MonicaApiCrawlWorker do alias Kith.Imports alias Kith.Imports.Sources.MonicaApi + alias Kith.Workers.DuplicateDetectionWorker @impl Oban.Worker def perform(%Oban.Job{args: %{"import_id" => import_id}}) do @@ -47,6 +48,9 @@ defmodule Kith.Workers.MonicaApiCrawlWorker do topic = "import:#{import_job.account_id}" Phoenix.PubSub.broadcast(Kith.PubSub, topic, {:import_complete, summary_map}) + # Trigger duplicate detection for newly imported contacts + Oban.insert(DuplicateDetectionWorker.new(%{account_id: import_job.account_id})) + Logger.info("MonicaApi import #{import_id} completed: #{inspect(summary_map)}") :ok else diff --git a/test/kith/workers/duplicate_detection_worker_test.exs b/test/kith/workers/duplicate_detection_worker_test.exs new file mode 100644 index 0000000..8f2728e --- /dev/null +++ b/test/kith/workers/duplicate_detection_worker_test.exs @@ -0,0 +1,601 @@ +defmodule Kith.Workers.DuplicateDetectionWorkerTest do + use Kith.DataCase, async: true + use Oban.Testing, repo: Kith.Repo + + import Kith.Factory + import Kith.ContactsFixtures + + alias Kith.Contacts.DuplicateCandidate + alias Kith.Workers.DuplicateDetectionWorker + + setup do + seed_reference_data!() + {account, _user} = setup_account() + + email_type = + Repo.one!( + from t in "contact_field_types", + where: t.protocol == "mailto:", + select: %{id: t.id}, + limit: 1 + ) + + phone_type = + Repo.one!( + from t in "contact_field_types", + where: t.protocol == "tel:", + select: %{id: t.id}, + limit: 1 + ) + + %{account: account, email_type_id: email_type.id, phone_type_id: phone_type.id} + end + + defp run_detection(account_id) do + perform_job(DuplicateDetectionWorker, %{account_id: account_id}) + end + + defp pending_candidates(account_id) do + DuplicateCandidate + |> where([d], d.account_id == ^account_id) + |> where([d], d.status == "pending") + |> order_by([d], desc: d.score) + |> Repo.all() + end + + describe "name matching" do + test "detects contacts with similar display names", %{account: account} do + insert(:contact, + account: account, + display_name: "John Smith", + first_name: "John", + last_name: "Smith" + ) + + insert(:contact, + account: account, + display_name: "John Smithe", + first_name: "John", + last_name: "Smithe" + ) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "name_match" in hd(candidates).reasons + assert hd(candidates).score >= 0.5 + end + + test "does not match dissimilar names", %{account: account} do + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + assert :ok = run_detection(account.id) + + assert pending_candidates(account.id) == [] + end + end + + describe "email matching" do + test "detects contacts sharing the same email", %{ + account: account, + email_type_id: email_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "shared@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "email_match" in hd(candidates).reasons + assert hd(candidates).score >= 0.8 + end + + test "email matching is case-insensitive", %{account: account, email_type_id: email_type_id} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "SHARED@Example.COM"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "email_match" in hd(candidates).reasons + end + + test "email-only match scores around 0.85", %{account: account, email_type_id: email_type_id} do + c1 = + insert(:contact, + account: account, + display_name: "Completely Different", + first_name: "Completely", + last_name: "Different" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Totally Unique", + first_name: "Totally", + last_name: "Unique" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "same@email.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "same@email.com"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert hd(candidates).score == 0.85 + end + end + + describe "phone matching" do + test "detects contacts sharing the same phone number", %{ + account: account, + phone_type_id: phone_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, phone_type_id, %{"value" => "+1-555-1234"}) + contact_field_fixture(c2, phone_type_id, %{"value" => "+1-555-1234"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "phone_match" in hd(candidates).reasons + assert hd(candidates).score >= 0.7 + end + + test "phone matching normalizes formatting", %{account: account, phone_type_id: phone_type_id} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, phone_type_id, %{"value" => "+1-555-1234"}) + contact_field_fixture(c2, phone_type_id, %{"value" => "15551234"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "phone_match" in hd(candidates).reasons + end + + test "phone-only match scores 0.75", %{account: account, phone_type_id: phone_type_id} do + c1 = + insert(:contact, + account: account, + display_name: "Completely Different", + first_name: "Completely", + last_name: "Different" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Totally Unique", + first_name: "Totally", + last_name: "Unique" + ) + + contact_field_fixture(c1, phone_type_id, %{"value" => "5559876"}) + contact_field_fixture(c2, phone_type_id, %{"value" => "5559876"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert hd(candidates).score == 0.75 + end + end + + describe "address matching" do + test "detects contacts sharing the same address", %{account: account} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + address_fixture(c1, %{"line1" => "123 Main St", "postal_code" => "90210"}) + address_fixture(c2, %{"line1" => "123 Main St", "postal_code" => "90210"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "address_match" in hd(candidates).reasons + assert hd(candidates).score == 0.6 + end + + test "address matching is case-insensitive and trims whitespace", %{account: account} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + address_fixture(c1, %{"line1" => " 123 Main St ", "postal_code" => "90210"}) + address_fixture(c2, %{"line1" => "123 MAIN ST", "postal_code" => "90210"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + assert "address_match" in hd(candidates).reasons + end + + test "does not match on postal_code alone", %{account: account} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + address_fixture(c1, %{"line1" => "123 Main St", "postal_code" => "90210"}) + address_fixture(c2, %{"line1" => "456 Oak Ave", "postal_code" => "90210"}) + + assert :ok = run_detection(account.id) + + assert pending_candidates(account.id) == [] + end + end + + describe "combined signals" do + test "email + name match scores higher than email alone", %{ + account: account, + email_type_id: email_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "John Smith", + first_name: "John", + last_name: "Smith" + ) + + c2 = + insert(:contact, + account: account, + display_name: "John Smithe", + first_name: "John", + last_name: "Smithe" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "john@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "john@example.com"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + candidate = hd(candidates) + assert "name_match" in candidate.reasons + assert "email_match" in candidate.reasons + # email base (0.85) + bonus for name signal (0.05) = 0.90 + assert candidate.score > 0.85 + end + + test "email + phone match boosts score", %{ + account: account, + email_type_id: email_type_id, + phone_type_id: phone_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "Completely Different", + first_name: "Completely", + last_name: "Different" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Totally Unique", + first_name: "Totally", + last_name: "Unique" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "same@email.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "same@email.com"}) + contact_field_fixture(c1, phone_type_id, %{"value" => "5551234"}) + contact_field_fixture(c2, phone_type_id, %{"value" => "5551234"}) + + assert :ok = run_detection(account.id) + + candidates = pending_candidates(account.id) + assert length(candidates) == 1 + candidate = hd(candidates) + assert "email_match" in candidate.reasons + assert "phone_match" in candidate.reasons + # email base (0.85) + 1 bonus (0.05) = 0.90 + assert candidate.score == 0.9 + end + end + + describe "edge cases" do + test "skips soft-deleted contacts", %{account: account, email_type_id: email_type_id} do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams", + deleted_at: DateTime.utc_now(:second) + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "shared@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + assert :ok = run_detection(account.id) + + assert pending_candidates(account.id) == [] + end + + test "does not re-insert existing pending candidates", %{ + account: account, + email_type_id: email_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "shared@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + # First run + assert :ok = run_detection(account.id) + assert length(pending_candidates(account.id)) == 1 + + # Second run should not create duplicates + assert :ok = run_detection(account.id) + assert length(pending_candidates(account.id)) == 1 + end + + test "does not re-insert dismissed candidates", %{ + account: account, + email_type_id: email_type_id + } do + c1 = + insert(:contact, + account: account, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "shared@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + # First run, then dismiss + assert :ok = run_detection(account.id) + [candidate] = pending_candidates(account.id) + Kith.DuplicateDetection.dismiss_candidate(candidate) + + # Second run should not re-create dismissed candidate + assert :ok = run_detection(account.id) + assert pending_candidates(account.id) == [] + end + + test "account isolation — only detects within same account", %{email_type_id: email_type_id} do + {account1, _} = setup_account() + {account2, _} = setup_account() + + c1 = + insert(:contact, + account: account1, + display_name: "Alice Johnson", + first_name: "Alice", + last_name: "Johnson" + ) + + c2 = + insert(:contact, + account: account2, + display_name: "Bob Williams", + first_name: "Bob", + last_name: "Williams" + ) + + contact_field_fixture(c1, email_type_id, %{"value" => "shared@example.com"}) + contact_field_fixture(c2, email_type_id, %{"value" => "shared@example.com"}) + + assert :ok = run_detection(account1.id) + assert :ok = run_detection(account2.id) + + assert pending_candidates(account1.id) == [] + assert pending_candidates(account2.id) == [] + end + + test "handles fewer than 2 contacts gracefully", %{account: account} do + insert(:contact, + account: account, + display_name: "Only Contact", + first_name: "Only", + last_name: "Contact" + ) + + assert :ok = run_detection(account.id) + + assert pending_candidates(account.id) == [] + end + + test "handles zero contacts gracefully", %{account: account} do + assert :ok = run_detection(account.id) + + assert pending_candidates(account.id) == [] + end + end + + describe "cron mode" do + test "runs for all accounts when no account_id provided" do + {account1, _} = setup_account() + {account2, _} = setup_account() + + insert(:contact, + account: account1, + display_name: "John Smith", + first_name: "John", + last_name: "Smith" + ) + + insert(:contact, + account: account1, + display_name: "John Smithe", + first_name: "John", + last_name: "Smithe" + ) + + insert(:contact, + account: account2, + display_name: "Jane Doe", + first_name: "Jane", + last_name: "Doe" + ) + + insert(:contact, + account: account2, + display_name: "Jane Doee", + first_name: "Jane", + last_name: "Doee" + ) + + assert :ok = perform_job(DuplicateDetectionWorker, %{}) + + assert length(pending_candidates(account1.id)) == 1 + assert length(pending_candidates(account2.id)) == 1 + end + end +end From 38cba058db077fdfa0bd5500e92f2d3b5ac78ff6 Mon Sep 17 00:00:00 2001 From: Bashar Qassis <23612682+bashar-qassis@users.noreply.github.com> Date: Sat, 4 Apr 2026 19:51:21 +0300 Subject: [PATCH 2/4] fix: paginate duplicates page to prevent timeout on large result sets list_candidates now takes limit/offset opts (default 20 per page). The LiveView loads one page at a time with a "Load more" button. Dismiss removes the candidate from the current list without reloading. --- lib/kith/duplicate_detection.ex | 6 + lib/kith_web/live/contact_live/duplicates.ex | 150 ++++++++++++------- 2 files changed, 98 insertions(+), 58 deletions(-) diff --git a/lib/kith/duplicate_detection.ex b/lib/kith/duplicate_detection.ex index a7599d0..359417e 100644 --- a/lib/kith/duplicate_detection.ex +++ b/lib/kith/duplicate_detection.ex @@ -4,13 +4,19 @@ defmodule Kith.DuplicateDetection do alias Kith.Contacts.DuplicateCandidate alias Kith.Repo + @default_page_size 20 + def list_candidates(account_id, opts \\ []) do status = Keyword.get(opts, :status, "pending") + limit = Keyword.get(opts, :limit, @default_page_size) + offset = Keyword.get(opts, :offset, 0) DuplicateCandidate |> scope_to_account(account_id) |> where([d], d.status == ^status) |> order_by([d], desc: d.score) + |> limit(^limit) + |> offset(^offset) |> Repo.all() |> Repo.preload([:contact, :duplicate_contact]) end diff --git a/lib/kith_web/live/contact_live/duplicates.ex b/lib/kith_web/live/contact_live/duplicates.ex index 1dad4f5..36c951b 100644 --- a/lib/kith_web/live/contact_live/duplicates.ex +++ b/lib/kith_web/live/contact_live/duplicates.ex @@ -7,12 +7,16 @@ defmodule KithWeb.ContactLive.Duplicates do alias Kith.Policy alias Kith.Workers.DuplicateDetectionWorker + @page_size 20 + @impl true def mount(_params, _session, socket) do {:ok, socket |> assign(:page_title, "Duplicate Contacts") - |> assign(:candidates, [])} + |> assign(:candidates, []) + |> assign(:has_more, false) + |> assign(:total_count, 0)} end @impl true @@ -20,12 +24,15 @@ defmodule KithWeb.ContactLive.Duplicates do scope = socket.assigns.current_scope account_id = scope.account.id - candidates = DuplicateDetection.list_candidates(account_id) + candidates = DuplicateDetection.list_candidates(account_id, limit: @page_size) + total_count = DuplicateDetection.pending_count(account_id) {:noreply, socket |> assign(:account_id, account_id) - |> assign(:candidates, candidates)} + |> assign(:candidates, candidates) + |> assign(:total_count, total_count) + |> assign(:has_more, length(candidates) >= @page_size)} end @impl true @@ -35,15 +42,32 @@ defmodule KithWeb.ContactLive.Duplicates do {:ok, _} = DuplicateDetection.dismiss_candidate(candidate) - candidates = DuplicateDetection.list_candidates(socket.assigns.account_id) + candidates = Enum.reject(socket.assigns.candidates, &(&1.id == candidate.id)) + total_count = socket.assigns.total_count - 1 {:noreply, socket |> assign(:candidates, candidates) - |> assign(:pending_duplicates_count, length(candidates)) + |> assign(:total_count, total_count) + |> assign(:pending_duplicates_count, total_count) |> put_flash(:info, "Duplicate dismissed.")} end + def handle_event("load_more", _params, socket) do + offset = length(socket.assigns.candidates) + + more = + DuplicateDetection.list_candidates(socket.assigns.account_id, + limit: @page_size, + offset: offset + ) + + {:noreply, + socket + |> assign(:candidates, socket.assigns.candidates ++ more) + |> assign(:has_more, length(more) >= @page_size)} + end + def handle_event("scan", _params, socket) do user = socket.assigns.current_scope.user @@ -79,7 +103,7 @@ defmodule KithWeb.ContactLive.Duplicates do
- {length(@candidates)} potential duplicate{if length(@candidates) != 1, do: "s"} found + {@total_count} potential duplicate{if @total_count != 1, do: "s"} found
- {length(@candidates)} potential duplicate{if length(@candidates) != 1, do: "s"} found + {@duplicates_total} potential duplicate{if @duplicates_total != 1, do: "s"} found
- This action cannot be easily undone. Are you sure you want to merge - - {@contact_b.display_name} - - into {@contact_a.display_name}? -
- -