diff --git a/runner/runner.go b/runner/runner.go index ae90cfbc..d0150661 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -95,7 +95,7 @@ type Runner struct { browser *Browser ditClassifier *dit.Classifier pHashClusters []pHashCluster - simHashes gcache.Cache[uint64, struct{}] // Include simHashes for efficient duplicate detection + simHashes gcache.Cache[uint64, []string] httpApiEndpoint *Server authProvider authprovider.AuthProvider interruptCh chan struct{} @@ -430,7 +430,7 @@ func New(options *Options) (*Runner, error) { runner.HostErrorsCache = gc } - runner.simHashes = gcache.New[uint64, struct{}](1000).ARC().Build() + runner.simHashes = gcache.New[uint64, []string](1000).ARC().Build() if options.JSONOutput || options.CSVOutput || len(options.OutputFilterPageType) > 0 { ditClassifier, err := dit.New() if err != nil { @@ -639,19 +639,21 @@ func (r *Runner) seen(k string) bool { func (r *Runner) duplicate(result *Result) bool { respSimHash := simhash.Simhash(simhash.NewWordFeatureSet(converstionutil.Bytes(result.Raw))) - if r.simHashes.Has(respSimHash) { - gologger.Debug().Msgf("Skipping duplicate response with simhash %d for URL %s\n", respSimHash, result.URL) - return true - } + ip := result.HostIP - for simHash := range r.simHashes.GetALL(false) { - // lower threshold for increased precision - if simhash.Compare(simHash, respSimHash) <= 3 { - gologger.Debug().Msgf("Skipping near-duplicate response with simhash %d for URL %s\n", respSimHash, result.URL) + for storedHash, storedIPs := range r.simHashes.GetALL(false) { + if simhash.Compare(storedHash, respSimHash) > 3 { + continue + } + if ip == "" || sliceutil.Contains(storedIPs, ip) { + gologger.Debug().Msgf("Skipping duplicate response (simhash %d, ip %s) for URL %s\n", respSimHash, ip, result.URL) return true } + _ = r.simHashes.Set(storedHash, append(storedIPs, ip)) + return false } - _ = r.simHashes.Set(respSimHash, struct{}{}) + + _ = r.simHashes.Set(respSimHash, []string{ip}) return false } diff --git a/runner/runner_test.go b/runner/runner_test.go index bd96b81d..415a6f51 100644 --- a/runner/runner_test.go +++ b/runner/runner_test.go @@ -386,6 +386,147 @@ func TestRunner_testAndSet_concurrent(t *testing.T) { require.Equal(t, 1, winCount, "exactly one goroutine should win testAndSet for the same key") } +func TestRunner_duplicate(t *testing.T) { + const ( + pageA = "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\nWelcomeHello world default page content here" + pageB = "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\nDashboardCompletely different application running on this server" + ) + + t.Run("same content same IP is duplicate", func(t *testing.T) { + r, err := New(&Options{}) + require.Nil(t, err) + + first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"} + second := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://b.example.com"} + + require.False(t, r.duplicate(first), "first result should not be duplicate") + require.True(t, r.duplicate(second), "same content + same IP should be duplicate") + }) + + t.Run("same content different IP is NOT duplicate", func(t *testing.T) { + r, err := New(&Options{}) + require.Nil(t, err) + + first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"} + second := &Result{Raw: pageA, HostIP: "2.2.2.2", URL: "https://b.example.com"} + + require.False(t, r.duplicate(first)) + require.False(t, r.duplicate(second), "same content but different IP should NOT be duplicate") + }) + + t.Run("different content same IP is NOT duplicate", func(t *testing.T) { + r, err := New(&Options{}) + require.Nil(t, err) + + first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"} + second := &Result{Raw: pageB, HostIP: "1.1.1.1", URL: "https://b.example.com"} + + require.False(t, r.duplicate(first)) + require.False(t, r.duplicate(second), "different content on same IP should NOT be duplicate") + }) + + t.Run("different content different IP is NOT duplicate", func(t *testing.T) { + r, err := New(&Options{}) + require.Nil(t, err) + + first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"} + second := &Result{Raw: pageB, HostIP: "2.2.2.2", URL: "https://b.example.com"} + + require.False(t, r.duplicate(first)) + require.False(t, r.duplicate(second), "different content + different IP should NOT be duplicate") + }) + + t.Run("third subdomain same content same IP is duplicate", func(t *testing.T) { + r, err := New(&Options{}) + require.Nil(t, err) + + first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"} + second := &Result{Raw: pageA, HostIP: "2.2.2.2", URL: "https://b.example.com"} + third := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://c.example.com"} + + require.False(t, r.duplicate(first)) + require.False(t, r.duplicate(second), "different IP should be kept") + require.True(t, r.duplicate(third), "same content + same IP as first should be duplicate") + }) + + t.Run("near-duplicate content same IP is duplicate", func(t *testing.T) { + r, err := New(&Options{}) + require.Nil(t, err) + + first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"} + nearDup := &Result{ + Raw: "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\nWelcomeHello world default page content here!", + HostIP: "1.1.1.1", + URL: "https://b.example.com", + } + + require.False(t, r.duplicate(first)) + require.True(t, r.duplicate(nearDup), "near-duplicate content from same IP should be duplicate") + }) + + t.Run("near-duplicate content different IP is NOT duplicate", func(t *testing.T) { + r, err := New(&Options{}) + require.Nil(t, err) + + first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"} + nearDup := &Result{ + Raw: "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\nWelcomeHello world default page content here!", + HostIP: "3.3.3.3", + URL: "https://b.example.com", + } + + require.False(t, r.duplicate(first)) + require.False(t, r.duplicate(nearDup), "near-duplicate content from different IP should NOT be duplicate") + }) + + t.Run("empty IP falls back to content-only dedup", func(t *testing.T) { + r, err := New(&Options{}) + require.Nil(t, err) + + first := &Result{Raw: pageA, HostIP: "", URL: "https://a.example.com"} + second := &Result{Raw: pageA, HostIP: "", URL: "https://b.example.com"} + + require.False(t, r.duplicate(first)) + require.True(t, r.duplicate(second), "empty IP should fall back to content-only dedup") + }) + + t.Run("many subdomains same default page same IP", func(t *testing.T) { + r, err := New(&Options{}) + require.Nil(t, err) + + kept := 0 + for i := 0; i < 50; i++ { + res := &Result{ + Raw: pageA, + HostIP: "10.0.0.1", + URL: fmt.Sprintf("https://sub%d.example.com", i), + } + if !r.duplicate(res) { + kept++ + } + } + require.Equal(t, 1, kept, "50 subdomains with identical content on same IP should keep exactly 1") + }) + + t.Run("many subdomains same default page different IPs", func(t *testing.T) { + r, err := New(&Options{}) + require.Nil(t, err) + + kept := 0 + for i := 0; i < 50; i++ { + res := &Result{ + Raw: pageA, + HostIP: fmt.Sprintf("10.0.0.%d", i+1), + URL: fmt.Sprintf("https://sub%d.example.com", i), + } + if !r.duplicate(res) { + kept++ + } + } + require.Equal(t, 50, kept, "50 subdomains with identical content but different IPs should keep all 50") + }) +} + func TestCreateNetworkpolicyInstance_AllowDenyFlags(t *testing.T) { runner := &Runner{}