diff --git a/runner/runner.go b/runner/runner.go
index ae90cfbc..d0150661 100644
--- a/runner/runner.go
+++ b/runner/runner.go
@@ -95,7 +95,7 @@ type Runner struct {
browser *Browser
ditClassifier *dit.Classifier
pHashClusters []pHashCluster
- simHashes gcache.Cache[uint64, struct{}] // Include simHashes for efficient duplicate detection
+ simHashes gcache.Cache[uint64, []string]
httpApiEndpoint *Server
authProvider authprovider.AuthProvider
interruptCh chan struct{}
@@ -430,7 +430,7 @@ func New(options *Options) (*Runner, error) {
runner.HostErrorsCache = gc
}
- runner.simHashes = gcache.New[uint64, struct{}](1000).ARC().Build()
+ runner.simHashes = gcache.New[uint64, []string](1000).ARC().Build()
if options.JSONOutput || options.CSVOutput || len(options.OutputFilterPageType) > 0 {
ditClassifier, err := dit.New()
if err != nil {
@@ -639,19 +639,21 @@ func (r *Runner) seen(k string) bool {
func (r *Runner) duplicate(result *Result) bool {
respSimHash := simhash.Simhash(simhash.NewWordFeatureSet(converstionutil.Bytes(result.Raw)))
- if r.simHashes.Has(respSimHash) {
- gologger.Debug().Msgf("Skipping duplicate response with simhash %d for URL %s\n", respSimHash, result.URL)
- return true
- }
+ ip := result.HostIP
- for simHash := range r.simHashes.GetALL(false) {
- // lower threshold for increased precision
- if simhash.Compare(simHash, respSimHash) <= 3 {
- gologger.Debug().Msgf("Skipping near-duplicate response with simhash %d for URL %s\n", respSimHash, result.URL)
+ for storedHash, storedIPs := range r.simHashes.GetALL(false) {
+ if simhash.Compare(storedHash, respSimHash) > 3 {
+ continue
+ }
+ if ip == "" || sliceutil.Contains(storedIPs, ip) {
+ gologger.Debug().Msgf("Skipping duplicate response (simhash %d, ip %s) for URL %s\n", respSimHash, ip, result.URL)
return true
}
+ _ = r.simHashes.Set(storedHash, append(storedIPs, ip))
+ return false
}
- _ = r.simHashes.Set(respSimHash, struct{}{})
+
+ _ = r.simHashes.Set(respSimHash, []string{ip})
return false
}
diff --git a/runner/runner_test.go b/runner/runner_test.go
index bd96b81d..415a6f51 100644
--- a/runner/runner_test.go
+++ b/runner/runner_test.go
@@ -386,6 +386,147 @@ func TestRunner_testAndSet_concurrent(t *testing.T) {
require.Equal(t, 1, winCount, "exactly one goroutine should win testAndSet for the same key")
}
+func TestRunner_duplicate(t *testing.T) {
+ const (
+ pageA = "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n
WelcomeHello world default page content here"
+ pageB = "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\nDashboardCompletely different application running on this server"
+ )
+
+ t.Run("same content same IP is duplicate", func(t *testing.T) {
+ r, err := New(&Options{})
+ require.Nil(t, err)
+
+ first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
+ second := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://b.example.com"}
+
+ require.False(t, r.duplicate(first), "first result should not be duplicate")
+ require.True(t, r.duplicate(second), "same content + same IP should be duplicate")
+ })
+
+ t.Run("same content different IP is NOT duplicate", func(t *testing.T) {
+ r, err := New(&Options{})
+ require.Nil(t, err)
+
+ first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
+ second := &Result{Raw: pageA, HostIP: "2.2.2.2", URL: "https://b.example.com"}
+
+ require.False(t, r.duplicate(first))
+ require.False(t, r.duplicate(second), "same content but different IP should NOT be duplicate")
+ })
+
+ t.Run("different content same IP is NOT duplicate", func(t *testing.T) {
+ r, err := New(&Options{})
+ require.Nil(t, err)
+
+ first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
+ second := &Result{Raw: pageB, HostIP: "1.1.1.1", URL: "https://b.example.com"}
+
+ require.False(t, r.duplicate(first))
+ require.False(t, r.duplicate(second), "different content on same IP should NOT be duplicate")
+ })
+
+ t.Run("different content different IP is NOT duplicate", func(t *testing.T) {
+ r, err := New(&Options{})
+ require.Nil(t, err)
+
+ first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
+ second := &Result{Raw: pageB, HostIP: "2.2.2.2", URL: "https://b.example.com"}
+
+ require.False(t, r.duplicate(first))
+ require.False(t, r.duplicate(second), "different content + different IP should NOT be duplicate")
+ })
+
+ t.Run("third subdomain same content same IP is duplicate", func(t *testing.T) {
+ r, err := New(&Options{})
+ require.Nil(t, err)
+
+ first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
+ second := &Result{Raw: pageA, HostIP: "2.2.2.2", URL: "https://b.example.com"}
+ third := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://c.example.com"}
+
+ require.False(t, r.duplicate(first))
+ require.False(t, r.duplicate(second), "different IP should be kept")
+ require.True(t, r.duplicate(third), "same content + same IP as first should be duplicate")
+ })
+
+ t.Run("near-duplicate content same IP is duplicate", func(t *testing.T) {
+ r, err := New(&Options{})
+ require.Nil(t, err)
+
+ first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
+ nearDup := &Result{
+ Raw: "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\nWelcomeHello world default page content here!",
+ HostIP: "1.1.1.1",
+ URL: "https://b.example.com",
+ }
+
+ require.False(t, r.duplicate(first))
+ require.True(t, r.duplicate(nearDup), "near-duplicate content from same IP should be duplicate")
+ })
+
+ t.Run("near-duplicate content different IP is NOT duplicate", func(t *testing.T) {
+ r, err := New(&Options{})
+ require.Nil(t, err)
+
+ first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
+ nearDup := &Result{
+ Raw: "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\nWelcomeHello world default page content here!",
+ HostIP: "3.3.3.3",
+ URL: "https://b.example.com",
+ }
+
+ require.False(t, r.duplicate(first))
+ require.False(t, r.duplicate(nearDup), "near-duplicate content from different IP should NOT be duplicate")
+ })
+
+ t.Run("empty IP falls back to content-only dedup", func(t *testing.T) {
+ r, err := New(&Options{})
+ require.Nil(t, err)
+
+ first := &Result{Raw: pageA, HostIP: "", URL: "https://a.example.com"}
+ second := &Result{Raw: pageA, HostIP: "", URL: "https://b.example.com"}
+
+ require.False(t, r.duplicate(first))
+ require.True(t, r.duplicate(second), "empty IP should fall back to content-only dedup")
+ })
+
+ t.Run("many subdomains same default page same IP", func(t *testing.T) {
+ r, err := New(&Options{})
+ require.Nil(t, err)
+
+ kept := 0
+ for i := 0; i < 50; i++ {
+ res := &Result{
+ Raw: pageA,
+ HostIP: "10.0.0.1",
+ URL: fmt.Sprintf("https://sub%d.example.com", i),
+ }
+ if !r.duplicate(res) {
+ kept++
+ }
+ }
+ require.Equal(t, 1, kept, "50 subdomains with identical content on same IP should keep exactly 1")
+ })
+
+ t.Run("many subdomains same default page different IPs", func(t *testing.T) {
+ r, err := New(&Options{})
+ require.Nil(t, err)
+
+ kept := 0
+ for i := 0; i < 50; i++ {
+ res := &Result{
+ Raw: pageA,
+ HostIP: fmt.Sprintf("10.0.0.%d", i+1),
+ URL: fmt.Sprintf("https://sub%d.example.com", i),
+ }
+ if !r.duplicate(res) {
+ kept++
+ }
+ }
+ require.Equal(t, 50, kept, "50 subdomains with identical content but different IPs should keep all 50")
+ })
+}
+
func TestCreateNetworkpolicyInstance_AllowDenyFlags(t *testing.T) {
runner := &Runner{}