Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ type Runner struct {
browser *Browser
ditClassifier *dit.Classifier
pHashClusters []pHashCluster
simHashes gcache.Cache[uint64, struct{}] // Include simHashes for efficient duplicate detection
simHashes gcache.Cache[uint64, []string]
httpApiEndpoint *Server
authProvider authprovider.AuthProvider
interruptCh chan struct{}
Expand Down Expand Up @@ -430,7 +430,7 @@ func New(options *Options) (*Runner, error) {
runner.HostErrorsCache = gc
}

runner.simHashes = gcache.New[uint64, struct{}](1000).ARC().Build()
runner.simHashes = gcache.New[uint64, []string](1000).ARC().Build()
if options.JSONOutput || options.CSVOutput || len(options.OutputFilterPageType) > 0 {
ditClassifier, err := dit.New()
if err != nil {
Expand Down Expand Up @@ -639,19 +639,21 @@ func (r *Runner) seen(k string) bool {

func (r *Runner) duplicate(result *Result) bool {
respSimHash := simhash.Simhash(simhash.NewWordFeatureSet(converstionutil.Bytes(result.Raw)))
if r.simHashes.Has(respSimHash) {
gologger.Debug().Msgf("Skipping duplicate response with simhash %d for URL %s\n", respSimHash, result.URL)
return true
}
ip := result.HostIP

for simHash := range r.simHashes.GetALL(false) {
// lower threshold for increased precision
if simhash.Compare(simHash, respSimHash) <= 3 {
gologger.Debug().Msgf("Skipping near-duplicate response with simhash %d for URL %s\n", respSimHash, result.URL)
for storedHash, storedIPs := range r.simHashes.GetALL(false) {
if simhash.Compare(storedHash, respSimHash) > 3 {
continue
}
if ip == "" || sliceutil.Contains(storedIPs, ip) {
gologger.Debug().Msgf("Skipping duplicate response (simhash %d, ip %s) for URL %s\n", respSimHash, ip, result.URL)
return true
}
_ = r.simHashes.Set(storedHash, append(storedIPs, ip))
return false
}
_ = r.simHashes.Set(respSimHash, struct{}{})

_ = r.simHashes.Set(respSimHash, []string{ip})
Comment on lines +644 to +656
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Scan all matching simhashes before deciding this response is unique.

Because GetALL(false) is iterated as a map, the order here is not stable. If two stored hashes are both within the <= 3 threshold of respSimHash, Line 652 can attach ip to the first one seen and Line 653 returns false before a later matching hash that already contains ip is checked. That makes dedupe results nondeterministic and can leak duplicates.

💡 Proposed fix
 func (r *Runner) duplicate(result *Result) bool {
 	respSimHash := simhash.Simhash(simhash.NewWordFeatureSet(converstionutil.Bytes(result.Raw)))
 	ip := result.HostIP
 
 	for storedHash, storedIPs := range r.simHashes.GetALL(false) {
 		if simhash.Compare(storedHash, respSimHash) > 3 {
 			continue
 		}
 		if ip == "" || sliceutil.Contains(storedIPs, ip) {
 			gologger.Debug().Msgf("Skipping duplicate response (simhash %d, ip %s) for URL %s\n", respSimHash, ip, result.URL)
 			return true
 		}
-		_ = r.simHashes.Set(storedHash, append(storedIPs, ip))
-		return false
 	}
 
-	_ = r.simHashes.Set(respSimHash, []string{ip})
+	if storedIPs, err := r.simHashes.GetIFPresent(respSimHash); err == nil {
+		_ = r.simHashes.Set(respSimHash, append(storedIPs, ip))
+	} else {
+		_ = r.simHashes.Set(respSimHash, []string{ip})
+	}
 	return false
 }
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@runner/runner.go` around lines 644 - 656, The loop over
r.simHashes.GetALL(false) currently returns or mutates on the first matching
storedHash, causing nondeterministic dedupe when multiple stored hashes are
within threshold; change the logic in the block that uses simhash.Compare,
sliceutil.Contains, and r.simHashes.Set so you first scan all stored entries:
track whether any stored hash is within threshold and whether any of those
already contains ip, and only after the loop either: (a) return true if any
matching storedIPs already contained ip, or (b) attach ip to an appropriate
stored hash (or create a new entry for respSimHash) and return false; remove the
early r.simHashes.Set(...) and return inside the loop to ensure all matches are
considered before deciding.

return false
}

Expand Down
141 changes: 141 additions & 0 deletions runner/runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,147 @@ func TestRunner_testAndSet_concurrent(t *testing.T) {
require.Equal(t, 1, winCount, "exactly one goroutine should win testAndSet for the same key")
}

func TestRunner_duplicate(t *testing.T) {
const (
pageA = "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n<html><head><title>Welcome</title></head><body>Hello world default page content here</body></html>"
pageB = "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n<html><head><title>Dashboard</title></head><body>Completely different application running on this server</body></html>"
)

t.Run("same content same IP is duplicate", func(t *testing.T) {
r, err := New(&Options{})
require.Nil(t, err)

first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
second := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://b.example.com"}

require.False(t, r.duplicate(first), "first result should not be duplicate")
require.True(t, r.duplicate(second), "same content + same IP should be duplicate")
})

t.Run("same content different IP is NOT duplicate", func(t *testing.T) {
r, err := New(&Options{})
require.Nil(t, err)

first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
second := &Result{Raw: pageA, HostIP: "2.2.2.2", URL: "https://b.example.com"}

require.False(t, r.duplicate(first))
require.False(t, r.duplicate(second), "same content but different IP should NOT be duplicate")
})

t.Run("different content same IP is NOT duplicate", func(t *testing.T) {
r, err := New(&Options{})
require.Nil(t, err)

first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
second := &Result{Raw: pageB, HostIP: "1.1.1.1", URL: "https://b.example.com"}

require.False(t, r.duplicate(first))
require.False(t, r.duplicate(second), "different content on same IP should NOT be duplicate")
})

t.Run("different content different IP is NOT duplicate", func(t *testing.T) {
r, err := New(&Options{})
require.Nil(t, err)

first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
second := &Result{Raw: pageB, HostIP: "2.2.2.2", URL: "https://b.example.com"}

require.False(t, r.duplicate(first))
require.False(t, r.duplicate(second), "different content + different IP should NOT be duplicate")
})

t.Run("third subdomain same content same IP is duplicate", func(t *testing.T) {
r, err := New(&Options{})
require.Nil(t, err)

first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
second := &Result{Raw: pageA, HostIP: "2.2.2.2", URL: "https://b.example.com"}
third := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://c.example.com"}

require.False(t, r.duplicate(first))
require.False(t, r.duplicate(second), "different IP should be kept")
require.True(t, r.duplicate(third), "same content + same IP as first should be duplicate")
})

t.Run("near-duplicate content same IP is duplicate", func(t *testing.T) {
r, err := New(&Options{})
require.Nil(t, err)

first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
nearDup := &Result{
Raw: "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n<html><head><title>Welcome</title></head><body>Hello world default page content here!</body></html>",
HostIP: "1.1.1.1",
URL: "https://b.example.com",
}

require.False(t, r.duplicate(first))
require.True(t, r.duplicate(nearDup), "near-duplicate content from same IP should be duplicate")
})

t.Run("near-duplicate content different IP is NOT duplicate", func(t *testing.T) {
r, err := New(&Options{})
require.Nil(t, err)

first := &Result{Raw: pageA, HostIP: "1.1.1.1", URL: "https://a.example.com"}
nearDup := &Result{
Raw: "HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\n<html><head><title>Welcome</title></head><body>Hello world default page content here!</body></html>",
HostIP: "3.3.3.3",
URL: "https://b.example.com",
}

require.False(t, r.duplicate(first))
require.False(t, r.duplicate(nearDup), "near-duplicate content from different IP should NOT be duplicate")
})

t.Run("empty IP falls back to content-only dedup", func(t *testing.T) {
r, err := New(&Options{})
require.Nil(t, err)

first := &Result{Raw: pageA, HostIP: "", URL: "https://a.example.com"}
second := &Result{Raw: pageA, HostIP: "", URL: "https://b.example.com"}

require.False(t, r.duplicate(first))
require.True(t, r.duplicate(second), "empty IP should fall back to content-only dedup")
})

t.Run("many subdomains same default page same IP", func(t *testing.T) {
r, err := New(&Options{})
require.Nil(t, err)

kept := 0
for i := 0; i < 50; i++ {
res := &Result{
Raw: pageA,
HostIP: "10.0.0.1",
URL: fmt.Sprintf("https://sub%d.example.com", i),
}
if !r.duplicate(res) {
kept++
}
}
require.Equal(t, 1, kept, "50 subdomains with identical content on same IP should keep exactly 1")
})

t.Run("many subdomains same default page different IPs", func(t *testing.T) {
r, err := New(&Options{})
require.Nil(t, err)

kept := 0
for i := 0; i < 50; i++ {
res := &Result{
Raw: pageA,
HostIP: fmt.Sprintf("10.0.0.%d", i+1),
URL: fmt.Sprintf("https://sub%d.example.com", i),
}
if !r.duplicate(res) {
kept++
}
}
require.Equal(t, 50, kept, "50 subdomains with identical content but different IPs should keep all 50")
})
}

func TestCreateNetworkpolicyInstance_AllowDenyFlags(t *testing.T) {
runner := &Runner{}

Expand Down
Loading