From 090f0c22aa106d4de4436531282737dded723d02 Mon Sep 17 00:00:00 2001 From: Mzack9999 Date: Sat, 21 Mar 2026 18:47:59 +0100 Subject: [PATCH 1/2] parser for fqdn --- common/httpx/domains.go | 442 +++++++- common/httpx/domains_test.go | 1081 ++++++++++++++++++++ common/httpx/test-data/sample_with_js.html | 52 + go.mod | 2 + go.sum | 4 + 5 files changed, 1561 insertions(+), 20 deletions(-) create mode 100644 common/httpx/test-data/sample_with_js.html diff --git a/common/httpx/domains.go b/common/httpx/domains.go index 3a410e56..d705fb00 100644 --- a/common/httpx/domains.go +++ b/common/httpx/domains.go @@ -1,25 +1,32 @@ package httpx import ( + "bytes" + "net/url" "regexp" "strings" "unicode" + "github.com/PuerkitoBio/goquery" + "github.com/dop251/goja/ast" + "github.com/dop251/goja/parser" mapsutil "github.com/projectdiscovery/utils/maps" stringsutil "github.com/projectdiscovery/utils/strings" "github.com/weppos/publicsuffix-go/publicsuffix" ) const ( - // group 1 is actual domain regex while group 0 and group 2 are used to filter out invalid matches (by skipping irrelevant contexts) potentialDomainRegex = `(?:^|['"/@])` + `([a-z0-9]+[a-z0-9.-]*\.[a-z]{2,})` + `(?:['"/@]|$)` ) var ( - // potentialDomainsCompiled is a compiled regex for potential domains (aka domain names) potentialDomainsCompiled = regexp.MustCompile(potentialDomainRegex) defaultDenylist = []string{".3g2", ".3gp", ".7z", ".apk", ".arj", ".avi", ".axd", ".bmp", ".csv", ".deb", ".dll", ".doc", ".drv", ".eot", ".exe", ".flv", ".gif", ".gifv", ".gz", ".h264", ".ico", ".iso", ".jar", ".jpeg", ".jpg", ".lock", ".m4a", ".m4v", ".map", ".mkv", ".mov", ".mp3", ".mp4", ".mpeg", ".mpg", ".msi", ".ogg", ".ogm", ".ogv", ".otf", ".pdf", ".pkg", ".png", ".ppt", ".psd", ".rar", ".rm", ".rpm", ".svg", ".swf", ".sys", ".tar.gz", ".tar", ".tif", ".tiff", ".ttf", ".txt", ".vob", ".wav", ".webm", ".webp", ".wmv", ".woff", ".woff2", ".xcf", ".xls", ".xlsx", ".zip", ".css", ".js", ".map", ".php", ".sheet", ".ms", ".wp", ".html", ".htm", ".md"} suffixBlacklist = map[string]struct{}{} + + urlAttrs = []string{"href", "src", "action", "formaction", "poster", "cite", "data-url", "data-href"} + + maxInlineScriptSize = 512 * 1024 // skip JS AST parsing for scripts larger than 512KB ) type BodyDomain struct { @@ -31,34 +38,429 @@ func (h *HTTPX) BodyDomainGrab(r *Response) *BodyDomain { domains := make(map[string]struct{}) fqdns := make(map[string]struct{}) - for _, tmp := range potentialDomainsCompiled.FindAllStringSubmatch(r.Raw, -1) { - // only interested in 1st group + // Only run HTML/JS parsers if the body looks like HTML + if len(r.Data) > 0 && looksLikeHTML(r.Data) { + inlineScripts := extractDomainsFromHTML(r.Data, domains, fqdns, r.Input) + + for _, script := range inlineScripts { + if len(script) <= maxInlineScriptSize { + extractDomainsFromJS(script, domains, fqdns, r.Input) + } + } + } + + // Regex fallback on the raw response (catches anything the parsers miss) + extractDomainsFromRegex(r.Raw, domains, fqdns, r.Input) + + return &BodyDomain{Domains: mapsutil.GetKeys(domains), Fqdns: mapsutil.GetKeys(fqdns)} +} + +func looksLikeHTML(data []byte) bool { + prefix := data + if len(prefix) > 1024 { + prefix = prefix[:1024] + } + trimmed := bytes.TrimSpace(prefix) + return len(trimmed) > 0 && trimmed[0] == '<' +} + +// extractDomainsFromHTML parses HTML and extracts hostnames from URL-bearing +// attributes (href, src, action, etc.), meta tags, and srcset values. +// It returns the text content of inline ` + response := &Response{ + Raw: html, + Data: []byte(html), + } + bd := ht.BodyDomainGrab(response) + require.NotNil(t, bd) +} + +func TestExtractDomainsFromJS(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + + script := ` + var url = "https://api.test.example.com/v1/users"; + var cdn = "https://cdn.test.example.net/assets/main.js"; + var num = 42; + var noDomain = "just a plain string"; + ` + extractDomainsFromJS(script, domains, fqdns, "") + + require.Contains(t, fqdns, "api.test.example.com") + require.Contains(t, fqdns, "cdn.test.example.net") + require.Equal(t, 2, len(fqdns)) +} + +func TestExtractDomainsFromHTML(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + + html := []byte(` + + link + + + + `) + + inlineScripts := extractDomainsFromHTML(html, domains, fqdns, "") + + require.Contains(t, fqdns, "link.test.example.com") + require.Contains(t, fqdns, "img.test.example.org") + require.Len(t, inlineScripts, 1) + require.Contains(t, inlineScripts[0], "inline.test.example.net") +} + +// --- False positive rejection tests --- + +func TestFalsePositive_IPAddresses(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + router + internal + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + for _, d := range bd.Domains { + require.False(t, isAllNumericParts(d), "IP address should not appear in domains: %s", d) + } + for _, f := range bd.Fqdns { + require.False(t, isAllNumericParts(f), "IP address should not appear in fqdns: %s", f) + } +} + +func TestFalsePositive_PackageNames(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + for _, f := range bd.Fqdns { + require.NotContains(t, f, "com.google.android", "Java/Android package name should be rejected: %s", f) + require.NotContains(t, f, "org.apache.commons", "Java package name should be rejected: %s", f) + require.NotContains(t, f, "io.netty.handler", "Java package name should be rejected: %s", f) + } +} + +func TestFalsePositive_FileExtensions(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + report + css + js + + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + for _, f := range bd.Fqdns { + require.NotEqual(t, "report.pdf", f) + require.NotEqual(t, "style.css", f) + require.NotEqual(t, "app.js", f) + require.NotEqual(t, "logo.png", f) + } +} + +func TestFalsePositive_VersionStrings(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + +

Running version 2.4.1 of the software

+ ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Empty(t, bd.Fqdns, "version strings should not produce fqdns") + require.Empty(t, bd.Domains, "version strings should not produce domains") +} + +func TestFalsePositive_CSSClassNames(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` +
test
+ + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + for _, f := range bd.Fqdns { + require.NotContains(t, f, "header.main") + require.NotContains(t, f, "container.fluid") + require.NotContains(t, f, "nav.active") + } +} + +func TestFalsePositive_MinifiedJSVars(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + // minified JS often has expressions like e.target, n.value, t.id + html := `` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Empty(t, bd.Fqdns, "minified JS property accesses should not produce fqdns") + require.Empty(t, bd.Domains, "minified JS property accesses should not produce domains") +} + +func TestFalsePositive_MailtoAndTelLinks(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + email + call + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + // mailto and tel links should not be parsed by the HTML extractor + // but the regex fallback may still catch domains from the raw text + require.NotNil(t, bd) +} + +func TestFalsePositive_DataURIs(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + + data link + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Empty(t, bd.Fqdns, "data URIs should not produce fqdns") + require.Empty(t, bd.Domains, "data URIs should not produce domains") +} + +func TestFalsePositive_WebpackChunks(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + for _, f := range bd.Fqdns { + require.NotContains(t, f, "chunk") + require.NotContains(t, f, "runtime") + } +} + +// --- Edge case tests --- + +func TestEdgeCase_ProtocolRelativeURLs(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + + + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "cdn.proto-relative.example.com") + require.Contains(t, bd.Fqdns, "images.proto-relative.example.net") + require.Contains(t, bd.Fqdns, "fonts.proto-relative.example.org") +} + +func TestEdgeCase_MetaRefreshRedirect(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "redirect.meta-refresh.example.com") +} + +func TestEdgeCase_JSONLDScripts(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "www.jsonld-org.example.com") + require.Contains(t, bd.Fqdns, "cdn.jsonld-org.example.com") +} + +func TestEdgeCase_TrailingDots(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + link + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "trailing-dot.example.com") + for _, f := range bd.Fqdns { + require.False(t, strings.HasSuffix(f, "."), "domain should not have trailing dot: %s", f) + } +} + +func TestEdgeCase_Deduplication(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + link1 + link2 + link3 + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + count := 0 + for _, f := range bd.Fqdns { + if f == "dedup.example.com" { + count++ + } + } + require.Equal(t, 1, count, "duplicate fqdns should be deduplicated") +} + +func TestEdgeCase_InputDomainExclusion(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + self + other + ` + response := &Response{ + Raw: html, + Data: []byte(html), + Input: "example.com", + } + bd := ht.BodyDomainGrab(response) + + // example.com is the input domain, should be excluded from domains list + for _, d := range bd.Domains { + require.NotEqual(t, "example.com", d, "input domain should be excluded from domains") + } + // self.example.com equals the input, should be excluded from fqdns + for _, f := range bd.Fqdns { + require.NotEqual(t, "example.com", f, "input should be excluded from fqdns") + } + require.Contains(t, bd.Domains, "different.net") + require.Contains(t, bd.Fqdns, "other.different.net") +} + +func TestEdgeCase_EmptyAndWhitespaceScripts(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + + + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.NotNil(t, bd) + require.Empty(t, bd.Fqdns) +} + +func TestEdgeCase_MultipleScriptsSomeBroken(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + + + + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "valid.multi-script.example.com") + require.Contains(t, bd.Fqdns, "also-valid.multi-script.example.net") +} + +func TestEdgeCase_URLsWithQueryAndFragment(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + query + fragment + both + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "query.example.com") + require.Contains(t, bd.Fqdns, "fragment.example.com") + require.Contains(t, bd.Fqdns, "both.example.com") +} + +func TestEdgeCase_MixedCaseURLs(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + upper + mixed + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "upper.case.example.com") + require.Contains(t, bd.Fqdns, "mixed.case.example.net") +} + +func TestEdgeCase_URLsWithPorts(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + port + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "ported.example.com") + require.Contains(t, bd.Fqdns, "ported-js.example.net") + for _, f := range bd.Fqdns { + require.NotContains(t, f, ":", "port numbers should not appear in extracted domains") + } +} + +func TestEdgeCase_URLsWithAuth(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + authed + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "authed.example.com") +} + +func TestEdgeCase_JSArrowFunctions(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + + script := ` + const fetchData = () => fetch("https://api.arrow.example.com/data"); + const urls = ["a", "b"].map(x => "https://map." + x + ".example.net"); + const handler = async () => { + const res = await fetch("https://async-arrow.example.org/endpoint"); + return res; + }; + ` + extractDomainsFromJS(script, domains, fqdns, "") + + require.Contains(t, fqdns, "api.arrow.example.com") + require.Contains(t, fqdns, "async-arrow.example.org") +} + +func TestEdgeCase_JSObjectNesting(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + + script := ` + var config = { + api: { + base: "https://nested-api.example.com/v1", + endpoints: { + users: "https://nested-users.example.com/users", + deep: { + level: "https://nested-deep.example.net/deep" + } + } + } + }; + ` + extractDomainsFromJS(script, domains, fqdns, "") + + require.Contains(t, fqdns, "nested-api.example.com") + require.Contains(t, fqdns, "nested-users.example.com") + require.Contains(t, fqdns, "nested-deep.example.net") +} + +func TestEdgeCase_JSTryCatchFinally(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + + script := ` + try { + fetch("https://try-block.example.com/api"); + } catch(e) { + fetch("https://catch-block.example.net/error"); + } finally { + fetch("https://finally-block.example.org/cleanup"); + } + ` + extractDomainsFromJS(script, domains, fqdns, "") + + require.Contains(t, fqdns, "try-block.example.com") + require.Contains(t, fqdns, "catch-block.example.net") + require.Contains(t, fqdns, "finally-block.example.org") +} + +func TestEdgeCase_JSConditionalTernary(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + + script := ` + var url = isProd + ? "https://prod.ternary.example.com/api" + : "https://dev.ternary.example.net/api"; + ` + extractDomainsFromJS(script, domains, fqdns, "") + + require.Contains(t, fqdns, "prod.ternary.example.com") + require.Contains(t, fqdns, "dev.ternary.example.net") +} + +func TestEdgeCase_JSSwitchCase(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + + script := ` + switch(env) { + case "prod": + url = "https://prod.switch.example.com/api"; + break; + case "staging": + url = "https://staging.switch.example.net/api"; + break; + default: + url = "https://default.switch.example.org/api"; + } + ` + extractDomainsFromJS(script, domains, fqdns, "") + + require.Contains(t, fqdns, "prod.switch.example.com") + require.Contains(t, fqdns, "staging.switch.example.net") + require.Contains(t, fqdns, "default.switch.example.org") +} + +func TestEdgeCase_JSLoops(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + + script := ` + for (var i = 0; i < 10; i++) { + fetch("https://for-loop.example.com/item"); + } + while (true) { + fetch("https://while-loop.example.net/poll"); + break; + } + do { + fetch("https://do-while.example.org/retry"); + } while (false); + ` + extractDomainsFromJS(script, domains, fqdns, "") + + require.Contains(t, fqdns, "for-loop.example.com") + require.Contains(t, fqdns, "while-loop.example.net") + require.Contains(t, fqdns, "do-while.example.org") +} + +func TestEdgeCase_JSIfElse(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + + script := ` + if (condition) { + url = "https://if-branch.example.com/a"; + } else if (other) { + url = "https://elseif-branch.example.net/b"; + } else { + url = "https://else-branch.example.org/c"; + } + ` + extractDomainsFromJS(script, domains, fqdns, "") + + require.Contains(t, fqdns, "if-branch.example.com") + require.Contains(t, fqdns, "elseif-branch.example.net") + require.Contains(t, fqdns, "else-branch.example.org") +} + +func TestEdgeCase_HTMLEntitiesInURLs(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + // goquery auto-decodes & to & + html := ` + entity + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "entity.example.com") +} + +func TestEdgeCase_FormactionAttribute(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` +
+ +
+ ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "formaction-btn.example.com") +} + +func TestEdgeCase_CiteAttribute(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` +
quoted text
+ ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "cite-source.example.com") +} + +func TestEdgeCase_OpenGraphAndTwitterMeta(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + + + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "og-url.example.com") + require.Contains(t, bd.Fqdns, "og-image.example.net") + require.Contains(t, bd.Fqdns, "twitter-img.example.org") +} + +func TestEdgeCase_SrcsetMultipleEntries(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "srcset1.example.com") + require.Contains(t, bd.Fqdns, "srcset2.example.net") + require.Contains(t, bd.Fqdns, "srcset3.example.org") +} + +func TestEdgeCase_DataAttributes(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` +
+
+ ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "data-url-attr.example.com") + require.Contains(t, bd.Fqdns, "data-href-attr.example.net") +} + +func TestEdgeCase_LargeBodyNoPanic(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + // generate a large body with repeated content + var builder strings.Builder + builder.WriteString("") + for i := 0; i < 1000; i++ { + builder.WriteString(`link`) + } + builder.WriteString("") + body := builder.String() + + response := &Response{Raw: body, Data: []byte(body)} + bd := ht.BodyDomainGrab(response) + + require.NotNil(t, bd) + require.Contains(t, bd.Fqdns, "bulk.example.com") +} + +func TestEdgeCase_OnlyRawNoData(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + // r.Data is nil but r.Raw has content — HTML parser skipped, regex catches it + raw := `HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n\r\ntest` + response := &Response{Raw: raw, Data: nil} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "raw-only.example.com") +} + +func TestEdgeCase_SubdomainVsDomain(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + root domain + subdomain + deep sub + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Domains, "example.com") + require.Contains(t, bd.Fqdns, "sub.example.com") + require.Contains(t, bd.Fqdns, "deep.sub.example.com") + // root domain (example.com) should be in domains but NOT in fqdns + // (because d == val for a root domain, so the fqdn branch is skipped) + for _, f := range bd.Fqdns { + require.NotEqual(t, "example.com", f, "root domain should not appear in fqdns list") + } +} + +func TestEdgeCase_InternationalTLDs(t *testing.T) { + ht, err := New(&DefaultOptions) + require.Nil(t, err) + + html := ` + uk + au + ` + response := &Response{Raw: html, Data: []byte(html)} + bd := ht.BodyDomainGrab(response) + + require.Contains(t, bd.Fqdns, "test.example.co.uk") + require.Contains(t, bd.Fqdns, "test.example.com.au") +} + +// helper for IP check test +func isAllNumericParts(d string) bool { + for _, part := range strings.Split(d, ".") { + allDigits := true + for _, c := range part { + if c < '0' || c > '9' { + allDigits = false + break + } + } + if !allDigits { + return false + } + } + return true +} + +// --- Unit tests for hostnameFromURL --- + +func TestHostnameFromURL(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + {"https url", "https://example.com/path", "example.com"}, + {"http url", "http://sub.example.com/path", "sub.example.com"}, + {"protocol-relative", "//cdn.example.com/file.js", "cdn.example.com"}, + {"with port", "https://example.com:8443/api", "example.com"}, + {"with auth", "https://user:pass@example.com/page", "example.com"}, + {"with query", "https://example.com/path?q=1", "example.com"}, + {"with fragment", "https://example.com/path#section", "example.com"}, + {"empty string", "", ""}, + {"hash only", "#section", ""}, + {"javascript scheme", "javascript:void(0)", ""}, + {"data uri", "data:text/html,

hi

", ""}, + {"mailto", "mailto:user@example.com", ""}, + {"tel", "tel:+1234567890", ""}, + {"blob", "blob:https://example.com/uuid", ""}, + {"about", "about:blank", ""}, + {"relative path", "/path/to/page", ""}, + {"bare domain no scheme", "example.com", ""}, + {"no dots", "https://localhost/path", ""}, + {"ip address", "https://192.168.1.1/admin", "192.168.1.1"}, + {"uppercase scheme", "HTTPS://EXAMPLE.COM/PATH", "EXAMPLE.COM"}, + {"whitespace", " https://trimmed.example.com/path ", "trimmed.example.com"}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + result := hostnameFromURL(tc.input) + require.Equal(t, tc.expected, result) + }) + } +} + +// --- Unit tests for addDomainCandidate --- + +func TestAddDomainCandidate(t *testing.T) { + t.Run("valid fqdn", func(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + addDomainCandidate("sub.example.com", domains, fqdns, "") + require.Contains(t, fqdns, "sub.example.com") + require.Contains(t, domains, "example.com") + }) + + t.Run("root domain only goes to domains not fqdns", func(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + addDomainCandidate("example.com", domains, fqdns, "") + require.Contains(t, domains, "example.com") + require.Empty(t, fqdns) + }) + + t.Run("trailing dot is stripped", func(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + addDomainCandidate("sub.example.com.", domains, fqdns, "") + require.Contains(t, fqdns, "sub.example.com") + }) + + t.Run("uppercase is lowered", func(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + addDomainCandidate("SUB.EXAMPLE.COM", domains, fqdns, "") + require.Contains(t, fqdns, "sub.example.com") + }) + + t.Run("input domain excluded from domains", func(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + addDomainCandidate("sub.example.com", domains, fqdns, "example.com") + require.Empty(t, domains) + require.Contains(t, fqdns, "sub.example.com") + }) + + t.Run("input fqdn excluded from fqdns", func(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + addDomainCandidate("sub.example.com", domains, fqdns, "sub.example.com") + require.Empty(t, fqdns) + }) + + t.Run("empty string rejected", func(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + addDomainCandidate("", domains, fqdns, "") + require.Empty(t, domains) + require.Empty(t, fqdns) + }) + + t.Run("whitespace only rejected", func(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + addDomainCandidate(" ", domains, fqdns, "") + require.Empty(t, domains) + require.Empty(t, fqdns) + }) + + t.Run("ip address rejected", func(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + addDomainCandidate("192.168.1.1", domains, fqdns, "") + require.Empty(t, domains) + require.Empty(t, fqdns) + }) + + t.Run("single label rejected", func(t *testing.T) { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + addDomainCandidate("localhost", domains, fqdns, "") + require.Empty(t, domains) + require.Empty(t, fqdns) + }) +} + +// --- Benchmarks --- + +func BenchmarkBodyDomainGrab_HackerOne(b *testing.B) { + ht, err := New(&DefaultOptions) + if err != nil { + b.Fatal(err) + } + response := &Response{ + Raw: rawResponse, + Data: []byte(rawResponse), + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + ht.BodyDomainGrab(response) + } +} + +func BenchmarkBodyDomainGrab_SmallPage(b *testing.B) { + ht, err := New(&DefaultOptions) + if err != nil { + b.Fatal(err) + } + response := &Response{ + Raw: sampleWithJS, + Data: []byte(sampleWithJS), + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + ht.BodyDomainGrab(response) + } +} + +func BenchmarkBodyDomainGrab_RegexOnly(b *testing.B) { + response := &Response{ + Raw: rawResponse, + Data: []byte(rawResponse), + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + extractDomainsFromRegex(response.Raw, domains, fqdns, "") + } +} + +func BenchmarkBodyDomainGrab_HTMLOnly(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + extractDomainsFromHTML([]byte(rawResponse), domains, fqdns, "") + } +} + +func BenchmarkBodyDomainGrab_JSOnly(b *testing.B) { + scripts := []string{} + extractDomainsFromHTML([]byte(rawResponse), make(map[string]struct{}), make(map[string]struct{}), "") + doc, _ := goquery.NewDocumentFromReader(bytes.NewReader([]byte(rawResponse))) + doc.Find("script").Each(func(_ int, s *goquery.Selection) { + if _, ok := s.Attr("src"); !ok { + if text := s.Text(); strings.TrimSpace(text) != "" { + scripts = append(scripts, text) + } + } + }) + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, script := range scripts { + domains := make(map[string]struct{}) + fqdns := make(map[string]struct{}) + extractDomainsFromJS(script, domains, fqdns, "") + } + } +} + +func BenchmarkBodyDomainGrab_JSON(b *testing.B) { + ht, err := New(&DefaultOptions) + if err != nil { + b.Fatal(err) + } + json := `{"url":"https://api.example.com/v1","cdn":"https://cdn.example.net/assets","callback":"https://hooks.example.org/notify"}` + response := &Response{ + Raw: json, + Data: []byte(json), + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + ht.BodyDomainGrab(response) + } +} + +func BenchmarkBodyDomainGrab_PlainText(b *testing.B) { + ht, err := New(&DefaultOptions) + if err != nil { + b.Fatal(err) + } + text := `'api.example.com' and 'cdn.example.net' are the endpoints` + response := &Response{ + Raw: text, + Data: []byte(text), + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + ht.BodyDomainGrab(response) + } +} diff --git a/common/httpx/test-data/sample_with_js.html b/common/httpx/test-data/sample_with_js.html new file mode 100644 index 00000000..93b7eab1 --- /dev/null +++ b/common/httpx/test-data/sample_with_js.html @@ -0,0 +1,52 @@ + + + + Test Page + + + + + + + Link + Another + Relative + Hash + JS + + photo + +
+ +
+ + + +
+ + + + + + + + Text with a bare domain like bare.textdomain.example.com mentioned here. + + diff --git a/go.mod b/go.mod index a5791088..226dfe71 100644 --- a/go.mod +++ b/go.mod @@ -50,6 +50,7 @@ require ( ) require ( + github.com/dop251/goja v0.0.0-20260311135729-065cd970411c github.com/dustin/go-humanize v1.0.1 github.com/go-sql-driver/mysql v1.9.3 github.com/go-viper/mapstructure/v2 v2.5.0 @@ -97,6 +98,7 @@ require ( github.com/felixge/fgprof v0.9.5 // indirect github.com/gaissmai/bart v0.26.0 // indirect github.com/go-ole/go-ole v1.2.6 // indirect + github.com/go-sourcemap/sourcemap v2.1.3+incompatible // indirect github.com/google/certificate-transparency-go v1.3.2 // indirect github.com/google/go-github/v30 v30.1.0 // indirect github.com/google/go-querystring v1.1.0 // indirect diff --git a/go.sum b/go.sum index 3903a95c..20c1b59c 100644 --- a/go.sum +++ b/go.sum @@ -111,6 +111,8 @@ github.com/dlclark/regexp2 v1.11.5 h1:Q/sSnsKerHeCkc/jSTNq1oCm7KiVgUMZRDUoRu0JQZ github.com/dlclark/regexp2 v1.11.5/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/dop251/goja v0.0.0-20260311135729-065cd970411c h1:OcLmPfx1T1RmZVHHFwWMPaZDdRf0DBMZOFMVWJa7Pdk= +github.com/dop251/goja v0.0.0-20260311135729-065cd970411c/go.mod h1:MxLav0peU43GgvwVgNbLAj1s/bSGboKkhuULvq/7hx4= github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707 h1:2tV76y6Q9BB+NEBasnqvs7e49aEBFI8ejC89PSnWH+4= github.com/dsnet/compress v0.0.2-0.20230904184137-39efe44ab707/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s= github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY= @@ -137,6 +139,8 @@ github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-rod/rod v0.116.2 h1:A5t2Ky2A+5eD/ZJQr1EfsQSe5rms5Xof/qj296e+ZqA= github.com/go-rod/rod v0.116.2/go.mod h1:H+CMO9SCNc2TJ2WfrG+pKhITz57uGNYU43qYHh438Mg= +github.com/go-sourcemap/sourcemap v2.1.3+incompatible h1:W1iEw64niKVGogNgBN3ePyLFfuisuzeidWPMPWmECqU= +github.com/go-sourcemap/sourcemap v2.1.3+incompatible/go.mod h1:F8jJfvm2KbVjc5NqelyYJmf/v5J0dwNLS2mL4sNA1Jg= github.com/go-sql-driver/mysql v1.9.3 h1:U/N249h2WzJ3Ukj8SowVFjdtZKfu9vlLZxjPXV1aweo= github.com/go-sql-driver/mysql v1.9.3/go.mod h1:qn46aNg1333BRMNU69Lq93t8du/dwxI64Gl8i5p1WMU= github.com/go-viper/mapstructure/v2 v2.5.0 h1:vM5IJoUAy3d7zRSVtIwQgBj7BiWtMPfmPEgAXnvj1Ro= From c3b06b505f80970d4b328db31cf9ca00a4c701fc Mon Sep 17 00:00:00 2001 From: Mzack9999 Date: Sat, 21 Mar 2026 18:54:07 +0100 Subject: [PATCH 2/2] fixing lint --- common/httpx/domains.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common/httpx/domains.go b/common/httpx/domains.go index d705fb00..76da3b70 100644 --- a/common/httpx/domains.go +++ b/common/httpx/domains.go @@ -148,7 +148,9 @@ func extractDomainsFromJS(script string, domains, fqdns map[string]struct{}, inp if err != nil { return } - defer func() { recover() }() + defer func() { + _ = recover() + }() walkProgram(program, func(value string) { for _, match := range potentialDomainsCompiled.FindAllStringSubmatch(value, -1) { if len(match) >= 2 {