diff --git a/bbot/core/helpers/web/engine.py b/bbot/core/helpers/web/engine.py index 1c3ecc0f52..279ff8773d 100644 --- a/bbot/core/helpers/web/engine.py +++ b/bbot/core/helpers/web/engine.py @@ -4,8 +4,10 @@ import asyncio import logging import traceback +from urllib.parse import urlparse from socksio.exceptions import SOCKSError from contextlib import asynccontextmanager +from radixtarget import RadixTarget from bbot.core.engine import EngineServer from bbot.core.helpers.misc import bytes_to_human, human_to_bytes, get_exception_chain, truncate_string @@ -36,6 +38,32 @@ def __init__(self, socket_path, target, config={}, debug=False): self.web_clients = {} self.web_client = self.AsyncClient(persist_cookies=False) + # proxy exclusion support + self.has_proxy = bool(self.web_config.get("http_proxy", "")) + proxy_exclusions = self.web_config.get("http_proxy_exclude", []) + self.noproxy_web_clients = {} + self.proxy_bypass_all = False + if self.has_proxy and proxy_exclusions: + normalized = [] + for pattern in proxy_exclusions: + pattern = str(pattern).strip() + if pattern == "*": + self.proxy_bypass_all = True + break + # normalize NO_PROXY conventions for radixtarget + # ".example.com" and "*.example.com" both mean "example.com + subdomains" + if pattern.startswith("*."): + pattern = pattern[2:] + elif pattern.startswith("."): + pattern = pattern[1:] + if pattern: + normalized.append(pattern) + self.proxy_exclusion_target = RadixTarget(*normalized) if normalized else RadixTarget() + self.noproxy_web_client = self._AsyncClient_noproxy(persist_cookies=False) + else: + self.proxy_exclusion_target = RadixTarget() + self.noproxy_web_client = None + def AsyncClient(self, *args, **kwargs): # cache by retries to prevent unwanted accumulation of clients # (they are not garbage-collected) @@ -49,12 +77,44 @@ def AsyncClient(self, *args, **kwargs): self.web_clients[client.retries] = client return client + def _AsyncClient_noproxy(self, *args, **kwargs): + """Create/cache a BBOTAsyncClient with proxy disabled, for excluded hosts.""" + retries = kwargs.get("retries", 1) + try: + return self.noproxy_web_clients[retries] + except KeyError: + from .client import BBOTAsyncClient + + noproxy_config = dict(self.config) + noproxy_web = dict(noproxy_config.get("web", {})) + noproxy_web["http_proxy"] = None + noproxy_config["web"] = noproxy_web + client = BBOTAsyncClient.from_config(noproxy_config, self.target, *args, **kwargs) + self.noproxy_web_clients[client.retries] = client + return client + + def _get_client_for_url(self, url, client=None): + """Return the appropriate client based on proxy exclusion rules. + + If no explicit client is provided and the URL matches an exclusion pattern, + returns the no-proxy client. Otherwise returns the given client or default. + """ + if client is not None: + return client + if self.noproxy_web_client is not None and url: + if self.proxy_bypass_all: + return self.noproxy_web_client + hostname = urlparse(str(url)).hostname + if hostname and self.proxy_exclusion_target.get(hostname): + return self.noproxy_web_client + return self.web_client + async def request(self, *args, **kwargs): raise_error = kwargs.pop("raise_error", False) # TODO: use this cache_for = kwargs.pop("cache_for", None) # noqa - client = kwargs.get("client", self.web_client) + explicit_client = kwargs.pop("client", None) # allow vs follow, httpx why?? allow_redirects = kwargs.pop("allow_redirects", None) @@ -79,6 +139,8 @@ async def request(self, *args, **kwargs): if client_kwargs: client = self.AsyncClient(**client_kwargs) + else: + client = self._get_client_for_url(url, explicit_client) try: async with self._acatch(url, raise_error): @@ -144,7 +206,8 @@ async def stream_request(self, url, **kwargs): chunk_size = 8192 chunks = [] - async with self._acatch(url, raise_error=True), self.web_client.stream(url=url, **kwargs) as response: + stream_client = self._get_client_for_url(url) + async with self._acatch(url, raise_error=True), stream_client.stream(url=url, **kwargs) as response: agen = response.aiter_bytes(chunk_size=chunk_size) async for chunk in agen: _chunk_size = len(chunk) diff --git a/bbot/defaults.yml b/bbot/defaults.yml index fa3727b1b4..caa1a505e5 100644 --- a/bbot/defaults.yml +++ b/bbot/defaults.yml @@ -84,6 +84,9 @@ dns: web: # HTTP proxy http_proxy: + # Hosts/CIDRs to exclude from HTTP proxy (NO_PROXY equivalent) + # Examples: ["localhost", "*.internal.corp", "10.0.0.0/8", "elastic.mycompany.com"] + http_proxy_exclude: [] # Web user-agent user_agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.2151.97 # Suffix to append to user-agent (e.g. for tracking or identification) diff --git a/bbot/scanner/preset/args.py b/bbot/scanner/preset/args.py index ffda0de2a5..9813735697 100644 --- a/bbot/scanner/preset/args.py +++ b/bbot/scanner/preset/args.py @@ -177,6 +177,9 @@ def preset_from_args(self): if self.parsed.proxy: args_preset.core.merge_custom({"web": {"http_proxy": self.parsed.proxy}}) + if self.parsed.no_proxy: + args_preset.core.merge_custom({"web": {"http_proxy_exclude": self.parsed.no_proxy}}) + if self.parsed.custom_headers: args_preset.core.merge_custom({"web": {"http_headers": self.parsed.custom_headers}}) @@ -372,6 +375,13 @@ def create_parser(self, *args, **kwargs): misc = p.add_argument_group(title="Misc") misc.add_argument("--version", action="store_true", help="show BBOT version and exit") misc.add_argument("--proxy", help="Use this proxy for all HTTP requests", metavar="HTTP_PROXY") + misc.add_argument( + "--no-proxy", + nargs="+", + default=[], + help="Exclude these hosts from proxy (e.g. localhost *.internal.corp 10.0.0.0/8)", + metavar="HOST", + ) misc.add_argument( "-H", "--custom-headers", diff --git a/bbot/scanner/preset/environ.py b/bbot/scanner/preset/environ.py index a222dd1bb3..2a833d0097 100644 --- a/bbot/scanner/preset/environ.py +++ b/bbot/scanner/preset/environ.py @@ -125,6 +125,13 @@ def prepare(self): environ.pop("HTTP_PROXY", None) environ.pop("HTTPS_PROXY", None) + # handle proxy exclusions (NO_PROXY) + http_proxy_exclude = self.preset.config.get("web", {}).get("http_proxy_exclude", []) + if http_proxy_exclude: + environ["NO_PROXY"] = ",".join(str(x) for x in http_proxy_exclude) + else: + environ.pop("NO_PROXY", None) + # ssl verification import urllib3 diff --git a/bbot/scanner/scanner.py b/bbot/scanner/scanner.py index b2883cded6..505d4e6311 100644 --- a/bbot/scanner/scanner.py +++ b/bbot/scanner/scanner.py @@ -232,6 +232,7 @@ def __init__( max_redirects = web_config.get("http_max_redirects", 5) self.web_max_redirects = max(max_redirects, self.web_spider_distance) self.http_proxy = web_config.get("http_proxy", "") + self.http_proxy_exclude = web_config.get("http_proxy_exclude", []) self.http_timeout = web_config.get("http_timeout", 10) self.httpx_timeout = web_config.get("httpx_timeout", 5) self.http_retries = web_config.get("http_retries", 1) diff --git a/bbot/test/test_step_1/test_web.py b/bbot/test/test_step_1/test_web.py index fbaf6f6f97..357c7f9a0b 100644 --- a/bbot/test/test_step_1/test_web.py +++ b/bbot/test/test_step_1/test_web.py @@ -425,6 +425,70 @@ async def test_http_proxy(bbot_scanner, bbot_httpserver, proxy_server): await scan._cleanup() +@pytest.mark.asyncio +async def test_http_proxy_exclude(bbot_scanner, bbot_httpserver, proxy_server): + """Verify that requests to excluded hosts bypass the proxy.""" + endpoint = "/test_http_proxy_exclude" + url = bbot_httpserver.url_for(endpoint) + bbot_httpserver.expect_request(uri=endpoint).respond_with_data("proxy_exclude_works") + + proxy_address = f"http://127.0.0.1:{proxy_server.server_address[1]}" + # Exclude 127.0.0.1 from proxy + scan = bbot_scanner( + "127.0.0.1", + config={ + "web": { + "http_proxy": proxy_address, + "http_proxy_exclude": ["127.0.0.1"], + } + }, + ) + + await scan._prep() + + proxy_server.RequestHandlerClass.urls.clear() + r = await scan.helpers.request(url) + + # Request should NOT go through proxy + assert len(proxy_server.RequestHandlerClass.urls) == 0, "Request should have bypassed proxy but went through it" + assert r.status_code == 200 and r.text == "proxy_exclude_works" + + await scan._cleanup() + + +@pytest.mark.asyncio +async def test_http_proxy_exclude_passthrough(bbot_scanner, bbot_httpserver, proxy_server): + """Verify that non-excluded hosts still go through the proxy.""" + endpoint = "/test_proxy_passthrough" + url = bbot_httpserver.url_for(endpoint) + bbot_httpserver.expect_request(uri=endpoint).respond_with_data("passthrough_works") + + proxy_address = f"http://127.0.0.1:{proxy_server.server_address[1]}" + # Exclude a different host, not the one we're requesting + scan = bbot_scanner( + "127.0.0.1", + config={ + "web": { + "http_proxy": proxy_address, + "http_proxy_exclude": ["10.0.0.0/8"], + } + }, + ) + + await scan._prep() + + proxy_server.RequestHandlerClass.urls.clear() + r = await scan.helpers.request(url) + + # Request SHOULD go through proxy (127.0.0.1 not in exclusion list) + assert len(proxy_server.RequestHandlerClass.urls) == 1, ( + f"Request to {url} should have gone through proxy but didn't" + ) + assert r.status_code == 200 and r.text == "passthrough_works" + + await scan._cleanup() + + @pytest.mark.asyncio async def test_http_ssl(bbot_scanner, bbot_httpserver_ssl): endpoint = "/test_http_ssl"