diff --git a/bbot/core/helpers/misc.py b/bbot/core/helpers/misc.py index 6c5211762c..cd2011abcc 100644 --- a/bbot/core/helpers/misc.py +++ b/bbot/core/helpers/misc.py @@ -1118,6 +1118,32 @@ def str_or_file(s): yield s +_comment_re = re.compile(r"\s#") + + +def strip_comments(line): + """Strip #-style comments from a line. + + Handles full-line comments (``# ...``) and inline comments (``target # ...``). + The ``#`` must be preceded by whitespace to count as an inline comment, + so URL fragments like ``http://example.com/page#section`` are preserved. + + Examples: + >>> strip_comments("evilcorp.com # main domain") + 'evilcorp.com' + >>> strip_comments("# full line comment") + '' + >>> strip_comments("http://example.com/page#section") + 'http://example.com/page#section' + """ + if line.lstrip().startswith("#"): + return "" + m = _comment_re.search(line) + if m: + return line[: m.start()] + return line + + split_regex = re.compile(r"[\s,]") @@ -1128,6 +1154,7 @@ def chain_lists( remove_blank=True, validate=False, validate_chars='<>:"/\\|?*)', + _strip_comments=False, ): """Chains together list elements, allowing for entries separated by commas. @@ -1143,6 +1170,7 @@ def chain_lists( remove_blank (bool, optional): Whether to remove blank entries from the list. Defaults to True. validate (bool, optional): Whether to perform validation for undesirable characters. Defaults to False. validate_chars (str, optional): When performing validation, what additional set of characters to block (blocks non-printable ascii automatically). Defaults to '<>:"/\\|?*)' + _strip_comments (bool, optional): Whether to strip ``#``-style comments from entries and file lines. Defaults to False. Returns: list: The list of chained elements. @@ -1161,6 +1189,8 @@ def chain_lists( l = [l] final_list = {} for entry in l: + if _strip_comments: + entry = strip_comments(entry) for s in split_regex.split(entry): f = s.strip() if validate: @@ -1172,6 +1202,8 @@ def chain_lists( new_msg = str(msg).format(filename=f_path) log.info(new_msg) for line in str_or_file(f): + if _strip_comments: + line = strip_comments(line) final_list[line] = None else: final_list[f] = None diff --git a/bbot/scanner/preset/args.py b/bbot/scanner/preset/args.py index ffda0de2a5..f99305e5d8 100644 --- a/bbot/scanner/preset/args.py +++ b/bbot/scanner/preset/args.py @@ -403,14 +403,14 @@ def sanitize_args(self): self.parsed.exclude_modules = chain_lists(self.parsed.exclude_modules) self.parsed.output_modules = chain_lists(self.parsed.output_modules) self.parsed.targets = chain_lists( - self.parsed.targets, try_files=True, msg="Reading targets from file: {filename}" + self.parsed.targets, try_files=True, msg="Reading targets from file: {filename}", _strip_comments=True ) if self.parsed.seeds is not None: self.parsed.seeds = chain_lists( - self.parsed.seeds, try_files=True, msg="Reading seeds from file: {filename}" + self.parsed.seeds, try_files=True, msg="Reading seeds from file: {filename}", _strip_comments=True ) self.parsed.blacklist = chain_lists( - self.parsed.blacklist, try_files=True, msg="Reading blacklist from file: {filename}" + self.parsed.blacklist, try_files=True, msg="Reading blacklist from file: {filename}", _strip_comments=True ) self.parsed.flags = chain_lists(self.parsed.flags) self.parsed.exclude_flags = chain_lists(self.parsed.exclude_flags) diff --git a/bbot/scanner/preset/preset.py b/bbot/scanner/preset/preset.py index 01e6ba681f..c6ab3a8220 100644 --- a/bbot/scanner/preset/preset.py +++ b/bbot/scanner/preset/preset.py @@ -673,6 +673,7 @@ def from_dict(cls, preset_dict, name=None, _exclude=None, _log=False): cls._resolve_file_entries(target_vals), try_files=True, msg="Reading targets from preset file: {filename}", + _strip_comments=True, ) seeds = preset_dict.get("seeds") if seeds is not None: @@ -680,6 +681,7 @@ def from_dict(cls, preset_dict, name=None, _exclude=None, _log=False): cls._resolve_file_entries(seeds), try_files=True, msg="Reading seeds from preset file: {filename}", + _strip_comments=True, ) blacklist = preset_dict.get("blacklist") if blacklist is not None: @@ -687,6 +689,7 @@ def from_dict(cls, preset_dict, name=None, _exclude=None, _log=False): cls._resolve_file_entries(blacklist), try_files=True, msg="Reading blacklist from preset file: {filename}", + _strip_comments=True, ) new_preset = cls( *targets, diff --git a/bbot/scanner/target.py b/bbot/scanner/target.py index ff5c0d4548..d029b6f567 100644 --- a/bbot/scanner/target.py +++ b/bbot/scanner/target.py @@ -17,7 +17,7 @@ def _fnv1a_64(data_strings): from bbot.errors import * from bbot.core.event import is_event from bbot.core.event.helpers import EventSeed, BaseEventSeed -from bbot.core.helpers.misc import is_dns_name, is_ip, is_ip_type +from bbot.core.helpers.misc import is_dns_name, is_ip, is_ip_type, strip_comments log = logging.getLogger("bbot.core.target") @@ -61,8 +61,8 @@ class BaseTarget: accept_target_types = ["TARGET"] def __init__(self, *targets, strict_scope=False, acl_mode=False): - # ignore blank targets (sometimes happens as a symptom of .splitlines()) - targets = [stripped for t in targets if (stripped := (t.strip() if isinstance(t, str) else t))] + # strip comments and ignore blank targets + targets = [stripped for t in targets if (stripped := (strip_comments(t).strip() if isinstance(t, str) else t))] self.strict_scope = strict_scope self._rt = RadixTarget(strict_scope=strict_scope, acl_mode=acl_mode) self.event_seeds = set() diff --git a/bbot/test/test_step_1/test_target.py b/bbot/test/test_step_1/test_target.py index 0039fce14b..20977e8422 100644 --- a/bbot/test/test_step_1/test_target.py +++ b/bbot/test/test_step_1/test_target.py @@ -710,3 +710,126 @@ def test_target_pickle(): # hashes match assert target.hash == restored.hash + + +def test_target_comments(): + """Target strings support # comments — both full-line and inline.""" + from bbot.scanner.target import BBOTTarget + + target = BBOTTarget( + target=[ + "# this is a full-line comment", + "evilcorp.com # main evilcorp domain", + " # indented comment ", + "1.2.3.0/24 # internal network", + "othercorp.com", + ], + ) + + # comment-only lines are ignored + assert len(target.target) == 3 + + # inline comments are stripped — targets work normally + assert target.in_target("evilcorp.com") + assert target.in_target("www.evilcorp.com") + assert target.in_target("1.2.3.4") + assert target.in_target("othercorp.com") + + # the comment text itself is not a target + assert not target.in_target("main") + assert not target.in_target("internal") + + +def test_target_comments_url_fragment_not_stripped(): + """A # inside a URL (fragment) must NOT be treated as a comment. + + BBOT's URL normalisation may drop fragments, but the important thing + is that the host is still recognised as a valid target. + """ + from bbot.scanner.target import BBOTTarget + + target = BBOTTarget(target=["http://evilcorp.com/page#section"]) + assert target.in_target("evilcorp.com") + assert len(target.target) == 1 + + +def test_target_comments_blacklist(): + """Comments work for blacklist entries too.""" + from bbot.scanner.target import BBOTTarget + + target = BBOTTarget( + target=["evilcorp.com"], + blacklist=[ + "# don't scan the blog", + "blog.evilcorp.com # unstable host", + ], + ) + assert target.in_scope("www.evilcorp.com") + assert not target.in_scope("blog.evilcorp.com") + assert len(target.blacklist) == 1 + + +def test_target_comments_seeds(): + """Comments work for seed entries too.""" + from bbot.scanner.target import BBOTTarget + + target = BBOTTarget( + target=["evilcorp.com"], + seeds=[ + "# seed comment", + "evilcorp.com # the main domain", + ], + ) + assert "evilcorp.com" in target.seeds + assert len(target.seeds) == 1 + + +def test_target_comments_from_file(tmp_path): + """Comments in a target file are stripped when loaded via chain_lists.""" + from bbot.core.helpers.misc import chain_lists + + target_file = tmp_path / "targets.txt" + target_file.write_text( + "# My target list\n" + "evilcorp.com # main domain\n" + "\n" + " # another comment\n" + "othercorp.com\n" + "192.168.1.0/24 # lab network\n" + "http://example.com/page#fragment # with a URL fragment\n" + ) + + result = chain_lists([str(target_file)], try_files=True, _strip_comments=True) + assert "evilcorp.com" in result + assert "othercorp.com" in result + assert "192.168.1.0/24" in result + assert "http://example.com/page#fragment" in result + # comments and blank lines are gone + assert not any(r.lstrip().startswith("#") for r in result) + assert len(result) == 4 + + +def test_strip_comments_helper(): + """Unit tests for the strip_comments function.""" + from bbot.core.helpers.misc import strip_comments + + # full-line comments + assert strip_comments("# comment") == "" + assert strip_comments(" # indented comment") == "" + + # inline comments + assert strip_comments("evilcorp.com # main domain") == "evilcorp.com" + assert strip_comments("1.2.3.0/24\t# tab comment") == "1.2.3.0/24" + + # no comment + assert strip_comments("evilcorp.com") == "evilcorp.com" + + # URL fragment (no space before #) is preserved + assert strip_comments("http://example.com/page#section") == "http://example.com/page#section" + + # URL fragment with trailing inline comment + assert strip_comments("http://example.com/page#section # a comment") == "http://example.com/page#section" + + # empty / whitespace + assert strip_comments("") == "" + assert strip_comments(" ") == " "