From de11ef24dbb9c2beadea0c239f9947329e4f74ac Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Tue, 28 Oct 2025 17:18:34 +0100 Subject: [PATCH 01/16] Include apex domain for prediction --- subwiz/main.py | 22 ++++++++++++++-------- subwiz/model.py | 10 +++++++++- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/subwiz/main.py b/subwiz/main.py index 4e0a1a3..65c8e79 100644 --- a/subwiz/main.py +++ b/subwiz/main.py @@ -32,8 +32,8 @@ MODEL_REPO = "HadrianSecurity/subwiz" -MODEL_FILE = "model.pt" -TOKENIZER_FILE = "tokenizer.json" +MODEL_FILE = "model_v2.pt" +TOKENIZER_FILE = "tokenizer_v2.json" CONFIG_FILE = "config.json" @@ -102,13 +102,19 @@ def run_inference( Set of predicted domain objects """ - apex = next(iter(input_domains)).apex_domain subs = [dom.subdomain for dom in input_domains] - tokenizer_input = ",".join(sorted(subs)) + "[DELIM]" - # TODO: pick a different subset, if some were out of context last iteration + apex_domain = input_domains[0].apex_domain + subdomains_tokenizer_input = ",".join(sorted(subs)) + "[DELIM]" + apex_tokenizer_input = "[BOS]" + apex_domain + "[DELIM]" - x = tokenizer.encode(tokenizer_input) - x = [1] * (gpt_model.config.block_size - len(x)) + x + subs_x = tokenizer.encode(subdomains_tokenizer_input) + apex_x = tokenizer.encode(apex_tokenizer_input) + + # Trim subs to account for the apex part, grab last part + subs_x = subs_x[:gpt_model.config.block_size - len(apex_x)] + + x = apex_x + subs_x + x = [gpt_model.pad_token] * (gpt_model.config.block_size - len(x)) + x x = torch.tensor(x) blocked_outputs = {dom.subdomain for dom in blocked_domains} @@ -128,7 +134,7 @@ def run_inference( for pred in predictions } - predictions: set[str] = {sub + "." + apex for sub in predictions} + predictions: set[str] = {sub + "." + apex_domain for sub in predictions} predicted_domains: set[Domain] = set() for pred in predictions: diff --git a/subwiz/model.py b/subwiz/model.py index f230d45..1691695 100644 --- a/subwiz/model.py +++ b/subwiz/model.py @@ -428,6 +428,10 @@ def generate( idx = idx.to(self.device) sequences = idx.unsqueeze(0) + # Locate the first delimiter token and get its position + delim_token_id = self.delim_token + trimming_position = (sequences == delim_token_id).nonzero(as_tuple=True)[1][0] + probabilities = torch.tensor([1.0], device=self.device) finished_sequences = torch.tensor([], device=self.device) @@ -441,7 +445,11 @@ def generate( on_iteration() # trim the sequences down to block size - sequences = sequences[:, -self.config.block_size :] + if sequences.shape[1] > self.config.block_size: + sequences = torch.cat( + (sequences[:, :trimming_position], sequences[:, trimming_position + 1:]), + dim=1, + ) # remove any invalid subdomain starts outputs = self.tokenizer.batch_decode(sequences[:, -i:]) From b641f51bd86da9d71af604c08f9bba3814b1b3aa Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 29 Oct 2025 19:33:31 +0100 Subject: [PATCH 02/16] Working version --- subwiz/main.py | 14 +++++++------- subwiz/model.py | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/subwiz/main.py b/subwiz/main.py index 65c8e79..b665e0d 100644 --- a/subwiz/main.py +++ b/subwiz/main.py @@ -17,10 +17,10 @@ import torch from transformers import PreTrainedTokenizerFast -from subwiz.cli_printer import print_hello, print_log, print_progress_dot -from subwiz.model import GPT -from subwiz.resolve import get_registered_domains -from subwiz.type import ( +from subwiz_v2.cli_printer import print_hello, print_log, print_progress_dot +from subwiz_v2.model import GPT +from subwiz_v2.resolve import get_registered_domains +from subwiz_v2.type import ( Domain, input_domains_type, device_type, @@ -103,15 +103,15 @@ def run_inference( """ subs = [dom.subdomain for dom in input_domains] - apex_domain = input_domains[0].apex_domain + apex_domain = next(iter(input_domains)).apex_domain subdomains_tokenizer_input = ",".join(sorted(subs)) + "[DELIM]" - apex_tokenizer_input = "[BOS]" + apex_domain + "[DELIM]" + apex_tokenizer_input = apex_domain + "[DELIM]" subs_x = tokenizer.encode(subdomains_tokenizer_input) apex_x = tokenizer.encode(apex_tokenizer_input) # Trim subs to account for the apex part, grab last part - subs_x = subs_x[:gpt_model.config.block_size - len(apex_x)] + subs_x = subs_x[-(gpt_model.config.block_size - len(apex_x)):] x = apex_x + subs_x x = [gpt_model.pad_token] * (gpt_model.config.block_size - len(x)) + x diff --git a/subwiz/model.py b/subwiz/model.py index 1691695..84be7de 100644 --- a/subwiz/model.py +++ b/subwiz/model.py @@ -248,6 +248,7 @@ def __init__(self, config: GPTConfig): self.end_token = self.tokenizer("[END]")["input_ids"][0] self.comma_token = self.tokenizer(",")["input_ids"][0] self.delim_token = self.tokenizer("[DELIM]")["input_ids"][0] + self.pad_token = self.tokenizer("[PAD]")["input_ids"][0] self.transformer = nn.ModuleDict( dict( @@ -382,6 +383,28 @@ def device(self) -> str: # assign model inputs to the right device return next(self.lm_head.parameters()).device.type + def _trim_subdomains( + self, + sequences: torch.Tensor, + num_initial_pad_tokens: int, + num_tokens_generated: int, + ) -> torch.Tensor: + if num_tokens_generated == 0: + return sequences + + if num_tokens_generated > num_initial_pad_tokens: + # Trim after the fist delim token, which is the same for every element of batch + trimming_position = 1 + (sequences == self.delim_token).nonzero(as_tuple=True)[1][0] + else: + # Remove the first pad token + trimming_position = 0 + + sequences = torch.cat( + (sequences[:, :trimming_position], sequences[:, trimming_position + 1:]), + dim=1, + ) + return sequences + @torch.no_grad() def generate( self, @@ -426,12 +449,9 @@ def generate( run_uuid = uuid.uuid4() idx = idx.to(self.device) + num_initial_pad_tokens = (idx == self.pad_token).sum().item() sequences = idx.unsqueeze(0) - # Locate the first delimiter token and get its position - delim_token_id = self.delim_token - trimming_position = (sequences == delim_token_id).nonzero(as_tuple=True)[1][0] - probabilities = torch.tensor([1.0], device=self.device) finished_sequences = torch.tensor([], device=self.device) @@ -445,11 +465,11 @@ def generate( on_iteration() # trim the sequences down to block size - if sequences.shape[1] > self.config.block_size: - sequences = torch.cat( - (sequences[:, :trimming_position], sequences[:, trimming_position + 1:]), - dim=1, - ) + sequences = self._trim_subdomains( + sequences, + num_initial_pad_tokens=num_initial_pad_tokens, + num_tokens_generated=i, + ) # remove any invalid subdomain starts outputs = self.tokenizer.batch_decode(sequences[:, -i:]) From 89a29337b643d4227d0758e81e6ba9c6cee92b06 Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 29 Oct 2025 19:34:49 +0100 Subject: [PATCH 03/16] Fix imports --- subwiz/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/subwiz/main.py b/subwiz/main.py index b665e0d..16b4761 100644 --- a/subwiz/main.py +++ b/subwiz/main.py @@ -17,10 +17,10 @@ import torch from transformers import PreTrainedTokenizerFast -from subwiz_v2.cli_printer import print_hello, print_log, print_progress_dot -from subwiz_v2.model import GPT -from subwiz_v2.resolve import get_registered_domains -from subwiz_v2.type import ( +from subwiz.cli_printer import print_hello, print_log, print_progress_dot +from subwiz.model import GPT +from subwiz.resolve import get_registered_domains +from subwiz.type import ( Domain, input_domains_type, device_type, From 74e875f91762199b99c8328f01c9eca12afa42fc Mon Sep 17 00:00:00 2001 From: klaasmeinke <48212135+klaasmeinke@users.noreply.github.com> Date: Wed, 5 Nov 2025 15:54:46 +0100 Subject: [PATCH 04/16] Update subwiz/model.py --- subwiz/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subwiz/model.py b/subwiz/model.py index 84be7de..a8a14fd 100644 --- a/subwiz/model.py +++ b/subwiz/model.py @@ -388,7 +388,7 @@ def _trim_subdomains( sequences: torch.Tensor, num_initial_pad_tokens: int, num_tokens_generated: int, - ) -> torch.Tensor: + ) -> torch.Tensor: if num_tokens_generated == 0: return sequences From 7907b60170120aad871691d7b824911df0f632d8 Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 16:06:07 +0100 Subject: [PATCH 05/16] More efficient way of trimming when there is no pad --- subwiz/model.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/subwiz/model.py b/subwiz/model.py index a8a14fd..a469a9d 100644 --- a/subwiz/model.py +++ b/subwiz/model.py @@ -386,18 +386,17 @@ def device(self) -> str: def _trim_subdomains( self, sequences: torch.Tensor, - num_initial_pad_tokens: int, + apex_unpadded_position: int, num_tokens_generated: int, ) -> torch.Tensor: if num_tokens_generated == 0: return sequences - if num_tokens_generated > num_initial_pad_tokens: - # Trim after the fist delim token, which is the same for every element of batch - trimming_position = 1 + (sequences == self.delim_token).nonzero(as_tuple=True)[1][0] - else: - # Remove the first pad token - trimming_position = 0 + trimming_position = ( + apex_unpadded_position + if num_tokens_generated > apex_unpadded_position + else num_tokens_generated + ) sequences = torch.cat( (sequences[:, :trimming_position], sequences[:, trimming_position + 1:]), @@ -452,6 +451,10 @@ def generate( num_initial_pad_tokens = (idx == self.pad_token).sum().item() sequences = idx.unsqueeze(0) + apex_padded_position = (sequences == self.delim_token).nonzero(as_tuple=True)[1][0] + + apex_unpadded_position = 1 + apex_padded_position - num_initial_pad_tokens + probabilities = torch.tensor([1.0], device=self.device) finished_sequences = torch.tensor([], device=self.device) @@ -467,7 +470,7 @@ def generate( # trim the sequences down to block size sequences = self._trim_subdomains( sequences, - num_initial_pad_tokens=num_initial_pad_tokens, + apex_unpadded_position=apex_unpadded_position, num_tokens_generated=i, ) From 192dd23be4bacc3fe024db30ad9848ec46f855a1 Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 16:07:51 +0100 Subject: [PATCH 06/16] linting --- subwiz/main.py | 2 +- subwiz/model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/subwiz/main.py b/subwiz/main.py index 16b4761..5959a1b 100644 --- a/subwiz/main.py +++ b/subwiz/main.py @@ -111,7 +111,7 @@ def run_inference( apex_x = tokenizer.encode(apex_tokenizer_input) # Trim subs to account for the apex part, grab last part - subs_x = subs_x[-(gpt_model.config.block_size - len(apex_x)):] + subs_x = subs_x[-(gpt_model.config.block_size - len(apex_x)) :] x = apex_x + subs_x x = [gpt_model.pad_token] * (gpt_model.config.block_size - len(x)) + x diff --git a/subwiz/model.py b/subwiz/model.py index a469a9d..6c9360d 100644 --- a/subwiz/model.py +++ b/subwiz/model.py @@ -399,7 +399,7 @@ def _trim_subdomains( ) sequences = torch.cat( - (sequences[:, :trimming_position], sequences[:, trimming_position + 1:]), + (sequences[:, :trimming_position], sequences[:, trimming_position + 1 :]), dim=1, ) return sequences From 732c052c76b504bdd488ac73f1225be4e4eb1d9d Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 16:37:02 +0100 Subject: [PATCH 07/16] add revision system --- subwiz/main.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/subwiz/main.py b/subwiz/main.py index 5959a1b..e826262 100644 --- a/subwiz/main.py +++ b/subwiz/main.py @@ -12,6 +12,8 @@ from collections import defaultdict from typing import Callable +from importlib.metadata import version + from huggingface_hub import hf_hub_download from huggingface_hub.utils import disable_progress_bars, enable_progress_bars import torch @@ -35,12 +37,13 @@ MODEL_FILE = "model_v2.pt" TOKENIZER_FILE = "tokenizer_v2.json" CONFIG_FILE = "config.json" - +REVISION = "9a2c505d0312ad6938b27d9b4338020fe37883e8" def get_model_and_tokenizer( force_download: bool, device: str, quiet: bool, + version: str = "weights_v2", ) -> tuple[GPT, PreTrainedTokenizerFast]: """Download files from HuggingFace to run subwiz. Caches in local file system. @@ -54,14 +57,24 @@ def get_model_and_tokenizer( if quiet: disable_progress_bars() + model_path = hf_hub_download( - repo_id=MODEL_REPO, filename=MODEL_FILE, force_download=force_download + repo_id=MODEL_REPO, + filename=MODEL_FILE, + force_download=force_download, + revision=REVISION, ) tokenizer_path = hf_hub_download( - repo_id=MODEL_REPO, filename=TOKENIZER_FILE, force_download=force_download + repo_id=MODEL_REPO, + filename=TOKENIZER_FILE, + force_download=force_download, + revision=REVISION, ) hf_hub_download( - repo_id=MODEL_REPO, filename=CONFIG_FILE, force_download=force_download + repo_id=MODEL_REPO, + filename=CONFIG_FILE, + force_download=force_download, + revision=REVISION, ) if quiet: enable_progress_bars() @@ -304,8 +317,15 @@ def run( "Use the --multi-apex flag to process them all." ) + # Auto-detect version from package + try: + pkg_version = version("subwiz") + model_version = "weights_v2" if pkg_version >= "0.5.0" else "weights_v1" + except Exception: + model_version = "weights_v1" # default to old module version + gpt_model, tokenizer = get_model_and_tokenizer( - force_download, device=device, quiet=quiet + force_download, device=device, quiet=quiet, version=model_version ) found_domains = set() From ca6b50802a9dcdeb297e8b117fcd4f755ee4d287 Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 16:37:55 +0100 Subject: [PATCH 08/16] lint --- subwiz/model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/subwiz/model.py b/subwiz/model.py index 6c9360d..730ae9d 100644 --- a/subwiz/model.py +++ b/subwiz/model.py @@ -451,7 +451,9 @@ def generate( num_initial_pad_tokens = (idx == self.pad_token).sum().item() sequences = idx.unsqueeze(0) - apex_padded_position = (sequences == self.delim_token).nonzero(as_tuple=True)[1][0] + apex_padded_position = ( + sequences == self.delim_token + ).nonzero(as_tuple=True)[1][0] apex_unpadded_position = 1 + apex_padded_position - num_initial_pad_tokens From 9b0ec959d94e18012459ef82c261d2d1eefb0257 Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 16:38:59 +0100 Subject: [PATCH 09/16] wicked lint --- subwiz/model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/subwiz/model.py b/subwiz/model.py index 730ae9d..ddcade2 100644 --- a/subwiz/model.py +++ b/subwiz/model.py @@ -451,9 +451,9 @@ def generate( num_initial_pad_tokens = (idx == self.pad_token).sum().item() sequences = idx.unsqueeze(0) - apex_padded_position = ( - sequences == self.delim_token - ).nonzero(as_tuple=True)[1][0] + apex_padded_position = (sequences == self.delim_token).nonzero(as_tuple=True)[ + 1 + ][0] apex_unpadded_position = 1 + apex_padded_position - num_initial_pad_tokens From 7b490babe41f546b51fc56d27b61feae74f428cd Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 16:43:22 +0100 Subject: [PATCH 10/16] linting --- subwiz/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subwiz/main.py b/subwiz/main.py index e826262..2bbe5f2 100644 --- a/subwiz/main.py +++ b/subwiz/main.py @@ -39,6 +39,7 @@ CONFIG_FILE = "config.json" REVISION = "9a2c505d0312ad6938b27d9b4338020fe37883e8" + def get_model_and_tokenizer( force_download: bool, device: str, @@ -57,7 +58,6 @@ def get_model_and_tokenizer( if quiet: disable_progress_bars() - model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, @@ -323,7 +323,7 @@ def run( model_version = "weights_v2" if pkg_version >= "0.5.0" else "weights_v1" except Exception: model_version = "weights_v1" # default to old module version - + gpt_model, tokenizer = get_model_and_tokenizer( force_download, device=device, quiet=quiet, version=model_version ) From 50e388727132678e1e5b5a017c9a6fc1e6f45fe5 Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 16:46:51 +0100 Subject: [PATCH 11/16] New language tests --- tests/test_results.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_results.py b/tests/test_results.py index 901699a..cd86e8c 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -19,8 +19,13 @@ def test_languages(): no_resolve=True, ) print(results) - assert "english.hadrian.io" in results - + assert { + "english.hadrian.io", + "french.hadrian.io", + "spanish.hadrian.io", + "portuguese.hadrian.io", + "dutch.hadrian.io", + } | results def test_numbers(): """Test that numeric subdomain patterns are generated correctly. From 3bf7778e3dec5e0526e09e06f7163ce0d91f8e13 Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 16:48:11 +0100 Subject: [PATCH 12/16] Intersect the set --- tests/test_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_results.py b/tests/test_results.py index cd86e8c..b1edca6 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -25,7 +25,7 @@ def test_languages(): "spanish.hadrian.io", "portuguese.hadrian.io", "dutch.hadrian.io", - } | results + } & set(results) def test_numbers(): """Test that numeric subdomain patterns are generated correctly. From bab673ff2dad58f914281475f59cafbe8139390b Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 16:49:09 +0100 Subject: [PATCH 13/16] add couple comments --- subwiz/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/subwiz/main.py b/subwiz/main.py index 2bbe5f2..b3da07f 100644 --- a/subwiz/main.py +++ b/subwiz/main.py @@ -32,11 +32,12 @@ concurrency_type, ) - +# v2 stands for the module version not the model weights version MODEL_REPO = "HadrianSecurity/subwiz" MODEL_FILE = "model_v2.pt" TOKENIZER_FILE = "tokenizer_v2.json" CONFIG_FILE = "config.json" +# The revision changes with every new model weights version REVISION = "9a2c505d0312ad6938b27d9b4338020fe37883e8" From c359cb2bcef752a931804fd6b5f0e75974b7fc2e Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 16:50:42 +0100 Subject: [PATCH 14/16] linting problems never end --- tests/test_results.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_results.py b/tests/test_results.py index b1edca6..9cda550 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -27,6 +27,7 @@ def test_languages(): "dutch.hadrian.io", } & set(results) + def test_numbers(): """Test that numeric subdomain patterns are generated correctly. From 761dbbda1a00b22125ae04e9b54a8f86a63221a5 Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 17:07:31 +0100 Subject: [PATCH 15/16] no need to fetch model version --- subwiz/main.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/subwiz/main.py b/subwiz/main.py index b3da07f..38ccfec 100644 --- a/subwiz/main.py +++ b/subwiz/main.py @@ -45,7 +45,6 @@ def get_model_and_tokenizer( force_download: bool, device: str, quiet: bool, - version: str = "weights_v2", ) -> tuple[GPT, PreTrainedTokenizerFast]: """Download files from HuggingFace to run subwiz. Caches in local file system. @@ -318,15 +317,8 @@ def run( "Use the --multi-apex flag to process them all." ) - # Auto-detect version from package - try: - pkg_version = version("subwiz") - model_version = "weights_v2" if pkg_version >= "0.5.0" else "weights_v1" - except Exception: - model_version = "weights_v1" # default to old module version - gpt_model, tokenizer = get_model_and_tokenizer( - force_download, device=device, quiet=quiet, version=model_version + force_download, device=device, quiet=quiet ) found_domains = set() From 295c5a88fba12b39f9814c3eff39d2567646b65e Mon Sep 17 00:00:00 2001 From: EduardoTerres Date: Wed, 5 Nov 2025 17:13:21 +0100 Subject: [PATCH 16/16] nit --- subwiz/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subwiz/main.py b/subwiz/main.py index 38ccfec..e200d59 100644 --- a/subwiz/main.py +++ b/subwiz/main.py @@ -32,12 +32,12 @@ concurrency_type, ) -# v2 stands for the module version not the model weights version + MODEL_REPO = "HadrianSecurity/subwiz" MODEL_FILE = "model_v2.pt" TOKENIZER_FILE = "tokenizer_v2.json" CONFIG_FILE = "config.json" -# The revision changes with every new model weights version +# Change revision when realeasing new weights REVISION = "9a2c505d0312ad6938b27d9b4338020fe37883e8"