From a4dd1e3049002183f43e29b937ad90b50bf7d5ca Mon Sep 17 00:00:00 2001 From: Bob Date: Fri, 27 Feb 2026 08:57:39 +0000 Subject: [PATCH] fix(transform): use scheme as domain for file:// and other non-web URLs When splitting URLs, file://, about:, and other URLs without a netloc produced an empty string for $domain. This caused all such events to cluster together as a single empty entry in "Top Browser Domains". Now falls back to using the URL scheme (e.g. "file", "about") as the domain when netloc is empty. This groups local file activity under a visible "file" domain label instead of an invisible empty string. Fixes #67 --- aw_transform/split_url_events.py | 15 ++++++++++----- tests/test_transforms.py | 12 +++++++++++- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/aw_transform/split_url_events.py b/aw_transform/split_url_events.py index 78ec93f4..8a4466f6 100644 --- a/aw_transform/split_url_events.py +++ b/aw_transform/split_url_events.py @@ -14,11 +14,16 @@ def split_url_events(events: List[Event]) -> List[Event]: url = event.data["url"] parsed_url = urlparse(url) event.data["$protocol"] = parsed_url.scheme - event.data["$domain"] = ( - parsed_url.netloc[4:] - if parsed_url.netloc[:4] == "www." - else parsed_url.netloc - ) + netloc = parsed_url.netloc + if netloc: + domain = netloc[4:] if netloc[:4] == "www." else netloc + elif parsed_url.scheme: + # For URLs without a domain (e.g. file://, about:), + # use the scheme as domain so they don't all cluster as empty. + domain = parsed_url.scheme + else: + domain = "" + event.data["$domain"] = domain event.data["$path"] = parsed_url.path event.data["$params"] = parsed_url.params event.data["$options"] = parsed_url.query diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 1d06e396..3aa119ca 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -345,12 +345,22 @@ def test_url_parse_event(): result = split_url_events([e3]) print(result) assert result[0].data["$protocol"] == "file" - assert result[0].data["$domain"] == "" + assert result[0].data["$domain"] == "file" assert result[0].data["$path"] == "/home/johan/myfile.txt" assert result[0].data["$params"] == "" assert result[0].data["$options"] == "" assert result[0].data["$identifier"] == "" + # Test about: URLs + e4 = Event( + data={"url": "about:blank"}, + timestamp=now, + duration=timedelta(seconds=1), + ) + result = split_url_events([e4]) + assert result[0].data["$protocol"] == "about" + assert result[0].data["$domain"] == "about" + def test_union(): now = datetime.now(timezone.utc)