From c76d2b0b37eb4a7684e3381e4dcfe0f316321ba4 Mon Sep 17 00:00:00 2001 From: Henry Post Date: Fri, 13 Feb 2026 12:54:51 -0600 Subject: [PATCH 1/4] add powerbi unflatten util --- scripts/PowerBI-unflatten.py | 135 +++++ scripts/snaffler-schema.json | 1007 ++++++++++++++++++++++++++++++++++ 2 files changed, 1142 insertions(+) create mode 100644 scripts/PowerBI-unflatten.py create mode 100644 scripts/snaffler-schema.json diff --git a/scripts/PowerBI-unflatten.py b/scripts/PowerBI-unflatten.py new file mode 100644 index 00000000..e93e048f --- /dev/null +++ b/scripts/PowerBI-unflatten.py @@ -0,0 +1,135 @@ +''' +1. print all json properties (generate a schema) for the native snaffler json output stored at ./snaffledShares.json +2. Save the schema to ./snafflerSchema.tmp.json +3. read the schema and understand it +4. create a function to mutate the schema so it's nested properly (i.e. no more entries.eventProperties.Green, but rather, enteries.event.severity, etc) +5. use the function to mutate ./snaffledShares.json into ./snaffledShares.powerbi.json +6. read the mutated json file and confirm it's valid json and has similar data. + +NOTE: You can remove 'entries.[].rawEventProperties' in PowerBI when you import, if it causes issues. +Add that removal as a transform step. +''' + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, Iterable, List, Set + + +# NOTE USER PROVIDED ARGUMENTS: Change directories here. +BASE_DIR = Path(__file__).parent +SOURCE_PATH = BASE_DIR / "snaffledShares.json" +SCHEMA_PATH = BASE_DIR / "snafflerSchema.tmp.json" +OUTPUT_PATH = BASE_DIR / "snaffledShares.powerbi.json" + + +def load_source() -> Dict[str, Any]: + """Load the raw snaffler output.""" + with SOURCE_PATH.open("r", encoding="utf-8") as handle: + return json.load(handle) + + +def _type_name(value: Any) -> str: + if value is None: + return "null" + if isinstance(value, bool): + return "boolean" + if isinstance(value, int) and not isinstance(value, bool): + return "integer" + if isinstance(value, float): + return "number" + if isinstance(value, str): + return "string" + if isinstance(value, list): + return "array" + if isinstance(value, dict): + return "object" + return type(value).__name__ + + +def _walk_schema(node: Any, path: str, found: Dict[str, Set[str]]) -> None: + """Collect a lightweight path->types schema.""" + found.setdefault(path, set()).add(_type_name(node)) + + if isinstance(node, dict): + for key, value in node.items(): + next_path = f"{path}.{key}" if path else key + _walk_schema(value, next_path, found) + elif isinstance(node, list): + next_path = f"{path}[]" if path else "[]" + for item in node: + _walk_schema(item, next_path, found) + + +def build_schema(document: Dict[str, Any]) -> Dict[str, List[str]]: + """Return a simple schema mapping JSON paths to observed types.""" + paths: Dict[str, Set[str]] = {} + _walk_schema(document, "", paths) + return {k: sorted(v) for k, v in sorted(paths.items())} + + +def normalize_entry(entry: Dict[str, Any]) -> Dict[str, Any]: + """ + Flatten eventProperties.* color buckets into a common event object. + + Example: + {"eventProperties": {"Green": {"DateTime": "...", ...}}} + becomes + {"event": {"severity": "Green", "DateTime": "...", ...}} + """ + event_props = entry.get("eventProperties") or {} + + # Take the first severity bucket, but keep the raw payload if present. + severity, payload = (next(iter(event_props.items())) if event_props else (None, None)) + + # Copy base fields to avoid mutating the original dict. + normalized: Dict[str, Any] = { + key: value + for key, value in entry.items() + if key != "eventProperties" + } + + if severity: + # Carry original message-level data plus normalized event block. + normalized["event"] = {"severity": severity, **payload} + normalized["rawEventProperties"] = event_props # keep original for traceability + + return normalized + + +def transform(document: Dict[str, Any]) -> Dict[str, Any]: + """Apply normalization to every entry.""" + entries = document.get("entries", []) + transformed = [normalize_entry(entry) for entry in entries] + return {"entries": transformed} + + +def write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2, ensure_ascii=False) + handle.write("\n") + + +def main() -> None: + raw = load_source() + + # Step 1 & 2: derive a path/type schema and persist it. + schema = build_schema(raw) + write_json(SCHEMA_PATH, schema) + + # Step 4 & 5: normalize and save PowerBI-friendly version. + transformed = transform(raw) + write_json(OUTPUT_PATH, transformed) + + # Step 6: lightweight validation parity checks. + raw_count = len(raw.get("entries", [])) + new_count = len(transformed.get("entries", [])) + assert raw_count == new_count, ( + f"entry count changed during transform: {raw_count} -> {new_count}" + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/snaffler-schema.json b/scripts/snaffler-schema.json new file mode 100644 index 00000000..8f35625e --- /dev/null +++ b/scripts/snaffler-schema.json @@ -0,0 +1,1007 @@ +{ + "": [ + "object" + ], + "entries": [ + "array" + ], + "entries[]": [ + "object" + ], + "entries[].eventProperties": [ + "object" + ], + "entries[].eventProperties.Black": [ + "object" + ], + "entries[].eventProperties.Black.DateTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult": [ + "object" + ], + "entries[].eventProperties.Black.FileResult.FileInfo": [ + "object" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Attributes": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.CreationTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory": [ + "object" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Attributes": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.CreationTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Exists": [ + "boolean" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Extension": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.FullName": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Name": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent": [ + "object" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.Attributes": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.CreationTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.Exists": [ + "boolean" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.Extension": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.FullName": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.Name": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.Parent": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Parent.Root": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root": [ + "object" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.Attributes": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.CreationTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.Exists": [ + "boolean" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.Extension": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.FullName": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.Name": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Directory.Root.Root": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.DirectoryName": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Exists": [ + "boolean" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Extension": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.FullName": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.IsReadOnly": [ + "boolean" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Length": [ + "integer" + ], + "entries[].eventProperties.Black.FileResult.FileInfo.Name": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule": [ + "object" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.Description": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.EnumerationScope": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.MatchAction": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.MatchLength": [ + "integer" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.MatchLocation": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.Regexes": [ + "array" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.Regexes[]": [ + "object" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.Regexes[].MatchTimeout": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.Regexes[].Options": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.Regexes[].RightToLeft": [ + "boolean" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.RuleName": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.Triage": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.WordList": [ + "array" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.WordListType": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.MatchedRule.WordList[]": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.RwStatus": [ + "object" + ], + "entries[].eventProperties.Black.FileResult.RwStatus.CanModify": [ + "boolean" + ], + "entries[].eventProperties.Black.FileResult.RwStatus.CanRead": [ + "boolean" + ], + "entries[].eventProperties.Black.FileResult.RwStatus.CanWrite": [ + "boolean" + ], + "entries[].eventProperties.Black.FileResult.TextResult": [ + "object" + ], + "entries[].eventProperties.Black.FileResult.TextResult.MatchContext": [ + "string" + ], + "entries[].eventProperties.Black.FileResult.TextResult.MatchedStrings": [ + "array" + ], + "entries[].eventProperties.Black.FileResult.TextResult.MatchedStrings[]": [ + "string" + ], + "entries[].eventProperties.Black.Type": [ + "string" + ], + "entries[].eventProperties.Green": [ + "object" + ], + "entries[].eventProperties.Green.DateTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult": [ + "object" + ], + "entries[].eventProperties.Green.FileResult.FileInfo": [ + "object" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Attributes": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.CreationTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory": [ + "object" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Attributes": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.CreationTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Exists": [ + "boolean" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Extension": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.FullName": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Name": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent": [ + "object" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.Attributes": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.CreationTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.Exists": [ + "boolean" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.Extension": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.FullName": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.Name": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.Parent": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Parent.Root": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root": [ + "object" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.Attributes": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.CreationTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.Exists": [ + "boolean" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.Extension": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.FullName": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.Name": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Directory.Root.Root": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.DirectoryName": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Exists": [ + "boolean" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Extension": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.FullName": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.IsReadOnly": [ + "boolean" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Length": [ + "integer" + ], + "entries[].eventProperties.Green.FileResult.FileInfo.Name": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule": [ + "object" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.Description": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.EnumerationScope": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.MatchAction": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.MatchLength": [ + "integer" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.MatchLocation": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.Regexes": [ + "array" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.Regexes[]": [ + "object" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.Regexes[].MatchTimeout": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.Regexes[].Options": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.Regexes[].RightToLeft": [ + "boolean" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.RuleName": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.Triage": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.WordList": [ + "array" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.WordListType": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.MatchedRule.WordList[]": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.RwStatus": [ + "object" + ], + "entries[].eventProperties.Green.FileResult.RwStatus.CanModify": [ + "boolean" + ], + "entries[].eventProperties.Green.FileResult.RwStatus.CanRead": [ + "boolean" + ], + "entries[].eventProperties.Green.FileResult.RwStatus.CanWrite": [ + "boolean" + ], + "entries[].eventProperties.Green.FileResult.TextResult": [ + "object" + ], + "entries[].eventProperties.Green.FileResult.TextResult.MatchContext": [ + "string" + ], + "entries[].eventProperties.Green.FileResult.TextResult.MatchedStrings": [ + "array" + ], + "entries[].eventProperties.Green.FileResult.TextResult.MatchedStrings[]": [ + "string" + ], + "entries[].eventProperties.Green.Type": [ + "string" + ], + "entries[].eventProperties.Red": [ + "object" + ], + "entries[].eventProperties.Red.DateTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult": [ + "object" + ], + "entries[].eventProperties.Red.FileResult.FileInfo": [ + "object" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Attributes": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.CreationTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory": [ + "object" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Attributes": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.CreationTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Exists": [ + "boolean" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Extension": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.FullName": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Name": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent": [ + "object" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.Attributes": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.CreationTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.Exists": [ + "boolean" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.Extension": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.FullName": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.Name": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.Parent": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Parent.Root": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root": [ + "object" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.Attributes": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.CreationTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.Exists": [ + "boolean" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.Extension": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.FullName": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.Name": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Directory.Root.Root": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.DirectoryName": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Exists": [ + "boolean" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Extension": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.FullName": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.IsReadOnly": [ + "boolean" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Length": [ + "integer" + ], + "entries[].eventProperties.Red.FileResult.FileInfo.Name": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule": [ + "object" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.Description": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.EnumerationScope": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.MatchAction": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.MatchLength": [ + "integer" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.MatchLocation": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.Regexes": [ + "array" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.Regexes[]": [ + "object" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.Regexes[].MatchTimeout": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.Regexes[].Options": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.Regexes[].RightToLeft": [ + "boolean" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.RuleName": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.Triage": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.WordList": [ + "array" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.WordListType": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.MatchedRule.WordList[]": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.RwStatus": [ + "object" + ], + "entries[].eventProperties.Red.FileResult.RwStatus.CanModify": [ + "boolean" + ], + "entries[].eventProperties.Red.FileResult.RwStatus.CanRead": [ + "boolean" + ], + "entries[].eventProperties.Red.FileResult.RwStatus.CanWrite": [ + "boolean" + ], + "entries[].eventProperties.Red.FileResult.TextResult": [ + "object" + ], + "entries[].eventProperties.Red.FileResult.TextResult.MatchContext": [ + "string" + ], + "entries[].eventProperties.Red.FileResult.TextResult.MatchedStrings": [ + "array" + ], + "entries[].eventProperties.Red.FileResult.TextResult.MatchedStrings[]": [ + "string" + ], + "entries[].eventProperties.Red.Type": [ + "string" + ], + "entries[].eventProperties.Yellow": [ + "object" + ], + "entries[].eventProperties.Yellow.DateTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult": [ + "object" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo": [ + "object" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Attributes": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.CreationTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory": [ + "object" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Attributes": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.CreationTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Exists": [ + "boolean" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Extension": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.FullName": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Name": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent": [ + "object" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.Attributes": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.CreationTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.Exists": [ + "boolean" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.Extension": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.FullName": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.Name": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.Parent": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Parent.Root": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root": [ + "object" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.Attributes": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.CreationTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.CreationTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.Exists": [ + "boolean" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.Extension": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.FullName": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.Name": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Directory.Root.Root": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.DirectoryName": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Exists": [ + "boolean" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Extension": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.FullName": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.IsReadOnly": [ + "boolean" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.LastAccessTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.LastAccessTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.LastWriteTime": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.LastWriteTimeUtc": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Length": [ + "integer" + ], + "entries[].eventProperties.Yellow.FileResult.FileInfo.Name": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule": [ + "object" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.Description": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.EnumerationScope": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.MatchAction": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.MatchLength": [ + "integer" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.MatchLocation": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.Regexes": [ + "array" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.Regexes[]": [ + "object" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.Regexes[].MatchTimeout": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.Regexes[].Options": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.Regexes[].RightToLeft": [ + "boolean" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.RuleName": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.Triage": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.WordList": [ + "array" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.WordListType": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.MatchedRule.WordList[]": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.RwStatus": [ + "object" + ], + "entries[].eventProperties.Yellow.FileResult.RwStatus.CanModify": [ + "boolean" + ], + "entries[].eventProperties.Yellow.FileResult.RwStatus.CanRead": [ + "boolean" + ], + "entries[].eventProperties.Yellow.FileResult.RwStatus.CanWrite": [ + "boolean" + ], + "entries[].eventProperties.Yellow.FileResult.TextResult": [ + "object" + ], + "entries[].eventProperties.Yellow.FileResult.TextResult.MatchContext": [ + "string" + ], + "entries[].eventProperties.Yellow.FileResult.TextResult.MatchedStrings": [ + "array" + ], + "entries[].eventProperties.Yellow.FileResult.TextResult.MatchedStrings[]": [ + "string" + ], + "entries[].eventProperties.Yellow.Type": [ + "string" + ], + "entries[].level": [ + "string" + ], + "entries[].message": [ + "string" + ], + "entries[].time": [ + "string" + ] +} From af4f59da6a2b15b373c603c3d20bd5f8c1768ccf Mon Sep 17 00:00:00 2001 From: Henry Post <131814724+henrypost@users.noreply.github.com> Date: Fri, 13 Feb 2026 13:10:58 -0600 Subject: [PATCH 2/4] Add toggle to exclude raw event properties --- scripts/PowerBI-unflatten.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/PowerBI-unflatten.py b/scripts/PowerBI-unflatten.py index e93e048f..d03fee18 100644 --- a/scripts/PowerBI-unflatten.py +++ b/scripts/PowerBI-unflatten.py @@ -22,6 +22,7 @@ SOURCE_PATH = BASE_DIR / "snaffledShares.json" SCHEMA_PATH = BASE_DIR / "snafflerSchema.tmp.json" OUTPUT_PATH = BASE_DIR / "snaffledShares.powerbi.json" +SHOULD_EXCLUDE_RAW_EVENT_PROPERTIES = True def load_source() -> Dict[str, Any]: @@ -93,7 +94,8 @@ def normalize_entry(entry: Dict[str, Any]) -> Dict[str, Any]: if severity: # Carry original message-level data plus normalized event block. normalized["event"] = {"severity": severity, **payload} - normalized["rawEventProperties"] = event_props # keep original for traceability + if not SHOULD_EXCLUDE_RAW_EVENT_PROPERTIES: + normalized["rawEventProperties"] = event_props # keep original for traceability return normalized From 0b7d1d6a78b59dc5b506a244d5f4780b04d2bf3c Mon Sep 17 00:00:00 2001 From: Henry Post <131814724+henrypost@users.noreply.github.com> Date: Thu, 26 Feb 2026 09:56:16 -0600 Subject: [PATCH 3/4] Refactor PowerBI-unflatten.py for better structure Refactor PowerBI unflattening script to improve structure and add argument parsing. --- scripts/PowerBI-unflatten.py | 285 +++++++++++++++++++++++------------ 1 file changed, 186 insertions(+), 99 deletions(-) diff --git a/scripts/PowerBI-unflatten.py b/scripts/PowerBI-unflatten.py index d03fee18..d7953436 100644 --- a/scripts/PowerBI-unflatten.py +++ b/scripts/PowerBI-unflatten.py @@ -1,137 +1,224 @@ -''' -1. print all json properties (generate a schema) for the native snaffler json output stored at ./snaffledShares.json -2. Save the schema to ./snafflerSchema.tmp.json -3. read the schema and understand it -4. create a function to mutate the schema so it's nested properly (i.e. no more entries.eventProperties.Green, but rather, enteries.event.severity, etc) -5. use the function to mutate ./snaffledShares.json into ./snaffledShares.powerbi.json -6. read the mutated json file and confirm it's valid json and has similar data. - -NOTE: You can remove 'entries.[].rawEventProperties' in PowerBI when you import, if it causes issues. -Add that removal as a transform step. -''' - from __future__ import annotations +import argparse import json +import sys from pathlib import Path -from typing import Any, Dict, Iterable, List, Set +from typing import Any, Dict, List -# NOTE USER PROVIDED ARGUMENTS: Change directories here. -BASE_DIR = Path(__file__).parent -SOURCE_PATH = BASE_DIR / "snaffledShares.json" -SCHEMA_PATH = BASE_DIR / "snafflerSchema.tmp.json" -OUTPUT_PATH = BASE_DIR / "snaffledShares.powerbi.json" -SHOULD_EXCLUDE_RAW_EVENT_PROPERTIES = True +def load_json(path: Path) -> Any: + """Load JSON from a file.""" + with path.open("r", encoding="utf-8") as handle: + return json.load(handle) -def load_source() -> Dict[str, Any]: - """Load the raw snaffler output.""" - with SOURCE_PATH.open("r", encoding="utf-8") as handle: - return json.load(handle) +def write_json(path: Path, payload: Any) -> None: + """Write JSON to a file (pretty printed).""" + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2, ensure_ascii=False) + handle.write("\n") -def _type_name(value: Any) -> str: - if value is None: - return "null" - if isinstance(value, bool): - return "boolean" - if isinstance(value, int) and not isinstance(value, bool): - return "integer" - if isinstance(value, float): - return "number" - if isinstance(value, str): - return "string" - if isinstance(value, list): - return "array" - if isinstance(value, dict): - return "object" - return type(value).__name__ - - -def _walk_schema(node: Any, path: str, found: Dict[str, Set[str]]) -> None: - """Collect a lightweight path->types schema.""" - found.setdefault(path, set()).add(_type_name(node)) - - if isinstance(node, dict): - for key, value in node.items(): - next_path = f"{path}.{key}" if path else key - _walk_schema(value, next_path, found) - elif isinstance(node, list): - next_path = f"{path}[]" if path else "[]" - for item in node: - _walk_schema(item, next_path, found) - - -def build_schema(document: Dict[str, Any]) -> Dict[str, List[str]]: - """Return a simple schema mapping JSON paths to observed types.""" - paths: Dict[str, Set[str]] = {} - _walk_schema(document, "", paths) - return {k: sorted(v) for k, v in sorted(paths.items())} - - -def normalize_entry(entry: Dict[str, Any]) -> Dict[str, Any]: +def normalize_entry( + entry: Dict[str, Any], + *, + keep_raw_event_properties: bool, + drop_raw_event_properties_field: bool, +) -> Dict[str, Any]: """ Flatten eventProperties.* color buckets into a common event object. Example: - {"eventProperties": {"Green": {"DateTime": "...", ...}}} - becomes - {"event": {"severity": "Green", "DateTime": "...", ...}} + {"eventProperties": {"Green": {"DateTime": "...", ...}}} + becomes: + {"event": {"severity": "Green", "DateTime": "...", ...}} + + Also optionally drops: + entries[].rawEventProperties """ event_props = entry.get("eventProperties") or {} - # Take the first severity bucket, but keep the raw payload if present. + # Take the first severity bucket. severity, payload = (next(iter(event_props.items())) if event_props else (None, None)) # Copy base fields to avoid mutating the original dict. - normalized: Dict[str, Any] = { - key: value - for key, value in entry.items() - if key != "eventProperties" - } + excluded_keys = {"eventProperties"} + if drop_raw_event_properties_field: + excluded_keys.add("rawEventProperties") + + normalized: Dict[str, Any] = {k: v for k, v in entry.items() if k not in excluded_keys} if severity: - # Carry original message-level data plus normalized event block. - normalized["event"] = {"severity": severity, **payload} - if not SHOULD_EXCLUDE_RAW_EVENT_PROPERTIES: - normalized["rawEventProperties"] = event_props # keep original for traceability + normalized["event"] = {"severity": severity, **(payload or {})} + if keep_raw_event_properties: + # Keep original buckets for traceability if requested. + normalized["rawEventProperties"] = event_props return normalized -def transform(document: Dict[str, Any]) -> Dict[str, Any]: - """Apply normalization to every entry.""" - entries = document.get("entries", []) - transformed = [normalize_entry(entry) for entry in entries] +def transform_document( + document: Any, + *, + keep_raw_event_properties: bool, + drop_raw_event_properties_field: bool, +) -> Any: + """ + Apply normalization to every entry if this looks like snaffler output. + If document isn't the expected shape, return it unchanged. + """ + if not isinstance(document, dict): + return document + + entries = document.get("entries") + if not isinstance(entries, list): + # Not a snaffler output (or not the format we expect); return unchanged. + return document + + transformed = [ + normalize_entry( + entry, + keep_raw_event_properties=keep_raw_event_properties, + drop_raw_event_properties_field=drop_raw_event_properties_field, + ) + if isinstance(entry, dict) + else entry + for entry in entries + ] + + # Matches your original: only output {"entries": ...} return {"entries": transformed} -def write_json(path: Path, payload: Any) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("w", encoding="utf-8") as handle: - json.dump(payload, handle, indent=2, ensure_ascii=False) - handle.write("\n") +def iter_input_files(input_dir: Path, pattern: str, recursive: bool) -> List[Path]: + if recursive: + return sorted(p for p in input_dir.rglob(pattern) if p.is_file()) + return sorted(p for p in input_dir.glob(pattern) if p.is_file()) -def main() -> None: - raw = load_source() +def make_output_path(input_file: Path, output_dir: Path, suffix: str) -> Path: + return output_dir / f"{input_file.stem}{suffix}" - # Step 1 & 2: derive a path/type schema and persist it. - schema = build_schema(raw) - write_json(SCHEMA_PATH, schema) - # Step 4 & 5: normalize and save PowerBI-friendly version. - transformed = transform(raw) - write_json(OUTPUT_PATH, transformed) +def parse_args(argv: List[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert Snaffler JSON output(s) into a PowerBI-friendly nested format." + ) + parser.add_argument( + "input_dir", + nargs="?", + default=".", + help="Folder containing .json files to convert (default: current directory).", + ) + parser.add_argument( + "-o", + "--output-dir", + default="converted", + help='Output folder (default: "converted").', + ) + parser.add_argument( + "--pattern", + default="*.json", + help='Glob pattern for input files (default: "*.json").', + ) + parser.add_argument( + "-r", + "--recursive", + action="store_true", + help="Search for input files recursively.", + ) + parser.add_argument( + "--suffix", + default=".snafflerconverted.json", + help='Suffix appended to each converted filename (default: ".snafflerconverted.json").', + ) - # Step 6: lightweight validation parity checks. - raw_count = len(raw.get("entries", [])) - new_count = len(transformed.get("entries", [])) - assert raw_count == new_count, ( - f"entry count changed during transform: {raw_count} -> {new_count}" + # Default behavior: drop entries[].rawEventProperties (PowerBI-friendly) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--keep-raw-event-properties", + action="store_true", + help="Keep entries[].rawEventProperties in output (not recommended for PowerBI import).", ) + group.add_argument( + "--drop-raw-event-properties", + action="store_true", + help="Explicitly drop entries[].rawEventProperties in output (default behavior).", + ) + + return parser.parse_args(argv) + + +def main(argv: List[str]) -> int: + args = parse_args(argv) + + input_dir = Path(args.input_dir).expanduser().resolve() + output_dir = Path(args.output_dir).expanduser().resolve() + + if not input_dir.exists() or not input_dir.is_dir(): + print(f"[!] input_dir is not a directory: {input_dir}", file=sys.stderr) + return 2 + + files = iter_input_files(input_dir, args.pattern, args.recursive) + if not files: + print(f"[!] No files matched pattern '{args.pattern}' in {input_dir}", file=sys.stderr) + return 1 + + # Default drop unless explicitly kept. + keep_raw = bool(args.keep_raw_event_properties) + drop_raw_field = not keep_raw + + converted_count = 0 + skipped_count = 0 + + for src in files: + # Avoid re-processing already converted outputs if user points input_dir at converted/ + if src.name.endswith(args.suffix): + skipped_count += 1 + continue + + try: + raw = load_json(src) + except json.JSONDecodeError as e: + print(f"[!] Skipping invalid JSON: {src} ({e})", file=sys.stderr) + skipped_count += 1 + continue + except Exception as e: + print(f"[!] Skipping unreadable file: {src} ({e})", file=sys.stderr) + skipped_count += 1 + continue + + transformed = transform_document( + raw, + keep_raw_event_properties=keep_raw, + drop_raw_event_properties_field=drop_raw_field, + ) + + out_path = make_output_path(src, output_dir, args.suffix) + write_json(out_path, transformed) + + # Lightweight validation parity check for snaffler-shaped docs. + if isinstance(raw, dict) and isinstance(raw.get("entries"), list): + raw_count = len(raw.get("entries", [])) + new_entries = transformed.get("entries") if isinstance(transformed, dict) else None + new_count = len(new_entries) if isinstance(new_entries, list) else None + if new_count is None or raw_count != new_count: + raise AssertionError( + f"entry count changed during transform for {src.name}: {raw_count} -> {new_count}" + ) + + # Confirm output is valid JSON by re-loading (cheap sanity check). + _ = load_json(out_path) + + converted_count += 1 + + print(f"[+] Converted: {converted_count}") + print(f"[~] Skipped: {skipped_count}") + print(f"[+] Output dir: {output_dir}") + return 0 if __name__ == "__main__": - main() + raise SystemExit(main(sys.argv[1:])) From c3762a355692528cad84b3fbc8d0c26ef2391fa9 Mon Sep 17 00:00:00 2001 From: Henry Post <131814724+henrypost@users.noreply.github.com> Date: Fri, 27 Feb 2026 09:29:37 -0600 Subject: [PATCH 4/4] Add script to merge JSON files into one This script merges multiple JSON files from the specified input directory into a single JSON file, consolidating all entries under the 'entries' key. It includes error handling for invalid files and options for pretty-printing the output. --- scripts/PowerBI-merge-converted.py | 153 +++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 scripts/PowerBI-merge-converted.py diff --git a/scripts/PowerBI-merge-converted.py b/scripts/PowerBI-merge-converted.py new file mode 100644 index 00000000..851436e6 --- /dev/null +++ b/scripts/PowerBI-merge-converted.py @@ -0,0 +1,153 @@ +import argparse +import json +import sys +from pathlib import Path +from typing import Any, Dict, List, Optional + +''' +1. traverse ./converted/*.json +2. merge into 1 json in memory +3. output to SnafflerMerged.merged.json +''' + +def load_json(path: Path) -> Optional[Dict[str, Any]]: + """ + Load JSON from path. Returns dict on success, None on failure. + """ + try: + with path.open("r", encoding="utf-8") as f: + return json.load(f) + except json.JSONDecodeError as e: + print(f"[WARN] Skipping invalid JSON {path} ({e})", file=sys.stderr) + return None + except OSError as e: + print(f"[WARN] Skipping unreadable file: {path} ({e})", file=sys.stderr) + return None + + +def extract_entries(doc: Dict[str, Any], path: Path, strict: bool) -> List[Dict[str, Any]]: + """ + Extract entries list from a parsed JSON object. + If strict=True, abort on schema mismatch. + Otherwise warn and skip bad files. + """ + if not isinstance(doc, dict): + msg = f"Top-level JSON is not an object in {path}" + if strict: + raise ValueError(msg) + print(f"[WARN] {msg}; skipping", file=sys.stderr) + return [] + + if "entries" not in doc: + msg = f"Missing 'entries' key in {path}" + if strict: + raise ValueError(msg) + print(f"[WARN] {msg}; skipping", file=sys.stderr) + return [] + + entries = doc["entries"] + if not isinstance(entries, list): + msg = f"'entries' is not a list in {path}" + if strict: + raise ValueError(msg) + print(f"[WARN] {msg}; skipping", file=sys.stderr) + return [] + + # Optional: ensure each entry is a dict; if not, keep it but warn/skip depending on strict + out: List[Dict[str, Any]] = [] + for i, item in enumerate(entries): + if isinstance(item, dict): + out.append(item) + else: + msg = f"entries[{i}] is not an object in {path}" + if strict: + raise ValueError(msg) + print(f"[WARN] {msg}; skipping item", file=sys.stderr) + + return out + + +def merge_entries(input_dir: Path, pattern: str, strict: bool) -> Dict[str, Any]: + """ + Traverse input_dir/pattern and merge all entries into {"entries": [...]} + """ + files = sorted(input_dir.glob(pattern)) + if not files: + print(f"[WARN] No files matched: {input_dir / pattern}", file=sys.stderr) + + merged: List[Dict[str, Any]] = [] + + for p in files: + doc = load_json(p) + if doc is None: + continue + merged.extend(extract_entries(doc, p, strict=strict)) + + return {"entries": merged} + + +def write_output(output_path: Path, data: Dict[str, Any], pretty: bool) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + with output_path.open("w", encoding="utf-8") as f: + if pretty: + json.dump(data, f, indent=2, ensure_ascii=False) + else: + json.dump(data, f, separators=(",", ":"), ensure_ascii=False) + f.write("\n") + + +def parse_args(argv: List[str]) -> argparse.Namespace: + ap = argparse.ArgumentParser( + description="Merge Snaffler converted JSON files by concatenating all objects under the `entries` key." + ) + ap.add_argument( + "--input-dir", + default="./converted", + help="Directory containing converted JSON files (default: ./converted)", + ) + ap.add_argument( + "--pattern", + default="*.json", + help="Glob pattern within input-dir (default: *.json)", + ) + ap.add_argument( + "--output", + default="SnafflerMerged.merged.json", + help="Output file path (default: SnafflerMerged.merged.json)", + ) + ap.add_argument( + "--pretty", + action="store_true", + help="Pretty-print the output JSON (indent=2)", + ) + ap.add_argument( + "--strict", + action="store_true", + help="Fail fast if any file is missing/invalid schema instead of skipping.", + ) + return ap.parse_args(argv) + + +def main(argv: List[str]) -> int: + args = parse_args(argv) + + input_dir = Path(args.input_dir).expanduser().resolve() + output_path = Path(args.output).expanduser().resolve() + + if not input_dir.exists() or not input_dir.is_dir(): + print(f"[ERROR] input-dir does not exist or is not a directory: {input_dir}", file=sys.stderr) + return 2 + + try: + merged_doc = merge_entries(input_dir=input_dir, pattern=args.pattern, strict=args.strict) + write_output(output_path=output_path, data=merged_doc, pretty=args.pretty) + except Exception as e: + print(f"[ERROR] {e}", file=sys.stderr) + return 1 + + print(f"[OK] Wrote {len(merged_doc['entries'])} merged entries to {output_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:]))