diff --git a/SnaffCore/Checkpoint/CheckpointData.cs b/SnaffCore/Checkpoint/CheckpointData.cs new file mode 100644 index 00000000..09f3e808 --- /dev/null +++ b/SnaffCore/Checkpoint/CheckpointData.cs @@ -0,0 +1,34 @@ +using System; +using System.Collections.Generic; +using System.Runtime.Serialization; + +namespace SnaffCore.Checkpoint +{ + /// + /// Serializable state snapshot for checkpoint/resume support. + /// Tracks which directories and computers have already been processed so + /// a resumed run can skip them entirely. + /// + [DataContract] + public class CheckpointData + { + /// When this checkpoint was written. + [DataMember] + public DateTime CheckpointTime { get; set; } + + /// + /// Full UNC / local paths of every directory whose tree-walk has been + /// fully dispatched. On resume, any path in this set is skipped by TreeWalker. + /// + [DataMember] + public List ScannedDirectories { get; set; } = new List(); + + /// + /// Hostnames / IPs of every computer whose share-discovery has been + /// completed. On resume, any computer in this set is skipped by + /// ShareFinder. + /// + [DataMember] + public List ScannedComputers { get; set; } = new List(); + } +} diff --git a/SnaffCore/Checkpoint/CheckpointManager.cs b/SnaffCore/Checkpoint/CheckpointManager.cs new file mode 100644 index 00000000..0bb6e79b --- /dev/null +++ b/SnaffCore/Checkpoint/CheckpointManager.cs @@ -0,0 +1,297 @@ +using System; +using System.Collections.Concurrent; +using System.IO; +using System.Runtime.Serialization.Json; +using System.Text; +using SnaffCore.Concurrency; + +namespace SnaffCore.Checkpoint +{ + /// + /// Thread-safe singleton that tracks scan progress and handles periodic + /// checkpointing to disk. Use before starting a + /// scan and everywhere else. + /// + public class CheckpointManager + { + // ------------------------------------------------------------------ // + // Singleton plumbing // + // ------------------------------------------------------------------ // + + private static volatile CheckpointManager _instance; + private static readonly object _createLock = new object(); + + public static CheckpointManager GetInstance() => _instance; + + /// + /// Create (and optionally restore) the singleton. + /// Called once during construction. + /// + /// Path to write / read checkpoint JSON. + public static CheckpointManager Initialize(string checkpointFilePath) + { + lock (_createLock) + { + _instance = new CheckpointManager(checkpointFilePath); + return _instance; + } + } + + // ------------------------------------------------------------------ // + // State // + // ------------------------------------------------------------------ // + + // Use ConcurrentDictionary as a thread-safe HashSet. + private readonly ConcurrentDictionary _scannedDirectories + = new ConcurrentDictionary(StringComparer.OrdinalIgnoreCase); + + private readonly ConcurrentDictionary _scannedComputers + = new ConcurrentDictionary(StringComparer.OrdinalIgnoreCase); + + private readonly string _filePath; + private readonly object _saveLock = new object(); + + // ------------------------------------------------------------------ // + // Public properties // + // ------------------------------------------------------------------ // + + public string FilePath => _filePath; + public bool IsRestoring { get; private set; } + + /// How many directories are recorded in this session so far. + public int ScannedDirectoryCount => _scannedDirectories.Count; + /// How many computers are recorded in this session so far. + public int ScannedComputerCount => _scannedComputers.Count; + + // ------------------------------------------------------------------ // + // Constructor // + // ------------------------------------------------------------------ // + + private CheckpointManager(string filePath) + { + // If the caller supplied a directory path (e.g. "." or "C:\Logs"), + // automatically create a file inside it rather than trying to + // treat the directory itself as the checkpoint file. + if (Directory.Exists(filePath)) + filePath = Path.Combine(filePath, "snaffler_checkpoint.json"); + + _filePath = filePath; + + if (File.Exists(filePath)) + { + TryLoad(); + } + } + + // ------------------------------------------------------------------ // + // Directory tracking // + // ------------------------------------------------------------------ // + + /// + /// Returns true if this directory has already been processed in a + /// previous (or the current) session. + /// + public bool IsDirectoryScanned(string path) + { + return !string.IsNullOrWhiteSpace(path) && + _scannedDirectories.ContainsKey(NormalisePath(path)); + } + + /// Mark a directory as having been entered / processed. + public void MarkDirectoryScanned(string path) + { + if (!string.IsNullOrWhiteSpace(path)) + _scannedDirectories.TryAdd(NormalisePath(path), 0); + } + + // ------------------------------------------------------------------ // + // Computer tracking // + // ------------------------------------------------------------------ // + + /// + /// Returns true if this computer's shares have already been discovered + /// in a previous session. + /// + public bool IsComputerScanned(string computer) + { + return !string.IsNullOrWhiteSpace(computer) && + _scannedComputers.ContainsKey(NormaliseHost(computer)); + } + + /// Mark a computer as having had its shares discovered. + public void MarkComputerScanned(string computer) + { + if (!string.IsNullOrWhiteSpace(computer)) + _scannedComputers.TryAdd(NormaliseHost(computer), 0); + } + + // ------------------------------------------------------------------ // + // Persistence // + // ------------------------------------------------------------------ // + + /// + /// Atomically write a checkpoint file to disk. Safe to call from any + /// thread – surplus concurrent calls are serialised by a lock so no + /// data is lost. + /// + public void SaveCheckpoint() + { + lock (_saveLock) + { + try + { + var data = new CheckpointData + { + CheckpointTime = DateTime.UtcNow, + ScannedDirectories = new System.Collections.Generic.List(_scannedDirectories.Keys), + ScannedComputers = new System.Collections.Generic.List(_scannedComputers.Keys), + }; + + string json = Serialise(data); + + // Write to a temp file first, then atomically replace the + // destination – avoids a corrupt checkpoint if the process + // is killed mid-write. + // File.Replace performs an atomic swap on NTFS and keeps a + // .bak as a safety net. On the very first save the + // destination does not yet exist, so File.Move is used + // instead (also atomic on the same volume). + string tmp = _filePath + ".tmp"; + File.WriteAllText(tmp, json, Encoding.UTF8); + if (File.Exists(_filePath)) + { + File.Replace(tmp, _filePath, _filePath + ".bak"); + } + else + { + File.Move(tmp, _filePath); + } + + BlockingMq.GetMq()?.Info( + string.Format("[Checkpoint] Saved checkpoint ({0} dirs, {1} computers) → {2}", + data.ScannedDirectories.Count, + data.ScannedComputers.Count, + _filePath)); + } + catch (Exception ex) + { + BlockingMq.GetMq()?.Error("[Checkpoint] Failed to save checkpoint: " + ex.Message); + } + } + } + + // ------------------------------------------------------------------ // + // Private helpers // + // ------------------------------------------------------------------ // + + private void TryLoad() + { + try + { + string json = File.ReadAllText(_filePath, Encoding.UTF8); + CheckpointData data = Deserialise(json); + if (data == null) return; + + foreach (string d in data.ScannedDirectories ?? new System.Collections.Generic.List()) + _scannedDirectories.TryAdd(NormalisePath(d), 0); + + foreach (string c in data.ScannedComputers ?? new System.Collections.Generic.List()) + _scannedComputers.TryAdd(NormaliseHost(c), 0); + + // Deduplication: remove child-directory entries whose parent is + // also in the completed set. If a parent dir is marked complete, + // WalkTree will skip it entirely — the child entries are dead weight + // that will never be checked. Pruning them here keeps the in-memory + // set lean and speeds up future IsDirectoryScanned lookups. + // Example: if both \\srv\share AND \\srv\share\sub are present, + // \\srv\share\sub is redundant and can be removed. + // + // Algorithm: sort the keys so that every descendant of a path + // immediately follows it, then do a single linear pass. + // Naïve lexicographic order is NOT sufficient here because the + // path-separator character '\' (ASCII 92) sorts after digits and + // uppercase letters, which would interleave children with siblings + // (e.g. "SHARE2" < "SHARE\A" in ordinal order). To fix this, we + // sort by a transformed key where both separators are replaced with + // '\x01' (ASCII 1, lower than every printable character) so that + // child paths always follow their parent in the sorted sequence. + var sortedKeys = new System.Collections.Generic.List(_scannedDirectories.Keys); + sortedKeys.Sort((a, b) => + string.Compare( + a.Replace('\\', '\x01').Replace('/', '\x01'), + b.Replace('\\', '\x01').Replace('/', '\x01'), + StringComparison.Ordinal)); + + var toRemove = new System.Collections.Generic.List(); + string lastKept = null; + foreach (string dir in sortedKeys) + { + if (lastKept != null && + (dir.StartsWith(lastKept + "\\", StringComparison.OrdinalIgnoreCase) || + dir.StartsWith(lastKept + "/", StringComparison.OrdinalIgnoreCase))) + { + // dir is a descendant of lastKept – redundant. + // Do NOT update lastKept so that deeper descendants + // are still caught by the same ancestor check. + toRemove.Add(dir); + } + else + { + lastKept = dir; + } + } + foreach (string redundant in toRemove) + { + byte dummy; + _scannedDirectories.TryRemove(redundant, out dummy); + } + + IsRestoring = true; + + Console.WriteLine(string.Format( + "[Checkpoint] Loaded checkpoint from {0} (written {1} UTC).", + _filePath, + data.CheckpointTime.ToString("u"))); + Console.WriteLine(string.Format( + "[Checkpoint] Resuming – will skip {0} directories and {1} computers ({2} redundant dir entries pruned).", + _scannedDirectories.Count, + _scannedComputers.Count, + toRemove.Count)); + } + catch (Exception ex) + { + Console.WriteLine("[Checkpoint] WARNING – could not load checkpoint (" + ex.Message + "). Starting fresh."); + _scannedDirectories.Clear(); + _scannedComputers.Clear(); + IsRestoring = false; + } + } + + private static string NormalisePath(string p) => + p.TrimEnd('\\', '/').ToUpperInvariant(); + + private static string NormaliseHost(string h) => + h.Trim().ToLowerInvariant(); + + // Use DataContractJsonSerializer – no extra NuGet dependency required. + private static string Serialise(CheckpointData data) + { + var ser = new DataContractJsonSerializer(typeof(CheckpointData)); + using (var ms = new MemoryStream()) + { + ser.WriteObject(ms, data); + return Encoding.UTF8.GetString(ms.ToArray()); + } + } + + private static CheckpointData Deserialise(string json) + { + var ser = new DataContractJsonSerializer(typeof(CheckpointData)); + byte[] bytes = Encoding.UTF8.GetBytes(json); + using (var ms = new MemoryStream(bytes)) + { + return (CheckpointData)ser.ReadObject(ms); + } + } + } +} diff --git a/SnaffCore/Config/Options.cs b/SnaffCore/Config/Options.cs index 78c66282..5e018705 100644 --- a/SnaffCore/Config/Options.cs +++ b/SnaffCore/Config/Options.cs @@ -127,6 +127,15 @@ public partial class Options // Content processing options public int MatchContextBytes { get; set; } = 200; + // Checkpoint / Resume options + // Path to the checkpoint JSON file. When set, Snaffler writes a + // checkpoint every CheckpointIntervalMinutes. If the file already + // exists on startup the run is automatically resumed from it. + public string CheckpointFile { get; set; } + + // How many minutes between automatic checkpoint writes. Default 10. + public int CheckpointIntervalMinutes { get; set; } = 10; + public Options() { //PrepareClassifiers(); diff --git a/SnaffCore/ShareFind/ShareFinder.cs b/SnaffCore/ShareFind/ShareFinder.cs index 4d1395c1..f228ba62 100644 --- a/SnaffCore/ShareFind/ShareFinder.cs +++ b/SnaffCore/ShareFind/ShareFinder.cs @@ -1,4 +1,5 @@ using SnaffCore.Classifiers; +using SnaffCore.Checkpoint; using SnaffCore.Concurrency; using SnaffCore.TreeWalk; using System; @@ -206,6 +207,9 @@ internal void GetComputerShares(string computer) } } } + + // Record this computer so it is skipped on any future resume. + CheckpointManager.GetInstance()?.MarkComputerScanned(computer); } internal bool IsShareReadable(string share) diff --git a/SnaffCore/SnaffCon.cs b/SnaffCore/SnaffCon.cs index 5334939d..9133b446 100644 --- a/SnaffCore/SnaffCon.cs +++ b/SnaffCore/SnaffCon.cs @@ -1,5 +1,6 @@ using SnaffCore.Classifiers; using SnaffCore.ActiveDirectory; +using SnaffCore.Checkpoint; using SnaffCore.Concurrency; using SnaffCore.Config; using SnaffCore.ShareFind; @@ -37,6 +38,8 @@ public class SnaffCon private DateTime StartTime { get; set; } + private Timer _checkpointTimer; + public SnaffCon(Options options) { MyOptions = options; @@ -53,6 +56,12 @@ public SnaffCon(Options options) FileScanner = new FileScanner(); TreeWalker = new TreeWalker(); ShareFinder = new ShareFinder(); + + // Initialise checkpoint manager if the user supplied a checkpoint file. + if (!string.IsNullOrWhiteSpace(MyOptions.CheckpointFile)) + { + CheckpointManager.Initialize(MyOptions.CheckpointFile); + } } public static ShareFinder GetShareFinder() @@ -91,6 +100,22 @@ public void Execute() statusUpdateTimer.Elapsed += TimedStatusUpdate; statusUpdateTimer.Start(); + // Start the periodic checkpoint timer if checkpointing is enabled. + var checkpointMgr = CheckpointManager.GetInstance(); + if (checkpointMgr != null) + { + double intervalMs = TimeSpan.FromMinutes(MyOptions.CheckpointIntervalMinutes).TotalMilliseconds; + _checkpointTimer = new Timer(intervalMs) { AutoReset = true }; + _checkpointTimer.Elapsed += (s, e) => checkpointMgr.SaveCheckpoint(); + _checkpointTimer.Start(); + + string resumeMsg = checkpointMgr.IsRestoring + ? string.Format("Resuming from checkpoint – skipping {0} directories and {1} computers.", + checkpointMgr.ScannedDirectoryCount, checkpointMgr.ScannedComputerCount) + : "Checkpointing enabled (no prior checkpoint found). Starting fresh."; + Mq.Info("[Checkpoint] " + resumeMsg); + } + // If we want to hunt for user IDs, we need data from the running user's domain. // Future - walk trusts @@ -151,6 +176,14 @@ public void Execute() waitHandle.WaitOne(); + // Stop the checkpoint timer and write a final checkpoint. + if (_checkpointTimer != null) + { + _checkpointTimer.Stop(); + _checkpointTimer.Dispose(); + CheckpointManager.GetInstance()?.SaveCheckpoint(); + } + StatusUpdate(); DateTime finished = DateTime.Now; TimeSpan runSpan = finished.Subtract(StartTime); @@ -288,6 +321,7 @@ public void PrepDomainUserRules() private void ShareDiscovery(string[] computerTargets) { Mq.Info("Starting to look for readable shares..."); + var checkpointMgr = CheckpointManager.GetInstance(); foreach (string computer in computerTargets) { if (CheckExclusions(computer)) @@ -295,6 +329,7 @@ private void ShareDiscovery(string[] computerTargets) // skip any that are in the exclusion list continue; } + // Perform reverse lookup if the computer is an IP address var computerName = ""; if (isIP(computer)) @@ -318,6 +353,15 @@ private void ShareDiscovery(string[] computerTargets) // Use the provided computer name if it's not an IP address computerName = computer; } + + // Skip computers that were fully processed in a prior run. + // Check is done after DNS resolution so that the resolved hostname + // matches what MarkComputerScanned recorded (avoids IP vs hostname mismatch). + if (checkpointMgr != null && checkpointMgr.IsComputerScanned(computerName)) + { + Mq.Info("[Checkpoint] Skipping already-scanned computer: " + computerName); + continue; + } // ShareFinder Task Creation - this kicks off the rest of the flow Mq.Trace("Creating a ShareFinder task for " + computerName); ShareTaskScheduler.New(() => @@ -390,8 +434,16 @@ private bool CheckExclusions(string computer) private void FileDiscovery(string[] pathTargets) { + var checkpointMgr = CheckpointManager.GetInstance(); foreach (string pathTarget in pathTargets) { + // Skip top-level path targets that were already fully scanned. + if (checkpointMgr != null && checkpointMgr.IsDirectoryScanned(pathTarget)) + { + Mq.Info("[Checkpoint] Skipping already-scanned path: " + pathTarget); + continue; + } + // TreeWalker Task Creation - this kicks off the rest of the flow Mq.Info("Creating a TreeWalker task for " + pathTarget); TreeTaskScheduler.New(() => diff --git a/SnaffCore/SnaffCore.csproj b/SnaffCore/SnaffCore.csproj index fa0fc804..4f5679fa 100644 --- a/SnaffCore/SnaffCore.csproj +++ b/SnaffCore/SnaffCore.csproj @@ -57,7 +57,17 @@ + + + + $(NuGetPackageRoot)nett/0.15.0/lib/net40/Nett.dll + True + + + $(NuGetPackageRoot)nett.coma/0.15.0/lib/net40/Nett.Coma.dll + True + @@ -65,6 +75,8 @@ + + @@ -90,6 +102,13 @@ + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + 0.15.0 + 0.15.0 diff --git a/SnaffCore/TreeWalk/TreeWalker.cs b/SnaffCore/TreeWalk/TreeWalker.cs index f4bf9c2b..fcdbf498 100644 --- a/SnaffCore/TreeWalk/TreeWalker.cs +++ b/SnaffCore/TreeWalk/TreeWalker.cs @@ -1,4 +1,5 @@ using SnaffCore.Classifiers; +using SnaffCore.Checkpoint; using SnaffCore.Concurrency; using SnaffCore.Config; using SnaffCore.FileScan; @@ -28,12 +29,31 @@ public TreeWalker() public void WalkTree(string currentDir) { // Walks a tree checking files and generating results as it goes. - + if (!Directory.Exists(currentDir)) { return; } + // If resuming from a checkpoint, skip directories we already processed. + var checkpointMgr = CheckpointManager.GetInstance(); + if (checkpointMgr != null && checkpointMgr.IsDirectoryScanned(currentDir)) + { + Mq.Trace("[Checkpoint] Skipping already-scanned directory: " + currentDir); + return; + } + + // NOTE: We mark this directory as scanned at the *end* of this method + // (after all file tasks and subdir tasks for this level have been queued), + // not on entry. This ensures that if the process is killed before the + // queued file tasks have a chance to run, the directory is not falsely + // recorded as complete and will be re-walked on resume. + // Trade-off: if the parent WalkTree for a path completes and is marked, + // but a child WalkTree that was dispatched async has not yet been marked, + // that child dir will not be re-discovered on resume (blocked by the + // marked parent). This is an inherent limitation of directory-level + // tracking without individual file-task persistence. + // SCCM ContentLib($) try { @@ -46,10 +66,12 @@ public void WalkTree(string currentDir) if (!Directory.Exists(dataLibDir)) { Mq.Error("SCCM content library found but no DataLib found: " + dataLibDir); + checkpointMgr?.MarkDirectoryScanned(currentDir); return; } Mq.Info("SCCM content library: Entering into datalib: " + dataLibDir); WalkSccmTree(dataLibDir, currentDir); // With base path name + checkpointMgr?.MarkDirectoryScanned(currentDir); return; } } @@ -173,6 +195,12 @@ public void WalkTree(string currentDir) Mq.Trace(e.ToString()); //continue; } + + // Mark this directory as fully walked (all direct file and subdir + // tasks have been queued for this level). Placed here — after all + // work for this directory is dispatched — so that a crash before + // file tasks execute does not falsely mark the dir as complete. + checkpointMgr?.MarkDirectoryScanned(currentDir); } public void WalkSccmTree(string currentDir, string sccmBaseDir) { diff --git a/SnaffCore/UltraSnaffCore.csproj b/SnaffCore/UltraSnaffCore.csproj index 539d3311..2a1440fd 100644 --- a/SnaffCore/UltraSnaffCore.csproj +++ b/SnaffCore/UltraSnaffCore.csproj @@ -57,10 +57,13 @@ + + + diff --git a/Snaffler/Config.cs b/Snaffler/Config.cs index e9aedb21..9cfb5690 100644 --- a/Snaffler/Config.cs +++ b/Snaffler/Config.cs @@ -106,7 +106,13 @@ private static Options ParseImpl(string[] args) ValueArgument logType = new ValueArgument('t', "logtype", "Type of log you would like to output. Currently supported options are plain and JSON. Defaults to plain."); ValueArgument timeOutArg = new ValueArgument('e', "timeout", "Interval between status updates (in minutes) also acts as a timeout for AD data to be gathered via LDAP. Turn this knob up if you aren't getting any computers from AD when you run Snaffler through a proxy or other slow link. Default = 5"); - // list of letters i haven't used yet: gnqw + ValueArgument checkpointArg = new ValueArgument('g', "checkpoint", + "Path to a checkpoint file. Snaffler saves progress every --checkpointinterval minutes. " + + "If the file already exists the run is automatically resumed from it, skipping work already done. " + + "Example: -g snaffler-checkpoint.json"); + ValueArgument checkpointIntervalArg = new ValueArgument('w', "checkpointinterval", + "How many minutes between checkpoint saves. Defaults to 10. Only applies when --checkpoint is set."); + // list of letters i haven't used yet: q CommandLineParser.CommandLineParser parser = new CommandLineParser.CommandLineParser(); parser.Arguments.Add(timeOutArg); @@ -132,6 +138,8 @@ private static Options ParseImpl(string[] args) parser.Arguments.Add(ruleDirArg); parser.Arguments.Add(logType); parser.Arguments.Add(compExclusionArg); + parser.Arguments.Add(checkpointArg); + parser.Arguments.Add(checkpointIntervalArg); // extra check to handle builtin behaviour from cmd line arg parser if ((args.Contains("--help") || args.Contains("/?") || args.Contains("help") || args.Contains("-h") || args.Length == 0)) @@ -403,6 +411,23 @@ private static Options ParseImpl(string[] args) } } + if (checkpointArg.Parsed && !string.IsNullOrWhiteSpace(checkpointArg.Value)) + { + parsedConfig.CheckpointFile = checkpointArg.Value; + Mq.Info("Checkpointing enabled. File: " + parsedConfig.CheckpointFile); + } + + if (checkpointIntervalArg.Parsed) + { + if (checkpointIntervalArg.Value < 1) + { + Mq.Error("Checkpoint interval must be at least 1 minute (got " + checkpointIntervalArg.Value + ")."); + throw new ArgumentException("Checkpoint interval must be at least 1 minute."); + } + parsedConfig.CheckpointIntervalMinutes = checkpointIntervalArg.Value; + Mq.Info("Checkpoint interval set to " + parsedConfig.CheckpointIntervalMinutes + " minutes."); + } + if (!parsedConfig.LogToConsole && !parsedConfig.LogToFile) { Mq.Error( diff --git a/Snaffler/Snaffler.csproj b/Snaffler/Snaffler.csproj index f84fdba9..c48ae90d 100644 --- a/Snaffler/Snaffler.csproj +++ b/Snaffler/Snaffler.csproj @@ -64,6 +64,19 @@ + + + $(NuGetPackageRoot)commandlineargumentsparser/3.0.22/lib/net45/CommandLineArgumentsParser.dll + True + + + $(NuGetPackageRoot)nett/0.15.0/lib/net40/Nett.dll + True + + + $(NuGetPackageRoot)nlog/4.7.15/lib/net45/NLog.dll + True + @@ -103,6 +116,10 @@ runtime; build; native; contentfiles; analyzers; buildtransitive all + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + 0.15.0 @@ -112,4 +129,7 @@ + + \ No newline at end of file diff --git a/Snaffler/bin/Release/Snaffler.exe b/Snaffler/bin/Release/Snaffler.exe new file mode 100644 index 00000000..4d58656e Binary files /dev/null and b/Snaffler/bin/Release/Snaffler.exe differ