From 1d6ebc3ab49bc2300a068885497bd5384763bbc8 Mon Sep 17 00:00:00 2001
From: ynotopec <49277296+ynotopec@users.noreply.github.com>
Date: Thu, 23 Oct 2025 22:52:30 +0200
Subject: [PATCH] Optimize dirDiff processing pipeline

---
 dirDiff.sh | 113 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 69 insertions(+), 44 deletions(-)

diff --git a/dirDiff.sh b/dirDiff.sh
index e88b4d5..855f71e 100755
--- a/dirDiff.sh
+++ b/dirDiff.sh
@@ -36,40 +36,54 @@ trap cleanup EXIT HUP INT TERM
 files_dir="$work_root/files"
 cache_dir="$work_root/files.cache"
 diff_dir="$work_root/diff"
+files_list="$work_root/files.list"
+cache_list="$work_root/cache.list"
 comm_file="$work_root/comm"
 stat_words="$work_root/statWords"
 stat_words_vars="$work_root/statWords.vars"
-tmp_file="$work_root/tmp"
 
 mkdir -p "$files_dir" "$cache_dir" "$diff_dir"
 
-# Copy files into the working directory.
-find "$input_dir" -mindepth 1 -maxdepth 1 -type f -print \
-  | while IFS= read -r file_name; do
-      cp -p "$file_name" "$files_dir/"
-    done
+# Copy files into the working directory, inflating gzip archives on the fly so
+# each file is processed only once.
+found_file=0
+for file_path in "$input_dir"/*; do
+  [ -f "$file_path" ] || continue
+  found_file=1
+  base_name=$(basename "$file_path")
+  if [ "${base_name##*.}" = "gz" ]; then
+    output_name="$files_dir/${base_name%.gz}"
+    gunzip -c "$file_path" >"$output_name"
+    touch -r "$file_path" "$output_name" 2>/dev/null || true
+  else
+    cp -p "$file_path" "$files_dir/"
+  fi
+done
+
+if [ "$found_file" -eq 0 ]; then
+  printf 'Error: "%s" does not contain any files to diff.\n' "$input_dir" >&2
+  exit 1
+fi
 
-if ! find "$files_dir" -mindepth 1 -maxdepth 1 -type f | read -r _; then
+# Snapshot the list of working files so subsequent stages can reuse it without
+# rescanning the directory tree.
+find "$files_dir" -type f >"$files_list"
+
+if [ ! -s "$files_list" ]; then
   printf 'Error: "%s" does not contain any files to diff.\n' "$input_dir" >&2
   exit 1
 fi
 
-# Decompress gzip archives so they can be diffed like regular files.
-find "$files_dir" -type f -name '*.gz' -print \
-  | while IFS= read -r gz_file; do
-      gunzip -f "$gz_file"
-    done
+file_count=$(wc -l <"$files_list" | tr -d '[:space:]')
+trigger_value=$(( file_count / 2 ))
 
 # Build the list of unique tokens per file.
-find "$files_dir" -type f -print \
-  | while IFS= read -r file_name; do
-      tr -c '[:alnum:]_' '[\n*]' <"$file_name" \
-        | grep -v '^\s*$' \
-        | sort -u
-    done >"$stat_words"
-
-file_count=$(find "$files_dir" -type f | wc -l | tr -d '[:space:]')
-trigger_value=$(( file_count / 2 ))
+: >"$stat_words"
+while IFS= read -r file_name; do
+  tr -c '[:alnum:]_' '[\n*]' <"$file_name" \
+    | grep -v '^\s*$' \
+    | sort -u >>"$stat_words"
+done <"$files_list"
 
 # Identify tokens that appear in more than half of the files and mask them so
 # the diffs focus on the outliers rather than the shared structure.
@@ -78,35 +92,46 @@ awk -v limit="$trigger_value" '{ count[$0]++ } END { for (word in count) if (cou
 
 # Copy the files so we can mask the less frequent tokens.
 cp -a "$files_dir/." "$cache_dir/"
+find "$cache_dir" -type f >"$cache_list"
 
 if [ -s "$stat_words_vars" ]; then
-  while IFS= read -r token; do
-    sed -i "s#\\b${token}\\b#\\$""{varMy}#g" "$cache_dir"/*
-  done <"$stat_words_vars"
+  mask_pattern=$(paste -sd '|' "$stat_words_vars")
+  while IFS= read -r file_name; do
+    perl -0pi -e 's/\b(?:'"$mask_pattern"')\b/\${varMy}/g' "$file_name"
+  done <"$cache_list"
 fi
 
 # Collect the common lines across all files.
-find "$cache_dir" -type f -print \
-  | while IFS= read -r file_name; do
-      awk '!seen[$0]++' "$file_name"
-    done >"$comm_file"
-
-awk -v limit="$trigger_value" 'NR == FNR { count[$0]++; next } count[$0] > limit' \
-  "$comm_file" "$comm_file" | awk '!seen[$0]++' >"${comm_file}.filtered"
-mv "${comm_file}.filtered" "$comm_file"
+tr '\n' '\0' <"$cache_list"   | xargs -0 awk -v limit="$trigger_value" '
+      FNR == 1 { delete seen }
+      {
+        if (!seen[$0]++) {
+          count[$0]++
+        }
+      }
+      END {
+        for (line in count) {
+          if (count[line] > limit) {
+            print line
+          }
+        }
+      }
+    ' >"$comm_file"
+
+LC_ALL=C sort -u "$comm_file" -o "$comm_file"
 
 # Build a diff for each file, highlighting only the unique lines.
-find "$cache_dir" -type f -print \
-  | while IFS= read -r file_name; do
-      {
-        printf '== %s ==\n' "$(basename "$file_name")"
-        cat "$file_name"
-        printf '=== missing ===\n'
-        cat "$comm_file"
-      } >"$tmp_file"
-
-      awk 'NR == FNR { count[$0]++; next } count[$0] == 1' "$tmp_file" "$tmp_file" \
-        | tee "$diff_dir/$(basename "$file_name")"
-    done
+while IFS= read -r file_name; do
+  base_name=$(basename "$file_name")
+  {
+    printf '== %s ==\n' "$base_name"
+    awk 'NR == FNR { common[$0] = 1; next } !common[$0] && !seen[$0]++' \
+      "$comm_file" "$file_name"
+    printf '=== missing ===\n'
+    awk 'NR == FNR { present[$0] = 1; next } !present[$0] && !seen[$0]++' \
+      "$file_name" "$comm_file"
+  } >"$diff_dir/$base_name"
+  touch -r "$file_name" "$diff_dir/$base_name" 2>/dev/null || true
+done <"$cache_list"
 
 # The diff files are available in "$diff_dir" when the script exits.