From cc3abf2109c56c6ac4532b3ee603193c4fb5c37f Mon Sep 17 00:00:00 2001
From: caiot5 <caiot5@gmail.com>
Date: Tue, 10 Mar 2026 06:16:04 +0000
Subject: [PATCH] revert single-curl fetch, keep scanlinks optimizations

---
 warnick-2.1.4/warnick.sh | 217 +++++++++++++++++++--------------------
 1 file changed, 108 insertions(+), 109 deletions(-)

diff --git a/warnick-2.1.4/warnick.sh b/warnick-2.1.4/warnick.sh
index a095680..aadcb89 100644
--- a/warnick-2.1.4/warnick.sh
+++ b/warnick-2.1.4/warnick.sh
@@ -39,6 +39,7 @@
 #
 # SCRIPT PREREQUISITE PROGRAMS:
 # - bash (tested on version 5.0.3)
+# - wget (1.20.1 or newer)
 # - curl (7.64.0 or newer)
 # - tee, cut, grep, head, cat, date, tr
 #
@@ -250,10 +251,10 @@ function scanlinks {
   ((depth++))
 
   while IFS= read -u 4 -r line; do
-    byteoffset=$(echo $line |cut -d ':' -f1)
+    byteoffset="${line%%:*}"
 
     tag="${filecontent:$byteoffset:$searchbuffer}"
-    tag=$(echo -n "$tag" |tr '\n\r' ' ' |cut -d'>' -f1)">"
+    tag="${tag//$'\n'/ }"; tag="${tag//$'\r'/ }"; tag="${tag%%>*}>"
     # get offset of link parameter href, src... etc. add tag parameters here.
     log 4 "Debug: Tag found $tag"
     # if multiple matches, pick first one (head -n1)
@@ -266,21 +267,21 @@ function scanlinks {
       linktype=""
 
       # step 1 - parse tag
-      link="$(echo -n $tag |cut -b$linkoffset- |cut -d'=' -f2 |cut -d'>' -f1 |cut -d'?' -f1)" # "./dir\case32.html#anchor" ALT
+      link="${tag:$linkoffset}"; link="${link#*=}"; link="${link%%>*}"; link="${link%%[?]*}" # "./dir\case32.html#anchor" ALT
       log 4 "Debug: Link parsed step 1: $link"
 
       # Check if link starts in quotes. If so, then grab link inside quotes
-      if [[ $link =~ ^'"' ]]; then link="$(echo -n $link |cut -d'"' -f2)"; fi # ./dir\case32.html#anchor
+      if [[ $link == \"* ]]; then link="${link#\"}"; link="${link%%\"*}"; fi # ./dir\case32.html#anchor
       log 4 "Debug: Link parsed step 2: $link"
 
       # step 3 - remove spaces and quotes and anything else that we missed
-      link="$(echo -n $link |tr -d '\"' |cut -d' ' -f1)"                      # ./dir\case32.html#anchor
+      link="${link//\"/}"; link="${link%% *}"                                  # ./dir\case32.html#anchor
       # step 4 - remove ./ from beginning of link
-      link="$(echo -n $link |sed 's/^\.\///')"                                # /dir\case32.html#anchor
+      link="${link#./}"                                                        # /dir\case32.html#anchor
       # step 5 - convert invalid directory separator '\' to '/'
-      link=$(echo -n $link |tr '\\' '/')                                      # /dir/case32.html#anchor
+      link="${link//\\/\/}"                                                    # /dir/case32.html#anchor
       # step 6 - remove #anchors and ?params from the end of a link
-      link=$(echo -n $link |cut -d'#' -f1 |cut -d'?' -f1)                     # /dir/case32.html
+      link="${link%%#*}"; link="${link%%[?]*}"                                 # /dir/case32.html
       # step 7 - urldecode
       #link="$(echo -n $link |sed 's/%20/ /g')"                               # /dir/case32.html
 
@@ -296,7 +297,7 @@ function scanlinks {
       # Is this a link with protocol? Assume this is a direct link. Remove protocol.
       #   - http://www.hostname.com/path/to/file.html
       if [[ ${link:0:7} == "http://" ]] || [[ ${link:0:6} == "ftp://" ]]; then
-        directlink="$(echo $link |cut -s -d'/' -f3-)"
+        directlink="${link#*//}"
         linktype="direct"
       fi
 
@@ -309,7 +310,7 @@ function scanlinks {
         #   - ../file.html
         if [[ ${link:0:3} == "../" ]]; then
           # Count occurrences in relative URL
-          parents=$(echo -n "$link" |grep -o "\.\./" | wc -l)
+          tmp="${link//\.\.\//}"; parents=$(( (${#link} - ${#tmp}) / 3 ))
           ((parents++))
           directlink=$(echo -n $link |cut -d'/' -f${parents}-)
           ((parents++))
@@ -337,7 +338,7 @@ function scanlinks {
         skiplink=1
       else
         # Is this a link on another host?
-        linkhost=$(echo $directlink |cut -d'/' -f1)
+        linkhost="${directlink%%/*}"
         if [[ ! "${linkhost,,}" == "$host" ]]; then
           # Too many skipped link notifications - suppress printing a skipped link, and print a list at the end.
           #log 1 "Notice: Skipping link to another host: $directlink"
@@ -399,133 +400,129 @@ function geturl {
   # Set archive URL here
   archurl="https://web.archive.org/web/${datestring}id_/http://$link"
   log 3 "Debug: Trying $archurl"
+  # Get status code for next page to be archived.
+  sleep $cooldown
+  archresponse="$(curl -sI "$archurl" | tr -d '\r')" # tr is needed, as text parsing doesn't play well with odd characters
+  archstatus="$(echo -n "$archresponse" |head -n1 |cut -d' ' -f2)"
+  archmime="$(echo -n "$archresponse" |grep "content-type: " |cut -d' ' -f2 |cut -d';' -f1)"
+  log 3 "$link HTTP status code: $archstatus"
+  log 3 "$link HTTP content-type: $archmime"
 
-  local headertmp="$tempdir/hdr-$$"
-  local filetmp="$tempdir/dl-$$"
-  local curlmeta=
-  archfounddate=
-  archmime=
-  downloadedfile=
-
-  curlmeta=$(curl -sSL \
-    --keepalive --keepalive-time 60 \
-    --tcp-fastopen \
-    --max-redirs 8 \
-    --dump-header "$headertmp" \
-    -o "$filetmp" \
-    --connect-timeout 30 \
-    --max-time 180 \
-    --retry 3 --retry-delay 10 --retry-connrefused \
-    --write-out "%{http_code}|%{url_effective}" \
-    "$archurl" 2>&1)
-
-  archstatus=$(echo "$curlmeta" |cut -d'|' -f1)
-  effectiveurl=$(echo "$curlmeta" |cut -d'|' -f2)
-
-  if [[ -f "$headertmp" ]]; then
-    archfounddate=$(grep -i "x-archive-redirect-reason: found" "$headertmp" \
-      |tail -n1 |tr -d '\r' |rev |cut -d' ' -f1 |rev)
+  # 302 Page redirect
+  if [[ $archstatus == "302" ]]; then
+    # Redirect page - follow redirect and try getting header information again
+    archurl=$(echo -n "$archresponse" |grep -m1 "location: " |cut -s -d' ' -f2)
+    archfounddate="$(echo -n "$archresponse" |grep "x-archive-redirect-reason: found" |rev |cut -d' ' -f1 |rev)"
     archfounddate="${archfounddate:0:14}"
-    archmime=$(grep -i "^content-type: " "$headertmp" \
-      |tail -n1 |tr -d '\r' |cut -d' ' -f2 |cut -d';' -f1)
-  fi
 
-  log 3 "$link HTTP status code: $archstatus"
-  log 3 "$link HTTP content-type: $archmime"
-  if [[ ! -z "$archfounddate" ]]; then
-    log 3 "$link Redirect resolved to datecode: $archfounddate"
+    counter=0
+    while [ $counter -le 3 ]; do
+      log 2 "$link 302 - Following redirect: ${archurl}"
+      sleep $cooldown
+      archresponse="$(curl -sI "$archurl" | tr -d '\r')" # tr is needed, as text parsing doesn't play well with odd characters
+      archstatus="$(echo -n "$archresponse" |head -n1 |cut -d' ' -f2)"
+      if [[ $archstatus == "302" ]]; then 
+        archfounddate="$(echo -n "$archresponse" |grep "x-archive-redirect-reason: found" |rev |cut -d' ' -f1 |rev)"
+        archfounddate="${archfounddate:0:14}"
+        log 3 "$link 302 - Resource found on a different date ($archfounddate)"
+        archredirectreason="$(echo -n "$archresponse" |grep "x-archive-redirect-reason: ")"
+        log 3 "$link 302 - $archredirectreason"
+        archurl=$(echo -n "$archresponse" |grep -m1 "location: " |cut -s -d' ' -f2)
+      else
+        log 3 "$link 302 - Ending redirect loop, encountered HTTP status code: $archstatus"
+        break
+      fi
+      ((counter++))
+    done
+
+
+    altfound=0
+    if [[ $archstatus == "200" ]]; then
+      archmime="$(echo -n "$archresponse" |grep "content-type: " |cut -d' ' -f2 |cut -d';' -f1)"
+      log 3 "$link 302 - HTTP content-type: $archmime"
+      if [[ $archmime == "text/html" ]]; then
+        # File is an HTML document. Only download if it is at or near target date.
+        minyear=$(expr ${datestring:0:4} - ${searchback})   # 2000 - 4 = 1996
+        maxyear=$(expr ${datestring:0:4} + ${searchahead})  # 2000 + 3 = 2003
+        if [[ "${archfounddate:0:4}" -ge $minyear ]] && [[ "${archfounddate:0:4}" -le $maxyear ]]; then
+          # Alternative within target search date
+          altfound=1
+        else
+          # Alternative not within target search date
+          archstatus="404"
+        fi
+      else
+        # File is NOT an HTML document. Expand search parameters.
+        log 3 "$link 302 - Alternative file is not an HTML document. Since it will not be parsed, can safely download alternative."
+        altfound=1
+      fi
+      if [[ "$altfound" == "1" ]]; then
+        log 2 "$link Alternative copy was found that is within target search range (datecode: ${archfounddate:0:8})"
+      else
+        log 1 "$link No resource found near target date. (${archfounddate:0:8})"
+      fi
+    else
+      # For some reason the redirected URL given from archive.org lead to a page with a status code that is not 200.
+      # I don't know why that would happen, so log this as an error.
+      if [[ ! $archstatus == "404" ]]; then
+        log 1 "IA responded with an unexpected HTTP status code: $archstatus"
+      fi
+    fi
   fi
 
   # 403 Forbidden
-  if [[ $archstatus == "403" ]]; then
+  if [[ $archstatus == "403" ]]; then   # Forbidden
     log 1 "$link Warning: IA responded with HTTP status code: 403 - Forbidden"
   fi
 
+  # 404 Not Found
+  if [[ $archstatus == "404" ]]; then    # Not found
+    log 1 "$link 404 - Not Found.";
+  fi
+
   # 502 Bad Gateway
-  if [[ $archstatus == "502" ]]; then
+  if [[ $archstatus == "502" ]]; then   # Bad Gateway
     log 1 "$link Warning: IA responded with HTTP status code: 502 - Bad Gateway"
   fi
 
   if [[ $archstatus == "403" ]] || [[ $archstatus == "502" ]]; then
     log 1 "Warning: Encountered a status code, that could indicate server throttling. Cooling down for 30 seconds."
-    rm -f "$headertmp" "$filetmp"
     sleep 30
-    return
-  fi
-
-  altfound=0
-  if [[ ! -z "$archfounddate" ]]; then
-    if [[ $archmime == "text/html" ]]; then
-      # File is an HTML document. Only download if it is at or near target date.
-      minyear=$(( ${datestring:0:4} - searchback ))   # 2000 - 4 = 1996
-      maxyear=$(( ${datestring:0:4} + searchahead ))  # 2000 + 3 = 2003
-      if [[ "${archfounddate:0:4}" -ge $minyear ]] && [[ "${archfounddate:0:4}" -le $maxyear ]]; then
-        # Alternative within target search date
-        altfound=1
-      else
-        # Alternative not within target search date
-        archstatus="404"
-      fi
-    else
-      # File is NOT an HTML document. Expand search parameters.
-      log 3 "$link 302 - Alternative file is not an HTML document. Since it will not be parsed, can safely download alternative."
-      altfound=1
-    fi
-    if [[ "$altfound" == "1" ]]; then
-      log 2 "$link Alternative copy was found that is within target search range (datecode: ${archfounddate:0:8})"
-    else
-      log 1 "$link No resource found near target date. (${archfounddate:0:8})"
-    fi
-  fi
-
-  # 404 Not Found
-  if [[ $archstatus == "404" ]]; then
-    log 1 "$link 404 - Not Found.";
-    rm -f "$headertmp" "$filetmp"
-    return
   fi
 
   # 200 Page found and archived
   if [[ $archstatus == "200" ]]; then
     log 3 "$link 200 - Page found!"
-
-    # Try to recover filename from the effective URL if it wasn't in the original.
-    if [[ -z "$filename" ]] && [[ -f "$filetmp" ]]; then
-      local realpath=$(echo "$effectiveurl" \
-        |sed 's|https://web\.archive\.org/web/[^/]*/http[s]*://[^/]*/||')
-      local inferredname=$(echo "$realpath" |rev |cut -s -d'/' -f1 |cut -s -d'.' -f1- |rev)
-      if [[ ! -z "$inferredname" ]]; then
-        filename="$inferredname"
-        log 3 "$link Inferred filename from effective URL: $filename"
-      else
-        local cdisp=$(grep -i "^content-disposition:" "$headertmp" \
-          |tail -n1 |tr -d '\r' |grep -oi 'filename=[^;[:space:]]*' \
-          |cut -d'=' -f2 |tr -d '"')
-        if [[ ! -z "$cdisp" ]]; then
-          filename="$cdisp"
-          log 3 "$link Inferred filename from Content-Disposition: $filename"
-        fi
-      fi
-    fi
-
-    downloadedfile="$filetmp"
-    log 3 "Successfully downloaded $filetmp !"
+    sleep $cooldown
+    wget --quiet --max-redirect=0 -e robots=off -nH -nc -P $tempdir/web "$archurl" 2>&1 |tee -a $logfile
   fi
 
-  rm -f "$headertmp"
+  # Check to see if wget created the directory ./sites/$host/web
+  if [ -d "$tempdir/web" ]; then
+    # if /web -directory found, assume the file is there, find the file and 
+    # save full path to file in the variable $outputfile
+    downloadedfile=$(find $tempdir/web -type f)
+    log 3 "Successfully downloaded $downloadedfile !"
+    filename="$(echo -n $downloadedfile | rev | cut -d'/' -f1 | rev)"
+  else
+    # Otherwise clear $downloadedfile
+    downloadedfile=
+  fi
 
   # If a file was downloaded
-  if [[ -f "$downloadedfile" ]]; then
+  if [ -f "$downloadedfile" ]; then
+    # Create path for downloaded file
+    # URL decode "my%20dir/" into "my dir/"
     pathdecoded=$(urldecode "$path")
 
     # Move downloaded file to destination directory if it doesn't exist.
-    if [[ -f "./sites/$host/$pathdecoded$filename" ]]; then
+    if [ -f "./sites/$host/$pathdecoded$filename" ]; then
+      #log 2 "Notice: Skipping downloaded file $directlink, file is already downloaded."
       log 1 "$host/$path$filename Skipping document, destination already exists.    "
-      rm -f "$downloadedfile"
       downloadedfile=
     else
       mkdir -p "./sites/$host/$pathdecoded" 2>&1 |tee -a $logfile
-      mv -n "$downloadedfile" "./sites/$host/$pathdecoded/$filename" 2>&1 |tee -a $logfile
+      mv -n "$downloadedfile" "./sites/$host/$pathdecoded" 2>&1 |tee -a $logfile # move wget'ed file out of ./web
       if [[ -z "$archfounddate" ]]; then
         log 1 "$host/$path$filename OK!"
       elif [[ "${archfounddate:0:8}" == "${datestring:0:8}" ]]; then
@@ -533,9 +530,10 @@ function geturl {
       else
         log 1 "$host/$path$filename [alternate date: ${archfounddate:0:8}] OK!"
       fi
-      downloadedfile="./sites/$host/$pathdecoded/$filename"   # update path to final location
     fi
 
+    rm -r $tempdir/web 2>&1 |tee -a $logfile                     # remove ./web
+
     # Place cursor to be positioned correctly for the cooldown timer.
     printf "\b\b\b   \b\b"
     for (( c=1; c<=$cooldown; c++ )); do
@@ -669,11 +667,11 @@ while IFS="" read -r line || [ -n "$line" ]; do
                                                                            # Use tr to remove special characters, such as linefeeds.
   # Parse given URL to components
   # take last component of URL and get filename if exists
-  filename=$(echo "$link" |rev |cut -s -d'/' -f1 |cut -s -d'.' -f1- |rev)  # file.html
+  LC_ALL=C filename=$(echo "$link" |rev |cut -s -d'/' -f1 |cut -s -d'.' -f1- |rev)  # file.html
   if [[ -z $filename ]]; then
     path=$(echo "$link" |cut -s -d'/' -f2-)                                # path/to
   else
-    path=$(echo "$link" |rev |cut -s -d'/' -f2- |rev |cut -s -d'/' -f2-)   # path/to
+    LC_ALL=C path=$(echo "$link" |rev |cut -s -d'/' -f2- |rev |cut -s -d'/' -f2-)   # path/to
   fi
 
   if [[ ! -z "$path" ]] && [[ ! ${path: -1} == "/" ]]; then
@@ -688,6 +686,7 @@ while IFS="" read -r line || [ -n "$line" ]; do
   log 2 "Debug: Path:     \"$path\""
   log 2 "Debug: Filename: \"$filename\""
 
+  newlinksfound=0
   geturl "$link"
 
   # If a file was downloaded, only scan links if we are not beyond maxdepth...