From cc3abf2109c56c6ac4532b3ee603193c4fb5c37f Mon Sep 17 00:00:00 2001 From: caiot5 Date: Tue, 10 Mar 2026 06:16:04 +0000 Subject: [PATCH] revert single-curl fetch, keep scanlinks optimizations --- warnick-2.1.4/warnick.sh | 217 +++++++++++++++++++-------------------- 1 file changed, 108 insertions(+), 109 deletions(-) diff --git a/warnick-2.1.4/warnick.sh b/warnick-2.1.4/warnick.sh index a095680..aadcb89 100644 --- a/warnick-2.1.4/warnick.sh +++ b/warnick-2.1.4/warnick.sh @@ -39,6 +39,7 @@ # # SCRIPT PREREQUISITE PROGRAMS: # - bash (tested on version 5.0.3) +# - wget (1.20.1 or newer) # - curl (7.64.0 or newer) # - tee, cut, grep, head, cat, date, tr # @@ -250,10 +251,10 @@ function scanlinks { ((depth++)) while IFS= read -u 4 -r line; do - byteoffset=$(echo $line |cut -d ':' -f1) + byteoffset="${line%%:*}" tag="${filecontent:$byteoffset:$searchbuffer}" - tag=$(echo -n "$tag" |tr '\n\r' ' ' |cut -d'>' -f1)">" + tag="${tag//$'\n'/ }"; tag="${tag//$'\r'/ }"; tag="${tag%%>*}>" # get offset of link parameter href, src... etc. add tag parameters here. log 4 "Debug: Tag found $tag" # if multiple matches, pick first one (head -n1) @@ -266,21 +267,21 @@ function scanlinks { linktype="" # step 1 - parse tag - link="$(echo -n $tag |cut -b$linkoffset- |cut -d'=' -f2 |cut -d'>' -f1 |cut -d'?' -f1)" # "./dir\case32.html#anchor" ALT + link="${tag:$linkoffset}"; link="${link#*=}"; link="${link%%>*}"; link="${link%%[?]*}" # "./dir\case32.html#anchor" ALT log 4 "Debug: Link parsed step 1: $link" # Check if link starts in quotes. If so, then grab link inside quotes - if [[ $link =~ ^'"' ]]; then link="$(echo -n $link |cut -d'"' -f2)"; fi # ./dir\case32.html#anchor + if [[ $link == \"* ]]; then link="${link#\"}"; link="${link%%\"*}"; fi # ./dir\case32.html#anchor log 4 "Debug: Link parsed step 2: $link" # step 3 - remove spaces and quotes and anything else that we missed - link="$(echo -n $link |tr -d '\"' |cut -d' ' -f1)" # ./dir\case32.html#anchor + link="${link//\"/}"; link="${link%% *}" # ./dir\case32.html#anchor # step 4 - remove ./ from beginning of link - link="$(echo -n $link |sed 's/^\.\///')" # /dir\case32.html#anchor + link="${link#./}" # /dir\case32.html#anchor # step 5 - convert invalid directory separator '\' to '/' - link=$(echo -n $link |tr '\\' '/') # /dir/case32.html#anchor + link="${link//\\/\/}" # /dir/case32.html#anchor # step 6 - remove #anchors and ?params from the end of a link - link=$(echo -n $link |cut -d'#' -f1 |cut -d'?' -f1) # /dir/case32.html + link="${link%%#*}"; link="${link%%[?]*}" # /dir/case32.html # step 7 - urldecode #link="$(echo -n $link |sed 's/%20/ /g')" # /dir/case32.html @@ -296,7 +297,7 @@ function scanlinks { # Is this a link with protocol? Assume this is a direct link. Remove protocol. # - http://www.hostname.com/path/to/file.html if [[ ${link:0:7} == "http://" ]] || [[ ${link:0:6} == "ftp://" ]]; then - directlink="$(echo $link |cut -s -d'/' -f3-)" + directlink="${link#*//}" linktype="direct" fi @@ -309,7 +310,7 @@ function scanlinks { # - ../file.html if [[ ${link:0:3} == "../" ]]; then # Count occurrences in relative URL - parents=$(echo -n "$link" |grep -o "\.\./" | wc -l) + tmp="${link//\.\.\//}"; parents=$(( (${#link} - ${#tmp}) / 3 )) ((parents++)) directlink=$(echo -n $link |cut -d'/' -f${parents}-) ((parents++)) @@ -337,7 +338,7 @@ function scanlinks { skiplink=1 else # Is this a link on another host? - linkhost=$(echo $directlink |cut -d'/' -f1) + linkhost="${directlink%%/*}" if [[ ! "${linkhost,,}" == "$host" ]]; then # Too many skipped link notifications - suppress printing a skipped link, and print a list at the end. #log 1 "Notice: Skipping link to another host: $directlink" @@ -399,133 +400,129 @@ function geturl { # Set archive URL here archurl="https://web.archive.org/web/${datestring}id_/http://$link" log 3 "Debug: Trying $archurl" + # Get status code for next page to be archived. + sleep $cooldown + archresponse="$(curl -sI "$archurl" | tr -d '\r')" # tr is needed, as text parsing doesn't play well with odd characters + archstatus="$(echo -n "$archresponse" |head -n1 |cut -d' ' -f2)" + archmime="$(echo -n "$archresponse" |grep "content-type: " |cut -d' ' -f2 |cut -d';' -f1)" + log 3 "$link HTTP status code: $archstatus" + log 3 "$link HTTP content-type: $archmime" - local headertmp="$tempdir/hdr-$$" - local filetmp="$tempdir/dl-$$" - local curlmeta= - archfounddate= - archmime= - downloadedfile= - - curlmeta=$(curl -sSL \ - --keepalive --keepalive-time 60 \ - --tcp-fastopen \ - --max-redirs 8 \ - --dump-header "$headertmp" \ - -o "$filetmp" \ - --connect-timeout 30 \ - --max-time 180 \ - --retry 3 --retry-delay 10 --retry-connrefused \ - --write-out "%{http_code}|%{url_effective}" \ - "$archurl" 2>&1) - - archstatus=$(echo "$curlmeta" |cut -d'|' -f1) - effectiveurl=$(echo "$curlmeta" |cut -d'|' -f2) - - if [[ -f "$headertmp" ]]; then - archfounddate=$(grep -i "x-archive-redirect-reason: found" "$headertmp" \ - |tail -n1 |tr -d '\r' |rev |cut -d' ' -f1 |rev) + # 302 Page redirect + if [[ $archstatus == "302" ]]; then + # Redirect page - follow redirect and try getting header information again + archurl=$(echo -n "$archresponse" |grep -m1 "location: " |cut -s -d' ' -f2) + archfounddate="$(echo -n "$archresponse" |grep "x-archive-redirect-reason: found" |rev |cut -d' ' -f1 |rev)" archfounddate="${archfounddate:0:14}" - archmime=$(grep -i "^content-type: " "$headertmp" \ - |tail -n1 |tr -d '\r' |cut -d' ' -f2 |cut -d';' -f1) - fi - log 3 "$link HTTP status code: $archstatus" - log 3 "$link HTTP content-type: $archmime" - if [[ ! -z "$archfounddate" ]]; then - log 3 "$link Redirect resolved to datecode: $archfounddate" + counter=0 + while [ $counter -le 3 ]; do + log 2 "$link 302 - Following redirect: ${archurl}" + sleep $cooldown + archresponse="$(curl -sI "$archurl" | tr -d '\r')" # tr is needed, as text parsing doesn't play well with odd characters + archstatus="$(echo -n "$archresponse" |head -n1 |cut -d' ' -f2)" + if [[ $archstatus == "302" ]]; then + archfounddate="$(echo -n "$archresponse" |grep "x-archive-redirect-reason: found" |rev |cut -d' ' -f1 |rev)" + archfounddate="${archfounddate:0:14}" + log 3 "$link 302 - Resource found on a different date ($archfounddate)" + archredirectreason="$(echo -n "$archresponse" |grep "x-archive-redirect-reason: ")" + log 3 "$link 302 - $archredirectreason" + archurl=$(echo -n "$archresponse" |grep -m1 "location: " |cut -s -d' ' -f2) + else + log 3 "$link 302 - Ending redirect loop, encountered HTTP status code: $archstatus" + break + fi + ((counter++)) + done + + + altfound=0 + if [[ $archstatus == "200" ]]; then + archmime="$(echo -n "$archresponse" |grep "content-type: " |cut -d' ' -f2 |cut -d';' -f1)" + log 3 "$link 302 - HTTP content-type: $archmime" + if [[ $archmime == "text/html" ]]; then + # File is an HTML document. Only download if it is at or near target date. + minyear=$(expr ${datestring:0:4} - ${searchback}) # 2000 - 4 = 1996 + maxyear=$(expr ${datestring:0:4} + ${searchahead}) # 2000 + 3 = 2003 + if [[ "${archfounddate:0:4}" -ge $minyear ]] && [[ "${archfounddate:0:4}" -le $maxyear ]]; then + # Alternative within target search date + altfound=1 + else + # Alternative not within target search date + archstatus="404" + fi + else + # File is NOT an HTML document. Expand search parameters. + log 3 "$link 302 - Alternative file is not an HTML document. Since it will not be parsed, can safely download alternative." + altfound=1 + fi + if [[ "$altfound" == "1" ]]; then + log 2 "$link Alternative copy was found that is within target search range (datecode: ${archfounddate:0:8})" + else + log 1 "$link No resource found near target date. (${archfounddate:0:8})" + fi + else + # For some reason the redirected URL given from archive.org lead to a page with a status code that is not 200. + # I don't know why that would happen, so log this as an error. + if [[ ! $archstatus == "404" ]]; then + log 1 "IA responded with an unexpected HTTP status code: $archstatus" + fi + fi fi # 403 Forbidden - if [[ $archstatus == "403" ]]; then + if [[ $archstatus == "403" ]]; then # Forbidden log 1 "$link Warning: IA responded with HTTP status code: 403 - Forbidden" fi + # 404 Not Found + if [[ $archstatus == "404" ]]; then # Not found + log 1 "$link 404 - Not Found."; + fi + # 502 Bad Gateway - if [[ $archstatus == "502" ]]; then + if [[ $archstatus == "502" ]]; then # Bad Gateway log 1 "$link Warning: IA responded with HTTP status code: 502 - Bad Gateway" fi if [[ $archstatus == "403" ]] || [[ $archstatus == "502" ]]; then log 1 "Warning: Encountered a status code, that could indicate server throttling. Cooling down for 30 seconds." - rm -f "$headertmp" "$filetmp" sleep 30 - return - fi - - altfound=0 - if [[ ! -z "$archfounddate" ]]; then - if [[ $archmime == "text/html" ]]; then - # File is an HTML document. Only download if it is at or near target date. - minyear=$(( ${datestring:0:4} - searchback )) # 2000 - 4 = 1996 - maxyear=$(( ${datestring:0:4} + searchahead )) # 2000 + 3 = 2003 - if [[ "${archfounddate:0:4}" -ge $minyear ]] && [[ "${archfounddate:0:4}" -le $maxyear ]]; then - # Alternative within target search date - altfound=1 - else - # Alternative not within target search date - archstatus="404" - fi - else - # File is NOT an HTML document. Expand search parameters. - log 3 "$link 302 - Alternative file is not an HTML document. Since it will not be parsed, can safely download alternative." - altfound=1 - fi - if [[ "$altfound" == "1" ]]; then - log 2 "$link Alternative copy was found that is within target search range (datecode: ${archfounddate:0:8})" - else - log 1 "$link No resource found near target date. (${archfounddate:0:8})" - fi - fi - - # 404 Not Found - if [[ $archstatus == "404" ]]; then - log 1 "$link 404 - Not Found."; - rm -f "$headertmp" "$filetmp" - return fi # 200 Page found and archived if [[ $archstatus == "200" ]]; then log 3 "$link 200 - Page found!" - - # Try to recover filename from the effective URL if it wasn't in the original. - if [[ -z "$filename" ]] && [[ -f "$filetmp" ]]; then - local realpath=$(echo "$effectiveurl" \ - |sed 's|https://web\.archive\.org/web/[^/]*/http[s]*://[^/]*/||') - local inferredname=$(echo "$realpath" |rev |cut -s -d'/' -f1 |cut -s -d'.' -f1- |rev) - if [[ ! -z "$inferredname" ]]; then - filename="$inferredname" - log 3 "$link Inferred filename from effective URL: $filename" - else - local cdisp=$(grep -i "^content-disposition:" "$headertmp" \ - |tail -n1 |tr -d '\r' |grep -oi 'filename=[^;[:space:]]*' \ - |cut -d'=' -f2 |tr -d '"') - if [[ ! -z "$cdisp" ]]; then - filename="$cdisp" - log 3 "$link Inferred filename from Content-Disposition: $filename" - fi - fi - fi - - downloadedfile="$filetmp" - log 3 "Successfully downloaded $filetmp !" + sleep $cooldown + wget --quiet --max-redirect=0 -e robots=off -nH -nc -P $tempdir/web "$archurl" 2>&1 |tee -a $logfile fi - rm -f "$headertmp" + # Check to see if wget created the directory ./sites/$host/web + if [ -d "$tempdir/web" ]; then + # if /web -directory found, assume the file is there, find the file and + # save full path to file in the variable $outputfile + downloadedfile=$(find $tempdir/web -type f) + log 3 "Successfully downloaded $downloadedfile !" + filename="$(echo -n $downloadedfile | rev | cut -d'/' -f1 | rev)" + else + # Otherwise clear $downloadedfile + downloadedfile= + fi # If a file was downloaded - if [[ -f "$downloadedfile" ]]; then + if [ -f "$downloadedfile" ]; then + # Create path for downloaded file + # URL decode "my%20dir/" into "my dir/" pathdecoded=$(urldecode "$path") # Move downloaded file to destination directory if it doesn't exist. - if [[ -f "./sites/$host/$pathdecoded$filename" ]]; then + if [ -f "./sites/$host/$pathdecoded$filename" ]; then + #log 2 "Notice: Skipping downloaded file $directlink, file is already downloaded." log 1 "$host/$path$filename Skipping document, destination already exists. " - rm -f "$downloadedfile" downloadedfile= else mkdir -p "./sites/$host/$pathdecoded" 2>&1 |tee -a $logfile - mv -n "$downloadedfile" "./sites/$host/$pathdecoded/$filename" 2>&1 |tee -a $logfile + mv -n "$downloadedfile" "./sites/$host/$pathdecoded" 2>&1 |tee -a $logfile # move wget'ed file out of ./web if [[ -z "$archfounddate" ]]; then log 1 "$host/$path$filename OK!" elif [[ "${archfounddate:0:8}" == "${datestring:0:8}" ]]; then @@ -533,9 +530,10 @@ function geturl { else log 1 "$host/$path$filename [alternate date: ${archfounddate:0:8}] OK!" fi - downloadedfile="./sites/$host/$pathdecoded/$filename" # update path to final location fi + rm -r $tempdir/web 2>&1 |tee -a $logfile # remove ./web + # Place cursor to be positioned correctly for the cooldown timer. printf "\b\b\b \b\b" for (( c=1; c<=$cooldown; c++ )); do @@ -669,11 +667,11 @@ while IFS="" read -r line || [ -n "$line" ]; do # Use tr to remove special characters, such as linefeeds. # Parse given URL to components # take last component of URL and get filename if exists - filename=$(echo "$link" |rev |cut -s -d'/' -f1 |cut -s -d'.' -f1- |rev) # file.html + LC_ALL=C filename=$(echo "$link" |rev |cut -s -d'/' -f1 |cut -s -d'.' -f1- |rev) # file.html if [[ -z $filename ]]; then path=$(echo "$link" |cut -s -d'/' -f2-) # path/to else - path=$(echo "$link" |rev |cut -s -d'/' -f2- |rev |cut -s -d'/' -f2-) # path/to + LC_ALL=C path=$(echo "$link" |rev |cut -s -d'/' -f2- |rev |cut -s -d'/' -f2-) # path/to fi if [[ ! -z "$path" ]] && [[ ! ${path: -1} == "/" ]]; then @@ -688,6 +686,7 @@ while IFS="" read -r line || [ -n "$line" ]; do log 2 "Debug: Path: \"$path\"" log 2 "Debug: Filename: \"$filename\"" + newlinksfound=0 geturl "$link" # If a file was downloaded, only scan links if we are not beyond maxdepth...