Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 108 additions & 109 deletions warnick-2.1.4/warnick.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#
# SCRIPT PREREQUISITE PROGRAMS:
# - bash (tested on version 5.0.3)
# - wget (1.20.1 or newer)
# - curl (7.64.0 or newer)
# - tee, cut, grep, head, cat, date, tr
#
Expand Down Expand Up @@ -250,10 +251,10 @@ function scanlinks {
((depth++))

while IFS= read -u 4 -r line; do
byteoffset=$(echo $line |cut -d ':' -f1)
byteoffset="${line%%:*}"

tag="${filecontent:$byteoffset:$searchbuffer}"
tag=$(echo -n "$tag" |tr '\n\r' ' ' |cut -d'>' -f1)">"
tag="${tag//$'\n'/ }"; tag="${tag//$'\r'/ }"; tag="${tag%%>*}>"
# get offset of link parameter href, src... etc. add tag parameters here.
log 4 "Debug: Tag found $tag"
# if multiple matches, pick first one (head -n1)
Expand All @@ -266,21 +267,21 @@ function scanlinks {
linktype=""

# step 1 - parse tag
link="$(echo -n $tag |cut -b$linkoffset- |cut -d'=' -f2 |cut -d'>' -f1 |cut -d'?' -f1)" # "./dir\case32.html#anchor" ALT
link="${tag:$linkoffset}"; link="${link#*=}"; link="${link%%>*}"; link="${link%%[?]*}" # "./dir\case32.html#anchor" ALT
log 4 "Debug: Link parsed step 1: $link"

# Check if link starts in quotes. If so, then grab link inside quotes
if [[ $link =~ ^'"' ]]; then link="$(echo -n $link |cut -d'"' -f2)"; fi # ./dir\case32.html#anchor
if [[ $link == \"* ]]; then link="${link#\"}"; link="${link%%\"*}"; fi # ./dir\case32.html#anchor
log 4 "Debug: Link parsed step 2: $link"

# step 3 - remove spaces and quotes and anything else that we missed
link="$(echo -n $link |tr -d '\"' |cut -d' ' -f1)" # ./dir\case32.html#anchor
link="${link//\"/}"; link="${link%% *}" # ./dir\case32.html#anchor
# step 4 - remove ./ from beginning of link
link="$(echo -n $link |sed 's/^\.\///')" # /dir\case32.html#anchor
link="${link#./}" # /dir\case32.html#anchor
# step 5 - convert invalid directory separator '\' to '/'
link=$(echo -n $link |tr '\\' '/') # /dir/case32.html#anchor
link="${link//\\/\/}" # /dir/case32.html#anchor
# step 6 - remove #anchors and ?params from the end of a link
link=$(echo -n $link |cut -d'#' -f1 |cut -d'?' -f1) # /dir/case32.html
link="${link%%#*}"; link="${link%%[?]*}" # /dir/case32.html
# step 7 - urldecode
#link="$(echo -n $link |sed 's/%20/ /g')" # /dir/case32.html

Expand All @@ -296,7 +297,7 @@ function scanlinks {
# Is this a link with protocol? Assume this is a direct link. Remove protocol.
# - http://www.hostname.com/path/to/file.html
if [[ ${link:0:7} == "http://" ]] || [[ ${link:0:6} == "ftp://" ]]; then
directlink="$(echo $link |cut -s -d'/' -f3-)"
directlink="${link#*//}"
linktype="direct"
fi

Expand All @@ -309,7 +310,7 @@ function scanlinks {
# - ../file.html
if [[ ${link:0:3} == "../" ]]; then
# Count occurrences in relative URL
parents=$(echo -n "$link" |grep -o "\.\./" | wc -l)
tmp="${link//\.\.\//}"; parents=$(( (${#link} - ${#tmp}) / 3 ))
((parents++))
directlink=$(echo -n $link |cut -d'/' -f${parents}-)
((parents++))
Expand Down Expand Up @@ -337,7 +338,7 @@ function scanlinks {
skiplink=1
else
# Is this a link on another host?
linkhost=$(echo $directlink |cut -d'/' -f1)
linkhost="${directlink%%/*}"
if [[ ! "${linkhost,,}" == "$host" ]]; then
# Too many skipped link notifications - suppress printing a skipped link, and print a list at the end.
#log 1 "Notice: Skipping link to another host: $directlink"
Expand Down Expand Up @@ -399,143 +400,140 @@ function geturl {
# Set archive URL here
archurl="https://web.archive.org/web/${datestring}id_/http://$link"
log 3 "Debug: Trying $archurl"
# Get status code for next page to be archived.
sleep $cooldown
archresponse="$(curl -sI "$archurl" | tr -d '\r')" # tr is needed, as text parsing doesn't play well with odd characters
archstatus="$(echo -n "$archresponse" |head -n1 |cut -d' ' -f2)"
archmime="$(echo -n "$archresponse" |grep "content-type: " |cut -d' ' -f2 |cut -d';' -f1)"
log 3 "$link HTTP status code: $archstatus"
log 3 "$link HTTP content-type: $archmime"

local headertmp="$tempdir/hdr-$$"
local filetmp="$tempdir/dl-$$"
local curlmeta=
archfounddate=
archmime=
downloadedfile=

curlmeta=$(curl -sSL \
--keepalive --keepalive-time 60 \
--tcp-fastopen \
--max-redirs 8 \
--dump-header "$headertmp" \
-o "$filetmp" \
--connect-timeout 30 \
--max-time 180 \
--retry 3 --retry-delay 10 --retry-connrefused \
--write-out "%{http_code}|%{url_effective}" \
"$archurl" 2>&1)

archstatus=$(echo "$curlmeta" |cut -d'|' -f1)
effectiveurl=$(echo "$curlmeta" |cut -d'|' -f2)

if [[ -f "$headertmp" ]]; then
archfounddate=$(grep -i "x-archive-redirect-reason: found" "$headertmp" \
|tail -n1 |tr -d '\r' |rev |cut -d' ' -f1 |rev)
# 302 Page redirect
if [[ $archstatus == "302" ]]; then
# Redirect page - follow redirect and try getting header information again
archurl=$(echo -n "$archresponse" |grep -m1 "location: " |cut -s -d' ' -f2)
archfounddate="$(echo -n "$archresponse" |grep "x-archive-redirect-reason: found" |rev |cut -d' ' -f1 |rev)"
archfounddate="${archfounddate:0:14}"
archmime=$(grep -i "^content-type: " "$headertmp" \
|tail -n1 |tr -d '\r' |cut -d' ' -f2 |cut -d';' -f1)
fi

log 3 "$link HTTP status code: $archstatus"
log 3 "$link HTTP content-type: $archmime"
if [[ ! -z "$archfounddate" ]]; then
log 3 "$link Redirect resolved to datecode: $archfounddate"
counter=0
while [ $counter -le 3 ]; do
log 2 "$link 302 - Following redirect: ${archurl}"
sleep $cooldown
archresponse="$(curl -sI "$archurl" | tr -d '\r')" # tr is needed, as text parsing doesn't play well with odd characters
archstatus="$(echo -n "$archresponse" |head -n1 |cut -d' ' -f2)"
if [[ $archstatus == "302" ]]; then
archfounddate="$(echo -n "$archresponse" |grep "x-archive-redirect-reason: found" |rev |cut -d' ' -f1 |rev)"
archfounddate="${archfounddate:0:14}"
log 3 "$link 302 - Resource found on a different date ($archfounddate)"
archredirectreason="$(echo -n "$archresponse" |grep "x-archive-redirect-reason: ")"
log 3 "$link 302 - $archredirectreason"
archurl=$(echo -n "$archresponse" |grep -m1 "location: " |cut -s -d' ' -f2)
else
log 3 "$link 302 - Ending redirect loop, encountered HTTP status code: $archstatus"
break
fi
((counter++))
done


altfound=0
if [[ $archstatus == "200" ]]; then
archmime="$(echo -n "$archresponse" |grep "content-type: " |cut -d' ' -f2 |cut -d';' -f1)"
log 3 "$link 302 - HTTP content-type: $archmime"
if [[ $archmime == "text/html" ]]; then
# File is an HTML document. Only download if it is at or near target date.
minyear=$(expr ${datestring:0:4} - ${searchback}) # 2000 - 4 = 1996
maxyear=$(expr ${datestring:0:4} + ${searchahead}) # 2000 + 3 = 2003
if [[ "${archfounddate:0:4}" -ge $minyear ]] && [[ "${archfounddate:0:4}" -le $maxyear ]]; then
# Alternative within target search date
altfound=1
else
# Alternative not within target search date
archstatus="404"
fi
else
# File is NOT an HTML document. Expand search parameters.
log 3 "$link 302 - Alternative file is not an HTML document. Since it will not be parsed, can safely download alternative."
altfound=1
fi
if [[ "$altfound" == "1" ]]; then
log 2 "$link Alternative copy was found that is within target search range (datecode: ${archfounddate:0:8})"
else
log 1 "$link No resource found near target date. (${archfounddate:0:8})"
fi
else
# For some reason the redirected URL given from archive.org lead to a page with a status code that is not 200.
# I don't know why that would happen, so log this as an error.
if [[ ! $archstatus == "404" ]]; then
log 1 "IA responded with an unexpected HTTP status code: $archstatus"
fi
fi
fi

# 403 Forbidden
if [[ $archstatus == "403" ]]; then
if [[ $archstatus == "403" ]]; then # Forbidden
log 1 "$link Warning: IA responded with HTTP status code: 403 - Forbidden"
fi

# 404 Not Found
if [[ $archstatus == "404" ]]; then # Not found
log 1 "$link 404 - Not Found.";
fi

# 502 Bad Gateway
if [[ $archstatus == "502" ]]; then
if [[ $archstatus == "502" ]]; then # Bad Gateway
log 1 "$link Warning: IA responded with HTTP status code: 502 - Bad Gateway"
fi

if [[ $archstatus == "403" ]] || [[ $archstatus == "502" ]]; then
log 1 "Warning: Encountered a status code, that could indicate server throttling. Cooling down for 30 seconds."
rm -f "$headertmp" "$filetmp"
sleep 30
return
fi

altfound=0
if [[ ! -z "$archfounddate" ]]; then
if [[ $archmime == "text/html" ]]; then
# File is an HTML document. Only download if it is at or near target date.
minyear=$(( ${datestring:0:4} - searchback )) # 2000 - 4 = 1996
maxyear=$(( ${datestring:0:4} + searchahead )) # 2000 + 3 = 2003
if [[ "${archfounddate:0:4}" -ge $minyear ]] && [[ "${archfounddate:0:4}" -le $maxyear ]]; then
# Alternative within target search date
altfound=1
else
# Alternative not within target search date
archstatus="404"
fi
else
# File is NOT an HTML document. Expand search parameters.
log 3 "$link 302 - Alternative file is not an HTML document. Since it will not be parsed, can safely download alternative."
altfound=1
fi
if [[ "$altfound" == "1" ]]; then
log 2 "$link Alternative copy was found that is within target search range (datecode: ${archfounddate:0:8})"
else
log 1 "$link No resource found near target date. (${archfounddate:0:8})"
fi
fi

# 404 Not Found
if [[ $archstatus == "404" ]]; then
log 1 "$link 404 - Not Found.";
rm -f "$headertmp" "$filetmp"
return
fi

# 200 Page found and archived
if [[ $archstatus == "200" ]]; then
log 3 "$link 200 - Page found!"

# Try to recover filename from the effective URL if it wasn't in the original.
if [[ -z "$filename" ]] && [[ -f "$filetmp" ]]; then
local realpath=$(echo "$effectiveurl" \
|sed 's|https://web\.archive\.org/web/[^/]*/http[s]*://[^/]*/||')
local inferredname=$(echo "$realpath" |rev |cut -s -d'/' -f1 |cut -s -d'.' -f1- |rev)
if [[ ! -z "$inferredname" ]]; then
filename="$inferredname"
log 3 "$link Inferred filename from effective URL: $filename"
else
local cdisp=$(grep -i "^content-disposition:" "$headertmp" \
|tail -n1 |tr -d '\r' |grep -oi 'filename=[^;[:space:]]*' \
|cut -d'=' -f2 |tr -d '"')
if [[ ! -z "$cdisp" ]]; then
filename="$cdisp"
log 3 "$link Inferred filename from Content-Disposition: $filename"
fi
fi
fi

downloadedfile="$filetmp"
log 3 "Successfully downloaded $filetmp !"
sleep $cooldown
wget --quiet --max-redirect=0 -e robots=off -nH -nc -P $tempdir/web "$archurl" 2>&1 |tee -a $logfile
fi

rm -f "$headertmp"
# Check to see if wget created the directory ./sites/$host/web
if [ -d "$tempdir/web" ]; then
# if /web -directory found, assume the file is there, find the file and
# save full path to file in the variable $outputfile
downloadedfile=$(find $tempdir/web -type f)
log 3 "Successfully downloaded $downloadedfile !"
filename="$(echo -n $downloadedfile | rev | cut -d'/' -f1 | rev)"
else
# Otherwise clear $downloadedfile
downloadedfile=
fi

# If a file was downloaded
if [[ -f "$downloadedfile" ]]; then
if [ -f "$downloadedfile" ]; then
# Create path for downloaded file
# URL decode "my%20dir/" into "my dir/"
pathdecoded=$(urldecode "$path")

# Move downloaded file to destination directory if it doesn't exist.
if [[ -f "./sites/$host/$pathdecoded$filename" ]]; then
if [ -f "./sites/$host/$pathdecoded$filename" ]; then
#log 2 "Notice: Skipping downloaded file $directlink, file is already downloaded."
log 1 "$host/$path$filename Skipping document, destination already exists. "
rm -f "$downloadedfile"
downloadedfile=
else
mkdir -p "./sites/$host/$pathdecoded" 2>&1 |tee -a $logfile
mv -n "$downloadedfile" "./sites/$host/$pathdecoded/$filename" 2>&1 |tee -a $logfile
mv -n "$downloadedfile" "./sites/$host/$pathdecoded" 2>&1 |tee -a $logfile # move wget'ed file out of ./web
if [[ -z "$archfounddate" ]]; then
log 1 "$host/$path$filename OK!"
elif [[ "${archfounddate:0:8}" == "${datestring:0:8}" ]]; then
log 1 "$host/$path$filename OK!"
else
log 1 "$host/$path$filename [alternate date: ${archfounddate:0:8}] OK!"
fi
downloadedfile="./sites/$host/$pathdecoded/$filename" # update path to final location
fi

rm -r $tempdir/web 2>&1 |tee -a $logfile # remove ./web

# Place cursor to be positioned correctly for the cooldown timer.
printf "\b\b\b \b\b"
for (( c=1; c<=$cooldown; c++ )); do
Expand Down Expand Up @@ -669,11 +667,11 @@ while IFS="" read -r line || [ -n "$line" ]; do
# Use tr to remove special characters, such as linefeeds.
# Parse given URL to components
# take last component of URL and get filename if exists
filename=$(echo "$link" |rev |cut -s -d'/' -f1 |cut -s -d'.' -f1- |rev) # file.html
LC_ALL=C filename=$(echo "$link" |rev |cut -s -d'/' -f1 |cut -s -d'.' -f1- |rev) # file.html
if [[ -z $filename ]]; then
path=$(echo "$link" |cut -s -d'/' -f2-) # path/to
else
path=$(echo "$link" |rev |cut -s -d'/' -f2- |rev |cut -s -d'/' -f2-) # path/to
LC_ALL=C path=$(echo "$link" |rev |cut -s -d'/' -f2- |rev |cut -s -d'/' -f2-) # path/to
fi

if [[ ! -z "$path" ]] && [[ ! ${path: -1} == "/" ]]; then
Expand All @@ -688,6 +686,7 @@ while IFS="" read -r line || [ -n "$line" ]; do
log 2 "Debug: Path: \"$path\""
log 2 "Debug: Filename: \"$filename\""

newlinksfound=0
geturl "$link"

# If a file was downloaded, only scan links if we are not beyond maxdepth...
Expand Down