diff --git a/search_engine_results/.gitignore b/search_engine_results/.gitignore new file mode 100644 index 0000000..382ddaa --- /dev/null +++ b/search_engine_results/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +config.py diff --git a/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb b/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb new file mode 100644 index 0000000..198e1a4 --- /dev/null +++ b/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb @@ -0,0 +1,242 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/bin/env python\n", + "# coding: utf-8\n", + "\n", + "import pathlib\n", + "import pprint\n", + "import json\n", + "import re\n", + "import requests\n", + "import time\n", + "import config\n", + "import argparse\n", + "import logging\n", + "import urllib.parse\n", + "import csv\n", + "from datetime import datetime\n", + "\n", + "\n", + "ENDPT = 'https://web.archive.org/save/'\n", + "UA_STRING = config.UA_STRING\n", + "ACCESS_KEY = config.ACCESS_KEY\n", + "SECRET_KEY = config.SECRET_KEY\n", + "HEADERS = {'Accept':'application/json',\n", + " 'User-Agent': UA_STRING,\n", + " 'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'}\n", + "IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def main():\n", + " parser = argparse.ArgumentParser(description='Creates job ')\n", + " parser.add_argument('-i', help='Input directory with metadata files')\n", + " parser.add_argument('-o', help='Location to save job id file')\n", + " ## TODO: Maybe switch this so default is to ignore?\n", + " parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query',\n", + " action='store_true')\n", + "\n", + " args = parser.parse_args()\n", + "\n", + " \n", + " # Make a list of the files that we are going to be editing (skip those already edited)\n", + " files = pathlib.Path(args.i).glob('**/*.json')\n", + " ## FOR TESTING ONLY!!!\n", + " #files = list(files)[10:11]\n", + " archive_files(files, args.o, args.ignore_self_links)\n", + " \n", + "def archive_files(files, output_file, ignore_self_links):\n", + " \n", + " \n", + " def get_urls_to_archive(fn):\n", + " '''Takes a file, gets the urls to archive, and passes them to the archive_url function'''\n", + " with open(filename, 'r') as f:\n", + " j_obj = json.load(f)\n", + " # Get the URLs from the file\n", + " query_url, link_urls = get_urls_from_json(j_obj)\n", + " # Filter out the self links and search engine cache urls\n", + " link_urls = filter_link_urls(query_url, link_urls)\n", + " \n", + " with open(output_file, 'w') as out_file:\n", + " f = csv.writer(out_file)\n", + " # Get outlinks for the query URL. This gets these jobs started early, so some will\n", + " # hopefully be done by the time we make the calls\n", + " query_job = archive_url(query_url, capture_outlinks=1)\n", + " store_job_id(f, query_url, query_job)\n", + " for url in link_urls:\n", + " job_id = archive_url(url)\n", + " store_job_id(f, url, job_id)\n", + " \n", + " def store_job_id(f, url, job_id):\n", + " '''Writes the result of an archive operation to a csv file (f) and the complete_urls dict'''\n", + " time = datetime.now()\n", + " f.writerow([time, url, job_id])\n", + " completed_urls[url] = job_id\n", + " \n", + " def filter_link_urls(query_url,\n", + " urls,\n", + " remove_cache=True):\n", + " '''\n", + " Takes link urls and filters them in three ways:\n", + " 1. (Optionally) Ignores urls from the two caches:\n", + " webcache.googleusercontent.com\n", + " https://cc.bingj\n", + " 2. Filters out those which are in the completed_urls dictionary\n", + " 3. (Optionally) Identifies URLs which have the same domain as the query URL.\n", + " Checks the skipped_urls list to see if the URL already appears there. If so, we assume\n", + " that we want it archived and move it from skipped to the to_archive list\n", + " '''\n", + " to_archive = []\n", + " if ignore_self_links:\n", + " domain = get_domain(query_url)\n", + " else:\n", + " domain = None\n", + " cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com'\n", + " for url in urls:\n", + " if url in completed_urls:\n", + " continue\n", + "\n", + " if remove_cache == True:\n", + " if re.match(cache_regex, url):\n", + " continue\n", + "\n", + " if ignore_self_links and re.match(f'https?://\\w*\\.?{domain}', url):\n", + " # If it matches, check if it's in skipped URLs\n", + " # If so, remove it from there, and add it to the to_archive list\n", + " if url in skipped_urls:\n", + " to_archive.append(url)\n", + " skipped_urls.remove(url)\n", + " # Else, add it to the skipped urls (and skip it)\n", + " else:\n", + " skipped_urls.append(url)\n", + " else:\n", + " to_archive.append(url)\n", + " return to_archive\n", + " \n", + " \n", + " completed_urls = dict_from_csv(output_file)\n", + " skipped_urls = []\n", + " attempts = 0\n", + " incomplete_files = list(files)\n", + " while len(incomplete_files) > 0:\n", + " if attempts == 3:\n", + " break\n", + " for fn in incomplete_files:\n", + " try:\n", + " archive_urls(fn)\n", + " incomplete_files.pop(incomplete_files.index(fn)) # if it works, remove it from the list\n", + " except ConnectionError:\n", + " failed_files.append(fn)\n", + " attempts += 1\n", + " logging.warn('Files that failed: {}'.format(incomplete_files))\n", + " time.sleep(30) # If something goes wrong, wait to see if it gets better :)\n", + "\n", + "\n", + "def dict_from_csv(csv_file):\n", + " result = {}\n", + " if pathlib.Path(csv_file).exists():\n", + " with open(csv_file, 'r') as fn:\n", + " f = csv.reader(fn)\n", + " for row in f:\n", + " result[row[0]] = row[1]\n", + " return result \n", + "\n", + " \n", + "def get_domain(url):\n", + " domain = re.search('^https://www.(\\w+\\.\\w+)', url).groups()[0]\n", + " if not domain:\n", + " raise ValueError(\"Can't find URL in {url}\")\n", + " return domain\n", + "\n", + "\n", + "\n", + "def get_urls_from_json(j_obj):\n", + " '''Takes a JSON object and extracts the correct URLs; returns them in a list.'''\n", + " query_url = urlencode_url(j_obj['link'])\n", + " link_urls = []\n", + " \n", + " for x in j_obj['linkElements']:\n", + " url = x['href']\n", + " if re.match('javascript', url) or url == '':\n", + " continue\n", + " link_urls.append(urlencode_url(url))\n", + " return (query_url, link_urls)\n", + " \n", + "def urlencode_url(url):\n", + " return requests.utils.requote_uri(urllib.parse.unquote_plus(url))\n", + "\n", + "def archive_url(url, \n", + " wait = 2, \n", + " capture_outlinks = 0 # Whether to capture outlinks (default is no)\n", + " ):\n", + "\n", + "\n", + "\n", + " payload = {'url': url,\n", + " 'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN,\n", + " #'capture_screenshot': capture_screenshot,\n", + " 'capture_outlinks': capture_outlinks\n", + " }\n", + " r = requests.post(ENDPT, headers=HEADERS, data=payload)\n", + " logging.debug(r.content)\n", + " print(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n", + "\n", + " if r.status_code == 429:\n", + " logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds')\n", + " time.sleep(wait)\n", + " return archive_url(url = url,\n", + " wait = wait * 1.2, \n", + " capture_outlinks = capture_outlinks)\n", + " if r.status_code in [104,502,503,504,443,401]:\n", + " logging.warning(url)\n", + " logging.warning(r.text)\n", + " if r.status_code in [104, 401, 443]:\n", + " logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.')\n", + " return None\n", + " logging.warning('502 or 503 or 504 status received; waiting 30 seconds')\n", + " time.sleep(30)\n", + " return archive_url(url = url,\n", + " capture_outlinks = capture_outlinks)\n", + " \n", + " r.raise_for_status()\n", + " try:\n", + " return r.json()['job_id']\n", + " except KeyError:\n", + " logging.warning(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py3", + "language": "python", + "name": "py3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/search_engine_results/wayback_urls/add_wayback_urls.py b/search_engine_results/wayback_urls/add_wayback_urls.py new file mode 100644 index 0000000..4e35d4b --- /dev/null +++ b/search_engine_results/wayback_urls/add_wayback_urls.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# coding: utf-8 + +import pathlib +import json +import re +import time +import argparse +import logging +import csv +import requests +import urllib + + + +def main(): + parser = argparse.ArgumentParser(description='Add wayback URLs to SERP metadata.') + parser.add_argument('-i', help='Input directory with metadata files') + parser.add_argument('-w', help = 'Location of file with wayback URLS') + parser.add_argument('-o', help='Directory to save modified files (if blank or same as input directory, will overwrite)') + + args = parser.parse_args() + + + # Make a list of the files that we are going to be editing (skip those already edited) + files = pathlib.Path(args.i).glob('**/*.json') + wayback_dict = load_wayback_dict(args.w) + for fn in files: + write_wayback_to_file(fn, args.o, wayback_dict) + + +def write_wayback_to_file(filename, out_dir, wayback_dict): + with open(filename, 'r') as f: + j_obj = json.load(f) + query_url = urlencode_url(j_obj['link']) + try: + wayback_url = wayback_dict[query_url] + j_obj['wayback_url'] = wayback_url + except KeyError: + logging.error(f"Should have an entry for {query_url}") + logging.error(wayback_dict.keys()) + j_obj['wayback_url'] = '' + for link_obj in j_obj['linkElements']: + link_url = urlencode_url(link_obj['href']) + if link_url == '': + try: + wayback_url = wayback_dict[link_url] + link_obj['wayback_url'] = wayback_url + except KeyError: + logging.info(f'No WB URL for {link_url}') + link_obj['wayback_url'] = '' + outfile = get_out_path(filename, out_dir) + with open(outfile, 'w') as f: + json.dump(j_obj, f) + + +def get_out_path(fp, out_dir): + '''Assumes that we want to keep the directory and the file name''' + if out_dir == None: + return fp + else: + new_path = pathlib.Path(out_dir).joinpath(*fp.parts[-2:]) + if not new_path.parent.exists(): + logging.warning(f"Creating new path at {new_path}") + new_path.parent.mkdir(parents = True) + return new_path + +def load_wayback_dict(fn): + '''Loads the waback URL file as a dictionary of {orig_url:wb_url}. Currently ignores + the timestamp, overwriting older WB URLs with newer ones''' + result = {} + if pathlib.Path(fn).exists(): + with open(fn, 'r') as f_obj: + f = csv.reader(f_obj) + for row in f: + result[row[1]] = row[2] + return result + + +def urlencode_url(url): + return requests.utils.requote_uri(urllib.parse.unquote_plus(url)) + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main() + diff --git a/search_engine_results/wayback_urls/archive_urls.py b/search_engine_results/wayback_urls/archive_urls.py new file mode 100644 index 0000000..3870a36 --- /dev/null +++ b/search_engine_results/wayback_urls/archive_urls.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python +# coding: utf-8 + +import pathlib +import json +import re +import requests +import time +import config +import argparse +import logging +import urllib.parse +import csv +import datetime +from urllib import HTTPError + + +ENDPT = 'https://web.archive.org/save/' +UA_STRING = config.UA_STRING +ACCESS_KEY = config.ACCESS_KEY +SECRET_KEY = config.SECRET_KEY +HEADERS = {'Accept':'application/json', + 'User-Agent': UA_STRING, + 'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'} +IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one +CHUNK_SIZE = 15 # Get this many URLS at a time + + + +def main(): + parser = argparse.ArgumentParser(description='Creates job ids from to_archive csv file') + parser.add_argument('-i', help='Location of to_archive CSV file') + parser.add_argument('-o', help='Location to save wayback URL file') + args = parser.parse_args() + + + + def get_job_ids(urls, capture_outlinks): + for url in urls: + if url not in completed_urls: + job_id = archive_url(url, + capture_outlinks = capture_outlinks) + # Just put them into the job_id_tuples + job_id_tuples.append((url, job_id)) + else: + logging.debug(f'{url} was in completed') + + + + def get_wayback_urls(out_file): + for url, job_id in job_id_tuples: + try: + wb_url, timestamp = get_wayback_url(job_id) + write_wayback(out_file, url, wb_url, timestamp) + except ConnectionError: + logging.warning(f'{url} with job id {job_id} failed with a ConnectionError') + except TypeError: + logging.warning(f'{url} with job id {job_id} did not get a WB URL') + except HTTPError as e: + logging.warning(f'{url} with job id {job_id} failed with an uncaught HTTP Error: {e}') + except Exception as e: + logging.warning(f'{url} with job id {job_id} failed with an uncaught Exception: {e}') + + + completed_urls = get_completed(args.o, time_string = IF_NOT_ARCHIVED_WITHIN) + to_archive = load_urls(args.i) + # Do query URLS first, since for them we'll capture outlinks + query_urls = [x for x in to_archive if to_archive[x] == 'query'] + link_urls = [x for x in to_archive if to_archive[x] == 'link'] + + with open(args.o, 'a') as out_file: + out = csv.writer(out_file) + logging.info("Now retrieving query urls") + for q_chunk in chunk_list(query_urls, CHUNK_SIZE): + job_id_tuples = [] # Stores which urls need to be retrieved (populated by get_job_ids) + get_job_ids(q_chunk, capture_outlinks = 1) + get_wayback_urls(out) + + logging.info("Now retrieving link urls") + for chunk in chunk_list(link_urls, CHUNK_SIZE): + job_id_tuples = [] + get_job_ids(chunk, capture_outlinks = 0) + get_wayback_urls(out) + + +def chunk_list(l, size): + for i in range(0, len(l), size): + logging.info(f'Now getting items {i} through {min(len(l), i + size)} of {len(l)}') + yield l[i:i+size] + +def load_urls(url_fn): + result = {} + with open(url_fn, 'r') as fn: + f = csv.reader(fn) + for row in f: + result[row[0]] = row[1] + return result + + +def write_wayback(f, url, wayback_url, timestamp): + '''Takes a CSV writer object, a url, and wayback_url, and writes + it out''' + f.writerow([timestamp,url,wayback_url]) + + + +def get_completed(csv_file, time_string): + '''Loads all of the completed URLs from the csv file. Takes in a time string like '20h', + strips the last character, and assumes that it refers to the number of hours. + Does not load any URLs older than that. + ''' + delta_hours = int(time_string[:-1]) + result = {} + if pathlib.Path(csv_file).exists(): + with open(csv_file, 'r') as fn: + f = csv.reader(fn) + for row in f: + dt = datetime.datetime.strptime(row[0], '%Y%m%d%H%M%S') + if datetime.datetime.now() - dt > datetime.timedelta(hours = delta_hours): + continue + else: + result[row[1]] = row[2] + return result + +def urlencode_url(url): + return requests.utils.requote_uri(urllib.parse.unquote_plus(url)) + +def archive_url(url, + wait = 6, + capture_outlinks = 0 # Whether to capture outlinks (default is no) + ): + + logging.info(f'Sending archive call for {url}') + payload = {'url': url, + 'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN, + #'capture_screenshot': capture_screenshot, + 'capture_outlinks': capture_outlinks + } + r = requests.post(ENDPT, headers=HEADERS, data=payload) + logging.debug(r.content) + + if r.status_code == 429: + logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds') + time.sleep(wait) + return archive_url(url = url, + wait = wait * 1.2, + capture_outlinks = capture_outlinks) + if r.status_code in [104,401,404,443,502,503,504]: + logging.warning(url) + logging.warning(r.text) + if r.status_code in [104, 401, 443]: + logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.') + return None + logging.warning('502 or 503 or 504 status received; waiting 30 seconds') + time.sleep(30) + return archive_url(url = url, + capture_outlinks = capture_outlinks) + + r.raise_for_status() + try: + return r.json()['job_id'] + except KeyError: + logging.warning(f'Should have a valid job id for {url}. Instead, this was returned:\n {r.content}') + + +def get_wayback_url(job_id): + + def call_status_url( + wait = 6, # Initial wait time + max_wait = 12 # Stop when wait time between calls hits max_wait + ): + '''Helper function to handle the call to the status API''' + if job_id is None: + return None + s = requests.get(ENDPT + 'status/' + job_id, headers=HEADERS) + if s.status_code == 200: + s_json = s.json() + if s_json['status'] == 'pending': + if wait > max_wait: + logging.debug(s_json) + logging.warning(f"The call to get the status of job id {job_id} failed. Skipping") + return None + logging.info(f'Pending, now waiting for {wait:.2f} seconds') + time.sleep(wait) + return call_status_url(wait = wait + 1) + if s_json['status'] == 'success': + return s_json + if s_json['status'] == 'error': + logging.error('Could not get status, with error: {}'.format(s_json["message"])) + return None + else: + logging.warning(s_json) + raise ValueError("Status was unexpected") + if s.status_code == 429: + logging.info(f'Hit rate limit, now waiting for {wait} seconds') + time.sleep(wait) + return call_status_url(wait = wait * 1.2) # Backoff + if s.status_code in [104,401,404,443,502,503,504]: + # These likely mean something's wrong; wait and then try again + if r.status_code in [104, 401, 443]: + logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.') + return None + logging.warning('443, 502, 503, or 504 status received; waiting 30 seconds') + logging.warning(s.text) + time.sleep(30) + return call_status_url() + else: + s.raise_for_status() + + logging.info(f"Getting wayback URL for job id {job_id}") + s_json = call_status_url() + if s_json is None: + return None + try: + wayback_url = 'http://web.archive.org/web/{}/{}'.format(s_json['timestamp'], + s_json['original_url']) + return (wayback_url, s_json['timestamp']) + except KeyError: + logging.error(f"Missing timestamp or original URL for {job_id}") + return None + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main() diff --git a/search_engine_results/wayback_urls/get_urls.py b/search_engine_results/wayback_urls/get_urls.py new file mode 100644 index 0000000..fdf27d7 --- /dev/null +++ b/search_engine_results/wayback_urls/get_urls.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# coding: utf-8 + +import pathlib +import json +import re +import requests +import time +import config +import argparse +import logging +import urllib.parse +import csv +import datetime + + +def main(): + parser = argparse.ArgumentParser(description='Gets URLs to archive from SERP metadata files') + parser.add_argument('-i', help='Input directory with metadata files') + parser.add_argument('-o', help='Location to save URL list') + ## TODO: Maybe switch this so default is to ignore? + parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query', + action='store_true') + + args = parser.parse_args() + + # Make a list of the files that we are going to be editing (skip those already edited) + files = pathlib.Path(args.i).glob('**/*.json') + ## FOR TESTING ONLY!!! + #files = list(files)[10:11] + get_urls_from_files(files, args.o, args.ignore_self_links) + +def get_urls_from_files(files, + output_file, + ignore_self_links, + remove_cache = True): + + def get_urls(fn): + '''Takes a file, gets the urls to archive, passes them to the archive_url function, and writes them to + the output file''' + with open(fn, 'r') as f: + j_obj = json.load(f) + # Get the URLs from the file + query_url, link_urls = get_urls_from_json(j_obj) + # Filter out the self links and search engine cache urls + link_urls = filter_link_urls(query_url, link_urls) + return (query_url, link_urls) + + def filter_link_urls(query_url, + urls): + ''' + Takes link urls and filters them in four ways: + 1. (Optionally) Ignores urls from the two caches: + webcache.googleusercontent.com + https://cc.bingj + 2. Filters out those which are in the completed_urls dictionary + 3. (Optionally) Identifies URLs which have the same domain as the query URL. + Checks the skipped_urls list to see if the URL already appears there. If so, we assume + that we want it archived and move it from skipped to the to_archive list + 4. Filters out URLs that appear more than once in this list + ''' + if ignore_self_links: + domain = get_domain(query_url) + else: + domain = None + cache_regex = r'https?://webcache.googleusercontent.com|https?://cc.bingj.com' + + result = set() + for url in urls: + if url in to_archive: + continue + + if remove_cache == True: + if re.match(cache_regex, url): + continue + + if ignore_self_links and re.match(f'https?://\w*\.?{domain}', url): + # If it matches, check if it's in skipped URLs + # If so, remove it from there, and add it to the to_archive list + if url in skipped_urls: + result.add(url) + skipped_urls.remove(url) + # Else, add it to the skipped urls (and skip it) + else: + skipped_urls.add(url) + else: + result.add(url) + return result + + skipped_urls = set() + to_archive = {} + for fn in files: + q_url, link_urls = get_urls(fn) + to_archive[q_url] = 'query' + for url in link_urls: + # Prioritize query urls - if it's already there, + # then don't overwrite + if url not in to_archive: + to_archive[url] = 'link' + write_urls(to_archive, output_file) + + +def write_urls(url_dict, fn): + with open(fn, 'w') as out_file: + f = csv.writer(out_file) + for url, link_type in url_dict.items(): + f.writerow([url, link_type]) + + +def get_domain(url): + domain = re.search('^https://www.(\w+\.\w+)', url).groups()[0] + if not domain: + raise ValueError("Can't find URL in {url}") + return domain + + +def get_urls_from_json(j_obj): + '''Takes a JSON object and extracts the correct URLs; returns them in a list.''' + query_url = urlencode_url(j_obj['link']) + link_urls = set() + + for x in j_obj['linkElements']: + url = x['href'] + if re.match('javascript', url) or url == '': + continue + link_urls.add(urlencode_url(url)) + return (query_url, link_urls) + + +def urlencode_url(url): + return requests.utils.requote_uri(urllib.parse.unquote_plus(url)) + + +if __name__ == '__main__': + main() diff --git a/search_engine_results/wayback_urls/wayback_urls.csv b/search_engine_results/wayback_urls/wayback_urls.csv new file mode 100644 index 0000000..e69de29