diff --git a/search_engine_results/.gitignore b/search_engine_results/.gitignore
new file mode 100644
index 0000000..382ddaa
--- /dev/null
+++ b/search_engine_results/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+config.py
diff --git a/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb b/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb
new file mode 100644
index 0000000..198e1a4
--- /dev/null
+++ b/search_engine_results/wayback_urls/.ipynb_checkpoints/archive_urls-checkpoint.ipynb
@@ -0,0 +1,242 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!/usr/bin/env python\n",
+    "# coding: utf-8\n",
+    "\n",
+    "import pathlib\n",
+    "import pprint\n",
+    "import json\n",
+    "import re\n",
+    "import requests\n",
+    "import time\n",
+    "import config\n",
+    "import argparse\n",
+    "import logging\n",
+    "import urllib.parse\n",
+    "import csv\n",
+    "from datetime import datetime\n",
+    "\n",
+    "\n",
+    "ENDPT = 'https://web.archive.org/save/'\n",
+    "UA_STRING = config.UA_STRING\n",
+    "ACCESS_KEY = config.ACCESS_KEY\n",
+    "SECRET_KEY = config.SECRET_KEY\n",
+    "HEADERS = {'Accept':'application/json',\n",
+    "           'User-Agent': UA_STRING,\n",
+    "           'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'}\n",
+    "IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def main():\n",
+    "    parser = argparse.ArgumentParser(description='Creates job ')\n",
+    "    parser.add_argument('-i', help='Input directory with metadata files')\n",
+    "    parser.add_argument('-o', help='Location to save job id file')\n",
+    "    ## TODO: Maybe switch this so default is to ignore?\n",
+    "    parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query',\n",
+    "            action='store_true')\n",
+    "\n",
+    "    args = parser.parse_args()\n",
+    "\n",
+    "    \n",
+    "    # Make a list of the files that we are going to be editing (skip those already edited)\n",
+    "    files = pathlib.Path(args.i).glob('**/*.json')\n",
+    "    ## FOR TESTING ONLY!!!\n",
+    "    #files = list(files)[10:11]\n",
+    "    archive_files(files, args.o, args.ignore_self_links)\n",
+    "    \n",
+    "def archive_files(files, output_file, ignore_self_links):\n",
+    "    \n",
+    "    \n",
+    "    def get_urls_to_archive(fn):\n",
+    "        '''Takes a file, gets the urls to archive, and passes them to the archive_url function'''\n",
+    "        with open(filename, 'r') as f:\n",
+    "            j_obj = json.load(f)\n",
+    "            # Get the URLs from the file\n",
+    "            query_url, link_urls = get_urls_from_json(j_obj)\n",
+    "            # Filter out the self links and search engine cache urls\n",
+    "            link_urls = filter_link_urls(query_url, link_urls)\n",
+    "        \n",
+    "        with open(output_file, 'w') as out_file:\n",
+    "            f = csv.writer(out_file)\n",
+    "            # Get outlinks for the query URL. This gets these jobs started early, so some will\n",
+    "            # hopefully be done by the time we make the calls\n",
+    "            query_job = archive_url(query_url, capture_outlinks=1)\n",
+    "            store_job_id(f, query_url, query_job)\n",
+    "            for url in link_urls:\n",
+    "                job_id = archive_url(url)\n",
+    "                store_job_id(f, url, job_id)\n",
+    "    \n",
+    "    def store_job_id(f, url, job_id):\n",
+    "        '''Writes the result of an archive operation to a csv file (f) and the complete_urls dict'''\n",
+    "        time = datetime.now()\n",
+    "        f.writerow([time, url, job_id])\n",
+    "        completed_urls[url] = job_id\n",
+    "    \n",
+    "    def filter_link_urls(query_url,\n",
+    "                         urls,\n",
+    "                    remove_cache=True):\n",
+    "        '''\n",
+    "        Takes link urls and filters them in three ways:\n",
+    "        1. (Optionally) Ignores urls from the two caches:\n",
+    "        webcache.googleusercontent.com\n",
+    "        https://cc.bingj\n",
+    "        2. Filters out those which are in the completed_urls dictionary\n",
+    "        3. (Optionally) Identifies URLs which have the same domain as the query URL.\n",
+    "        Checks the skipped_urls list to see if the URL already appears there. If so, we assume\n",
+    "        that we want it archived and move it from skipped to the to_archive list\n",
+    "        '''\n",
+    "        to_archive = []\n",
+    "        if ignore_self_links:\n",
+    "            domain = get_domain(query_url)\n",
+    "        else:\n",
+    "            domain = None\n",
+    "        cache_regex = r'https://webcache.googleusercontent.com|https://cc.bingj.com'\n",
+    "        for url in urls:\n",
+    "            if url in completed_urls:\n",
+    "                continue\n",
+    "\n",
+    "            if remove_cache == True:\n",
+    "                if re.match(cache_regex, url):\n",
+    "                    continue\n",
+    "\n",
+    "            if ignore_self_links and re.match(f'https?://\\w*\\.?{domain}', url):\n",
+    "                # If it matches, check if it's in skipped URLs\n",
+    "                # If so, remove it from there, and add it to the to_archive list\n",
+    "                if url in skipped_urls:\n",
+    "                    to_archive.append(url)\n",
+    "                    skipped_urls.remove(url)\n",
+    "                # Else, add it to the skipped urls (and skip it)\n",
+    "                else:\n",
+    "                    skipped_urls.append(url)\n",
+    "            else:\n",
+    "                to_archive.append(url)\n",
+    "        return to_archive\n",
+    "   \n",
+    "    \n",
+    "    completed_urls = dict_from_csv(output_file)\n",
+    "    skipped_urls = []\n",
+    "    attempts = 0\n",
+    "    incomplete_files = list(files)\n",
+    "    while len(incomplete_files) > 0:\n",
+    "        if attempts == 3:\n",
+    "            break\n",
+    "        for fn in incomplete_files:\n",
+    "            try:\n",
+    "                archive_urls(fn)\n",
+    "                incomplete_files.pop(incomplete_files.index(fn)) # if it works, remove it from the list\n",
+    "            except ConnectionError:\n",
+    "                failed_files.append(fn)\n",
+    "        attempts += 1\n",
+    "        logging.warn('Files that failed: {}'.format(incomplete_files))\n",
+    "        time.sleep(30) # If something goes wrong, wait to see if it gets better :)\n",
+    "\n",
+    "\n",
+    "def dict_from_csv(csv_file):\n",
+    "    result = {}\n",
+    "    if pathlib.Path(csv_file).exists():\n",
+    "        with open(csv_file, 'r') as fn:\n",
+    "            f = csv.reader(fn)\n",
+    "            for row in f:\n",
+    "                result[row[0]] = row[1]\n",
+    "    return result \n",
+    "\n",
+    "    \n",
+    "def get_domain(url):\n",
+    "    domain = re.search('^https://www.(\\w+\\.\\w+)', url).groups()[0]\n",
+    "    if not domain:\n",
+    "        raise ValueError(\"Can't find URL in {url}\")\n",
+    "    return domain\n",
+    "\n",
+    "\n",
+    "\n",
+    "def get_urls_from_json(j_obj):\n",
+    "    '''Takes a JSON object and extracts the correct URLs; returns them in a list.'''\n",
+    "    query_url = urlencode_url(j_obj['link'])\n",
+    "    link_urls = []\n",
+    "    \n",
+    "    for x in j_obj['linkElements']:\n",
+    "        url = x['href']\n",
+    "        if re.match('javascript', url) or url == '':\n",
+    "            continue\n",
+    "        link_urls.append(urlencode_url(url))\n",
+    "    return (query_url, link_urls)\n",
+    "    \n",
+    "def urlencode_url(url):\n",
+    "    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))\n",
+    "\n",
+    "def archive_url(url, \n",
+    "                wait = 2,         \n",
+    "                capture_outlinks = 0 # Whether to capture outlinks (default is no)\n",
+    "                ):\n",
+    "\n",
+    "\n",
+    "\n",
+    "    payload = {'url': url,\n",
+    "              'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN,\n",
+    "              #'capture_screenshot': capture_screenshot,\n",
+    "              'capture_outlinks': capture_outlinks\n",
+    "              }\n",
+    "    r = requests.post(ENDPT, headers=HEADERS, data=payload)\n",
+    "    logging.debug(r.content)\n",
+    "    print(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n",
+    "\n",
+    "    if r.status_code == 429:\n",
+    "        logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds')\n",
+    "        time.sleep(wait)\n",
+    "        return archive_url(url = url,\n",
+    "                           wait = wait * 1.2, \n",
+    "                           capture_outlinks = capture_outlinks)\n",
+    "    if r.status_code in [104,502,503,504,443,401]:\n",
+    "        logging.warning(url)\n",
+    "        logging.warning(r.text)\n",
+    "        if r.status_code in [104, 401, 443]:\n",
+    "            logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.')\n",
+    "            return None\n",
+    "        logging.warning('502 or 503 or 504 status received; waiting 30 seconds')\n",
+    "        time.sleep(30)\n",
+    "        return archive_url(url = url,\n",
+    "                           capture_outlinks = capture_outlinks)\n",
+    "                          \n",
+    "    r.raise_for_status()\n",
+    "    try:\n",
+    "        return r.json()['job_id']\n",
+    "    except KeyError:\n",
+    "        logging.warning(f'Should have a valid job id for {url}. Instead, this was returned:\\n {r.content}')\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py3",
+   "language": "python",
+   "name": "py3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/search_engine_results/wayback_urls/add_wayback_urls.py b/search_engine_results/wayback_urls/add_wayback_urls.py
new file mode 100644
index 0000000..4e35d4b
--- /dev/null
+++ b/search_engine_results/wayback_urls/add_wayback_urls.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import pathlib
+import json
+import re
+import time
+import argparse
+import logging
+import csv
+import requests
+import urllib
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Add wayback URLs to SERP metadata.')
+    parser.add_argument('-i', help='Input directory with metadata files')
+    parser.add_argument('-w', help = 'Location of file with wayback URLS')
+    parser.add_argument('-o', help='Directory to save modified files (if blank or same as input directory, will overwrite)')
+
+    args = parser.parse_args()
+
+    
+    # Make a list of the files that we are going to be editing (skip those already edited)
+    files = pathlib.Path(args.i).glob('**/*.json')
+    wayback_dict = load_wayback_dict(args.w)
+    for fn in files:
+        write_wayback_to_file(fn, args.o, wayback_dict)
+
+
+def write_wayback_to_file(filename, out_dir, wayback_dict):
+    with open(filename, 'r') as f:
+        j_obj = json.load(f)
+        query_url = urlencode_url(j_obj['link'])
+        try:
+            wayback_url = wayback_dict[query_url]
+            j_obj['wayback_url'] = wayback_url
+        except KeyError:
+            logging.error(f"Should have an entry for {query_url}")
+            logging.error(wayback_dict.keys())
+            j_obj['wayback_url'] = ''
+        for link_obj in j_obj['linkElements']:
+            link_url = urlencode_url(link_obj['href'])
+            if link_url == '':
+            try:
+                wayback_url = wayback_dict[link_url]
+                link_obj['wayback_url'] = wayback_url
+            except KeyError:
+                logging.info(f'No WB URL for {link_url}')
+                link_obj['wayback_url'] = ''
+    outfile = get_out_path(filename, out_dir)
+    with open(outfile, 'w') as f:
+        json.dump(j_obj, f)
+
+
+def get_out_path(fp, out_dir):
+    '''Assumes that we want to keep the directory and the file name'''
+    if out_dir == None:
+        return fp
+    else:
+        new_path = pathlib.Path(out_dir).joinpath(*fp.parts[-2:])
+        if not new_path.parent.exists():
+            logging.warning(f"Creating new path at {new_path}")
+            new_path.parent.mkdir(parents = True)
+        return new_path
+
+def load_wayback_dict(fn):
+    '''Loads the waback URL file as a dictionary of {orig_url:wb_url}. Currently ignores
+    the timestamp, overwriting older WB URLs with newer ones'''
+    result = {}
+    if pathlib.Path(fn).exists():
+        with open(fn, 'r') as f_obj:
+            f = csv.reader(f_obj)
+            for row in f:
+                result[row[1]] = row[2]
+    return result
+
+
+def urlencode_url(url):
+    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    main()
+
diff --git a/search_engine_results/wayback_urls/archive_urls.py b/search_engine_results/wayback_urls/archive_urls.py
new file mode 100644
index 0000000..3870a36
--- /dev/null
+++ b/search_engine_results/wayback_urls/archive_urls.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import pathlib
+import json
+import re
+import requests
+import time
+import config
+import argparse
+import logging
+import urllib.parse
+import csv
+import datetime
+from urllib import HTTPError
+
+
+ENDPT = 'https://web.archive.org/save/'
+UA_STRING = config.UA_STRING
+ACCESS_KEY = config.ACCESS_KEY
+SECRET_KEY = config.SECRET_KEY
+HEADERS = {'Accept':'application/json',
+           'User-Agent': UA_STRING,
+           'Authorization': f'LOW {ACCESS_KEY}:{SECRET_KEY}'}
+IF_NOT_ARCHIVED_WITHIN = '20h' # If an archive has been made in this long, don't make another one
+CHUNK_SIZE = 15 # Get this many URLS at a time
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Creates job ids from to_archive csv file')
+    parser.add_argument('-i', help='Location of to_archive CSV file')
+    parser.add_argument('-o', help='Location to save wayback URL file')
+    args = parser.parse_args()
+
+
+
+    def get_job_ids(urls, capture_outlinks):
+        for url in urls:
+            if url not in completed_urls:
+                job_id = archive_url(url,
+                        capture_outlinks = capture_outlinks)
+                # Just put them into the job_id_tuples
+                job_id_tuples.append((url, job_id))
+            else:
+                logging.debug(f'{url} was in completed')
+
+
+
+    def get_wayback_urls(out_file):
+        for url, job_id in job_id_tuples:
+            try:
+                wb_url, timestamp = get_wayback_url(job_id)
+                write_wayback(out_file, url, wb_url, timestamp)
+            except ConnectionError:
+                logging.warning(f'{url} with job id {job_id} failed with a ConnectionError')
+            except TypeError:
+                logging.warning(f'{url} with job id {job_id} did not get a WB URL')
+            except HTTPError as e:
+                logging.warning(f'{url} with job id {job_id} failed with an uncaught HTTP Error: {e}')
+            except Exception as e:
+                logging.warning(f'{url} with job id {job_id} failed with an uncaught Exception: {e}')
+
+
+    completed_urls = get_completed(args.o, time_string = IF_NOT_ARCHIVED_WITHIN)
+    to_archive = load_urls(args.i)
+    # Do query URLS first, since for them we'll capture outlinks
+    query_urls = [x for x in to_archive if to_archive[x] == 'query']
+    link_urls = [x for x in to_archive if to_archive[x] == 'link']
+
+    with open(args.o, 'a') as out_file:
+        out = csv.writer(out_file)
+        logging.info("Now retrieving query urls")
+        for q_chunk in chunk_list(query_urls, CHUNK_SIZE):
+            job_id_tuples = [] # Stores which urls need to be retrieved (populated by get_job_ids)
+            get_job_ids(q_chunk, capture_outlinks = 1)
+            get_wayback_urls(out)
+
+        logging.info("Now retrieving link urls")
+        for chunk in chunk_list(link_urls, CHUNK_SIZE):
+            job_id_tuples = []
+            get_job_ids(chunk, capture_outlinks = 0)
+            get_wayback_urls(out)
+
+
+def chunk_list(l, size):
+    for i in range(0, len(l), size):
+        logging.info(f'Now getting items {i} through {min(len(l), i + size)} of {len(l)}')
+        yield l[i:i+size]
+
+def load_urls(url_fn):
+    result = {}
+    with open(url_fn, 'r') as fn:
+        f = csv.reader(fn)
+        for row in f:
+            result[row[0]] = row[1]
+    return result
+
+
+def write_wayback(f, url, wayback_url, timestamp):
+    '''Takes a CSV writer object, a url, and wayback_url, and writes
+    it out'''
+    f.writerow([timestamp,url,wayback_url])
+
+
+
+def get_completed(csv_file, time_string):
+    '''Loads all of the completed URLs from the csv file. Takes in a time string like '20h',
+    strips the last character, and assumes that it refers to the number of hours.
+    Does not load any URLs older than that.
+    '''
+    delta_hours = int(time_string[:-1])
+    result = {}
+    if pathlib.Path(csv_file).exists():
+        with open(csv_file, 'r') as fn:
+            f = csv.reader(fn)
+            for row in f:
+                dt = datetime.datetime.strptime(row[0], '%Y%m%d%H%M%S')
+                if datetime.datetime.now() - dt > datetime.timedelta(hours = delta_hours):
+                    continue
+                else:
+                    result[row[1]] = row[2]
+    return result
+
+def urlencode_url(url):
+    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))
+
+def archive_url(url,
+                wait = 6,
+                capture_outlinks = 0 # Whether to capture outlinks (default is no)
+                ):
+
+    logging.info(f'Sending archive call for {url}')
+    payload = {'url': url,
+              'if_not_archived_within' : IF_NOT_ARCHIVED_WITHIN,
+              #'capture_screenshot': capture_screenshot,
+              'capture_outlinks': capture_outlinks
+              }
+    r = requests.post(ENDPT, headers=HEADERS, data=payload)
+    logging.debug(r.content)
+
+    if r.status_code == 429:
+        logging.info(f'Hit rate limit, now waiting for {wait:.2f} seconds')
+        time.sleep(wait)
+        return archive_url(url = url,
+                           wait = wait * 1.2, 
+                           capture_outlinks = capture_outlinks)
+    if r.status_code in [104,401,404,443,502,503,504]:
+        logging.warning(url)
+        logging.warning(r.text)
+        if r.status_code in [104, 401, 443]:
+            logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.')
+            return None
+        logging.warning('502 or 503 or 504 status received; waiting 30 seconds')
+        time.sleep(30)
+        return archive_url(url = url,
+                           capture_outlinks = capture_outlinks)
+
+    r.raise_for_status()
+    try:
+        return r.json()['job_id']
+    except KeyError:
+        logging.warning(f'Should have a valid job id for {url}. Instead, this was returned:\n {r.content}')
+
+
+def get_wayback_url(job_id):
+
+    def call_status_url(
+                         wait = 6, # Initial wait time
+                         max_wait = 12 # Stop when wait time between calls hits max_wait
+                        ):
+        '''Helper function to handle the call to the status API'''
+        if job_id is None:
+            return None
+        s = requests.get(ENDPT + 'status/' + job_id, headers=HEADERS)
+        if s.status_code == 200:
+            s_json = s.json()
+            if s_json['status'] == 'pending':
+                if wait > max_wait:
+                    logging.debug(s_json)
+                    logging.warning(f"The call to get the status of job id {job_id} failed. Skipping")
+                    return None
+                logging.info(f'Pending, now waiting for {wait:.2f} seconds')
+                time.sleep(wait)
+                return call_status_url(wait = wait + 1)
+            if s_json['status'] == 'success':
+                return s_json
+            if s_json['status'] == 'error':
+                logging.error('Could not get status, with error: {}'.format(s_json["message"]))
+                return None
+            else:
+                logging.warning(s_json)
+                raise ValueError("Status was unexpected")
+        if s.status_code == 429:
+            logging.info(f'Hit rate limit, now waiting for {wait} seconds')
+            time.sleep(wait)
+            return call_status_url(wait = wait * 1.2) # Backoff
+        if s.status_code in [104,401,404,443,502,503,504]:
+        # These likely mean something's wrong; wait and then try again
+            if r.status_code in [104, 401, 443]:
+                logging.warning(f'104, 401, or 443 received when archiving {url}. Giving up.')
+                return None
+            logging.warning('443, 502, 503, or 504 status received; waiting 30 seconds')
+            logging.warning(s.text)
+            time.sleep(30)
+            return call_status_url()
+        else:
+            s.raise_for_status()
+
+    logging.info(f"Getting wayback URL for job id {job_id}")
+    s_json = call_status_url()
+    if s_json is None:
+        return None
+    try:
+        wayback_url = 'http://web.archive.org/web/{}/{}'.format(s_json['timestamp'],
+                                                         s_json['original_url'])
+        return (wayback_url, s_json['timestamp'])
+    except KeyError:
+        logging.error(f"Missing timestamp or original URL for {job_id}")
+        return None
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+    main()
diff --git a/search_engine_results/wayback_urls/get_urls.py b/search_engine_results/wayback_urls/get_urls.py
new file mode 100644
index 0000000..fdf27d7
--- /dev/null
+++ b/search_engine_results/wayback_urls/get_urls.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import pathlib
+import json
+import re
+import requests
+import time
+import config
+import argparse
+import logging
+import urllib.parse
+import csv
+import datetime
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Gets URLs to archive from SERP metadata files')
+    parser.add_argument('-i', help='Input directory with metadata files')
+    parser.add_argument('-o', help='Location to save URL list')
+    ## TODO: Maybe switch this so default is to ignore?
+    parser.add_argument('--ignore_self_links', help='Whether to ignore links from the same domain as the query',
+            action='store_true')
+
+    args = parser.parse_args()
+
+    # Make a list of the files that we are going to be editing (skip those already edited)
+    files = pathlib.Path(args.i).glob('**/*.json')
+    ## FOR TESTING ONLY!!!
+    #files = list(files)[10:11]
+    get_urls_from_files(files, args.o, args.ignore_self_links)
+
+def get_urls_from_files(files,
+        output_file,
+        ignore_self_links,
+        remove_cache = True):
+
+    def get_urls(fn):
+        '''Takes a file, gets the urls to archive, passes them to the archive_url function, and writes them to
+        the output file'''
+        with open(fn, 'r') as f:
+            j_obj = json.load(f)
+            # Get the URLs from the file
+            query_url, link_urls = get_urls_from_json(j_obj)
+            # Filter out the self links and search engine cache urls
+            link_urls = filter_link_urls(query_url, link_urls)
+            return (query_url, link_urls)
+
+    def filter_link_urls(query_url,
+                         urls):
+        '''
+        Takes link urls and filters them in four ways:
+        1. (Optionally) Ignores urls from the two caches:
+        webcache.googleusercontent.com
+        https://cc.bingj
+        2. Filters out those which are in the completed_urls dictionary
+        3. (Optionally) Identifies URLs which have the same domain as the query URL.
+        Checks the skipped_urls list to see if the URL already appears there. If so, we assume
+        that we want it archived and move it from skipped to the to_archive list
+        4. Filters out URLs that appear more than once in this list
+        '''
+        if ignore_self_links:
+            domain = get_domain(query_url)
+        else:
+            domain = None
+        cache_regex = r'https?://webcache.googleusercontent.com|https?://cc.bingj.com'
+
+        result = set()
+        for url in urls:
+            if url in to_archive:
+                continue
+
+            if remove_cache == True:
+                if re.match(cache_regex, url):
+                    continue
+
+            if ignore_self_links and re.match(f'https?://\w*\.?{domain}', url):
+                # If it matches, check if it's in skipped URLs
+                # If so, remove it from there, and add it to the to_archive list
+                if url in skipped_urls:
+                    result.add(url)
+                    skipped_urls.remove(url)
+                # Else, add it to the skipped urls (and skip it)
+                else:
+                    skipped_urls.add(url)
+            else:
+                result.add(url)
+        return result
+
+    skipped_urls = set()
+    to_archive = {}
+    for fn in files:
+        q_url, link_urls = get_urls(fn)
+        to_archive[q_url] = 'query'
+        for url in link_urls:
+            # Prioritize query urls - if it's already there,
+            # then don't overwrite
+            if url not in to_archive:
+                to_archive[url] = 'link'
+    write_urls(to_archive, output_file)
+
+
+def write_urls(url_dict, fn):
+    with open(fn, 'w') as out_file:
+        f = csv.writer(out_file)
+        for url, link_type in url_dict.items():
+            f.writerow([url, link_type])
+
+
+def get_domain(url):
+    domain = re.search('^https://www.(\w+\.\w+)', url).groups()[0]
+    if not domain:
+        raise ValueError("Can't find URL in {url}")
+    return domain
+
+
+def get_urls_from_json(j_obj):
+    '''Takes a JSON object and extracts the correct URLs; returns them in a list.'''
+    query_url = urlencode_url(j_obj['link'])
+    link_urls = set()
+
+    for x in j_obj['linkElements']:
+        url = x['href']
+        if re.match('javascript', url) or url == '':
+            continue
+        link_urls.add(urlencode_url(url))
+    return (query_url, link_urls)
+
+
+def urlencode_url(url):
+    return requests.utils.requote_uri(urllib.parse.unquote_plus(url))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/search_engine_results/wayback_urls/wayback_urls.csv b/search_engine_results/wayback_urls/wayback_urls.csv
new file mode 100644
index 0000000..e69de29