-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathwayback_api.py
More file actions
49 lines (39 loc) · 1.55 KB
/
wayback_api.py
File metadata and controls
49 lines (39 loc) · 1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""
Module for accessing the web archive wayback machine (https://archive.org/web/)
"""
from typing import List, Tuple
import requests
# undocumented API scraped from the 'save page' form here: https://archive.org/web/
SAVE_PAGE_API = 'http://web.archive.org/save/{url}'
# see: https://archive.org/help/wayback_api.php
CHECK_PAGE_API = 'http://archive.org/wayback/available'
def check_url(url: str) -> str:
"""
Check that a URL is archived
:param url: the URL to check
:return: the archived URL, or None if not archived
"""
response_data = requests.get(CHECK_PAGE_API, params={'url': url}).json()
return response_data.get('archived_snapshots', {}).get('closest', {}).get('url', None)
def archive_url(url: str) -> str:
"""
Archive a URL
:param url: the URL to archive
:return: the newly archived URL, or None if there was a problem archiving it
"""
save_url = SAVE_PAGE_API.format(url=url)
response = requests.get(save_url)
if response.status_code is not 200:
return None # TODO: return the error to be handled later
return check_url(url)
def check_and_archive_url(url: str) -> str:
"""
Check to see if a URL is archived. If it is, return the archived URL. If it isn't, archive it and then return the archived URL.
:param url: the URL to check (and archive if not already)
:return: the URL that has been archived, or None if there was a problem archiving it
"""
archived = check_url(url)
if archived is None:
return archive_url(url)
else:
return archived