-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathHTMLParser.py
More file actions
139 lines (128 loc) · 5.05 KB
/
HTMLParser.py
File metadata and controls
139 lines (128 loc) · 5.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import re
import json
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
class HTMLParser:
def scrape_all_from_file(self, infile, outfile):
"""
This is the only function of the class you should really use.
This function takes a file containing JSON records of NGO's, which must
include a 'url' field, pulls all text from each URL & recursively from
URLs in the same domain, and generates new records with the text in an
additional text field. It saves these new records in a new JSON file.
Parameters:
infile - string: filename of file containing JSON records of NGOs.
this JSON file must have a "projects" key which contains a list
of NGO records, which each must have a "url" field.
outfile - string: filename of a new file to write augmented NGO
records with additioinal 'text' field.
Returns:
None
"""
with open(infile, "r") as input_file:
input_data = json.load(input_file)
scraping_data = {}
scraping_data["projects"] = []
for project in input_data["projects"]:
url = project.get("url")
if url:
project["text"] = self.scrape_url(url)
scraping_data["projects"].append(project)
with open(outfile, "w") as output_file:
json.dump(scraping_data, output_file)
def scrape_url(self, url: str) -> str:
"""
scrape_url can be used as a blackbox with a given url.
It will RETURN a string with text parsed from:
1) The original site
2) Associated links
Parameters:
url - string: url of page to scrape
Returns:
text - string: text of original site specified by 'url' along with
text from associated links on the page.
"""
try:
request = requests.get(url)
except requests.exceptions.ConnectionError as e:
print(e)
return ""
html_doc = request.text
soup = BeautifulSoup(html_doc, "html.parser")
other_links = self.get_other_links(soup, url)
# Get all text from url and subpages
text = soup.findAll(text=True)
text = " ".join(self.filter_text(text))
for link in other_links:
child_html_doc = requests.get(link).text
child_soup = BeautifulSoup(child_html_doc, "html.parser")
child_text = child_soup.findAll(text=True)
child_text = " ".join(self.filter_text(child_text))
text += " "
text += child_text
return text
def get_other_links(self, soup, url):
"""
This function gets associated links from a webpage.
The links should be within the same domain, and not stylesheets.
This function is known to have some bugs.
Parameters:
soup - BeautifulSoup object: should be an html parser
url - string: url of page to get associated links from.
Returns:
links - set: set of intra-domain urls found on 'url' page.
"""
links = set()
tags = soup.findAll(href=True)
regex = re.compile("^" + url)
for tag in tags:
bad_tags = ["head", "video", "script"]
if tag.parent.name in bad_tags:
continue
sub_url = tag.get("href")
if ".css" in sub_url or ".pdf" in sub_url:
continue
if re.match(regex, sub_url):
links.add(sub_url)
elif tag.get("data-target") == "#" or sub_url.startswith("./"):
if sub_url.startswith("./"):
sub_url = sub_url[2:]
if sub_url.startswith("/"):
sub_url = sub_url[1:]
if url.endswith("/"):
url = url[:-1]
link = url + "/" + sub_url
links.add(link)
else:
if url.endswith("/"):
url = url[:-1]
link = url + "/" + sub_url
links.add(link)
if url in links: # safety check
links.remove(url)
if url + "/" in links:
links.remove(url + "/")
return links
def filter_text(self, texts):
"""
This function filters out tags/scripts from HTML.
Parameters:
texts - list of strings: list of strings from some URL that should
be filtered.
Returns:
filtered_text - string: concatenated body of text without tags/
scripts.
"""
filtered_text = []
for text in texts:
if not isinstance(text, Comment) and text.parent.name not in {
"style",
"script",
"head",
"meta",
}:
stripped_text = text.strip()
if stripped_text:
filtered_text.append(stripped_text)
return filtered_text