-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
72 lines (62 loc) · 2.26 KB
/
scraper.py
File metadata and controls
72 lines (62 loc) · 2.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import include
'''
Scrapes the site https://www.footballnews.net/
legally to obtain football news headlines
'''
URL = 'https://www.footballnews.net/'
def scrape():
# open blacklist
with open('blacklist.txt', 'r') as f:
data = f.readlines()
blacklist = [item.strip('\n') for item in data]
if not blacklist:
print("Failed to pull blacklist")
return
# Get request
try:
page = include.rq.get(URL)
except:
print("error")
# Setting variables and getting data based on the html
print("Parsing HTML")
data = include.bs(page.content, "html.parser")
results = data.find(id='content')
news = results.find("div", class_='content-data')
final = news.find("ul", class_="news-content")
html_links = final.find_all("a")
html_sources = final.find_all("span", class_="source")
'''
Goes through each tuple in the list and checks the source
If the source is in the black list it deletes it
otherwise it creates a list that zips the headline
and source together and sends it to the insert function
'''
links = [link.text.strip().lower() for link in html_links]
sources = list(set([source.text.strip().replace("-", '').replace(' ', '').lower()
for source in html_sources]))
# checks if something exists
if links and sources:
print("Successfully scraped site, Filtering Data")
else:
print("Failed to scrape")
return
# Filters the list and source dict based on the blacklist
print("Filtering for blacklist")
zipped_file = list(zip(links, sources))
print("Original Data entered")
final = [(val, key) for (val, key) in zipped_file if key not in blacklist]
print("Filtering Data complete")
# Filters sources and creates the dictionary
print("Creating Sources Dictionary")
sources_dict = dict()
for num in range(len(sources)):
if sources[num] not in sources_dict and sources[num] not in blacklist:
sources_dict[sources[num]] = num
else:
continue
# Creates JSON of sources
print("Saving JSON back up of sources mapping")
with open('jsons/sources.json', 'w') as file:
include.json.dump(sources_dict, file)
print("Complete")
return final