forked from rjstyles/StackOverflow-Crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
62 lines (53 loc) · 1.91 KB
/
main.py
File metadata and controls
62 lines (53 loc) · 1.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import requests
from bs4 import BeautifulSoup
import operator
import os
import sys
Tag_Rank = {}
def tag_crawler(url):
source_code = requests.get(url).text
soup = BeautifulSoup(source_code, 'html.parser')
for tag_div in soup.find_all('div', {'class': 'post-taglist'}):
for tag_link in tag_div.find_all('a'):
tag = tag_link.string
if tag in Tag_Rank:
Tag_Rank[tag] += 1
else:
Tag_Rank[tag] = 1
def ques_links_crawler(base_url, end_url, page_limit):
page_no = 1
while page_no <= page_limit:
page_url = base_url + str(page_no) + end_url
source_code = requests.get(page_url).text
soup = BeautifulSoup(source_code, 'html.parser')
if page_no is 1:
os.system('clear')
print('crawling page ' + str(page_no) + ': [', end='')
prev_len = 0
q_no = 1
for ques_link in soup.find_all('a', {'class': 'question-hyperlink'}):
url = 'http://stackoverflow.com/' + ques_link.get('href')
tag_crawler(url)
for _ in range(prev_len):
print('\b', end='')
print('#', end='')
p_cent = q_no*2
percent = '] (' + str(p_cent) + '%)'
prev_len = len(percent)
print(percent, end='')
sys.stdout.flush()
q_no += 1
page_no += 1
def start():
page_limit = int(input('Enter no. of pages to crawl : '))
os.system('clear')
print('starting crawling...')
ques_links_crawler('http://stackoverflow.com/questions?page=', '&sort=newest', page_limit)
fw = open('Tags_frequency3.txt', 'w')
for key, value in sorted(Tag_Rank.items(), key=operator.itemgetter(1), reverse=True):
try:
fw.write(key + " : " + str(Tag_Rank[key]) + "\n")
except TypeError:
continue
print('\nResult saved to file Tags_frequency.txt')
start()