-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpyScraper.py
More file actions
54 lines (51 loc) · 2.13 KB
/
pyScraper.py
File metadata and controls
54 lines (51 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#-------------------------------------------------------------------------------
# Name: pyScraper
# Purpose: Web scraping, supporting JavaScript, proxies and cookies.
#
# Author: Akisato Kimura <akisato@ieee.org>
#
# Created: April 24, 2014
# Updated: November 29, 2017
# Copyright: (c) Akisato Kimura 2014-
# Licence: All rights reserved
#-------------------------------------------------------------------------------
from __future__ import print_function
import json
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import codecs
def scraping(url, proxy_url, cookie_file, log_file, output_file):
# Selenium settings
phantomjs_args = []
if proxy_url is not None:
proxy_setting = '--proxy=' + proxy_url
phantomjs_args.append(proxy_setting)
if cookie_file is not None:
cookie_setting = '--cookie_file={}'.format(cookie_file)
phantomjs_args.append(cookie_setting)
driver = webdriver.PhantomJS(service_args=phantomjs_args, service_log_path=log_file,
executable_path='/usr/local/bin/phantomjs')
# get a HTML response
driver.get(url)
html = driver.page_source.encode('utf-8') # more sophisticated methods may be available
# parse the response
soup = BeautifulSoup(html, 'lxml')
##### You have to modify the following part according to your objectives. #####
##### Here, we extract
##### 1. Texts between <head> ... <title> and </title> ... </head>
##### 2. Texts in <meta name="description"> between <head> and </head>
# extract
## title
header = soup.find("head")
title = header.find("title").text
## description
description = header.find("meta", attrs={"name": "description"})
description_content = description.attrs['content']
# output
print(isinstance(title, unicode))
print(isinstance(description_content, unicode))
output = {"title": title, "description": description_content}
# write the output as a json file
with codecs.open(output_file, 'w', 'utf-8') as fout:
json.dump(output, fout, indent=4, sort_keys=True, ensure_ascii=False)