-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
68 lines (62 loc) · 2.96 KB
/
data.py
File metadata and controls
68 lines (62 loc) · 2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from models import *
from craigsuck import craigslist
import re
import time
from datetime import datetime
listings = ['http://sfbay.craigslist.org/acc/','http://sfbay.craigslist.org/ofc/','http://sfbay.craigslist.org/egr/','http://sfbay.craigslist.org/med/','http://sfbay.craigslist.org/sci/','http://sfbay.craigslist.org/bus/','http://sfbay.craigslist.org/csr/','http://sfbay.craigslist.org/edu/','http://sfbay.craigslist.org/fbh/','http://sfbay.craigslist.org/lab/','http://sfbay.craigslist.org/gov/','http://sfbay.craigslist.org/hum/','http://sfbay.craigslist.org/eng/','http://sfbay.craigslist.org/lgl/','http://sfbay.craigslist.org/mnu/','http://sfbay.craigslist.org/mar/','http://sfbay.craigslist.org/hea/','http://sfbay.craigslist.org/npo/','http://sfbay.craigslist.org/rej/','http://sfbay.craigslist.org/ret/','http://sfbay.craigslist.org/sls/','http://sfbay.craigslist.org/spa/','http://sfbay.craigslist.org/sec/','http://sfbay.craigslist.org/trd/','http://sfbay.craigslist.org/sof/','http://sfbay.craigslist.org/sad/','http://sfbay.craigslist.org/tch/','http://sfbay.craigslist.org/trp/','http://sfbay.craigslist.org/tfr/','http://sfbay.craigslist.org/web/','http://sfbay.craigslist.org/wri/','http://sfbay.craigslist.org/wri/']
def parse_craigslist_post(post):
if post['title']:
parts = re.split('\(',post['title'])
if len(parts)>0:
post['neighborhood'] = parts.pop().rstrip(')')
if post['link']:
link_parts = re.split('/',post['link'].lstrip('http://').rstrip('.html'))
if link_parts[3]:
post['post_id'] = link_parts[3]
if link_parts[2]:
post['post_type'] = link_parts[2]
if link_parts[1]:
post['city'] = link_parts[1]
if link_parts[0]:
post['source'] = link_parts[0]
if post['date']:
d = time.strptime(post['date'][0:len(post['date'])-6],'%Y-%m-%dT%H:%M:%S')
post['post_date'] = datetime.fromtimestamp(time.mktime(d))
return post
def save_post(post):
if Post.query.filter_by(post_id=post['post_id']).first() is None:
item = Post(post['post_id'],post['post_date'],post['title'],post['link'])
item.source = save_region(post['source'])
item.region = save_region(post['city'])
item.section = save_region(post['neighborhood'])
item.post_type = save_post_type(post['post_type'])
db.session.add(item)
db.session.commit()
return True
return False
def save_post_type(short):
post_type = PostType.query.filter_by(short=short).first()
if post_type is None:
post_type = PostType(short)
db.session.add(post_type)
db.session.commit()
return post_type
return post_type
def save_region(short):
region = Region.query.filter_by(short=short).first()
if region is None:
region = Region(short)
db.session.add(region)
db.session.commit()
return region
def query_craigslist(listings,verbose = False):
posts = craigslist.fetch_all(listings)
count = 0
for post in posts:
parsed = parse_craigslist_post(post)
if save_post(parsed):
count += 1
if verbose:
print parsed['title']+"//"+parsed['post_type']
if verbose:
print str(count)+" new posts added"