-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
137 lines (101 loc) · 4.99 KB
/
crawler.py
File metadata and controls
137 lines (101 loc) · 4.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# This script allows to crawl information and repositories from GitHub using the GitHub REST API (https://developer.github.com/v3/search/).
#
# Given a query, the script downloads for each repository returned by the query its ZIP file.
# In addition, it also generates a CSV file containing the list of repositories queried.
# For each query, GitHub returns a json file which is processed by this script to get information about repositories.
#
# The GitHub API limits the queries to get 100 elements per page and up to 1,000 elements in total.
# To get more than 1,000 elements, the main query should be splitted in multiple subqueries using different time windows through the constant SUBQUERIES (it is a list of subqueries).
#
# As example, constant values are set to get the repositories on GitHub of the user 'rsain'.
#############
# Libraries #
#############
import wget
import time
import simplejson
import csv
import pycurl
import math
from StringIO import StringIO
import calendar
#############
# Constants #
#############
URL = "https://api.github.com/search/repositories?q=" #The basic URL to use the GitHub API
QUERY = "leetcode" #The personalized query (for instance, to get repositories from user 'rsain')
#https://api.github.com/search/repositories?q=leetcode+created%3A2015-04-30..2016-07-04
#https://api.github.com/search/repositories?q=leetcode+created%3A2013-01-01..2013-07-04
#SUBQUERIES = ["+created%3A<%3D2013-12-30","+created%3A>%3D2014-01-01"] #Different subqueries if you need to collect more than 1000 elements
PARAMETERS = "&per_page=100" #Additional parameters for the query (by default 100 items per page)
DELAY_BETWEEN_QUERYS = 5 #The time to wait between different queries to GitHub (to avoid be banned)
OUTPUT_FOLDER = "./" #Folder where ZIP files will be stored
OUTPUT_CSV_FILE = "./repositories.csv" #Path to the CSV file generated as output
#############
# Functions #
#############
def getUrl (url) :
''' Given a URL it returns its body '''
buffer = StringIO()
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.WRITEDATA, buffer)
c.perform()
c.close()
body = buffer.getvalue()
return body
########
# MAIN #
########
#To save the number of repositories processed
countOfRepositories = 0
#Output CSV file which will contain information about repositories
csvfile = open(OUTPUT_CSV_FILE, 'wb')
repositories = csv.writer(csvfile, delimiter=',')
from datetime import datetime, timedelta
from collections import OrderedDict
dates = ["2013-01-01", "2020-12-01"]
start, end = [datetime.strptime(_, "%Y-%m-%d") for _ in dates]
dateRange = OrderedDict(((start + timedelta(_)).strftime(r"%Y-%m"), None) for _ in xrange((end - start).days)).keys()
goodLicense={
"+license:afl-3.0+license:artistic-2.0+license:bsl-1.0+license:bsd-2-clause+license:bsd-3-clause+license:mpl-2.0+license:unlicense+license:apache-2.0+license:cc+license:wtfpl+license:epl-1.0+license:agpl-3.0+license:gpl+license:lgpl+license:isc+license:ms-pl+license:mpl-2.0+license:osl-3.0+license:ncsa+license:postgresql+license:zlib",
"+license:mit"}
#Run queries to get information in json format and download ZIP file for each repository
for year in range(2013, 2020):
for month in range(1,12):
for lic in goodLicense:
curMonth = str(year)+"-"+format(month,'02d')
lastDay = format(calendar.monthrange(year,month)[1],'02d')
SUB_QUERY = "+created%3A"+curMonth+"-01.."+curMonth+"-"+lastDay+lic
#Obtain the number of pages for the current subquery (by default each page contains 100 items)
url = URL + QUERY + SUB_QUERY + PARAMETERS
print "Processing "+url
dataRead = simplejson.loads(getUrl(url))
if(dataRead.get('total_count') is None):
print dataRead
numberOfPages = int(math.ceil(dataRead.get('total_count')/100.0))
#A delay between different queries
time.sleep(DELAY_BETWEEN_QUERYS)
#Results are in different pages
for currentPage in range(1, numberOfPages+1):
url = URL + QUERY + SUB_QUERY + PARAMETERS + "&page=" + str(currentPage)
print "SUB Processing "+url + " ...out of "+str(numberOfPages)
dataRead = simplejson.loads(getUrl(url))
#Iteration over all the repositories in the current json content page
for item in dataRead['items']:
#Obtain user and repository names
user = item['owner']['login']
repository = item['name']
#Update repositories counter
countOfRepositories = countOfRepositories + 1
#Download the zip file of the current project
print ("'%s' from user '%s' ... '%s'" %(repository,user,item['license']['key'] ))
url = item['clone_url']
fileToDownload = url[0:len(url)-4] + "/archive/master.zip"
fileName = item['full_name'].replace("/","_") + ".zip"
#wget.download(fileToDownload, out=OUTPUT_FOLDER + fileName)
repositories.writerow([curMonth, user, repository, url, fileToDownload, item['license']['key']])
#A delay between different subqueries
time.sleep(DELAY_BETWEEN_QUERYS)
print "DONE! " + str(countOfRepositories) + " repositories have been processed."
csvfile.close()