From c45d50df4eeb9d82af578fe9cb5ad2bc5529af6e Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 19 Apr 2014 16:08:56 -0300 Subject: [PATCH] Saca los requisitos de los cursos --- crawler.py | 88 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 29 deletions(-) diff --git a/crawler.py b/crawler.py index a83d0e3..103145b 100644 --- a/crawler.py +++ b/crawler.py @@ -6,13 +6,16 @@ #aa='http://dsrd.uc.cl/dara/libcursos/periodo21/ua6_0.html' N_cursos = 100 -url_root = 'http://dsrd.uc.cl/dara/libcursos/periodo22/' +url_root = 'http://dsrd.uc.cl/dara/libcursos/periodo21/' file_root = 'ua' titles = ['N','sigla','seccion','creditos','nombre','min','opt','ofg','profesores','horario','actividad','salas','campus','titulos'] +titlesFueraPeriodo = ['N','sigla','seccion','creditos','nombre','vac', 'profesores', 'horario', 'actividad', 'salas', 'campus', 'titulos'] #titles = ['N','sigla','seccion','creditos','nombre','vac','profesores','horario','actividad','salas','campus','titulos'] multiples = ['profesores','horario','actividad','salas'] otherUA = []#['bachhu','bachcs'] courses = [] +requisitos = {} +esTemporadaInscripcion = False def get_course(sigla): for c in courses: @@ -22,7 +25,6 @@ def get_course(sigla): def start(): for ua in otherUA + range(1,N_cursos + 1): - page = 0 aux = 0 if(ua == 9): @@ -31,7 +33,6 @@ def start(): else: offset = 0 lastN = 30 - if(ua in otherUA): offset = -1 @@ -40,10 +41,10 @@ def start(): url = '%s%s%s'%(url_root,ua,'.html') else: url = '%s%s%s%s%s%s'%(url_root,file_root,ua,'_',page,'.html') - + f = urllib.urlopen(url) response_code = f.getcode() - + print url if(response_code != 200 or lastN < 25): break @@ -61,13 +62,11 @@ def start(): def parse(html,offset): soup = bs4.BeautifulSoup(html) - f = soup.find_all('tr') - if(len(f) < 11): + courses = soup.find_all('tr', recursive=False)[:-3] + if(len(courses) < 11): return 0 - table = f[10+offset] - courses = table.find_all('tr')[1:] i = 0 - for course in courses[1:]: + for course in courses: parseCourse(course) i = i + 1 return i @@ -77,21 +76,26 @@ def parseCourse(soup): i = 0 aux = {} raw = {} + title_array = titles + if(esTemporadaInscripcion == False): + title_array = titlesFueraPeriodo for td in soup.find_all('td'): data = td.text - if(titles[i] in multiples): + if(title_array[i] in multiples): data = td.find_all(text=True) - aux[titles[i]] = data + aux[title_array[i]] = data i = i + 1 continue - raw[titles[i]] = data + raw[title_array[i]] = data i = i + 1 #print(raw) seccion = {} - #seccion['vac'] = raw['vac'] - seccion['min'] = raw['min'] - seccion['opt'] = raw['opt'] - seccion['ofg'] = raw['ofg'] + if(esTemporadaInscripcion): + seccion['min'] = raw['min'] + seccion['opt'] = raw['opt'] + seccion['ofg'] = raw['ofg'] + else: + seccion['vac'] = raw['vac'] seccion['seccion'] = raw['seccion'] seccion['campus'] = raw['campus'] @@ -152,7 +156,7 @@ def startDesc(): curso = {} curso['sigla'] = sigla courses.append(curso) - curso['descripcion'] = str(p) + curso['descripcion'] = p.text #cursos.append(curso) #print('encontrados %d cursos')%(len(cursos)) i = i + 1 @@ -162,15 +166,40 @@ def startDesc(): ####################################################### def startReq(): - f = open('req/output2.json').read() - reqs = json.loads(f) + f = open('output2.json').read() + courses = json.loads(f) + print "Procesando requisitos, esto puede tomar varios minutos..." + root_url = "https://www2.puc.cl/ControlPrerrequisitos/jsp/RequisitosAsign.jsp?SIGLA=" + i = 0 + for c in courses: + if i % 100 == 0: + print str(i) + "/" + str(len(courses)) + " cursos procesados." + url = '%s%s'%(root_url, c['sigla']) + html = urllib.urlopen(url).read() + soup = bs4.BeautifulSoup(html) + tds = soup.find_all("td", {"class" : "td"}) + course_reqs = [] + for td in tds: + if (td.contents[0]['class'][0] != u"html_tipo_requisito"): + continue + curr_req = {} + tipo_requisito = td.find_all("font", {"class" : "html_tipo_requisito"})[0].text + if tipo_requisito == "Requisitos que todos los alumnos deben cumplir.": + tipo_requisito = "Todos" + curr_req["alumnos"] = tipo_requisito + texts = td.find_all("font", {"class" : "html_texto_azul"}) + curr_req["requisitos"] = texts[0].text + curr_req["requisitos_especiales"] = texts[-1].text + course_reqs.append(curr_req) + + requisitos[c['sigla']] = course_reqs + i = i + 1 + + +####################################################### +#####################Flujo Programa#################### +####################################################### - for r in reqs: - sigla = r['sigla'] - curso = get_course(sigla) - if curso == None: - continue - curso['requisitos'] = r['req'] start() print 'Total cursos: \t\t%s'%(len(courses)) @@ -182,11 +211,12 @@ def startReq(): print 'Total cursos: \t\t%s'%(len(courses)) f = open('output2.json','w') f.write(json.dumps(courses,sort_keys=True)) -startReq() f.close() + +startReq() print 'Total cursos: \t\t%s'%(len(courses)) -f = open('output3.json','w') -f.write(json.dumps(courses,sort_keys=True)) +f = open('requisitos.json','w') +f.write(json.dumps(requisitos,sort_keys=True)) f.close() f = open('output3.json','r')