diff --git a/crawler.js b/crawler.js index b248dbc..fd5e72a 100644 --- a/crawler.js +++ b/crawler.js @@ -4,15 +4,51 @@ 'use strict' -/** - * Crawls a website using a start {url}, and returns the lexicographically smallest string. - * @param url - * @return {Promise.} - */ -module.exports = url => - new Promise((resolve, reject) => { - /** - * TODO: Write your high performance code here. - */ - reject(new Error('NotImplemented')) - }) +const O = require('observable-air') +const promiseRetry = require('promise-retry') +const axios = require('axios') +const {JSDOM} = require('jsdom') +const R = require('ramda') + +const requestRetry = url => promiseRetry(retry => axios.get(url).catch(retry)) + +const request$ = url => O.fromPromise(() => requestRetry(url)) +const extractDOM = R.compose( + R.path(['window', 'document']), + R.construct(JSDOM), + R.prop('data') +) +const querySelectorAll = R.curry((selector, doc) => + Array.from(doc.querySelectorAll(selector)) +) +const extractCodes = R.compose( + O.fromArray, + R.pluck('innerHTML'), + querySelectorAll('h1') +) +const extractLinks = R.compose( + O.fromArray, + R.pluck('href'), + querySelectorAll('a') +) + +const crawl = R.curry((base, unique, url) => { + const response$ = O.multicast(request$(url)) + const document$ = O.map(extractDOM, response$) + const code$ = O.flatMap(extractCodes, document$) + const link$ = O.map(R.concat(base), O.flatMap(extractLinks, document$)) + return O.merge(code$, O.flatMap(crawl(base, unique), unique(link$))) +}) + +const findMin = source => + O.reduce( + (last, current) => (current < last ? current : last), + 'zzzzzzzz', + source + ) + +const main = url => findMin(crawl(url, O.uniqueWith(new Set()), url)) + +module.exports = url => { + return new Promise(resolve => O.forEach(result => resolve(result), main(url))) +} diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..f0e3e25 --- /dev/null +++ b/crawler.py @@ -0,0 +1,28 @@ +import time +import requests +from BeautifulSoup import BeautifulSoup + +word_list = [] +link_list = set() +url = 'http://localhost:8080' + +# def get_links(html): +start = time.time() +def recursiveLinks(route): + nurl = url+route + page = requests.get(nurl) + html = BeautifulSoup(page.content) + words = [ h1.text for h1 in html.findAll('h1') if h1.text] + word_list.extend(words) + links = html.findAll('a') + if links > 0: + for link in links: + if not link['href'] in link_list: + link_list.add(link['href']) + links.extend(recursiveLinks(link['href'])) + return links + +# print(start) +recursiveLinks('') +print(min(word_list)) +print(time.time() - start) diff --git a/package.json b/package.json index 0631069..2c4b1cf 100644 --- a/package.json +++ b/package.json @@ -10,10 +10,15 @@ "author": "", "license": "ISC", "dependencies": { + "axios": "^0.16.2", "express": "^4.15.4", "express-rate-limit": "^2.9.0", + "jsdom": "^11.2.0", "mocha": "^3.5.3", "nodemon": "^1.12.0", - "pug": "^2.0.0-rc.4" + "observable-air": "^7.4.0", + "promise-retry": "^1.1.1", + "pug": "^2.0.0-rc.4", + "ramda": "^0.24.1" } } diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2aa9911 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +BeautifulSoup==3.2.1 +requests==2.18.4