-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.js
More file actions
70 lines (66 loc) · 1.81 KB
/
utils.js
File metadata and controls
70 lines (66 loc) · 1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
const axios = require('axios');
const https = require('https');
const jsdom = require('jsdom');
const { JSDOM } = jsdom;
/**
* Parse html document and extract all urls from href attributes
* @param {*} url
* @returns urls array
*/
module.exports.extractUrls = async function(url){
const response = await getResponse(url);
return await parseURLs(url, response.data);
}
/**
* Get page status (200, 301, 404, 502, etc)
* @param {*} url
* @returns pair url/code status
*/
module.exports.getPageStatus = async function(url){
const response = await getResponse(url);
const codeStatus = response.status;
return {url: url, codeStatus: codeStatus};
}
/**
* Get document response from provided url
* @param {*} url
* @returns
*/
async function getResponse(url){
try{
const agent = new https.Agent({
rejectUnauthorized: false
});
const response = await axios.get(url, { httpsAgent: agent });
return response;
}catch (error) {
if(error.response){
return error.response;
}else{
console.log(`${url} -> ${error.code}`);
return {
status: 301
}
}
}
}
/*
* Parse HTML and return array of all urls found in HTML body container
*/
async function parseURLs(originUrl, html) {
let urls = [];
const dom = new JSDOM(html);
for(selector of dom.window.document.querySelectorAll('a')){
let link = selector.getAttribute('href');
if(link != null){
if(link.includes('http')){
urls.push(link);
}else if(link.startsWith('/')){
const url = new URL(originUrl);
const baseUrl = `${url.protocol}//${url.hostname}`;
urls.push(baseUrl + link);
}
}
}
return urls;
}