Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 90 additions & 70 deletions core/crawler.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,90 @@
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse

class LinkParser(HTMLParser):

def handle_starttag(self, tag, attrs):
"""Process HTML start tags to find anchor tags and extract href URLs.

Args:
tag (str): HTML tag name
attrs (list): List of (attribute, value) tuples

"""

if tag == 'a':
for (key, value) in attrs:
if key == 'href':
newUrl = parse.urljoin(self.baseUrl, value)
self.links.append(newUrl)

def getLinks(self, url):
"""Fetch webpage and extract all links found.

Args:
url (str): URL to fetch and parse

Returns:
tuple: (html_content, links_found)
- html_content (str): Page HTML content
- links_found (list): List of absolute URLs found
"""

self.links = []
self.baseUrl = url
response = urlopen(url)
if response.getheader('Content-Type')=='text/html':
htmlBytes = response.read()
htmlString = htmlBytes.decode("utf-8")
self.feed(htmlString)
return htmlString, self.links
else:
return "",[]

def spider(url, maxPages):
"""Web crawler that visits pages and collects links.

Args:
url (str): Starting URL to begin crawl
maxPages (int): Maximum number of pages to visit

Returns:
list: All unique links discovered during crawl
"""

links = []
pagesToVisit = [url]
numberVisited = 0
foundWord = False
while numberVisited < maxPages and pagesToVisit:
numberVisited += 1
url = pagesToVisit.pop()
try:
parser = LinkParser()
data, links = parser.getLinks(url)
except:
pass
return links

print(spider("http://vulnweb.com", 10))
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib.error import URLError, HTTPError
from urllib import parse
from typing import Optional


class LinkParser(HTMLParser):

def __init__(self, baseUrl: str) -> None:
super().__init__()
self.baseUrl = baseUrl
self.links: list[str] = []

def handle_starttag(self, tag: str, attrs: list[tuple[str, Optional[str]]]) -> None:
"""Process HTML start tags to find anchor tags and extract href URLs.

Args:
tag (str): HTML tag name
attrs (list): List of (attribute, value) tuples

"""
if tag == 'a':
for (key, value) in attrs:
if key == 'href' and value is not None:
newUrl = parse.urljoin(self.baseUrl, value)
self.links.append(newUrl)

def getLinks(self, url: str) -> tuple[str, list[str]]:
"""Fetch webpage and extract all links found.

Args:
url (str): URL to fetch and parse

Returns:
tuple: (html_content, links_found)
- html_content (str): Page HTML content
- links_found (list): List of absolute URLs found
"""
self.links = []
self.baseUrl = url
try:
response = urlopen(url)
except (URLError, HTTPError) as exc:
return "", []

content_type = response.getheader("Content-Type", "")
if not content_type.startswith("text/html"):
return "", []

htmlBytes = response.read()
htmlString = htmlBytes.decode("utf-8", errors="replace")
self.feed(htmlString)
return htmlString, self.links


def spider(url: str, maxPages: int) -> list[str]:
"""Web crawler that visits pages up to maxPages.

Args:
url (str): Starting URL
maxPages (int): Maximum number of pages to crawl

Returns:
list[str]: All unique URLs discovered across all crawled pages
"""
pagesToVisit: list[str] = [url]
visited: set[str] = set()
all_links: list[str] = []
numberVisited = 0

while numberVisited < maxPages and pagesToVisit:
numberVisited += 1
url = pagesToVisit.pop(0)

if url in visited:
continue
visited.add(url)

try:
_, links = LinkParser(url).getLinks(url)
except (URLError, HTTPError, ValueError):
continue

for link in links:
if link not in visited:
pagesToVisit.append(link)
all_links.append(link)

return all_links