Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions src/crawl/HesitantCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import time
import logging
import re
import validators

import numpy as np
from urllib.parse import urlparse, urljoin
Expand All @@ -11,7 +12,7 @@
from fetch import HTMLFetcher
from util import setup

CONFIG = setup("../config/config.yaml")
CONFIG = setup("config/config.yaml")


class HesitantCrawler(BaseCrawler):
Expand Down Expand Up @@ -94,7 +95,7 @@ def skip_this_url(self, url: str) -> bool:
if url in self._visited:
logging.debug(f"Skip {url}, because we have visited it before")
return True # skip
return False
return False

def find_urls(self, url: str, html: str) -> str:
"""
Expand Down Expand Up @@ -145,6 +146,10 @@ def find_target(self, parsed: str) -> str:
def process_url(self, url: str, parent_url: str, from_sitemap: bool = False):
"""check url for target and then add to results and queue"""

if not validators.url(url):
logging.debug(f"Invalid url: {url}")
return

if url in self._istargeted:
return

Expand Down Expand Up @@ -246,14 +251,17 @@ def crawl(self):
continue

# Fetch from visting URL, will check robots if it is allowed (as part of Fetcher class)
visiting_html = self._fetcher.fetch(url=visiting_url)
self._visited[visiting_url] = visiting_html # even if nothing found, keep track of what we have tried
if len(visiting_html) == 0: # Nothing returned
try:
visiting_html = self._fetcher.fetch(url=visiting_url)
self._visited[visiting_url] = visiting_html # even if nothing found, keep track of what we have tried
if len(visiting_html) == 0: # Nothing returned
continue

for found_url in self.find_urls(url=visiting_url, html=visiting_html):
self.process_url(url=found_url, parent_url=visiting_url)
except Exception:
continue

for found_url in self.find_urls(url=visiting_url, html=visiting_html):
self.process_url(url=found_url, parent_url=visiting_url)

# At the end, measure how long we've been busy so far
duration = time.time() - start_time

Expand Down
2 changes: 1 addition & 1 deletion src/fetch/HTML.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from util import setup
from .base import IFetcher

CONFIG = setup("../config/config.yaml")
CONFIG = setup("config/config.yaml")


class HTMLFetcher(IFetcher):
Expand Down
2 changes: 1 addition & 1 deletion src/fetch/Robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from util import setup
from .base import IFetcher

CONFIG = setup("../config/config.yaml")
CONFIG = setup("config/config.yaml")


class RobotsFetcher(IFetcher):
Expand Down
2 changes: 1 addition & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from scrape import build_webfocusedscraper


CONFIG = setup("../config/config.yaml")
CONFIG = setup("config/config.yaml")


def main():
Expand Down
2 changes: 1 addition & 1 deletion src/parse/HTML.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def parse(self, html: str) -> str:
return text
except Exception as e:
# Handle exceptions
logging.debug(f"Parsing HTML failed for {url}. Error: {e}")
logging.debug(f"Parsing HTML failed. Error: {e}")


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion src/scrape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from scrape.base import IScraper, Scraper
from util import setup

CONFIG = setup("../config/config.yaml")
CONFIG = setup("config/config.yaml")


def build_webfocusedscraper(user_agent: str) -> IScraper:
Expand Down
4 changes: 2 additions & 2 deletions src/scrape/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from crawl import ICrawler
from parse import IHTMLParser

CONFIG = setup("../config/config.yaml")
CONFIG = setup("config/config.yaml")


class IScraper(ABC):
Expand Down Expand Up @@ -108,7 +108,7 @@ def scrape(self):
logging.debug("Delay has passed")

content = self._htmlparser.parse(html=html)
if len(content) > 0:
if content is not None and len(content) > 0:
if content in seen_content: # No dupliactes
logging.debug(f"Content from {crawlresult.url} is a duplicate, not added to output")
continue
Expand Down