diff --git a/src/crawl/HesitantCrawler.py b/src/crawl/HesitantCrawler.py index 77ba55b..62b6834 100644 --- a/src/crawl/HesitantCrawler.py +++ b/src/crawl/HesitantCrawler.py @@ -2,6 +2,7 @@ import time import logging import re +import validators import numpy as np from urllib.parse import urlparse, urljoin @@ -11,7 +12,7 @@ from fetch import HTMLFetcher from util import setup -CONFIG = setup("../config/config.yaml") +CONFIG = setup("config/config.yaml") class HesitantCrawler(BaseCrawler): @@ -94,7 +95,7 @@ def skip_this_url(self, url: str) -> bool: if url in self._visited: logging.debug(f"Skip {url}, because we have visited it before") return True # skip - return False + return False def find_urls(self, url: str, html: str) -> str: """ @@ -145,6 +146,10 @@ def find_target(self, parsed: str) -> str: def process_url(self, url: str, parent_url: str, from_sitemap: bool = False): """check url for target and then add to results and queue""" + if not validators.url(url): + logging.debug(f"Invalid url: {url}") + return + if url in self._istargeted: return @@ -246,14 +251,17 @@ def crawl(self): continue # Fetch from visting URL, will check robots if it is allowed (as part of Fetcher class) - visiting_html = self._fetcher.fetch(url=visiting_url) - self._visited[visiting_url] = visiting_html # even if nothing found, keep track of what we have tried - if len(visiting_html) == 0: # Nothing returned + try: + visiting_html = self._fetcher.fetch(url=visiting_url) + self._visited[visiting_url] = visiting_html # even if nothing found, keep track of what we have tried + if len(visiting_html) == 0: # Nothing returned + continue + + for found_url in self.find_urls(url=visiting_url, html=visiting_html): + self.process_url(url=found_url, parent_url=visiting_url) + except Exception: continue - for found_url in self.find_urls(url=visiting_url, html=visiting_html): - self.process_url(url=found_url, parent_url=visiting_url) - # At the end, measure how long we've been busy so far duration = time.time() - start_time diff --git a/src/fetch/HTML.py b/src/fetch/HTML.py index 663a74a..36bcf26 100644 --- a/src/fetch/HTML.py +++ b/src/fetch/HTML.py @@ -9,7 +9,7 @@ from util import setup from .base import IFetcher -CONFIG = setup("../config/config.yaml") +CONFIG = setup("config/config.yaml") class HTMLFetcher(IFetcher): diff --git a/src/fetch/Robots.py b/src/fetch/Robots.py index e429ad8..24d18c5 100644 --- a/src/fetch/Robots.py +++ b/src/fetch/Robots.py @@ -6,7 +6,7 @@ from util import setup from .base import IFetcher -CONFIG = setup("../config/config.yaml") +CONFIG = setup("config/config.yaml") class RobotsFetcher(IFetcher): diff --git a/src/main.py b/src/main.py index 72fd8eb..73c20b2 100644 --- a/src/main.py +++ b/src/main.py @@ -8,7 +8,7 @@ from scrape import build_webfocusedscraper -CONFIG = setup("../config/config.yaml") +CONFIG = setup("config/config.yaml") def main(): diff --git a/src/parse/HTML.py b/src/parse/HTML.py index 0127caa..a352f54 100644 --- a/src/parse/HTML.py +++ b/src/parse/HTML.py @@ -48,7 +48,7 @@ def parse(self, html: str) -> str: return text except Exception as e: # Handle exceptions - logging.debug(f"Parsing HTML failed for {url}. Error: {e}") + logging.debug(f"Parsing HTML failed. Error: {e}") if __name__ == "__main__": diff --git a/src/scrape/__init__.py b/src/scrape/__init__.py index 9b7d442..ded790d 100644 --- a/src/scrape/__init__.py +++ b/src/scrape/__init__.py @@ -2,7 +2,7 @@ from scrape.base import IScraper, Scraper from util import setup -CONFIG = setup("../config/config.yaml") +CONFIG = setup("config/config.yaml") def build_webfocusedscraper(user_agent: str) -> IScraper: diff --git a/src/scrape/base.py b/src/scrape/base.py index 026bc9f..fcfab69 100644 --- a/src/scrape/base.py +++ b/src/scrape/base.py @@ -12,7 +12,7 @@ from crawl import ICrawler from parse import IHTMLParser -CONFIG = setup("../config/config.yaml") +CONFIG = setup("config/config.yaml") class IScraper(ABC): @@ -108,7 +108,7 @@ def scrape(self): logging.debug("Delay has passed") content = self._htmlparser.parse(html=html) - if len(content) > 0: + if content is not None and len(content) > 0: if content in seen_content: # No dupliactes logging.debug(f"Content from {crawlresult.url} is a duplicate, not added to output") continue