SNStatComp · lhaarman · May 6, 2026
diff --git a/src/crawl/HesitantCrawler.py b/src/crawl/HesitantCrawler.py
@@ -2,6 +2,7 @@
 import time
 import logging
 import re
+import validators
 
 import numpy as np
 from urllib.parse import urlparse, urljoin
@@ -11,7 +12,7 @@
 from fetch import HTMLFetcher
 from util import setup
 
-CONFIG = setup("../config/config.yaml")
+CONFIG = setup("config/config.yaml")
 
 
 class HesitantCrawler(BaseCrawler): 
@@ -94,7 +95,7 @@ def skip_this_url(self, url: str) -> bool:
         if url in self._visited:
             logging.debug(f"Skip {url}, because we have visited it before")
             return True  # skip
-        return False 
+        return False
 
     def find_urls(self, url: str, html: str) -> str:
         """
@@ -145,6 +146,10 @@ def find_target(self, parsed: str) -> str:
     def process_url(self, url: str, parent_url: str, from_sitemap: bool = False):
         """check url for target and then add to results and queue"""
 
+        if not validators.url(url):
+            logging.debug(f"Invalid url: {url}")
+            return
+
         if url in self._istargeted:
             return
 
@@ -246,14 +251,17 @@ def crawl(self):
                 continue
 
             # Fetch from visting URL, will check robots if it is allowed (as part of Fetcher class)
-            visiting_html = self._fetcher.fetch(url=visiting_url)
-            self._visited[visiting_url] = visiting_html  # even if nothing found, keep track of what we have tried
-            if len(visiting_html) == 0:  # Nothing returned
+            try:
+                visiting_html = self._fetcher.fetch(url=visiting_url)
+                self._visited[visiting_url] = visiting_html  # even if nothing found, keep track of what we have tried
+                if len(visiting_html) == 0:  # Nothing returned
+                    continue
+
+                for found_url in self.find_urls(url=visiting_url, html=visiting_html):
+                    self.process_url(url=found_url, parent_url=visiting_url)
+            except Exception:
                 continue
 
-            for found_url in self.find_urls(url=visiting_url, html=visiting_html):
-                self.process_url(url=found_url, parent_url=visiting_url)
-
             # At the end, measure how long we've been busy so far
             duration = time.time() - start_time
 

diff --git a/src/fetch/HTML.py b/src/fetch/HTML.py
@@ -9,7 +9,7 @@
 from util import setup
 from .base import IFetcher
 
-CONFIG = setup("../config/config.yaml")
+CONFIG = setup("config/config.yaml")
 
 
 class HTMLFetcher(IFetcher):

diff --git a/src/fetch/Robots.py b/src/fetch/Robots.py
@@ -6,7 +6,7 @@
 from util import setup
 from .base import IFetcher
 
-CONFIG = setup("../config/config.yaml")
+CONFIG = setup("config/config.yaml")
 
 
 class RobotsFetcher(IFetcher):

diff --git a/src/main.py b/src/main.py
@@ -8,7 +8,7 @@
 from scrape import build_webfocusedscraper
 
 
-CONFIG = setup("../config/config.yaml")
+CONFIG = setup("config/config.yaml")
 
 
 def main():

diff --git a/src/parse/HTML.py b/src/parse/HTML.py
@@ -48,7 +48,7 @@ def parse(self, html: str) -> str:
             return text
         except Exception as e:
             # Handle exceptions
-            logging.debug(f"Parsing HTML failed for {url}. Error: {e}")
+            logging.debug(f"Parsing HTML failed. Error: {e}")
 
 
 if __name__ == "__main__":

diff --git a/src/scrape/__init__.py b/src/scrape/__init__.py
@@ -2,7 +2,7 @@
 from scrape.base import IScraper, Scraper
 from util import setup
 
-CONFIG = setup("../config/config.yaml")
+CONFIG = setup("config/config.yaml")
 
 
 def build_webfocusedscraper(user_agent: str) -> IScraper:

diff --git a/src/scrape/base.py b/src/scrape/base.py
@@ -12,7 +12,7 @@
 from crawl import ICrawler
 from parse import IHTMLParser
 
-CONFIG = setup("../config/config.yaml")
+CONFIG = setup("config/config.yaml")
 
 
 class IScraper(ABC):
@@ -108,7 +108,7 @@ def scrape(self):
                     logging.debug("Delay has passed")
 
                 content = self._htmlparser.parse(html=html)
-                if len(content) > 0:
+                if content is not None and len(content) > 0:
                     if content in seen_content:  # No dupliactes
                         logging.debug(f"Content from {crawlresult.url} is a duplicate, not added to output")
                         continue