Crawler/html_cleanup.py at master · EmilySun621/Crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import re
from bs4 import BeautifulSoup, Comment
from urllib.parse import urlparse
import re
from collections import Counter
from url_info import *
from database import DataBase


#E
def clean_html_text(html_content: bytes) -> str:
    """
    Takes raw HTML (bytes) and returns a cleaned, space-delimited string
    for tokenization. Removes comments, scripts, styles, and excess whitespace.
    """
    soup = BeautifulSoup(html_content, "html.parser")

    # Remove comments
    for tag in soup(text=lambda text: isinstance(text, Comment)):
        tag.extract()

    # Remove <script> and <style> content
    for element in soup.find_all(['script', 'style']):
        element.extract()

    # Get visible text and normalize spacing
    webtext = soup.get_text()
    space_delimited_text = re.sub(r'\s+', ' ', webtext).strip()

    return space_delimited_text

#C
def filter_extreme_large_small_files(url, DataBase, text, resp, lowerbound, upperbound):
    """
    Filters out:
    1. Extremely large files with very little text content
    2. Text content smaller than lowerbound
    3. Text content larger than upperbound

    Returns False if the file should be skipped.
    """
    content_size = len(resp.raw_response.content)
    text_length = len(text)

    # Very large file but low content → suspicious
    if content_size > 1_000_000 and text_length < 500:
        DataBase.blacklistURL[url] = "Large File With Low Content"
        # DataBase.feature_buffer.append(extract_url_features(url,0))
        return False

    # Content too small
    if text_length < lowerbound:
        print(f"[SKIP] Content too long: {text_length} chars (max: {lowerbound})")
        DataBase.blacklistURL[url] = f"Content Too Short"
        # DataBase.feature_buffer.append(extract_url_features(url,0))
        return False

    # Content too large, our upperbound limit is 100w
    if text_length > upperbound:
        print(f"[SKIP] Content too long: {text_length} chars (max: {upperbound})")
        DataBase.blacklistURL[url] = f"Content Too Long"
        # DataBase.feature_buffer.append(extract_url_features(url,0))
        return False

    # If page is more than 20
    path = urlparse(url).path
    pagination_match = re.search(r'/page/(\d+)/', path)
    if pagination_match:
        page_num = int(pagination_match.group(1))
        if page_num > 50:
            DataBase.blacklistURL[url] = f"Page More Than 50"
            # DataBase.feature_buffer.append(extract_url_features(url,0))
            return False
    return True


#A
def is_low_information_path(url, db, depth=3):
    """
    Heuristic to detect template-based or low-information pages
    by repeated shallow URL paths (e.g., /news/article/123, /news/article/124...)

    Returns True if the path structure is already visited.
    """
    parsed = urlparse(url)
    path_parts = parsed.path.strip("/").split("/")

    if len(path_parts) < depth:
        path_key = "/".join(path_parts)
    else:
        path_key = "/".join(path_parts[:depth])

    if path_key in db.visited_path:
        print(f"[SKIP] Repeated low-info path structure: {path_key}")
        db.blacklistURL[url] = "Low Information Path"
        # DataBase.feature_buffer.append(extract_url_features(url,0))
        return True

    db.visited_path.add(path_key)
    return False