linked-distributed-data/crawler.py at main · codeforcologne/linked-distributed-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Achtung: Dieser Code wurde (teilweise) mit Hilfe von KI-Tools generiert (Copilot Agent).<br/>
# Bitte sorgfältig prüfen und vor dem produktiven Einsatz testen!
# OK Lab Köln

# pip3 install requests
import requests
from bs4 import BeautifulSoup
import json
import os


# request the target URL
def crawler():
    """Initialisiert eine leere GeoJSON-Datei."""
    file_path = 'json_ld_results.json'
    empty_geojson = []
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(empty_geojson, json_file, ensure_ascii=False, indent=4)
    print(f"JSON-Datei wurde geleert: {file_path}")

    response = requests.get("https://codeforcologne.github.io/linked-distributed-data/index.html")
    response.raise_for_status()

    # Load the HTML file
    with open('index.html', 'r', encoding='utf-8') as file:
        content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(content, 'html.parser')

    # Find all links in the nav tag with id 'resources'
    nav_links = soup.select('#resources a')

    # Print the href attribute of each link and crawl them
    for link in nav_links:
        print(link.get('href'))

    crawl_links(nav_links)

def crawl_links(links):
    for link in links:
        href = link.get('href')
        if href:
            try:
                link_response = requests.get(href)
                link_response.raise_for_status()
                print(f"Successfully crawled: {href}")

                # Parse the HTML content of the crawled link
                link_soup = BeautifulSoup(link_response.text, 'html.parser')

                # Find all links in the nav tag with id 'resources'
                nested_nav_links = link_soup.select('#resources a')

                # Print the href attribute of each nested link
                for nested_link in nested_nav_links:
                    print(nested_link.get('href'))

                # Recursively crawl nested links
                crawl_websites(nested_nav_links)

            except requests.RequestException as e:
                print(f"Failed to crawl: {href} with error: {e}")

# crawl the json-ld of the website
def crawl_json_ld(soup):
    json_ld = soup.find_all('script', type='application/ld+json')
    json_ld_data = []
    for item in json_ld:
        try:
            # Parse the JSON-LD content as a string
            json_data = json.loads(item.string)
            json_ld_data.append(json_data)
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON-LD: {e}")
        #print(item.string)
        #json_ld_data.append(item)
    print(json_ld_data)
    append_to_json_file('json_ld_results.json', json_ld_data)

def append_to_json_file(file_path, new_data):
    """Append new JSON data to an existing JSON file."""
    if os.path.exists(file_path):
        # Load existing data
        with open(file_path, 'r', encoding='utf-8') as json_file:
            try:
                existing_data = json.load(json_file)
            except json.JSONDecodeError:
                existing_data = []
    else:
        existing_data = []

    # Append new data
    existing_data.extend(new_data)

    # Save updated data back to the file
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(existing_data, json_file, ensure_ascii=False, indent=4)
    print(f"Appended JSON-LD data to {file_path}")

def crawl_websites(links):
    """Recursively crawl nested links."""
    for link in links:
        href = link.get('href')
        if href:
            try:
                link_response = requests.get(href)
                link_response.raise_for_status()
                print(f"Recursively crawled: {href}")

                # Parse the HTML content of the crawled link
                link_soup = BeautifulSoup(link_response.text, 'html.parser')

                # Optionally, extract JSON-LD data
                crawl_json_ld(link_soup)

            except requests.RequestException as e:
                print(f"Failed to recursively crawl: {href} with error: {e}")

# execute the crawler
crawler()