Python-script-for-downloading/comics_downloader.py at master · windweb/Python-script-for-downloading · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
import os
import threading
from tqdm import tqdm


comics_name = 'garfield'  # should be the same as in the url https://www.gocomics.com/garfield/
# IMPORTANT. The script is tested on a period of up to 2 months.
# In the case of specifying a period of more than 60 months may be blocked by the server
comics_date_start = "2023/01/29"  # beginning of the period in the format YEAR/MONTH/DAY ('%Y/%m/%d')
comics_date_end = "2023/01/31"  # end of period format YEAR/MONTH/DAY ('%Y/%m/%d')
download_folder = "garfield_comics"  # folder to store downloaded files
format_img = 'jpg'  # like png, gif, jpg, etc.
num_of_threads = 5  # maximum number of simultaneous downloads


def download_comic_error(key, url):
    filename = key.replace('/', '_').replace(':', '_').replace('\\', '_') + f'.{format_img}'
    full_path = os.path.join(download_folder, filename)
    response = requests.get(url)
    if response.status_code == 200:
        with open(full_path, "wb") as f:
            f.write(response.content)
        print(f"✅{filename} has been downloaded successfully✅")
    else:
        print(f"🆘Error downloading {filename}, status code: {response.status_code}🆘")
        print(f"🆘Error downloading {filename}, url: {url}🆘")


def date_range(start, end):
    '''
    Returns a dictionary of dates in the format YEAR/MONTH/DAY as keys and corresponding URL of the comics for the respective date as values.
    In the console, while the function is running, a message is displayed that shows which date the image matches, indicating its url
    The function uses the datetime and timedelta modules to generate a range of dates
    from the start date to the end date and constructs the URL for each date.
    It then uses the requests module to make a GET request to the URL and retrieves the HTML content.
    The BeautifulSoup library is used to parse the HTML and extract the URL of the image for the comic strip.
    '''
    start_date = datetime.strptime(start, '%Y/%m/%d')
    end_date = datetime.strptime(end, '%Y/%m/%d')
    current_date = start_date
    dictionary_data = {}
    while current_date <= end_date:
        date_str = current_date.strftime('%Y/%m/%d')
        key = date_str
        value = f"https://www.gocomics.com/{comics_name}/{date_str}"
        res = requests.get(value)
        soup = BeautifulSoup(res.text, "html.parser")
        container = soup.find("div", class_="comic__container")
        img = container.find("img", class_="lazyload img-fluid")
        src = img["src"]
        dictionary_data[key] = src
        print(f'For date {key} matches the image by url={src}')
        current_date += timedelta(days=1)
    return dictionary_data


def download_comic(key, url, bar):
    """
    Downloads a comic strip for a given date.
    The function takes in a date string, URL of the comic strip, and a progress bar object.
    It generates the filename for the comic strip using the date string and the format YEAR_MONTH_DAY.format_img(gif/png)
    The function checks if the file already exists, and if it does, skips the download.
    If the file does not exist, the function uses the requests module to make a GET request to the URL and retrieves the image content.
    The image content is then written to a file using the filename.
    """
    key_temp = key
    filename = key.replace('/', '_').replace(':', '_').replace('\\', '_') + f'.{format_img}'
    full_path = os.path.join(download_folder, filename)

    if os.path.exists(full_path):
        print(f"♻{filename} already exists, skipping download♻")
        return

    response = requests.get(url)
    if response.status_code == 200:
        with open(full_path, "wb") as f:
            f.write(response.content)
        rint(f"✅{filename} has been downloaded successfully✅")
        bar.update(1)
    else:
        print(f"🆘Error downloading {filename}, status code: {response.status_code}🆘")
        print(f'‼!!!!!!!!!!!!!!!!!!!! DATE = {key_temp} URL = {url} !!!!!!!!!!!!!!!!!!!!‼')
        print(f"Do you want to try downloading files that you couldn't download before? (y/n)?")
        choose = str(input('TYPE (y) OR (n) 🥖>>> ')).lower()
        if choose == 'y':
            key_error = key_temp
            url_error = url
            return download_comic_error(key_error, url_error)
        else:
            print(f'go next step')
            pass


def download_threads(threads):
    """
    Starts and joins a batch of threads.
    The function takes in a list of thread objects and starts num_of_threads number of threads at a time.
    It waits for all the threads to finish before starting the next batch.
    """
    for i in range(0, len(threads), num_of_threads):
        for j in range(i, min(i + num_of_threads, len(threads))):
            threads[j].start()
        for j in range(i, min(i + num_of_threads, len(threads))):
            threads[j].join()


def main():
    """
    The main function that ties everything together.
    The function creates a dictionary of comic strips using the date_range function.
    If the download folder does not exist, it creates the folder.
    It initializes a progress bar using the tqdm library and creates a list of threads using the download_comic function.
    The function then uses the download_threads function to start and join the threads.
    The function prints a message indicating that all files have been downloaded successfully.
    """
    dictionary = date_range(comics_date_start, comics_date_end)
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    threads = []
    bar = tqdm(total=len(dictionary))
    for key, value in dictionary.items():
        threads.append(threading.Thread(target=download_comic, args=(key, value, bar)))
    download_threads(threads)
    print("✔🏁All files have been downloaded successfully🏁✔")


if __name__ == "__main__":
    main()

"""
The construction if __name__ == "__main__":
is a way to ensure that a Python script is only executed as the main program and not imported as a module into another script.

When a Python script is run, the __name__ special attribute is automatically set to "__main__".
Therefore, if the script is being executed as the main program, __name__ will be equal to "__main__".
If the script is being imported as a module into another script, __name__ will be set to the name of the module.

So, in this code, the main() function is only run if the script is executed as the main program, and not imported as a module.
"""