Python-Webscraper/WebScraper.py at main · nacbotics5/Python-Webscraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import requests,re
from bs4 import BeautifulSoup

class WebCrawler(object):
    def __init__(self):
        self.browser = requests.session()
        self.user_agent = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'}

    def open_url(self,url):
        html = self.browser.get(url,headers = self.user_agent,allow_redirects=False).content
        return(html)

    def gather_text(self,url):
        html = self.open_url(url)
        soup = BeautifulSoup(html, features="html.parser")
        for tags in soup(["script", "style","head"]):
            tags.extract()
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)
        return(text)

    def get_image(self,url):
        html = self.open_url(url)
        BS = BeautifulSoup(html, "html.parser")
        for image in BS.findAll('img'):
            print(image.get('src'))

    def Login(self, username, password):
        html = self.open_url(self.url)
        login_data = {"username": username,"password": password}
        self.browser.post(self.url,login_data)
        return(self.browser.status)

    def download(self,url,id):
        with open("%i.jpg"%id,'wb') as filem:
            data = self.browser.get(url,stream=True,headers = self.user_agent,allow_redirects=False)
            chunks =  int(data.headers['Content-length'])
            filesize = round(chunks/10**6,2)
            print("FILENAME: %i.jpg"%id)
            print("FILESIZE : %f"%filesize)
            try:
                for chunk in data.iter_content(chunk_size = chunks):filem.write(chunk)
                return("Done downloading!")
            except Exception as e:
                return("sorry couldn't download file because %s"%str(e))

    def crawl_link(self,url,pattern=""):
        pages = []
        html = self.open_url(url)
        BS = BeautifulSoup(html, "html.parser")
        for link in BS.findAll("a",href=re.compile("(.*?)")):
            if "href" in link.attrs:
                try:
                    file_link = link.attrs['href']
                    pages.append(file_link)
                except Exception as e:print(str(e))
        return(pages)