Python_Web_Crawler/Python_Web_crawler.py at master · vipulyadav150/Python_Web_Crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup


def spider_build():

        url = 'http://www.india.com/travel/articles/10-of-the-best-travel-agencies-in-india-that-will-make-trip-planning-a-cake-walk-for-you/'
        print(url)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text,"html.parser")

        for link in soup.findAll('a',{'rel':'nofollow'}):
            title = link.string

            href = link.get('href')
            print('Title : ' + title)
            print(href)
            brochure_download_link(href)


def brochure_download_link(main_url):
    source_code = requests.get(main_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text,"html.parser")
    #print(soup)
    for link_down in soup.findAll('div',{'class':'customer-services'}):
      #  for head in link_down.findAll('h2'):
          #  print(head)
            #for un_list in head.findAll('ul'):
             for un_list in link_down.findAll('ul'):
              #  print(un_list.findAll('li')[0].findAll('a')[0])
                l = un_list.findAll('li')[0].findAll('a')[0]
                # for list in un_list.findAll('li'):
                #  for link_fresh in list.findAll('a'):
                href_new = str(main_url) + l.get('href')
                print("Download Brochure from : " + href_new)


        #title_new = link_down.string
        #print(title_new)


spider_build()