From c3b2b015763fb68ad15962a996803ae601719a41 Mon Sep 17 00:00:00 2001 From: Doomfires Date: Wed, 9 Aug 2017 22:22:24 -0700 Subject: [PATCH] Commiting small changes and new file --- info.txt | 1 + quotesbot/spiders/toscrape-css-0.py | 22 ++++++++++++++++++++++ quotesbot/spiders/toscrape-css.py | 2 +- 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 info.txt create mode 100644 quotesbot/spiders/toscrape-css-0.py diff --git a/info.txt b/info.txt new file mode 100644 index 0000000..eade7ed --- /dev/null +++ b/info.txt @@ -0,0 +1 @@ +This will be my spider army to find all the lyrics to songs. diff --git a/quotesbot/spiders/toscrape-css-0.py b/quotesbot/spiders/toscrape-css-0.py new file mode 100644 index 0000000..900923d --- /dev/null +++ b/quotesbot/spiders/toscrape-css-0.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +import scrapy + + +class ToScrapeCSSSpider(scrapy.Spider): + name = "toscrape-css" + start_urls = [ + 'http://quotes.toscrape.com/', 'http://soundcloud.com/' + ] + + def parse(self, response): + for quote in response.css("div.quote"): + yield { + 'text': quote.css("span.text::text").extract_first(), + 'author': quote.css("small.author::text").extract_first(), + 'tags': quote.css("div.tags > a.tag::text").extract() + } + + next_page_url = response.css("li.next > a::attr(href)").extract_first() + if next_page_url is not None: + yield scrapy.Request(response.urljoin(next_page_url)) + diff --git a/quotesbot/spiders/toscrape-css.py b/quotesbot/spiders/toscrape-css.py index 555e204..900923d 100644 --- a/quotesbot/spiders/toscrape-css.py +++ b/quotesbot/spiders/toscrape-css.py @@ -5,7 +5,7 @@ class ToScrapeCSSSpider(scrapy.Spider): name = "toscrape-css" start_urls = [ - 'http://quotes.toscrape.com/', + 'http://quotes.toscrape.com/', 'http://soundcloud.com/' ] def parse(self, response):