From 4a3a5c49748d996eafa9c89f70b12bc3164bb375 Mon Sep 17 00:00:00 2001 From: Nickolai Date: Thu, 30 Jun 2022 21:34:41 +0300 Subject: [PATCH 1/5] [Iteration 1] One-shot command-line RSS reader. --- MikalaiSidarevich/rss_reader/README.md | 116 +++++++++ .../rss_reader/engine/__init__.py | 0 .../rss_reader/engine/argparser.py | 41 ++++ .../rss_reader/engine/converter.py | 51 ++++ .../rss_reader/engine/rssparser.py | 228 ++++++++++++++++++ .../rss_reader/engine/rssreader.py | 35 +++ MikalaiSidarevich/rss_reader/requirements.txt | 4 + MikalaiSidarevich/rss_reader/rss_reader.py | 62 +++++ MikalaiSidarevich/rss_reader/setup.py | 9 + .../rss_reader/tests/__init__.py | 0 .../rss_reader/tests/test_argparser.py | 26 ++ .../rss_reader/tests/test_converter.py | 33 +++ .../rss_reader/tests/test_rssparser.py | 24 ++ .../rss_reader/tests/test_rssreader.py | 33 +++ 14 files changed, 662 insertions(+) create mode 100644 MikalaiSidarevich/rss_reader/README.md create mode 100644 MikalaiSidarevich/rss_reader/engine/__init__.py create mode 100644 MikalaiSidarevich/rss_reader/engine/argparser.py create mode 100644 MikalaiSidarevich/rss_reader/engine/converter.py create mode 100644 MikalaiSidarevich/rss_reader/engine/rssparser.py create mode 100644 MikalaiSidarevich/rss_reader/engine/rssreader.py create mode 100644 MikalaiSidarevich/rss_reader/requirements.txt create mode 100644 MikalaiSidarevich/rss_reader/rss_reader.py create mode 100644 MikalaiSidarevich/rss_reader/setup.py create mode 100644 MikalaiSidarevich/rss_reader/tests/__init__.py create mode 100644 MikalaiSidarevich/rss_reader/tests/test_argparser.py create mode 100644 MikalaiSidarevich/rss_reader/tests/test_converter.py create mode 100644 MikalaiSidarevich/rss_reader/tests/test_rssparser.py create mode 100644 MikalaiSidarevich/rss_reader/tests/test_rssreader.py diff --git a/MikalaiSidarevich/rss_reader/README.md b/MikalaiSidarevich/rss_reader/README.md new file mode 100644 index 00000000..2eabf33a --- /dev/null +++ b/MikalaiSidarevich/rss_reader/README.md @@ -0,0 +1,116 @@ +# RSS reader + +Pure Python command-line RSS reader. + +## Requirements + +The utility requires [**`python 3.9`**](https://www.python.org/downloads/) interpreter with [**`pip`**](https://pypi.org/project/pip/) installing tool. + +On the command line the interpreter can be typed as `python`, `python3`, `python3.9` (depending on OS, version, etc.). + +To be specific this readme has decided to use the name `python`. + +## Dependencies + +All extra packages listed in the `requirements.txt`: + +- [**`beautifulsoup4`**](https://pypi.org/project/beautifulsoup4/) `4.11.1` — Screen-scraping library +- [**`coverage`**](https://pypi.org/project/coverage/) `6.2` — Code coverage measurement for Python +- [**`lxml`**](https://pypi.org/project/lxml/) `4.8.0` — Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API +- [**`requests`**](https://pypi.org/project/requests/) `2.26.0` — Python HTTP for Humans + +To install extra packages automatically set the working directory to the project root `rss_reader/` and execute*: + +```sh +> python -m pip install -r requirements.txt +``` + +**Super user privileges may be required to install extra packages. If so, then use* `sudo` *command on Linux or run terminal as administrator on Windows.* + +## Usage + +The utility can handle multiple arguments. + +To show help message below use `-h/--help` argument. + +```sh +usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [source] + +Pure Python command-line RSS reader. + +positional arguments: + source RSS URL + +optional arguments: + -h, --help show this help message and exit + --version Print version info + --json Print result as JSON in stdout + --verbose Outputs verbose status messages + --limit LIMIT Limit news topics if this parameter provided +``` + +## Examples: + +Set the working directory to the project root `rss_reader/` and execute: + +- Show utility version: + ```sh + > python rss_reader.py --version + Version 1.1 + ``` + +- Read 1 news entry from [Yahoo](https://news.yahoo.com/) source: + ```sh + > python rss_reader.py https://news.yahoo.com/rss/ --limit 1 + + Feed: Yahoo News - Latest News & Headlines + + Title: WNBA star Brittney Griner ordered to trial Friday in Russia + Date: 2022-06-27T07:41:55Z + Link: https://news.yahoo.com/us-basketball-star-griner-due-074155275.html + + Shackled and looking wary, WNBA star Brittney Griner was ordered Monday to stand trial by a court near Moscow on cannabis possession charges, about 4 1/2 months after her arrest at an airport while returning to play for a Russian team. The Phoenix Mercury center and two-time U.S. Olympic gold medalist also was ordered to remain in custody for the duration of her criminal trial, which was to begin Friday. Griner could face 10 years in prison if convicted on charges of large-scale transportation of drugs. + + + Links: + [1]: https://news.yahoo.com/us-basketball-star-griner-due-074155275.html (link) + [2]: https://s.yimg.com/ny/api/res/1.2/utfVa4Ach8UgXMMZREmJhg--/YXBwaWQ9aGlnaGxhbmRlcjt3PTEyMDA7aD04MDA-/https://s.yimg.com/uu/api/res/1.2/1THMVZDeZ0z7PVXchxklYw--~B/aD00MDAwO3c9NjAwMDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/ap.org/9f3f122d6d91ff94f9613b5d97409f0f (image) + + + ``` + +## JSON format + +The utility can export the feed into JSON format for console output. +The structure is shown below: + +```python +{ + "channel": "Channel title", + "url": "Channel URL", + "entries": [ + { + "title": "Entry title", + "date": "Entry publish date", + "link": "Entry link", + "description": "Entry description", + "image_link": "Entry image link" + } + ] +} +``` + +## Running tests + +To run tests set the working directory to the project root `rss_reader/` and execute: + +```sh +> python -m unittest +``` + +To run test coverage checking set the working directory to the project root `rss_reader/` and execute: + +```sh +> python -m coverage run -m unittest +> python -m coverage report --include=engine/* +``` diff --git a/MikalaiSidarevich/rss_reader/engine/__init__.py b/MikalaiSidarevich/rss_reader/engine/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/MikalaiSidarevich/rss_reader/engine/argparser.py b/MikalaiSidarevich/rss_reader/engine/argparser.py new file mode 100644 index 00000000..0bb1bd03 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/argparser.py @@ -0,0 +1,41 @@ +""" +ArgParser - parser for CLI arguments. +""" + +import argparse +import sys + + +class ArgParser: + """Parser for CLI arguments based on `argparse.ArgumentParser` parser.""" + + def __init__(self): + """ + Initialize parser with configured settings. + """ + description = "Pure Python command-line RSS reader." + self._parser = argparse.ArgumentParser(description=description) + self._configure() + + def _configure(self): + """ + Configure parser - add CLI arguments. + """ + self._parser.add_argument("source", nargs='?', help="RSS URL") + self._parser.add_argument("--version", action='store_true', help="Print version info") + self._parser.add_argument("--json", action='store_true', help="Print result as JSON in stdout") + self._parser.add_argument("--verbose", action='store_true', help="Outputs verbose status messages") + self._parser.add_argument("--limit", help="Limit news topics if this parameter provided") + + @property + def args(self): + """ + Get CLI arguments in a dictionary format. + """ + return vars(self._parser.parse_args(sys.argv[1:])) + + def print_help(self): + """ + Print help message to console. + """ + self._parser.print_help() diff --git a/MikalaiSidarevich/rss_reader/engine/converter.py b/MikalaiSidarevich/rss_reader/engine/converter.py new file mode 100644 index 00000000..f6a07428 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/converter.py @@ -0,0 +1,51 @@ +""" +Converter - RSS data converter to target formats. +""" + +import json + + +class Converter: + """RSS data converter to target formats.""" + + @classmethod + def to_text(cls, feed): + """ + Convert channel's `feed` data to text format for console output. + """ + # List of channel data blocks + channel_data = [] + + # Add channel title + channel_data.append(f"\nFeed: {feed['channel']}\n\n") + + for entry in feed['entries']: + # Add enty title block + channel_data.append(f"Title: {entry['title']}\n") + + # Add entry date block + if entry['date'] is not None: + channel_data.append(f"Date: {entry['date']}\n") + + # Add entry link block + channel_data.append(f"Link: {entry['link']}\n") + + # Add entry description block + if entry['description'] is not None: + channel_data.append(f"\n{entry['description']}\n\n") + + # Add links list block + channel_data.append(f"\nLinks:\n[1]: {entry['link']} (link)\n") + if entry['image_link'] is not None: + channel_data.append(f"[2]: {entry['image_link']} (image)\n") + + channel_data.append("\n") + + return ''.join(channel_data) + + @classmethod + def to_json(cls, feed): + """ + Convert channel's `feed` data to json format. + """ + return json.dumps(feed, ensure_ascii=False, indent=4) diff --git a/MikalaiSidarevich/rss_reader/engine/rssparser.py b/MikalaiSidarevich/rss_reader/engine/rssparser.py new file mode 100644 index 00000000..0e5dd439 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/rssparser.py @@ -0,0 +1,228 @@ +""" +RssParser - parser for RSS data. +""" + +import threading +import xml.dom.minidom +from html import unescape +from time import sleep + +import requests +from bs4 import BeautifulSoup + + +class RequestError(Exception): + """ + Request URL exception. + """ + pass + + +class XmlParseError(Exception): + """ + Parse XML document exception. + """ + pass + + +class HtmlParseError(Exception): + """ + Parse HTML document exception. + """ + pass + + +class RssParser: + """Parser for RSS data.""" + + def __init__(self, verbose=False): + """ + Initialize parser with `verbose` ability and setup feed storage. + """ + self._verbose = verbose + self._feed = {} + + def feed(self, url, limit): + """ + Get parsed RSS feed. + """ + self._url = url + self._limit = limit + + # Get XML document + self._document = self._request_url(url).strip() + + # Parse XML document into the internal storage + self._parse_feed() + + return self._feed + + def _request_url(self, url): + """ + Get content by the `url`. + """ + headers = {'User-Agent': 'Mozilla/5.0'} + timeout = 10 + + try: + response = requests.get(url, headers=headers, timeout=timeout) + except Exception: + raise RequestError(f"Unable to get content by URL '{url}'") + + return response.content + + def _parse_feed(self): + """ + Parse XML document into the internal storage. + """ + # Get XML DOM structure + try: + dom = xml.dom.minidom.parseString(self._document) + except Exception: + raise XmlParseError(f"Invalid XML document by URL '{self._url}'") + + # Get feed channel title + try: + title_dom = dom.getElementsByTagName('title').item(0) + if title_dom.firstChild: + title = title_dom.firstChild.nodeValue + else: + title = self._url + except Exception: + raise XmlParseError(f"RSS channel '{self._url}' has no title") + + self._feed['channel'] = unescape(title) + + # Add channel url + self._feed['url'] = self._url + + # Get list of RSS entries + for tag in ['item', 'entry']: + item_list = dom.getElementsByTagName(tag) + if item_list: + break + + self._feed['entries'] = [] + + # Items limit depends on user limit-parameter provided + if self._limit is None: + limit = item_list.length + else: + limit = min(self._limit, item_list.length) + + # Get RSS entries data + for i in range(limit): + item = item_list[i] + thread = threading.Thread(target=self._get_entry, + args=(i, item)) + thread.start() + + # Prevent too fast requests + sleep(0.2) + + # Join spawned threads + for thread in threading.enumerate(): + # Main thread should be skipped + if thread is threading.main_thread(): + continue + thread.join() + + if self._verbose: + print(f"Total {len(self._feed['entries'])} items processed", flush=True) + + def _get_entry(self, n, item): + """ + Request `n`-th entry attributes: + get title, date, link from XML, description, image by entry's url. + """ + if self._verbose: + print(f"Entry #{n} requested", flush=True) + + entry = {'title': None, + 'date': None, + 'link': None, + 'description': None, + 'image_link': None} + + # Get entry title + try: + entry_title_dom = item.getElementsByTagName('title').item(0) + + # Search non-empty node + for node in entry_title_dom.childNodes: + if node.nodeValue.strip(): + entry['title'] = unescape(node.nodeValue) + break + except Exception: + if self._verbose: + print(f"Entry #{n} has no title", flush=True) + # Title must be present + return + + # Get entry link + try: + for tag in ['link', 'id']: + entry_link_dom = item.getElementsByTagName(tag).item(0) + if entry_link_dom.firstChild: + entry['link'] = entry_link_dom.firstChild.nodeValue + entry['link'] = entry['link'].lower() + break + except Exception: + if self._verbose: + print(f"Entry #{n} has no link", flush=True) + # Link must be present + return + + # Get entry published date + try: + for tag in ['pubDate', 'published']: + entry_date_dom = item.getElementsByTagName(tag).item(0) + if entry_date_dom: + entry['date'] = entry_date_dom.firstChild.nodeValue + break + except Exception: + if self._verbose: + print(f"Entry #{n} has no date published", flush=True) + + # Get entry description + try: + attrs = {'name': 'description', + 'property': 'og:description', + 'itemprop': 'description'} + entry['description'] = self._get_meta_tag(entry['link'], **attrs) + entry['description'] = unescape(entry['description']).strip() + except Exception as e: + if self._verbose: + print(f"Entry #{n} description: {e}", flush=True) + + # Get entry image link + try: + attrs = {'property': 'og:image'} + entry['image_link'] = self._get_meta_tag(entry['link'], **attrs) + except Exception as e: + if self._verbose: + print(f"Entry #{n} image: {e}", flush=True) + + # Add current entry + self._feed['entries'].append(entry) + + if self._verbose: + print(f"Entry #{n} received", flush=True) + + def _get_meta_tag(self, url, **kwargs): + """ + Extract meta tags content from `url` specified. + Meta tag attributes specified by `kwargs`. + """ + raw_html = self._request_url(url) + parsed_html = BeautifulSoup(raw_html.decode('utf-8', 'ignore'), features='html.parser') + + try: + for attr, value in kwargs.items(): + meta = parsed_html.find('meta', {attr: value}) + if meta: + return meta['content'] + else: + raise Exception + except Exception: + raise HtmlParseError(f"Couldn't parse data by URL '{url}'") diff --git a/MikalaiSidarevich/rss_reader/engine/rssreader.py b/MikalaiSidarevich/rss_reader/engine/rssreader.py new file mode 100644 index 00000000..db356529 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/rssreader.py @@ -0,0 +1,35 @@ +""" +RssReader - pure Python command-line RSS reader. +""" + +from engine.converter import Converter +from engine.rssparser import RssParser + + +class RssReader: + """Pure Python command-line RSS reader.""" + + def __init__(self, verbose): + """ + Initialize reader with `verbose` ability. + """ + self._verbose = verbose + + def read_rss(self, url, limit, json): + """ + Get RSS entries from `url` and output them to the stdout. + Limit number of entries with `limit`. + Output all the entries if `limit` is not specified. + """ + try: + parser = RssParser(self._verbose) + feed = parser.feed(url, limit) + except Exception: + raise + + if json: + rss_content = Converter.to_json(feed) + else: + rss_content = Converter.to_text(feed) + + return rss_content diff --git a/MikalaiSidarevich/rss_reader/requirements.txt b/MikalaiSidarevich/rss_reader/requirements.txt new file mode 100644 index 00000000..3cbf33f2 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/requirements.txt @@ -0,0 +1,4 @@ +beautifulsoup4==4.11.1 +coverage==6.2 +lxml==4.8.0 +requests==2.26.0 diff --git a/MikalaiSidarevich/rss_reader/rss_reader.py b/MikalaiSidarevich/rss_reader/rss_reader.py new file mode 100644 index 00000000..8bded285 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/rss_reader.py @@ -0,0 +1,62 @@ +""" +main() - entry point: parse CLI arguments & run RSS reading. +""" + +import sys + +from engine.argparser import ArgParser +from engine.rssreader import RssReader + +version = '1.1' + + +def main(): + """ + Entry point - get CLI arguments and start process. + """ + # Set console encoding to UTF-8 + sys.stdout.reconfigure(encoding='utf-8') + + arg_parser = ArgParser() + args = arg_parser.args + + try: + if args['version']: + print(f"Version {version}") + exit(0) + + json = False + if args['json']: + json = args['json'] + + verbose = False + if args['verbose']: + verbose = args['verbose'] + + limit = None + if args['limit'] is not None: + limit = int(args['limit']) + + url = None + if args['source'] is not None: + url = args['source'] + + except Exception: + print("Invalid argument value", flush=True) + exit(1) + + if url: + if verbose: + print(f"URL is set: '{url}'", flush=True) + try: + rss = RssReader(verbose).read_rss(url, limit, json) + print(rss, flush=True) + except Exception as e: + print(f"{type(e).__name__}: {e}", flush=True) + exit(1) + else: + arg_parser.print_help() + + +if __name__ == '__main__': + main() diff --git a/MikalaiSidarevich/rss_reader/setup.py b/MikalaiSidarevich/rss_reader/setup.py new file mode 100644 index 00000000..58a4ab95 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/setup.py @@ -0,0 +1,9 @@ +""" +Install required packages for RSS reader. +""" + +import os +import sys + +cmd = f"{sys.executable} -m pip install -r requirements.txt" +os.system(cmd) diff --git a/MikalaiSidarevich/rss_reader/tests/__init__.py b/MikalaiSidarevich/rss_reader/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/MikalaiSidarevich/rss_reader/tests/test_argparser.py b/MikalaiSidarevich/rss_reader/tests/test_argparser.py new file mode 100644 index 00000000..a3703c87 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/tests/test_argparser.py @@ -0,0 +1,26 @@ +"""Test ArgParser class.""" + +from unittest import TestCase + +from engine.argparser import ArgParser + + +class TestArgParser(TestCase): + """Testcase for ArgParser class.""" + + def setUp(self): + """ + Prepare test fixture. + """ + self.parser = ArgParser() + + def test_args(self): + """ + args-property test. + """ + expected = {'source': None, + 'version': False, + 'json': False, + 'verbose': False, + 'limit': None} + self.assertDictEqual(self.parser.args, expected) diff --git a/MikalaiSidarevich/rss_reader/tests/test_converter.py b/MikalaiSidarevich/rss_reader/tests/test_converter.py new file mode 100644 index 00000000..7b48ac38 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/tests/test_converter.py @@ -0,0 +1,33 @@ +"""Test Converter class.""" + +from unittest import TestCase + +from engine.converter import Converter + + +class TestConverter(TestCase): + """Testcase for Converter class.""" + + def test_to_text(self): + """ + to_text() test. + """ + feed = {'channel': "", + 'entries': [{'title': "", + 'date': "", + 'link': "", + 'description': "", + 'image_link': ""}]} + self.assertIsNotNone(Converter.to_text(feed)) + + def test_to_json(self): + """ + to_json() test. + """ + feed = {'channel': "", + 'entries': [{'title': "", + 'date': "", + 'link': "", + 'description': "", + 'image_link': ""}]} + self.assertIsNotNone(Converter.to_json(feed)) diff --git a/MikalaiSidarevich/rss_reader/tests/test_rssparser.py b/MikalaiSidarevich/rss_reader/tests/test_rssparser.py new file mode 100644 index 00000000..24d80862 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/tests/test_rssparser.py @@ -0,0 +1,24 @@ +"""Test RssParser class.""" + +from unittest import TestCase + +from engine.rssparser import RequestError, RssParser + + +class TestRssParser(TestCase): + """Testcase for RssParser class.""" + + def setUp(self): + """ + Prepare test fixture. + """ + self.parser = RssParser(verbose=False) + + def test_feed(self): + """ + feed() test. + """ + with self.assertRaises(RequestError): + self.parser.feed("", 1) + + self.assertIsNotNone(self.parser.feed("https://news.yahoo.com/rss/", 1)) diff --git a/MikalaiSidarevich/rss_reader/tests/test_rssreader.py b/MikalaiSidarevich/rss_reader/tests/test_rssreader.py new file mode 100644 index 00000000..eb585859 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/tests/test_rssreader.py @@ -0,0 +1,33 @@ +"""Test RssReader class.""" + +from unittest import TestCase +from unittest.mock import patch + +from engine.rssreader import RssReader + + +class TestRssReader(TestCase): + """Testcase for RssReader class.""" + + def setUp(self): + """ + Prepare test fixture. + """ + self.reader = RssReader(verbose=False) + + @patch('engine.rssparser.RssParser.feed') + def test_read_rss(self, mock_parser_feed): + """ + read_rss() test. + """ + mock_parser_feed.return_value = {'channel': "", + 'entries': [{'title': "", + 'date': "", + 'link': "", + 'description': "", + 'image_link': ""}]} + + url = "https://news.yahoo.com/rss/" + limit = 1 + self.assertIsNotNone(self.reader.read_rss(url, limit, False)) + self.assertIsNotNone(self.reader.read_rss(url, limit, True)) From 2f0b573802fa1676ca892cb1a9adf23a7e6e9039 Mon Sep 17 00:00:00 2001 From: Nickolai Date: Thu, 30 Jun 2022 21:57:16 +0300 Subject: [PATCH 2/5] [Iteration 2] Distribution. --- MikalaiSidarevich/rss_reader/README.md | 16 +++++- MikalaiSidarevich/rss_reader/engine/main.py | 62 +++++++++++++++++++++ MikalaiSidarevich/rss_reader/pyproject.toml | 3 + MikalaiSidarevich/rss_reader/rss_reader.py | 58 +------------------ MikalaiSidarevich/rss_reader/setup.cfg | 15 +++++ MikalaiSidarevich/rss_reader/setup.py | 8 +-- 6 files changed, 99 insertions(+), 63 deletions(-) create mode 100644 MikalaiSidarevich/rss_reader/engine/main.py create mode 100644 MikalaiSidarevich/rss_reader/pyproject.toml create mode 100644 MikalaiSidarevich/rss_reader/setup.cfg diff --git a/MikalaiSidarevich/rss_reader/README.md b/MikalaiSidarevich/rss_reader/README.md index 2eabf33a..463aed77 100644 --- a/MikalaiSidarevich/rss_reader/README.md +++ b/MikalaiSidarevich/rss_reader/README.md @@ -4,7 +4,7 @@ Pure Python command-line RSS reader. ## Requirements -The utility requires [**`python 3.9`**](https://www.python.org/downloads/) interpreter with [**`pip`**](https://pypi.org/project/pip/) installing tool. +The utility requires [**`python 3.9`**](https://www.python.org/downloads/) interpreter with [**`pip`**](https://pypi.org/project/pip/) installing tool, [**`setuptools`**](https://pypi.org/project/setuptools/) installing tool. On the command line the interpreter can be typed as `python`, `python3`, `python3.9` (depending on OS, version, etc.). @@ -25,6 +25,12 @@ To install extra packages automatically set the working directory to the project > python -m pip install -r requirements.txt ``` +It's also possible to setup package via **`setuptools`**: + +```sh +> python setup.py install +``` + **Super user privileges may be required to install extra packages. If so, then use* `sudo` *command on Linux or run terminal as administrator on Windows.* ## Usage @@ -56,7 +62,13 @@ Set the working directory to the project root `rss_reader/` and execute: - Show utility version: ```sh > python rss_reader.py --version - Version 1.1 + Version 1.2 + ``` + +- Show utility version using CLI utility installed: + ```sh + > rss_reader --version + Version 1.2 ``` - Read 1 news entry from [Yahoo](https://news.yahoo.com/) source: diff --git a/MikalaiSidarevich/rss_reader/engine/main.py b/MikalaiSidarevich/rss_reader/engine/main.py new file mode 100644 index 00000000..bac61581 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/main.py @@ -0,0 +1,62 @@ +""" +main() - entry point: parse CLI arguments & run RSS reading. +""" + +import sys + +from engine.argparser import ArgParser +from engine.rssreader import RssReader + +version = '1.2' + + +def main(): + """ + Entry point - get CLI arguments and start process. + """ + # Set console encoding to UTF-8 + sys.stdout.reconfigure(encoding='utf-8') + + arg_parser = ArgParser() + args = arg_parser.args + + try: + if args['version']: + print(f"Version {version}") + exit(0) + + json = False + if args['json']: + json = args['json'] + + verbose = False + if args['verbose']: + verbose = args['verbose'] + + limit = None + if args['limit'] is not None: + limit = int(args['limit']) + + url = None + if args['source'] is not None: + url = args['source'] + + except Exception: + print("Invalid argument value", flush=True) + exit(1) + + if url: + if verbose: + print(f"URL is set: '{url}'", flush=True) + try: + rss = RssReader(verbose).read_rss(url, limit, json) + print(rss, flush=True) + except Exception as e: + print(f"{type(e).__name__}: {e}", flush=True) + exit(1) + else: + arg_parser.print_help() + + +if __name__ == '__main__': + main() diff --git a/MikalaiSidarevich/rss_reader/pyproject.toml b/MikalaiSidarevich/rss_reader/pyproject.toml new file mode 100644 index 00000000..bf912968 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools", "wheel"] diff --git a/MikalaiSidarevich/rss_reader/rss_reader.py b/MikalaiSidarevich/rss_reader/rss_reader.py index 8bded285..d7a02bf1 100644 --- a/MikalaiSidarevich/rss_reader/rss_reader.py +++ b/MikalaiSidarevich/rss_reader/rss_reader.py @@ -1,62 +1,8 @@ """ -main() - entry point: parse CLI arguments & run RSS reading. +Pure Python command-line RSS reader. """ -import sys - -from engine.argparser import ArgParser -from engine.rssreader import RssReader - -version = '1.1' - - -def main(): - """ - Entry point - get CLI arguments and start process. - """ - # Set console encoding to UTF-8 - sys.stdout.reconfigure(encoding='utf-8') - - arg_parser = ArgParser() - args = arg_parser.args - - try: - if args['version']: - print(f"Version {version}") - exit(0) - - json = False - if args['json']: - json = args['json'] - - verbose = False - if args['verbose']: - verbose = args['verbose'] - - limit = None - if args['limit'] is not None: - limit = int(args['limit']) - - url = None - if args['source'] is not None: - url = args['source'] - - except Exception: - print("Invalid argument value", flush=True) - exit(1) - - if url: - if verbose: - print(f"URL is set: '{url}'", flush=True) - try: - rss = RssReader(verbose).read_rss(url, limit, json) - print(rss, flush=True) - except Exception as e: - print(f"{type(e).__name__}: {e}", flush=True) - exit(1) - else: - arg_parser.print_help() - +from engine.main import main if __name__ == '__main__': main() diff --git a/MikalaiSidarevich/rss_reader/setup.cfg b/MikalaiSidarevich/rss_reader/setup.cfg new file mode 100644 index 00000000..95ccd91a --- /dev/null +++ b/MikalaiSidarevich/rss_reader/setup.cfg @@ -0,0 +1,15 @@ +[metadata] +name = rss-reader +version = 1.2 + +[options] +packages = find: +install_requires = + beautifulsoup4==4.11.1 + coverage==6.2 + lxml==4.8.0 + requests==2.26.0 + +[options.entry_points] +console_scripts = + rss_reader = engine.main:main diff --git a/MikalaiSidarevich/rss_reader/setup.py b/MikalaiSidarevich/rss_reader/setup.py index 58a4ab95..699b270c 100644 --- a/MikalaiSidarevich/rss_reader/setup.py +++ b/MikalaiSidarevich/rss_reader/setup.py @@ -1,9 +1,7 @@ """ -Install required packages for RSS reader. +Entry point to install package. """ -import os -import sys +from setuptools import setup -cmd = f"{sys.executable} -m pip install -r requirements.txt" -os.system(cmd) +setup() From 400985fb5e2d94de18e5f93b02bc19ac2c52e9fd Mon Sep 17 00:00:00 2001 From: Nickolai Date: Thu, 30 Jun 2022 22:00:26 +0300 Subject: [PATCH 3/5] [Iteration 3] News caching. --- MikalaiSidarevich/rss_reader/README.md | 83 ++++-- .../rss_reader/engine/argparser.py | 1 + .../rss_reader/engine/converter.py | 65 +++-- MikalaiSidarevich/rss_reader/engine/main.py | 13 +- .../rss_reader/engine/rsscacher.py | 239 ++++++++++++++++++ .../rss_reader/engine/rssparser.py | 81 +++++- .../rss_reader/engine/rssreader.py | 22 +- MikalaiSidarevich/rss_reader/setup.cfg | 3 +- .../rss_reader/tests/test_argparser.py | 3 +- .../rss_reader/tests/test_converter.py | 27 +- .../rss_reader/tests/test_rsscacher.py | 33 +++ .../rss_reader/tests/test_rssparser.py | 16 +- .../rss_reader/tests/test_rssreader.py | 19 +- 13 files changed, 507 insertions(+), 98 deletions(-) create mode 100644 MikalaiSidarevich/rss_reader/engine/rsscacher.py create mode 100644 MikalaiSidarevich/rss_reader/tests/test_rsscacher.py diff --git a/MikalaiSidarevich/rss_reader/README.md b/MikalaiSidarevich/rss_reader/README.md index 463aed77..c9de2b5a 100644 --- a/MikalaiSidarevich/rss_reader/README.md +++ b/MikalaiSidarevich/rss_reader/README.md @@ -40,7 +40,7 @@ The utility can handle multiple arguments. To show help message below use `-h/--help` argument. ```sh -usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [source] +usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [--date DATE] [source] Pure Python command-line RSS reader. @@ -53,6 +53,7 @@ optional arguments: --json Print result as JSON in stdout --verbose Outputs verbose status messages --limit LIMIT Limit news topics if this parameter provided + --date DATE Read cached news by date specified like '%Y%m%d' ``` ## Examples: @@ -62,13 +63,13 @@ Set the working directory to the project root `rss_reader/` and execute: - Show utility version: ```sh > python rss_reader.py --version - Version 1.2 + Version 1.3 ``` - Show utility version using CLI utility installed: ```sh > rss_reader --version - Version 1.2 + Version 1.3 ``` - Read 1 news entry from [Yahoo](https://news.yahoo.com/) source: @@ -89,6 +90,26 @@ Set the working directory to the project root `rss_reader/` and execute: [2]: https://s.yimg.com/ny/api/res/1.2/utfVa4Ach8UgXMMZREmJhg--/YXBwaWQ9aGlnaGxhbmRlcjt3PTEyMDA7aD04MDA-/https://s.yimg.com/uu/api/res/1.2/1THMVZDeZ0z7PVXchxklYw--~B/aD00MDAwO3c9NjAwMDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/ap.org/9f3f122d6d91ff94f9613b5d97409f0f (image) + ``` + +- Read from cache 1 news entry for the date `'20220628'` (requires previously stored data): + ```sh + > python rss_reader.py --date 20220628 --limit 1 + + Feed: Yahoo News - Latest News & Headlines + + Title: Spit, 'disrespect' arrive at Wimbledon as tennis turns ugly + Date: 2022-06-28T22:01:51Z + Link: https://news.yahoo.com/spit-disrespect-arrive-wimbledon-tennis-220151441.html + + This is not what one thinks of when pondering the supposedly genteel roots of tennis, and the purportedly proper atmosphere at dates-to-the-1800s Wimbledon, a country club sport being contested at a place officially called the All England Lawn Tennis Club: a player, Nick Kyrgios, capping a first-round victory Tuesday by spitting in the direction of a spectator he said was hassling him. Like, he literally came to the match to literally just not even support anyone, really. During the match, which filled the stands at 1,980-seat Court No. 3 — and attracted lengthy lines of folks hoping to eventually be let in, likely owing to the popularity of the anything-can-happen Kyrgios, a 27-year-old from Australia, and the involvement of a local player — Kyrgios asked, without success, to have the fan removed for cursing and sending other verbal abuse his way. + + + Links: + [1]: https://news.yahoo.com/spit-disrespect-arrive-wimbledon-tennis-220151441.html (link) + [2]: https://s.yimg.com/ny/api/res/1.2/7Oybi_h9sBCC7gjex3GADQ--/YXBwaWQ9aGlnaGxhbmRlcjt3PTEyMDA7aD04MDA-/https://s.yimg.com/uu/api/res/1.2/Kn3F_gIJwe0a3uIOU.Tb2w--~B/aD0yMzgxO3c9MzU3MTthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/ap.org/4a35cff443aaabc2b49d94a5e7672369 (image) + + ``` ## JSON format @@ -97,21 +118,51 @@ The utility can export the feed into JSON format for console output. The structure is shown below: ```python -{ - "channel": "Channel title", - "url": "Channel URL", - "entries": [ - { - "title": "Entry title", - "date": "Entry publish date", - "link": "Entry link", - "description": "Entry description", - "image_link": "Entry image link" - } - ] -} +[ + { + "channel": "Channel title", + "url": "Channel URL", + "entries": [ + { + "title": "Entry title", + "date": "Entry publish date", + "link": "Entry link", + "description": "Entry description", + "image_link": "Entry image link" + } + ] + } +] ``` +## Feed cache + +RSS feed is cached while reading. + +Cache storage is the SQLite3 database, it contains 2 data tables: `channels` and `entries`. + +- Table `channels` schema: + + | column | type | + | ------- | ------ | + | id | INT PK | + | channel | TEXT | + | url | TEXT | + +- Table `entries` schema: + + | column | type | + | ----------- | ------ | + | id | INT PK | + | title | TEXT | + | link | TEXT | + | date | TEXT | + | date_fmt | TEXT | + | description | TEXT | + | image_link | TEXT | + | image_data | BLOB | + | channel_id | INT FK | + ## Running tests To run tests set the working directory to the project root `rss_reader/` and execute: diff --git a/MikalaiSidarevich/rss_reader/engine/argparser.py b/MikalaiSidarevich/rss_reader/engine/argparser.py index 0bb1bd03..186736a9 100644 --- a/MikalaiSidarevich/rss_reader/engine/argparser.py +++ b/MikalaiSidarevich/rss_reader/engine/argparser.py @@ -26,6 +26,7 @@ def _configure(self): self._parser.add_argument("--json", action='store_true', help="Print result as JSON in stdout") self._parser.add_argument("--verbose", action='store_true', help="Outputs verbose status messages") self._parser.add_argument("--limit", help="Limit news topics if this parameter provided") + self._parser.add_argument("--date", help="Read cached news by date specified like '%%Y%%m%%d'") @property def args(self): diff --git a/MikalaiSidarevich/rss_reader/engine/converter.py b/MikalaiSidarevich/rss_reader/engine/converter.py index f6a07428..1137adde 100644 --- a/MikalaiSidarevich/rss_reader/engine/converter.py +++ b/MikalaiSidarevich/rss_reader/engine/converter.py @@ -9,43 +9,56 @@ class Converter: """RSS data converter to target formats.""" @classmethod - def to_text(cls, feed): + def to_text(cls, feed_list): """ - Convert channel's `feed` data to text format for console output. + Convert `feed_list` data to text format for console output. """ - # List of channel data blocks - channel_data = [] + # RSS channels list + channel_list = [] - # Add channel title - channel_data.append(f"\nFeed: {feed['channel']}\n\n") + for feed in feed_list: + # List of channel data blocks + channel_data = [] - for entry in feed['entries']: - # Add enty title block - channel_data.append(f"Title: {entry['title']}\n") + # Add channel title + channel_data.append(f"\nFeed: {feed['channel']}\n\n") - # Add entry date block - if entry['date'] is not None: - channel_data.append(f"Date: {entry['date']}\n") + for entry in feed['entries']: + # Add enty title block + channel_data.append(f"Title: {entry['title']}\n") - # Add entry link block - channel_data.append(f"Link: {entry['link']}\n") + # Add entry date block + if entry['date'] is not None: + channel_data.append(f"Date: {entry['date']}\n") - # Add entry description block - if entry['description'] is not None: - channel_data.append(f"\n{entry['description']}\n\n") + # Add entry link block + channel_data.append(f"Link: {entry['link']}\n") - # Add links list block - channel_data.append(f"\nLinks:\n[1]: {entry['link']} (link)\n") - if entry['image_link'] is not None: - channel_data.append(f"[2]: {entry['image_link']} (image)\n") + # Add entry description block + if entry['description'] is not None: + channel_data.append(f"\n{entry['description']}\n\n") - channel_data.append("\n") + # Add links list block + channel_data.append(f"\nLinks:\n[1]: {entry['link']} (link)\n") + if entry['image_link'] is not None: + channel_data.append(f"[2]: {entry['image_link']} (image)\n") - return ''.join(channel_data) + channel_data.append("\n") + + # Merge channel blocks + channel_list.append(''.join(channel_data)) + + return ''.join(channel_list) @classmethod - def to_json(cls, feed): + def to_json(cls, feed_list): """ - Convert channel's `feed` data to json format. + Convert `feed_list` data to json format. """ - return json.dumps(feed, ensure_ascii=False, indent=4) + # Image data & formatted date shouldn't go to json + for feed in feed_list: + for entry in feed['entries']: + del entry['date_fmt'] + del entry['image_data'] + + return json.dumps(feed_list, ensure_ascii=False, indent=4) diff --git a/MikalaiSidarevich/rss_reader/engine/main.py b/MikalaiSidarevich/rss_reader/engine/main.py index bac61581..0a7db834 100644 --- a/MikalaiSidarevich/rss_reader/engine/main.py +++ b/MikalaiSidarevich/rss_reader/engine/main.py @@ -7,7 +7,8 @@ from engine.argparser import ArgParser from engine.rssreader import RssReader -version = '1.2' +version = '1.3' +db = "storage.db" def main(): @@ -41,15 +42,19 @@ def main(): if args['source'] is not None: url = args['source'] + date = None + if args['date'] is not None: + date = args['date'] + except Exception: print("Invalid argument value", flush=True) exit(1) - if url: + if url or date: if verbose: - print(f"URL is set: '{url}'", flush=True) + print(f"URL is set: '{url}', read from {'cache' if date else 'URL'}", flush=True) try: - rss = RssReader(verbose).read_rss(url, limit, json) + rss = RssReader(verbose, db).read_rss(url, limit, json, date) print(rss, flush=True) except Exception as e: print(f"{type(e).__name__}: {e}", flush=True) diff --git a/MikalaiSidarevich/rss_reader/engine/rsscacher.py b/MikalaiSidarevich/rss_reader/engine/rsscacher.py new file mode 100644 index 00000000..3f450d5a --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/rsscacher.py @@ -0,0 +1,239 @@ +""" +RssCacher - SQLite3 database handler, work with cached RSS entries. +""" + +import sqlite3 +import threading + +# Define the lock globally +lock = threading.Lock() + + +class StorageError(Exception): + """ + Cache storage exception. + """ + pass + + +class QueryError(Exception): + """ + Database query exception. + """ + pass + + +class RssCacher: + """SQLite3 database handler, work with cached RSS entries.""" + + def __init__(self, dbname, verbose=False): + """ + Initialize database name and verbosity. + """ + self._dbname = dbname + self._verbose = verbose + + def __enter__(self): + """ + Context manager entry point. + Create tables if they're not exist. + """ + self._connect() + self._create_tables() + return self + + def __exit__(self, exc_type, exc_value, traceback): + """ + Context manager exit point. + """ + if exc_type: + raise + self._close() + + def _connect(self): + """ + Connect to database & set the cursor. + """ + try: + self._conn = sqlite3.connect(self._dbname, check_same_thread=False) + self._conn.row_factory = sqlite3.Row + self._cur = self._conn.cursor() + except Exception: + raise StorageError(f"Unable to open '{self._dbname}' database") + + def _close(self): + """ + Close database connection. + """ + self._conn.close() + + def _create_tables(self): + """ + Create tables to store RSS data in local storage. + """ + try: + query = """CREATE TABLE IF NOT EXISTS channels ( + id INTEGER PRIMARY KEY, + channel TEXT, + url TEXT)""" + self._cur.execute(query) + + query = """CREATE TABLE IF NOT EXISTS entries ( + id INTEGER PRIMARY KEY, + title TEXT, + link TEXT, + date TEXT, + date_fmt TEXT, + description TEXT, + image_link TEXT, + image_data BLOB, + channel_id INTEGER, + FOREIGN KEY (channel_id) REFERENCES channels (id))""" + self._cur.execute(query) + except Exception as e: + raise StorageError(f"Unable to create tables in '{self._dbname}' database ({e})") + + def store_channel(self, feed): + """ + Save RSS channel data into the `channels` table. + """ + try: + # Check channel duplicate + query = ("SELECT id FROM channels WHERE url=?") + self._cur.execute(query, (feed['url'],)) + row = self._cur.fetchone() + + # Store channel data + if row: + query = "UPDATE channels SET channel=? WHERE url=?" + else: + query = "INSERT INTO channels (channel, url) VALUES (?,?)" + + self._cur.execute(query, (feed['channel'], feed['url'])) + self._conn.commit() + except Exception as e: + raise StorageError(f"Unable to store channel data into '{self._dbname}' ({e})") + + def store_entry(self, entry, channel_id): + """ + Save RSS entry info into the `entries` table. + """ + try: + lock.acquire(True) + + # Check entry duplicate + query = ("SELECT id FROM entries WHERE `link`=?") + self._cur.execute(query, (entry['link'],)) + row = self._cur.fetchone() + + values = [entry['title'], + entry['link'], + entry['date'], + entry['date_fmt'], + entry['description'], + entry['image_link'], + entry['image_data'], + channel_id] + + # Store entry + if row: + query = """UPDATE entries SET + title=?, + link=?, + date=?, + date_fmt=?, + description=?, + image_link=?, + image_data=?, + channel_id=? WHERE link=?""" + values += (entry['link'],) + else: + query = """INSERT INTO entries ( + title, + link, + date, + date_fmt, + description, + image_link, + image_data, + channel_id + ) VALUES (?,?,?,?,?,?,?,?)""" + + self._cur.execute(query, values) + self._conn.commit() + except Exception as e: + raise StorageError(f"Unable to store entry data into '{self._dbname}' ({e})") + finally: + lock.release() + + def get_channel_id(self, url): + """ + Get channel id by channel `url`. + """ + query = ("SELECT id FROM channels WHERE url=?") + self._cur.execute(query, (url,)) + return self._cur.fetchone()['id'] + + def feed(self, url, limit, date): + """ + Get list of RSS feeds from storage. + `url` sets RSS channel, + `limit` sets total number of entries, + `date` sets entries date filter. + """ + query = """SELECT + entries.title as title, + entries.link as link, + entries.date as date, + entries.date_fmt as date_fmt, + entries.description as description, + entries.image_link as image_link, + entries.image_data as image_data, + channels.channel as channel, + channels.url as url + FROM entries JOIN channels ON entries.channel_id=channels.id + WHERE entries.date_fmt=?""" + + # Set url filter + if url is not None: + url = url.lower().rstrip('/') + query += f" AND channels.url='{url}'" + + # Set limit filter + if limit is not None: + query += f" LIMIT {limit}" + + self._cur.execute(query, (date, )) + db_entries_list = self._cur.fetchall() + + # Create feed list + if db_entries_list: + feed_list = [] + + # Place entries by channels + for db_entry in db_entries_list: + entry = {'title': db_entry['title'], + 'link': db_entry['link'], + 'date': db_entry['date'], + 'date_fmt': db_entry['date_fmt'], + 'description': db_entry['description'], + 'image_link': db_entry['image_link'], + 'image_data': db_entry['image_data']} + + for feed in feed_list: + # Add entry to its channel + if db_entry['channel'] in feed.values(): + feed['entries'].append(entry) + break + else: + # Set channel at first occurence + feed = {'channel': db_entry['channel'], + 'url': db_entry['url'], + 'entries': [entry]} + + # Add new feed structure into the result list + feed_list.append(feed) + else: + raise QueryError(f"There are no entries for the date '{date}'") + + return feed_list diff --git a/MikalaiSidarevich/rss_reader/engine/rssparser.py b/MikalaiSidarevich/rss_reader/engine/rssparser.py index 0e5dd439..25211664 100644 --- a/MikalaiSidarevich/rss_reader/engine/rssparser.py +++ b/MikalaiSidarevich/rss_reader/engine/rssparser.py @@ -5,7 +5,7 @@ import threading import xml.dom.minidom from html import unescape -from time import sleep +from time import sleep, strftime, strptime import requests from bs4 import BeautifulSoup @@ -32,21 +32,36 @@ class HtmlParseError(Exception): pass +class DateError(Exception): + """ + Date format conversion exception. + """ + pass + + +class ImageError(Exception): + """ + Download image exception. + """ + pass + + class RssParser: """Parser for RSS data.""" - def __init__(self, verbose=False): + def __init__(self, db, verbose=False): """ - Initialize parser with `verbose` ability and setup feed storage. + Initialize parser with `verbose` ability and setup feed storages. """ self._verbose = verbose self._feed = {} + self._db = db def feed(self, url, limit): """ Get parsed RSS feed. """ - self._url = url + self._url = url.lower() self._limit = limit # Get XML document @@ -55,7 +70,7 @@ def feed(self, url, limit): # Parse XML document into the internal storage self._parse_feed() - return self._feed + return [self._feed] def _request_url(self, url): """ @@ -96,6 +111,12 @@ def _parse_feed(self): # Add channel url self._feed['url'] = self._url + # Store channel into db + self._db.store_channel(self._feed) + + # Get channel id to store entries + channel_id = self._db.get_channel_id(self._feed['url']) + # Get list of RSS entries for tag in ['item', 'entry']: item_list = dom.getElementsByTagName(tag) @@ -114,7 +135,7 @@ def _parse_feed(self): for i in range(limit): item = item_list[i] thread = threading.Thread(target=self._get_entry, - args=(i, item)) + args=(i, item, channel_id)) thread.start() # Prevent too fast requests @@ -130,7 +151,7 @@ def _parse_feed(self): if self._verbose: print(f"Total {len(self._feed['entries'])} items processed", flush=True) - def _get_entry(self, n, item): + def _get_entry(self, n, item, channel_id): """ Request `n`-th entry attributes: get title, date, link from XML, description, image by entry's url. @@ -140,9 +161,11 @@ def _get_entry(self, n, item): entry = {'title': None, 'date': None, + 'date_fmt': None, 'link': None, 'description': None, - 'image_link': None} + 'image_link': None, + 'image_data': None} # Get entry title try: @@ -164,8 +187,8 @@ def _get_entry(self, n, item): for tag in ['link', 'id']: entry_link_dom = item.getElementsByTagName(tag).item(0) if entry_link_dom.firstChild: - entry['link'] = entry_link_dom.firstChild.nodeValue - entry['link'] = entry['link'].lower() + link = entry_link_dom.firstChild.nodeValue + entry['link'] = link.lower().rstrip('/') break except Exception: if self._verbose: @@ -179,6 +202,7 @@ def _get_entry(self, n, item): entry_date_dom = item.getElementsByTagName(tag).item(0) if entry_date_dom: entry['date'] = entry_date_dom.firstChild.nodeValue + entry['date_fmt'] = self._convert_date(entry['date']) break except Exception: if self._verbose: @@ -189,7 +213,7 @@ def _get_entry(self, n, item): attrs = {'name': 'description', 'property': 'og:description', 'itemprop': 'description'} - entry['description'] = self._get_meta_tag(entry['link'], **attrs) + entry['description'] = self._get_meta_tag(link, **attrs) entry['description'] = unescape(entry['description']).strip() except Exception as e: if self._verbose: @@ -198,7 +222,8 @@ def _get_entry(self, n, item): # Get entry image link try: attrs = {'property': 'og:image'} - entry['image_link'] = self._get_meta_tag(entry['link'], **attrs) + entry['image_link'] = self._get_meta_tag(link, **attrs) + entry['image_data'] = self._download_image(entry['image_link']) except Exception as e: if self._verbose: print(f"Entry #{n} image: {e}", flush=True) @@ -209,6 +234,12 @@ def _get_entry(self, n, item): if self._verbose: print(f"Entry #{n} received", flush=True) + # Store entry into db + self._db.store_entry(entry, channel_id) + + if self._verbose: + print(f"Entry #{n} stored", flush=True) + def _get_meta_tag(self, url, **kwargs): """ Extract meta tags content from `url` specified. @@ -226,3 +257,29 @@ def _get_meta_tag(self, url, **kwargs): raise Exception except Exception: raise HtmlParseError(f"Couldn't parse data by URL '{url}'") + + def _convert_date(self, date): + """ + Convert raw input `date` into '%Y%m%d' format. + Recognize 2 formats: "Fri, 17 Jun 2022 02:50:00 GMT"-like and "2022-05-26T00:00:00Z"-like. + """ + try: + # "Fri, 17 Jun 2022 02:50:00 GMT" + date = strptime(date.rsplit(" ", maxsplit=1)[0], "%a, %d %b %Y %X") + except Exception: + try: + # "2022-05-26T00:00:00Z" + date = strptime(date.split('T')[0], "%Y-%m-%d") + except Exception: + raise DateError(f"Unable to extract date from '{date}'") + + return strftime('%Y%m%d', date) + + def _download_image(self, url): + """ + Download image content by `url`. + """ + try: + return self._request_url(url) + except RequestError: + raise ImageError(f"Unable to get image by URL '{url}'") diff --git a/MikalaiSidarevich/rss_reader/engine/rssreader.py b/MikalaiSidarevich/rss_reader/engine/rssreader.py index db356529..603cc807 100644 --- a/MikalaiSidarevich/rss_reader/engine/rssreader.py +++ b/MikalaiSidarevich/rss_reader/engine/rssreader.py @@ -3,33 +3,41 @@ """ from engine.converter import Converter +from engine.rsscacher import RssCacher from engine.rssparser import RssParser class RssReader: """Pure Python command-line RSS reader.""" - def __init__(self, verbose): + def __init__(self, verbose=False, dbname="storage.db"): """ - Initialize reader with `verbose` ability. + Initialize reader with `verbose` ability, storage `dbname`. """ self._verbose = verbose + self._dbname = dbname - def read_rss(self, url, limit, json): + def read_rss(self, url, limit, json, date): """ Get RSS entries from `url` and output them to the stdout. Limit number of entries with `limit`. Output all the entries if `limit` is not specified. + Read news from cache by `date` if specified. """ try: - parser = RssParser(self._verbose) - feed = parser.feed(url, limit) + with RssCacher(self._dbname, self._verbose) as cacher: + parser = RssParser(cacher, self._verbose) + + if date is None: + feed_list = parser.feed(url, limit) + else: + feed_list = cacher.feed(url, limit, date) except Exception: raise if json: - rss_content = Converter.to_json(feed) + rss_content = Converter.to_json(feed_list) else: - rss_content = Converter.to_text(feed) + rss_content = Converter.to_text(feed_list) return rss_content diff --git a/MikalaiSidarevich/rss_reader/setup.cfg b/MikalaiSidarevich/rss_reader/setup.cfg index 95ccd91a..166adf42 100644 --- a/MikalaiSidarevich/rss_reader/setup.cfg +++ b/MikalaiSidarevich/rss_reader/setup.cfg @@ -1,11 +1,12 @@ [metadata] name = rss-reader -version = 1.2 +version = 1.3 [options] packages = find: install_requires = beautifulsoup4==4.11.1 + EbookLib==0.17.1 coverage==6.2 lxml==4.8.0 requests==2.26.0 diff --git a/MikalaiSidarevich/rss_reader/tests/test_argparser.py b/MikalaiSidarevich/rss_reader/tests/test_argparser.py index a3703c87..b51b46b5 100644 --- a/MikalaiSidarevich/rss_reader/tests/test_argparser.py +++ b/MikalaiSidarevich/rss_reader/tests/test_argparser.py @@ -22,5 +22,6 @@ def test_args(self): 'version': False, 'json': False, 'verbose': False, - 'limit': None} + 'limit': None, + 'date': None} self.assertDictEqual(self.parser.args, expected) diff --git a/MikalaiSidarevich/rss_reader/tests/test_converter.py b/MikalaiSidarevich/rss_reader/tests/test_converter.py index 7b48ac38..fe2385f9 100644 --- a/MikalaiSidarevich/rss_reader/tests/test_converter.py +++ b/MikalaiSidarevich/rss_reader/tests/test_converter.py @@ -12,22 +12,21 @@ def test_to_text(self): """ to_text() test. """ - feed = {'channel': "", - 'entries': [{'title': "", - 'date': "", - 'link': "", - 'description': "", - 'image_link': ""}]} - self.assertIsNotNone(Converter.to_text(feed)) + feed_list = [{'channel': "", + 'entries': [{'title': "", + 'date': "", + 'link': "", + 'description': "", + 'image_link': ""}]}] + self.assertIsNotNone(Converter.to_text(feed_list)) def test_to_json(self): """ to_json() test. """ - feed = {'channel': "", - 'entries': [{'title': "", - 'date': "", - 'link': "", - 'description': "", - 'image_link': ""}]} - self.assertIsNotNone(Converter.to_json(feed)) + feed_list = [{'channel': "", + 'entries': [{'title': "", + 'link': "", + 'date_fmt': "", + 'image_data': ""}]}] + self.assertIsNotNone(Converter.to_json(feed_list)) diff --git a/MikalaiSidarevich/rss_reader/tests/test_rsscacher.py b/MikalaiSidarevich/rss_reader/tests/test_rsscacher.py new file mode 100644 index 00000000..146918c6 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/tests/test_rsscacher.py @@ -0,0 +1,33 @@ +"""Test RssCacher class.""" + +from unittest import TestCase + +from engine.rsscacher import QueryError, RssCacher, StorageError + + +class TestRssCacher(TestCase): + """Testcase for RssCacher class.""" + + def test_store_channel(self): + """ + store_channel() test. + """ + with RssCacher(':memory:') as db: + with self.assertRaises(StorageError): + db.store_channel({}) + + def test_store_entry(self): + """ + store_entry() test. + """ + with RssCacher(':memory:') as db: + with self.assertRaises(StorageError): + db.store_entry({}, 0) + + def test_feed(self): + """ + feed() test. + """ + with RssCacher(':memory:') as db: + with self.assertRaises(QueryError): + db.feed("", 1, None) diff --git a/MikalaiSidarevich/rss_reader/tests/test_rssparser.py b/MikalaiSidarevich/rss_reader/tests/test_rssparser.py index 24d80862..6424b3ba 100644 --- a/MikalaiSidarevich/rss_reader/tests/test_rssparser.py +++ b/MikalaiSidarevich/rss_reader/tests/test_rssparser.py @@ -2,23 +2,21 @@ from unittest import TestCase +from engine.rsscacher import RssCacher from engine.rssparser import RequestError, RssParser class TestRssParser(TestCase): """Testcase for RssParser class.""" - def setUp(self): - """ - Prepare test fixture. - """ - self.parser = RssParser(verbose=False) - def test_feed(self): """ feed() test. """ - with self.assertRaises(RequestError): - self.parser.feed("", 1) + with RssCacher(":memory:", verbose=False) as db: + self.parser = RssParser(db, verbose=False) + + with self.assertRaises(RequestError): + self.parser.feed("", 1) - self.assertIsNotNone(self.parser.feed("https://news.yahoo.com/rss/", 1)) + self.assertIsNotNone(self.parser.feed("https://news.yahoo.com/rss/", 1)) diff --git a/MikalaiSidarevich/rss_reader/tests/test_rssreader.py b/MikalaiSidarevich/rss_reader/tests/test_rssreader.py index eb585859..9d2924f1 100644 --- a/MikalaiSidarevich/rss_reader/tests/test_rssreader.py +++ b/MikalaiSidarevich/rss_reader/tests/test_rssreader.py @@ -20,14 +20,17 @@ def test_read_rss(self, mock_parser_feed): """ read_rss() test. """ - mock_parser_feed.return_value = {'channel': "", - 'entries': [{'title': "", - 'date': "", - 'link': "", - 'description': "", - 'image_link': ""}]} + mock_parser_feed.return_value = [{'channel': "", + 'entries': [{'title': "", + 'date': "", + 'date_fmt': "", + 'link': "", + 'description': "", + 'image_link': "", + 'image_data': ""}]}] url = "https://news.yahoo.com/rss/" limit = 1 - self.assertIsNotNone(self.reader.read_rss(url, limit, False)) - self.assertIsNotNone(self.reader.read_rss(url, limit, True)) + date = None + self.assertIsNotNone(self.reader.read_rss(url, limit, False, date)) + self.assertIsNotNone(self.reader.read_rss(url, limit, True, date)) From 6380a5df4fd179abc2ac928c15968a6bff81eab6 Mon Sep 17 00:00:00 2001 From: Nickolai Date: Thu, 30 Jun 2022 22:07:45 +0300 Subject: [PATCH 4/5] [Iteration 4] Format converter. --- MikalaiSidarevich/rss_reader/README.md | 52 +++++-- .../rss_reader/engine/argparser.py | 2 + .../rss_reader/engine/converter.py | 127 ++++++++++++++++++ MikalaiSidarevich/rss_reader/engine/main.py | 12 +- .../rss_reader/engine/rssreader.py | 11 +- MikalaiSidarevich/rss_reader/requirements.txt | 1 + MikalaiSidarevich/rss_reader/setup.cfg | 2 +- .../rss_reader/tests/test_argparser.py | 4 +- .../rss_reader/tests/test_converter.py | 35 +++++ .../rss_reader/tests/test_rssreader.py | 14 +- 10 files changed, 242 insertions(+), 18 deletions(-) diff --git a/MikalaiSidarevich/rss_reader/README.md b/MikalaiSidarevich/rss_reader/README.md index c9de2b5a..9618828c 100644 --- a/MikalaiSidarevich/rss_reader/README.md +++ b/MikalaiSidarevich/rss_reader/README.md @@ -15,6 +15,7 @@ To be specific this readme has decided to use the name `python`. All extra packages listed in the `requirements.txt`: - [**`beautifulsoup4`**](https://pypi.org/project/beautifulsoup4/) `4.11.1` — Screen-scraping library +- [**`EbookLib`**](https://pypi.org/project/EbookLib/) `0.17.1` — Ebook library which can handle EPUB2/EPUB3 and Kindle format - [**`coverage`**](https://pypi.org/project/coverage/) `6.2` — Code coverage measurement for Python - [**`lxml`**](https://pypi.org/project/lxml/) `4.8.0` — Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API - [**`requests`**](https://pypi.org/project/requests/) `2.26.0` — Python HTTP for Humans @@ -40,20 +41,22 @@ The utility can handle multiple arguments. To show help message below use `-h/--help` argument. ```sh -usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [--date DATE] [source] +usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [--date DATE] [--to-epub PATH] [--to-html PATH] [source] Pure Python command-line RSS reader. positional arguments: - source RSS URL + source RSS URL optional arguments: - -h, --help show this help message and exit - --version Print version info - --json Print result as JSON in stdout - --verbose Outputs verbose status messages - --limit LIMIT Limit news topics if this parameter provided - --date DATE Read cached news by date specified like '%Y%m%d' + -h, --help show this help message and exit + --version Print version info + --json Print result as JSON in stdout + --verbose Outputs verbose status messages + --limit LIMIT Limit news topics if this parameter provided + --date DATE Read cached news by date specified like '%Y%m%d' + --to-epub PATH Convert news to epub format + --to-html PATH Convert news to HTML format ``` ## Examples: @@ -63,13 +66,13 @@ Set the working directory to the project root `rss_reader/` and execute: - Show utility version: ```sh > python rss_reader.py --version - Version 1.3 + Version 1.4 ``` - Show utility version using CLI utility installed: ```sh > rss_reader --version - Version 1.3 + Version 1.4 ``` - Read 1 news entry from [Yahoo](https://news.yahoo.com/) source: @@ -111,6 +114,11 @@ Set the working directory to the project root `rss_reader/` and execute: ``` + +- Export all the news entries from [Yahoo](https://news.yahoo.com/) source to the `yahoo.epub` file in the working directory: + ```sh + > python rss_reader.py https://news.yahoo.com/rss/ --to-epub yahoo.epub + ``` ## JSON format @@ -135,6 +143,30 @@ The structure is shown below: ] ``` +## Export formats + +The utility can export the feed into HTML and Epub formats, use the `--to-html` or `--to-epub` options, respectively. + +### HTML format + +When export to HTML format the utility saves HTML page into the filepath specified and pictures data into a separate folder named like HTML filepath with suffix ` - images`. + +- Produce `yahoo.html` file and `yahoo.html - images/` directory with images in the `~/rss/` directory: + + ```sh + > python rss_reader.py https://news.yahoo.com/rss/ --to-html ~/rss/yahoo.html + ``` + +### Epub format + +When export to Epub format the utility saves Epub file into the filepath specified with pictures data incapsulated. + +- Produce `yahoo.epub` in the `~/rss/` directory: + + ```sh + > python rss_reader.py https://news.yahoo.com/rss/ --to-html ~/rss/yahoo.html + ``` + ## Feed cache RSS feed is cached while reading. diff --git a/MikalaiSidarevich/rss_reader/engine/argparser.py b/MikalaiSidarevich/rss_reader/engine/argparser.py index 186736a9..ae4085db 100644 --- a/MikalaiSidarevich/rss_reader/engine/argparser.py +++ b/MikalaiSidarevich/rss_reader/engine/argparser.py @@ -27,6 +27,8 @@ def _configure(self): self._parser.add_argument("--verbose", action='store_true', help="Outputs verbose status messages") self._parser.add_argument("--limit", help="Limit news topics if this parameter provided") self._parser.add_argument("--date", help="Read cached news by date specified like '%%Y%%m%%d'") + self._parser.add_argument("--to-epub", metavar="PATH", help="Convert news to epub format") + self._parser.add_argument("--to-html", metavar="PATH", help="Convert news to HTML format") @property def args(self): diff --git a/MikalaiSidarevich/rss_reader/engine/converter.py b/MikalaiSidarevich/rss_reader/engine/converter.py index 1137adde..c0442e48 100644 --- a/MikalaiSidarevich/rss_reader/engine/converter.py +++ b/MikalaiSidarevich/rss_reader/engine/converter.py @@ -3,6 +3,16 @@ """ import json +import os + +from ebooklib import epub + + +class FileError(Exception): + """ + Save file exception. + """ + pass class Converter: @@ -62,3 +72,120 @@ def to_json(cls, feed_list): del entry['image_data'] return json.dumps(feed_list, ensure_ascii=False, indent=4) + + @classmethod + def save_html(cls, feeds, path): + """ + Save `feeds` data into html file with images. + """ + # Separate path and filename + dir, fname = os.path.split(path) + + # Set dirname for images + img_dname = f"{fname} - images" + + try: + # Create directory for images + if not os.path.exists(os.path.join(dir, img_dname)): + os.mkdir(os.path.join(dir, img_dname)) + + with open(path, "w", encoding='utf-8') as f: + content = "" + for feed in feeds: + # Add channel title + content += f"""

{feed['channel']}

""" + + for entry in feed['entries']: + # Set entry title & link + header = f"""

{entry['title']}

""" + + if entry['date']: + header += f"

{entry['date']}

" + + # Set entry description + decription = "" + + if entry['description']: + decription = f"""

{entry['description']}

""" + + image_fname = str(hash(entry['image_link'])) + + # Save image + image = "" + + if entry['image_data']: + with open(os.path.join(dir, img_dname, image_fname), "wb") as img: + img.write(entry['image_data']) + image = f"""""" + + # Merge components + content += f"{header}{image}{decription}" + + # Add channels delimiter + content += "
" + + html_template = """ + {} + {} + """ + + f.write(html_template.strip().format("RSS news", content)) + + except PermissionError: + raise FileError(f"Permission denied for '{path}'") + except Exception: + raise FileError(f"Unable to save html into '{path}'") + + @classmethod + def save_epub(cls, feeds, path): + """ + Save `feeds` data into epub file. + """ + book = epub.EpubBook() + book.set_title('RSS news') + + chapters = [] + for i, feed in enumerate(feeds): + # Set channel title & url + chapter = epub.EpubHtml(title=f"Channel {i}", file_name=f"ch_{i}.xhtml") + chapter.content = f"""

{feed['channel']}

""" + + for entry in feed['entries']: + # Set entry title & link + header = f"""

{entry['title']}

""" + + if entry['date']: + header += f"

{entry['date']}

" + + # Set entry description + decription = "" + + if entry['description']: + decription = f"""

{entry['description']}

""" + + # Add image + image = "" + + if entry['image_data']: + image_fname = str(hash(entry['image_link'])) + img = epub.EpubItem(file_name=image_fname, media_type="ITEM_IMAGE") + img.content = entry['image_data'] + book.add_item(img) + image = f"""
""" + + # Merge components + chapter.content += f"{header}{image}{decription}" + + chapters.append(chapter) + book.add_item(chapter) + + book.spine = chapters + + try: + writer = epub.EpubWriter(path, book, {}) + writer.process() + writer.write() + except PermissionError: + raise FileError(f"Permission denied for '{path}'") + except Exception: + raise FileError(f"Unable to save epub into '{path}'") diff --git a/MikalaiSidarevich/rss_reader/engine/main.py b/MikalaiSidarevich/rss_reader/engine/main.py index 0a7db834..805124f7 100644 --- a/MikalaiSidarevich/rss_reader/engine/main.py +++ b/MikalaiSidarevich/rss_reader/engine/main.py @@ -7,7 +7,7 @@ from engine.argparser import ArgParser from engine.rssreader import RssReader -version = '1.3' +version = '1.4' db = "storage.db" @@ -46,6 +46,14 @@ def main(): if args['date'] is not None: date = args['date'] + html_path = None + if args['to_html'] is not None: + html_path = args['to_html'] + + epub_path = None + if args['to_epub'] is not None: + epub_path = args['to_epub'] + except Exception: print("Invalid argument value", flush=True) exit(1) @@ -54,7 +62,7 @@ def main(): if verbose: print(f"URL is set: '{url}', read from {'cache' if date else 'URL'}", flush=True) try: - rss = RssReader(verbose, db).read_rss(url, limit, json, date) + rss = RssReader(verbose, db).read_rss(url, limit, json, date, html_path, epub_path) print(rss, flush=True) except Exception as e: print(f"{type(e).__name__}: {e}", flush=True) diff --git a/MikalaiSidarevich/rss_reader/engine/rssreader.py b/MikalaiSidarevich/rss_reader/engine/rssreader.py index 603cc807..50ca3606 100644 --- a/MikalaiSidarevich/rss_reader/engine/rssreader.py +++ b/MikalaiSidarevich/rss_reader/engine/rssreader.py @@ -17,7 +17,7 @@ def __init__(self, verbose=False, dbname="storage.db"): self._verbose = verbose self._dbname = dbname - def read_rss(self, url, limit, json, date): + def read_rss(self, url, limit, json, date, html_path, epub_path): """ Get RSS entries from `url` and output them to the stdout. Limit number of entries with `limit`. @@ -35,6 +35,15 @@ def read_rss(self, url, limit, json, date): except Exception: raise + if html_path: + Converter.save_html(feed_list, html_path) + + if epub_path: + Converter.save_epub(feed_list, epub_path) + + # Conversions that require the full feed structure should be done before here + # Json conversion makes removes from feeds + if json: rss_content = Converter.to_json(feed_list) else: diff --git a/MikalaiSidarevich/rss_reader/requirements.txt b/MikalaiSidarevich/rss_reader/requirements.txt index 3cbf33f2..15b12c51 100644 --- a/MikalaiSidarevich/rss_reader/requirements.txt +++ b/MikalaiSidarevich/rss_reader/requirements.txt @@ -1,4 +1,5 @@ beautifulsoup4==4.11.1 +EbookLib==0.17.1 coverage==6.2 lxml==4.8.0 requests==2.26.0 diff --git a/MikalaiSidarevich/rss_reader/setup.cfg b/MikalaiSidarevich/rss_reader/setup.cfg index 166adf42..0d64e66a 100644 --- a/MikalaiSidarevich/rss_reader/setup.cfg +++ b/MikalaiSidarevich/rss_reader/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = rss-reader -version = 1.3 +version = 1.4 [options] packages = find: diff --git a/MikalaiSidarevich/rss_reader/tests/test_argparser.py b/MikalaiSidarevich/rss_reader/tests/test_argparser.py index b51b46b5..8a13d954 100644 --- a/MikalaiSidarevich/rss_reader/tests/test_argparser.py +++ b/MikalaiSidarevich/rss_reader/tests/test_argparser.py @@ -23,5 +23,7 @@ def test_args(self): 'json': False, 'verbose': False, 'limit': None, - 'date': None} + 'date': None, + 'to_html': None, + 'to_epub': None} self.assertDictEqual(self.parser.args, expected) diff --git a/MikalaiSidarevich/rss_reader/tests/test_converter.py b/MikalaiSidarevich/rss_reader/tests/test_converter.py index fe2385f9..e62512a8 100644 --- a/MikalaiSidarevich/rss_reader/tests/test_converter.py +++ b/MikalaiSidarevich/rss_reader/tests/test_converter.py @@ -1,6 +1,7 @@ """Test Converter class.""" from unittest import TestCase +from unittest.mock import patch from engine.converter import Converter @@ -30,3 +31,37 @@ def test_to_json(self): 'date_fmt': "", 'image_data': ""}]}] self.assertIsNotNone(Converter.to_json(feed_list)) + + @patch('builtins.open') + def test_save_html(self, mock_open): + """ + save_html() test. + """ + feeds = [{'channel': "", + 'url': "", + 'entries': [{'title': "", + 'date': "", + 'date_fmt': "", + 'link': "", + 'description': "", + 'image_link': "", + 'image_data': b""}]}] + with self.assertRaises(Exception): + Converter.save_html(feeds, "1/") + + @patch('ebooklib.epub.EpubWriter.write') + def test_save_epub(self, mock_write): + """ + save_epub() test. + """ + feeds = [{'channel': "", + 'url': "", + 'entries': [{'title': "", + 'date': "", + 'date_fmt': "", + 'link': "", + 'description': "", + 'image_link': "", + 'image_data': b""}]}] + Converter.save_epub(feeds, "") + mock_write.assert_called_once() diff --git a/MikalaiSidarevich/rss_reader/tests/test_rssreader.py b/MikalaiSidarevich/rss_reader/tests/test_rssreader.py index 9d2924f1..0d0acd78 100644 --- a/MikalaiSidarevich/rss_reader/tests/test_rssreader.py +++ b/MikalaiSidarevich/rss_reader/tests/test_rssreader.py @@ -13,7 +13,7 @@ def setUp(self): """ Prepare test fixture. """ - self.reader = RssReader(verbose=False) + self.reader = RssReader(verbose=False, dbname=':memory:') @patch('engine.rssparser.RssParser.feed') def test_read_rss(self, mock_parser_feed): @@ -32,5 +32,13 @@ def test_read_rss(self, mock_parser_feed): url = "https://news.yahoo.com/rss/" limit = 1 date = None - self.assertIsNotNone(self.reader.read_rss(url, limit, False, date)) - self.assertIsNotNone(self.reader.read_rss(url, limit, True, date)) + html_path = None + epub_path = None + + self.assertIsNotNone(self.reader.read_rss(url, limit, False, date, html_path, epub_path)) + self.assertIsNotNone(self.reader.read_rss(url, limit, True, date, html_path, epub_path)) + + with self.assertRaises(Exception): + self.reader.read_rss(url, limit, False, date, '/', epub_path) + with self.assertRaises(Exception): + self.reader.read_rss(url, limit, False, date, html_path, '/') From 506cdcd551a9de3a5bea113f928b5cb67e53c1ef Mon Sep 17 00:00:00 2001 From: Nickolai Date: Thu, 30 Jun 2022 22:16:37 +0300 Subject: [PATCH 5/5] [Iteration 5] Output colorization. --- MikalaiSidarevich/rss_reader/.colorize.conf | 11 ++++++++++ MikalaiSidarevich/rss_reader/README.md | 22 ++++++++++++++++--- .../rss_reader/engine/argparser.py | 1 + MikalaiSidarevich/rss_reader/engine/main.py | 16 +++++++++++++- MikalaiSidarevich/rss_reader/requirements.txt | 1 + MikalaiSidarevich/rss_reader/setup.cfg | 3 ++- .../rss_reader/tests/test_argparser.py | 3 ++- 7 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 MikalaiSidarevich/rss_reader/.colorize.conf diff --git a/MikalaiSidarevich/rss_reader/.colorize.conf b/MikalaiSidarevich/rss_reader/.colorize.conf new file mode 100644 index 00000000..c60a149e --- /dev/null +++ b/MikalaiSidarevich/rss_reader/.colorize.conf @@ -0,0 +1,11 @@ +"(?<=^Feed: ).*$", 1, magenta, +"(?<=""channel"": "")[^""]*", 1, magenta, +"(?<=^Title: ).*$", 1, green, +"(?<=""title"": "")[^""]*", 1, green, +"(?<=^Date: ).*$", 0, brown, +"(?<=""raw_date"": "")[^""]*", 0, brown, +"http[s]?\:\/\/[^\s,'""]*", 0, blue, +"http[s]?\:\/\/[\S]*", 0, blue, +"^\w*(Error|Exception)", 0, white, red +"^Entry #\d+ (description|image): .* ", 0, red, +"^Entry #\d+ has no .+", 0, red, diff --git a/MikalaiSidarevich/rss_reader/README.md b/MikalaiSidarevich/rss_reader/README.md index 9618828c..f539f5f7 100644 --- a/MikalaiSidarevich/rss_reader/README.md +++ b/MikalaiSidarevich/rss_reader/README.md @@ -16,6 +16,7 @@ All extra packages listed in the `requirements.txt`: - [**`beautifulsoup4`**](https://pypi.org/project/beautifulsoup4/) `4.11.1` — Screen-scraping library - [**`EbookLib`**](https://pypi.org/project/EbookLib/) `0.17.1` — Ebook library which can handle EPUB2/EPUB3 and Kindle format +- [**`colorize`**](https://pypi.org/project/colorize/) `1.1.0` — Command line utility to colorize other commands output - [**`coverage`**](https://pypi.org/project/coverage/) `6.2` — Code coverage measurement for Python - [**`lxml`**](https://pypi.org/project/lxml/) `4.8.0` — Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API - [**`requests`**](https://pypi.org/project/requests/) `2.26.0` — Python HTTP for Humans @@ -41,7 +42,7 @@ The utility can handle multiple arguments. To show help message below use `-h/--help` argument. ```sh -usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [--date DATE] [--to-epub PATH] [--to-html PATH] [source] +usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [--date DATE] [--to-epub PATH] [--to-html PATH] [--colorize] [source] Pure Python command-line RSS reader. @@ -57,6 +58,7 @@ optional arguments: --date DATE Read cached news by date specified like '%Y%m%d' --to-epub PATH Convert news to epub format --to-html PATH Convert news to HTML format + --colorize Colorize console output ``` ## Examples: @@ -66,13 +68,13 @@ Set the working directory to the project root `rss_reader/` and execute: - Show utility version: ```sh > python rss_reader.py --version - Version 1.4 + Version 1.5 ``` - Show utility version using CLI utility installed: ```sh > rss_reader --version - Version 1.4 + Version 1.5 ``` - Read 1 news entry from [Yahoo](https://news.yahoo.com/) source: @@ -195,6 +197,20 @@ Cache storage is the SQLite3 database, it contains 2 data tables: `channels` and | image_data | BLOB | | channel_id | INT FK | +## Colorized output* + +The utility provides colorized console output with `--colorize` option set. + +| entity | color | | +| ----------- | ------- | ----------------------------------------------------------- | +| feed title | magenta | ![FF0090](https://via.placeholder.com/15/FF0090/FF0090.png) | +| entry title | cyan | ![00FFFF](https://via.placeholder.com/15/00FFFF/00FFFF.png) | +| entry date | yellow | ![FFFF00](https://via.placeholder.com/15/FFFF00/FFFF00.png) | +| links | blue | ![0000FF](https://via.placeholder.com/15/0000FF/0000FF.png) | +| errors | red | ![FF0000](https://via.placeholder.com/15/FF0000/FF0000.png) | + +**Colorized mode is enabled on Linux only.* + ## Running tests To run tests set the working directory to the project root `rss_reader/` and execute: diff --git a/MikalaiSidarevich/rss_reader/engine/argparser.py b/MikalaiSidarevich/rss_reader/engine/argparser.py index ae4085db..7bc3d175 100644 --- a/MikalaiSidarevich/rss_reader/engine/argparser.py +++ b/MikalaiSidarevich/rss_reader/engine/argparser.py @@ -29,6 +29,7 @@ def _configure(self): self._parser.add_argument("--date", help="Read cached news by date specified like '%%Y%%m%%d'") self._parser.add_argument("--to-epub", metavar="PATH", help="Convert news to epub format") self._parser.add_argument("--to-html", metavar="PATH", help="Convert news to HTML format") + self._parser.add_argument("--colorize", action='store_true', help="Colorize console output") @property def args(self): diff --git a/MikalaiSidarevich/rss_reader/engine/main.py b/MikalaiSidarevich/rss_reader/engine/main.py index 805124f7..9666d010 100644 --- a/MikalaiSidarevich/rss_reader/engine/main.py +++ b/MikalaiSidarevich/rss_reader/engine/main.py @@ -2,12 +2,13 @@ main() - entry point: parse CLI arguments & run RSS reading. """ +import subprocess import sys from engine.argparser import ArgParser from engine.rssreader import RssReader -version = '1.4' +version = '1.5' db = "storage.db" @@ -54,6 +55,16 @@ def main(): if args['to_epub'] is not None: epub_path = args['to_epub'] + colorizer = None + if args['colorize']: + if sys.platform == "linux": + colorizer = subprocess.Popen("colorize", + stdin=subprocess.PIPE, + encoding='utf-8') + sys.stdout = colorizer.stdin + else: + print("Unable to set colorized mode, set to normal mode.", flush=True) + except Exception: print("Invalid argument value", flush=True) exit(1) @@ -67,6 +78,9 @@ def main(): except Exception as e: print(f"{type(e).__name__}: {e}", flush=True) exit(1) + finally: + if colorizer: + colorizer.communicate() else: arg_parser.print_help() diff --git a/MikalaiSidarevich/rss_reader/requirements.txt b/MikalaiSidarevich/rss_reader/requirements.txt index 15b12c51..403be712 100644 --- a/MikalaiSidarevich/rss_reader/requirements.txt +++ b/MikalaiSidarevich/rss_reader/requirements.txt @@ -1,5 +1,6 @@ beautifulsoup4==4.11.1 EbookLib==0.17.1 +colorize==1.1.0 coverage==6.2 lxml==4.8.0 requests==2.26.0 diff --git a/MikalaiSidarevich/rss_reader/setup.cfg b/MikalaiSidarevich/rss_reader/setup.cfg index 0d64e66a..5db3e7e4 100644 --- a/MikalaiSidarevich/rss_reader/setup.cfg +++ b/MikalaiSidarevich/rss_reader/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = rss-reader -version = 1.4 +version = 1.5 [options] packages = find: @@ -8,6 +8,7 @@ install_requires = beautifulsoup4==4.11.1 EbookLib==0.17.1 coverage==6.2 + colorize==1.1.0 lxml==4.8.0 requests==2.26.0 diff --git a/MikalaiSidarevich/rss_reader/tests/test_argparser.py b/MikalaiSidarevich/rss_reader/tests/test_argparser.py index 8a13d954..a0ee8aa7 100644 --- a/MikalaiSidarevich/rss_reader/tests/test_argparser.py +++ b/MikalaiSidarevich/rss_reader/tests/test_argparser.py @@ -25,5 +25,6 @@ def test_args(self): 'limit': None, 'date': None, 'to_html': None, - 'to_epub': None} + 'to_epub': None, + 'colorize': False} self.assertDictEqual(self.parser.args, expected)