diff --git a/MikalaiSidarevich/rss_reader/.colorize.conf b/MikalaiSidarevich/rss_reader/.colorize.conf new file mode 100644 index 00000000..c60a149e --- /dev/null +++ b/MikalaiSidarevich/rss_reader/.colorize.conf @@ -0,0 +1,11 @@ +"(?<=^Feed: ).*$", 1, magenta, +"(?<=""channel"": "")[^""]*", 1, magenta, +"(?<=^Title: ).*$", 1, green, +"(?<=""title"": "")[^""]*", 1, green, +"(?<=^Date: ).*$", 0, brown, +"(?<=""raw_date"": "")[^""]*", 0, brown, +"http[s]?\:\/\/[^\s,'""]*", 0, blue, +"http[s]?\:\/\/[\S]*", 0, blue, +"^\w*(Error|Exception)", 0, white, red +"^Entry #\d+ (description|image): .* ", 0, red, +"^Entry #\d+ has no .+", 0, red, diff --git a/MikalaiSidarevich/rss_reader/README.md b/MikalaiSidarevich/rss_reader/README.md new file mode 100644 index 00000000..f539f5f7 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/README.md @@ -0,0 +1,227 @@ +# RSS reader + +Pure Python command-line RSS reader. + +## Requirements + +The utility requires [**`python 3.9`**](https://www.python.org/downloads/) interpreter with [**`pip`**](https://pypi.org/project/pip/) installing tool, [**`setuptools`**](https://pypi.org/project/setuptools/) installing tool. + +On the command line the interpreter can be typed as `python`, `python3`, `python3.9` (depending on OS, version, etc.). + +To be specific this readme has decided to use the name `python`. + +## Dependencies + +All extra packages listed in the `requirements.txt`: + +- [**`beautifulsoup4`**](https://pypi.org/project/beautifulsoup4/) `4.11.1` — Screen-scraping library +- [**`EbookLib`**](https://pypi.org/project/EbookLib/) `0.17.1` — Ebook library which can handle EPUB2/EPUB3 and Kindle format +- [**`colorize`**](https://pypi.org/project/colorize/) `1.1.0` — Command line utility to colorize other commands output +- [**`coverage`**](https://pypi.org/project/coverage/) `6.2` — Code coverage measurement for Python +- [**`lxml`**](https://pypi.org/project/lxml/) `4.8.0` — Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API +- [**`requests`**](https://pypi.org/project/requests/) `2.26.0` — Python HTTP for Humans + +To install extra packages automatically set the working directory to the project root `rss_reader/` and execute*: + +```sh +> python -m pip install -r requirements.txt +``` + +It's also possible to setup package via **`setuptools`**: + +```sh +> python setup.py install +``` + +**Super user privileges may be required to install extra packages. If so, then use* `sudo` *command on Linux or run terminal as administrator on Windows.* + +## Usage + +The utility can handle multiple arguments. + +To show help message below use `-h/--help` argument. + +```sh +usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] [--date DATE] [--to-epub PATH] [--to-html PATH] [--colorize] [source] + +Pure Python command-line RSS reader. + +positional arguments: + source RSS URL + +optional arguments: + -h, --help show this help message and exit + --version Print version info + --json Print result as JSON in stdout + --verbose Outputs verbose status messages + --limit LIMIT Limit news topics if this parameter provided + --date DATE Read cached news by date specified like '%Y%m%d' + --to-epub PATH Convert news to epub format + --to-html PATH Convert news to HTML format + --colorize Colorize console output +``` + +## Examples: + +Set the working directory to the project root `rss_reader/` and execute: + +- Show utility version: + ```sh + > python rss_reader.py --version + Version 1.5 + ``` + +- Show utility version using CLI utility installed: + ```sh + > rss_reader --version + Version 1.5 + ``` + +- Read 1 news entry from [Yahoo](https://news.yahoo.com/) source: + ```sh + > python rss_reader.py https://news.yahoo.com/rss/ --limit 1 + + Feed: Yahoo News - Latest News & Headlines + + Title: WNBA star Brittney Griner ordered to trial Friday in Russia + Date: 2022-06-27T07:41:55Z + Link: https://news.yahoo.com/us-basketball-star-griner-due-074155275.html + + Shackled and looking wary, WNBA star Brittney Griner was ordered Monday to stand trial by a court near Moscow on cannabis possession charges, about 4 1/2 months after her arrest at an airport while returning to play for a Russian team. The Phoenix Mercury center and two-time U.S. Olympic gold medalist also was ordered to remain in custody for the duration of her criminal trial, which was to begin Friday. Griner could face 10 years in prison if convicted on charges of large-scale transportation of drugs. + + + Links: + [1]: https://news.yahoo.com/us-basketball-star-griner-due-074155275.html (link) + [2]: https://s.yimg.com/ny/api/res/1.2/utfVa4Ach8UgXMMZREmJhg--/YXBwaWQ9aGlnaGxhbmRlcjt3PTEyMDA7aD04MDA-/https://s.yimg.com/uu/api/res/1.2/1THMVZDeZ0z7PVXchxklYw--~B/aD00MDAwO3c9NjAwMDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/ap.org/9f3f122d6d91ff94f9613b5d97409f0f (image) + + + ``` + +- Read from cache 1 news entry for the date `'20220628'` (requires previously stored data): + ```sh + > python rss_reader.py --date 20220628 --limit 1 + + Feed: Yahoo News - Latest News & Headlines + + Title: Spit, 'disrespect' arrive at Wimbledon as tennis turns ugly + Date: 2022-06-28T22:01:51Z + Link: https://news.yahoo.com/spit-disrespect-arrive-wimbledon-tennis-220151441.html + + This is not what one thinks of when pondering the supposedly genteel roots of tennis, and the purportedly proper atmosphere at dates-to-the-1800s Wimbledon, a country club sport being contested at a place officially called the All England Lawn Tennis Club: a player, Nick Kyrgios, capping a first-round victory Tuesday by spitting in the direction of a spectator he said was hassling him. Like, he literally came to the match to literally just not even support anyone, really. During the match, which filled the stands at 1,980-seat Court No. 3 — and attracted lengthy lines of folks hoping to eventually be let in, likely owing to the popularity of the anything-can-happen Kyrgios, a 27-year-old from Australia, and the involvement of a local player — Kyrgios asked, without success, to have the fan removed for cursing and sending other verbal abuse his way. + + + Links: + [1]: https://news.yahoo.com/spit-disrespect-arrive-wimbledon-tennis-220151441.html (link) + [2]: https://s.yimg.com/ny/api/res/1.2/7Oybi_h9sBCC7gjex3GADQ--/YXBwaWQ9aGlnaGxhbmRlcjt3PTEyMDA7aD04MDA-/https://s.yimg.com/uu/api/res/1.2/Kn3F_gIJwe0a3uIOU.Tb2w--~B/aD0yMzgxO3c9MzU3MTthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/ap.org/4a35cff443aaabc2b49d94a5e7672369 (image) + + + ``` + +- Export all the news entries from [Yahoo](https://news.yahoo.com/) source to the `yahoo.epub` file in the working directory: + ```sh + > python rss_reader.py https://news.yahoo.com/rss/ --to-epub yahoo.epub + ``` + +## JSON format + +The utility can export the feed into JSON format for console output. +The structure is shown below: + +```python +[ + { + "channel": "Channel title", + "url": "Channel URL", + "entries": [ + { + "title": "Entry title", + "date": "Entry publish date", + "link": "Entry link", + "description": "Entry description", + "image_link": "Entry image link" + } + ] + } +] +``` + +## Export formats + +The utility can export the feed into HTML and Epub formats, use the `--to-html` or `--to-epub` options, respectively. + +### HTML format + +When export to HTML format the utility saves HTML page into the filepath specified and pictures data into a separate folder named like HTML filepath with suffix ` - images`. + +- Produce `yahoo.html` file and `yahoo.html - images/` directory with images in the `~/rss/` directory: + + ```sh + > python rss_reader.py https://news.yahoo.com/rss/ --to-html ~/rss/yahoo.html + ``` + +### Epub format + +When export to Epub format the utility saves Epub file into the filepath specified with pictures data incapsulated. + +- Produce `yahoo.epub` in the `~/rss/` directory: + + ```sh + > python rss_reader.py https://news.yahoo.com/rss/ --to-html ~/rss/yahoo.html + ``` + +## Feed cache + +RSS feed is cached while reading. + +Cache storage is the SQLite3 database, it contains 2 data tables: `channels` and `entries`. + +- Table `channels` schema: + + | column | type | + | ------- | ------ | + | id | INT PK | + | channel | TEXT | + | url | TEXT | + +- Table `entries` schema: + + | column | type | + | ----------- | ------ | + | id | INT PK | + | title | TEXT | + | link | TEXT | + | date | TEXT | + | date_fmt | TEXT | + | description | TEXT | + | image_link | TEXT | + | image_data | BLOB | + | channel_id | INT FK | + +## Colorized output* + +The utility provides colorized console output with `--colorize` option set. + +| entity | color | | +| ----------- | ------- | ----------------------------------------------------------- | +| feed title | magenta | ![FF0090](https://via.placeholder.com/15/FF0090/FF0090.png) | +| entry title | cyan | ![00FFFF](https://via.placeholder.com/15/00FFFF/00FFFF.png) | +| entry date | yellow | ![FFFF00](https://via.placeholder.com/15/FFFF00/FFFF00.png) | +| links | blue | ![0000FF](https://via.placeholder.com/15/0000FF/0000FF.png) | +| errors | red | ![FF0000](https://via.placeholder.com/15/FF0000/FF0000.png) | + +**Colorized mode is enabled on Linux only.* + +## Running tests + +To run tests set the working directory to the project root `rss_reader/` and execute: + +```sh +> python -m unittest +``` + +To run test coverage checking set the working directory to the project root `rss_reader/` and execute: + +```sh +> python -m coverage run -m unittest +> python -m coverage report --include=engine/* +``` diff --git a/MikalaiSidarevich/rss_reader/engine/__init__.py b/MikalaiSidarevich/rss_reader/engine/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/MikalaiSidarevich/rss_reader/engine/argparser.py b/MikalaiSidarevich/rss_reader/engine/argparser.py new file mode 100644 index 00000000..7bc3d175 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/argparser.py @@ -0,0 +1,45 @@ +""" +ArgParser - parser for CLI arguments. +""" + +import argparse +import sys + + +class ArgParser: + """Parser for CLI arguments based on `argparse.ArgumentParser` parser.""" + + def __init__(self): + """ + Initialize parser with configured settings. + """ + description = "Pure Python command-line RSS reader." + self._parser = argparse.ArgumentParser(description=description) + self._configure() + + def _configure(self): + """ + Configure parser - add CLI arguments. + """ + self._parser.add_argument("source", nargs='?', help="RSS URL") + self._parser.add_argument("--version", action='store_true', help="Print version info") + self._parser.add_argument("--json", action='store_true', help="Print result as JSON in stdout") + self._parser.add_argument("--verbose", action='store_true', help="Outputs verbose status messages") + self._parser.add_argument("--limit", help="Limit news topics if this parameter provided") + self._parser.add_argument("--date", help="Read cached news by date specified like '%%Y%%m%%d'") + self._parser.add_argument("--to-epub", metavar="PATH", help="Convert news to epub format") + self._parser.add_argument("--to-html", metavar="PATH", help="Convert news to HTML format") + self._parser.add_argument("--colorize", action='store_true', help="Colorize console output") + + @property + def args(self): + """ + Get CLI arguments in a dictionary format. + """ + return vars(self._parser.parse_args(sys.argv[1:])) + + def print_help(self): + """ + Print help message to console. + """ + self._parser.print_help() diff --git a/MikalaiSidarevich/rss_reader/engine/converter.py b/MikalaiSidarevich/rss_reader/engine/converter.py new file mode 100644 index 00000000..c0442e48 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/converter.py @@ -0,0 +1,191 @@ +""" +Converter - RSS data converter to target formats. +""" + +import json +import os + +from ebooklib import epub + + +class FileError(Exception): + """ + Save file exception. + """ + pass + + +class Converter: + """RSS data converter to target formats.""" + + @classmethod + def to_text(cls, feed_list): + """ + Convert `feed_list` data to text format for console output. + """ + # RSS channels list + channel_list = [] + + for feed in feed_list: + # List of channel data blocks + channel_data = [] + + # Add channel title + channel_data.append(f"\nFeed: {feed['channel']}\n\n") + + for entry in feed['entries']: + # Add enty title block + channel_data.append(f"Title: {entry['title']}\n") + + # Add entry date block + if entry['date'] is not None: + channel_data.append(f"Date: {entry['date']}\n") + + # Add entry link block + channel_data.append(f"Link: {entry['link']}\n") + + # Add entry description block + if entry['description'] is not None: + channel_data.append(f"\n{entry['description']}\n\n") + + # Add links list block + channel_data.append(f"\nLinks:\n[1]: {entry['link']} (link)\n") + if entry['image_link'] is not None: + channel_data.append(f"[2]: {entry['image_link']} (image)\n") + + channel_data.append("\n") + + # Merge channel blocks + channel_list.append(''.join(channel_data)) + + return ''.join(channel_list) + + @classmethod + def to_json(cls, feed_list): + """ + Convert `feed_list` data to json format. + """ + # Image data & formatted date shouldn't go to json + for feed in feed_list: + for entry in feed['entries']: + del entry['date_fmt'] + del entry['image_data'] + + return json.dumps(feed_list, ensure_ascii=False, indent=4) + + @classmethod + def save_html(cls, feeds, path): + """ + Save `feeds` data into html file with images. + """ + # Separate path and filename + dir, fname = os.path.split(path) + + # Set dirname for images + img_dname = f"{fname} - images" + + try: + # Create directory for images + if not os.path.exists(os.path.join(dir, img_dname)): + os.mkdir(os.path.join(dir, img_dname)) + + with open(path, "w", encoding='utf-8') as f: + content = "" + for feed in feeds: + # Add channel title + content += f"""

{feed['channel']}

""" + + for entry in feed['entries']: + # Set entry title & link + header = f"""

{entry['title']}

""" + + if entry['date']: + header += f"

{entry['date']}

" + + # Set entry description + decription = "" + + if entry['description']: + decription = f"""

{entry['description']}

""" + + image_fname = str(hash(entry['image_link'])) + + # Save image + image = "" + + if entry['image_data']: + with open(os.path.join(dir, img_dname, image_fname), "wb") as img: + img.write(entry['image_data']) + image = f"""""" + + # Merge components + content += f"{header}{image}{decription}" + + # Add channels delimiter + content += "
" + + html_template = """ + {} + {} + """ + + f.write(html_template.strip().format("RSS news", content)) + + except PermissionError: + raise FileError(f"Permission denied for '{path}'") + except Exception: + raise FileError(f"Unable to save html into '{path}'") + + @classmethod + def save_epub(cls, feeds, path): + """ + Save `feeds` data into epub file. + """ + book = epub.EpubBook() + book.set_title('RSS news') + + chapters = [] + for i, feed in enumerate(feeds): + # Set channel title & url + chapter = epub.EpubHtml(title=f"Channel {i}", file_name=f"ch_{i}.xhtml") + chapter.content = f"""

{feed['channel']}

""" + + for entry in feed['entries']: + # Set entry title & link + header = f"""

{entry['title']}

""" + + if entry['date']: + header += f"

{entry['date']}

" + + # Set entry description + decription = "" + + if entry['description']: + decription = f"""

{entry['description']}

""" + + # Add image + image = "" + + if entry['image_data']: + image_fname = str(hash(entry['image_link'])) + img = epub.EpubItem(file_name=image_fname, media_type="ITEM_IMAGE") + img.content = entry['image_data'] + book.add_item(img) + image = f"""
""" + + # Merge components + chapter.content += f"{header}{image}{decription}" + + chapters.append(chapter) + book.add_item(chapter) + + book.spine = chapters + + try: + writer = epub.EpubWriter(path, book, {}) + writer.process() + writer.write() + except PermissionError: + raise FileError(f"Permission denied for '{path}'") + except Exception: + raise FileError(f"Unable to save epub into '{path}'") diff --git a/MikalaiSidarevich/rss_reader/engine/main.py b/MikalaiSidarevich/rss_reader/engine/main.py new file mode 100644 index 00000000..9666d010 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/main.py @@ -0,0 +1,89 @@ +""" +main() - entry point: parse CLI arguments & run RSS reading. +""" + +import subprocess +import sys + +from engine.argparser import ArgParser +from engine.rssreader import RssReader + +version = '1.5' +db = "storage.db" + + +def main(): + """ + Entry point - get CLI arguments and start process. + """ + # Set console encoding to UTF-8 + sys.stdout.reconfigure(encoding='utf-8') + + arg_parser = ArgParser() + args = arg_parser.args + + try: + if args['version']: + print(f"Version {version}") + exit(0) + + json = False + if args['json']: + json = args['json'] + + verbose = False + if args['verbose']: + verbose = args['verbose'] + + limit = None + if args['limit'] is not None: + limit = int(args['limit']) + + url = None + if args['source'] is not None: + url = args['source'] + + date = None + if args['date'] is not None: + date = args['date'] + + html_path = None + if args['to_html'] is not None: + html_path = args['to_html'] + + epub_path = None + if args['to_epub'] is not None: + epub_path = args['to_epub'] + + colorizer = None + if args['colorize']: + if sys.platform == "linux": + colorizer = subprocess.Popen("colorize", + stdin=subprocess.PIPE, + encoding='utf-8') + sys.stdout = colorizer.stdin + else: + print("Unable to set colorized mode, set to normal mode.", flush=True) + + except Exception: + print("Invalid argument value", flush=True) + exit(1) + + if url or date: + if verbose: + print(f"URL is set: '{url}', read from {'cache' if date else 'URL'}", flush=True) + try: + rss = RssReader(verbose, db).read_rss(url, limit, json, date, html_path, epub_path) + print(rss, flush=True) + except Exception as e: + print(f"{type(e).__name__}: {e}", flush=True) + exit(1) + finally: + if colorizer: + colorizer.communicate() + else: + arg_parser.print_help() + + +if __name__ == '__main__': + main() diff --git a/MikalaiSidarevich/rss_reader/engine/rsscacher.py b/MikalaiSidarevich/rss_reader/engine/rsscacher.py new file mode 100644 index 00000000..3f450d5a --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/rsscacher.py @@ -0,0 +1,239 @@ +""" +RssCacher - SQLite3 database handler, work with cached RSS entries. +""" + +import sqlite3 +import threading + +# Define the lock globally +lock = threading.Lock() + + +class StorageError(Exception): + """ + Cache storage exception. + """ + pass + + +class QueryError(Exception): + """ + Database query exception. + """ + pass + + +class RssCacher: + """SQLite3 database handler, work with cached RSS entries.""" + + def __init__(self, dbname, verbose=False): + """ + Initialize database name and verbosity. + """ + self._dbname = dbname + self._verbose = verbose + + def __enter__(self): + """ + Context manager entry point. + Create tables if they're not exist. + """ + self._connect() + self._create_tables() + return self + + def __exit__(self, exc_type, exc_value, traceback): + """ + Context manager exit point. + """ + if exc_type: + raise + self._close() + + def _connect(self): + """ + Connect to database & set the cursor. + """ + try: + self._conn = sqlite3.connect(self._dbname, check_same_thread=False) + self._conn.row_factory = sqlite3.Row + self._cur = self._conn.cursor() + except Exception: + raise StorageError(f"Unable to open '{self._dbname}' database") + + def _close(self): + """ + Close database connection. + """ + self._conn.close() + + def _create_tables(self): + """ + Create tables to store RSS data in local storage. + """ + try: + query = """CREATE TABLE IF NOT EXISTS channels ( + id INTEGER PRIMARY KEY, + channel TEXT, + url TEXT)""" + self._cur.execute(query) + + query = """CREATE TABLE IF NOT EXISTS entries ( + id INTEGER PRIMARY KEY, + title TEXT, + link TEXT, + date TEXT, + date_fmt TEXT, + description TEXT, + image_link TEXT, + image_data BLOB, + channel_id INTEGER, + FOREIGN KEY (channel_id) REFERENCES channels (id))""" + self._cur.execute(query) + except Exception as e: + raise StorageError(f"Unable to create tables in '{self._dbname}' database ({e})") + + def store_channel(self, feed): + """ + Save RSS channel data into the `channels` table. + """ + try: + # Check channel duplicate + query = ("SELECT id FROM channels WHERE url=?") + self._cur.execute(query, (feed['url'],)) + row = self._cur.fetchone() + + # Store channel data + if row: + query = "UPDATE channels SET channel=? WHERE url=?" + else: + query = "INSERT INTO channels (channel, url) VALUES (?,?)" + + self._cur.execute(query, (feed['channel'], feed['url'])) + self._conn.commit() + except Exception as e: + raise StorageError(f"Unable to store channel data into '{self._dbname}' ({e})") + + def store_entry(self, entry, channel_id): + """ + Save RSS entry info into the `entries` table. + """ + try: + lock.acquire(True) + + # Check entry duplicate + query = ("SELECT id FROM entries WHERE `link`=?") + self._cur.execute(query, (entry['link'],)) + row = self._cur.fetchone() + + values = [entry['title'], + entry['link'], + entry['date'], + entry['date_fmt'], + entry['description'], + entry['image_link'], + entry['image_data'], + channel_id] + + # Store entry + if row: + query = """UPDATE entries SET + title=?, + link=?, + date=?, + date_fmt=?, + description=?, + image_link=?, + image_data=?, + channel_id=? WHERE link=?""" + values += (entry['link'],) + else: + query = """INSERT INTO entries ( + title, + link, + date, + date_fmt, + description, + image_link, + image_data, + channel_id + ) VALUES (?,?,?,?,?,?,?,?)""" + + self._cur.execute(query, values) + self._conn.commit() + except Exception as e: + raise StorageError(f"Unable to store entry data into '{self._dbname}' ({e})") + finally: + lock.release() + + def get_channel_id(self, url): + """ + Get channel id by channel `url`. + """ + query = ("SELECT id FROM channels WHERE url=?") + self._cur.execute(query, (url,)) + return self._cur.fetchone()['id'] + + def feed(self, url, limit, date): + """ + Get list of RSS feeds from storage. + `url` sets RSS channel, + `limit` sets total number of entries, + `date` sets entries date filter. + """ + query = """SELECT + entries.title as title, + entries.link as link, + entries.date as date, + entries.date_fmt as date_fmt, + entries.description as description, + entries.image_link as image_link, + entries.image_data as image_data, + channels.channel as channel, + channels.url as url + FROM entries JOIN channels ON entries.channel_id=channels.id + WHERE entries.date_fmt=?""" + + # Set url filter + if url is not None: + url = url.lower().rstrip('/') + query += f" AND channels.url='{url}'" + + # Set limit filter + if limit is not None: + query += f" LIMIT {limit}" + + self._cur.execute(query, (date, )) + db_entries_list = self._cur.fetchall() + + # Create feed list + if db_entries_list: + feed_list = [] + + # Place entries by channels + for db_entry in db_entries_list: + entry = {'title': db_entry['title'], + 'link': db_entry['link'], + 'date': db_entry['date'], + 'date_fmt': db_entry['date_fmt'], + 'description': db_entry['description'], + 'image_link': db_entry['image_link'], + 'image_data': db_entry['image_data']} + + for feed in feed_list: + # Add entry to its channel + if db_entry['channel'] in feed.values(): + feed['entries'].append(entry) + break + else: + # Set channel at first occurence + feed = {'channel': db_entry['channel'], + 'url': db_entry['url'], + 'entries': [entry]} + + # Add new feed structure into the result list + feed_list.append(feed) + else: + raise QueryError(f"There are no entries for the date '{date}'") + + return feed_list diff --git a/MikalaiSidarevich/rss_reader/engine/rssparser.py b/MikalaiSidarevich/rss_reader/engine/rssparser.py new file mode 100644 index 00000000..25211664 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/rssparser.py @@ -0,0 +1,285 @@ +""" +RssParser - parser for RSS data. +""" + +import threading +import xml.dom.minidom +from html import unescape +from time import sleep, strftime, strptime + +import requests +from bs4 import BeautifulSoup + + +class RequestError(Exception): + """ + Request URL exception. + """ + pass + + +class XmlParseError(Exception): + """ + Parse XML document exception. + """ + pass + + +class HtmlParseError(Exception): + """ + Parse HTML document exception. + """ + pass + + +class DateError(Exception): + """ + Date format conversion exception. + """ + pass + + +class ImageError(Exception): + """ + Download image exception. + """ + pass + + +class RssParser: + """Parser for RSS data.""" + + def __init__(self, db, verbose=False): + """ + Initialize parser with `verbose` ability and setup feed storages. + """ + self._verbose = verbose + self._feed = {} + self._db = db + + def feed(self, url, limit): + """ + Get parsed RSS feed. + """ + self._url = url.lower() + self._limit = limit + + # Get XML document + self._document = self._request_url(url).strip() + + # Parse XML document into the internal storage + self._parse_feed() + + return [self._feed] + + def _request_url(self, url): + """ + Get content by the `url`. + """ + headers = {'User-Agent': 'Mozilla/5.0'} + timeout = 10 + + try: + response = requests.get(url, headers=headers, timeout=timeout) + except Exception: + raise RequestError(f"Unable to get content by URL '{url}'") + + return response.content + + def _parse_feed(self): + """ + Parse XML document into the internal storage. + """ + # Get XML DOM structure + try: + dom = xml.dom.minidom.parseString(self._document) + except Exception: + raise XmlParseError(f"Invalid XML document by URL '{self._url}'") + + # Get feed channel title + try: + title_dom = dom.getElementsByTagName('title').item(0) + if title_dom.firstChild: + title = title_dom.firstChild.nodeValue + else: + title = self._url + except Exception: + raise XmlParseError(f"RSS channel '{self._url}' has no title") + + self._feed['channel'] = unescape(title) + + # Add channel url + self._feed['url'] = self._url + + # Store channel into db + self._db.store_channel(self._feed) + + # Get channel id to store entries + channel_id = self._db.get_channel_id(self._feed['url']) + + # Get list of RSS entries + for tag in ['item', 'entry']: + item_list = dom.getElementsByTagName(tag) + if item_list: + break + + self._feed['entries'] = [] + + # Items limit depends on user limit-parameter provided + if self._limit is None: + limit = item_list.length + else: + limit = min(self._limit, item_list.length) + + # Get RSS entries data + for i in range(limit): + item = item_list[i] + thread = threading.Thread(target=self._get_entry, + args=(i, item, channel_id)) + thread.start() + + # Prevent too fast requests + sleep(0.2) + + # Join spawned threads + for thread in threading.enumerate(): + # Main thread should be skipped + if thread is threading.main_thread(): + continue + thread.join() + + if self._verbose: + print(f"Total {len(self._feed['entries'])} items processed", flush=True) + + def _get_entry(self, n, item, channel_id): + """ + Request `n`-th entry attributes: + get title, date, link from XML, description, image by entry's url. + """ + if self._verbose: + print(f"Entry #{n} requested", flush=True) + + entry = {'title': None, + 'date': None, + 'date_fmt': None, + 'link': None, + 'description': None, + 'image_link': None, + 'image_data': None} + + # Get entry title + try: + entry_title_dom = item.getElementsByTagName('title').item(0) + + # Search non-empty node + for node in entry_title_dom.childNodes: + if node.nodeValue.strip(): + entry['title'] = unescape(node.nodeValue) + break + except Exception: + if self._verbose: + print(f"Entry #{n} has no title", flush=True) + # Title must be present + return + + # Get entry link + try: + for tag in ['link', 'id']: + entry_link_dom = item.getElementsByTagName(tag).item(0) + if entry_link_dom.firstChild: + link = entry_link_dom.firstChild.nodeValue + entry['link'] = link.lower().rstrip('/') + break + except Exception: + if self._verbose: + print(f"Entry #{n} has no link", flush=True) + # Link must be present + return + + # Get entry published date + try: + for tag in ['pubDate', 'published']: + entry_date_dom = item.getElementsByTagName(tag).item(0) + if entry_date_dom: + entry['date'] = entry_date_dom.firstChild.nodeValue + entry['date_fmt'] = self._convert_date(entry['date']) + break + except Exception: + if self._verbose: + print(f"Entry #{n} has no date published", flush=True) + + # Get entry description + try: + attrs = {'name': 'description', + 'property': 'og:description', + 'itemprop': 'description'} + entry['description'] = self._get_meta_tag(link, **attrs) + entry['description'] = unescape(entry['description']).strip() + except Exception as e: + if self._verbose: + print(f"Entry #{n} description: {e}", flush=True) + + # Get entry image link + try: + attrs = {'property': 'og:image'} + entry['image_link'] = self._get_meta_tag(link, **attrs) + entry['image_data'] = self._download_image(entry['image_link']) + except Exception as e: + if self._verbose: + print(f"Entry #{n} image: {e}", flush=True) + + # Add current entry + self._feed['entries'].append(entry) + + if self._verbose: + print(f"Entry #{n} received", flush=True) + + # Store entry into db + self._db.store_entry(entry, channel_id) + + if self._verbose: + print(f"Entry #{n} stored", flush=True) + + def _get_meta_tag(self, url, **kwargs): + """ + Extract meta tags content from `url` specified. + Meta tag attributes specified by `kwargs`. + """ + raw_html = self._request_url(url) + parsed_html = BeautifulSoup(raw_html.decode('utf-8', 'ignore'), features='html.parser') + + try: + for attr, value in kwargs.items(): + meta = parsed_html.find('meta', {attr: value}) + if meta: + return meta['content'] + else: + raise Exception + except Exception: + raise HtmlParseError(f"Couldn't parse data by URL '{url}'") + + def _convert_date(self, date): + """ + Convert raw input `date` into '%Y%m%d' format. + Recognize 2 formats: "Fri, 17 Jun 2022 02:50:00 GMT"-like and "2022-05-26T00:00:00Z"-like. + """ + try: + # "Fri, 17 Jun 2022 02:50:00 GMT" + date = strptime(date.rsplit(" ", maxsplit=1)[0], "%a, %d %b %Y %X") + except Exception: + try: + # "2022-05-26T00:00:00Z" + date = strptime(date.split('T')[0], "%Y-%m-%d") + except Exception: + raise DateError(f"Unable to extract date from '{date}'") + + return strftime('%Y%m%d', date) + + def _download_image(self, url): + """ + Download image content by `url`. + """ + try: + return self._request_url(url) + except RequestError: + raise ImageError(f"Unable to get image by URL '{url}'") diff --git a/MikalaiSidarevich/rss_reader/engine/rssreader.py b/MikalaiSidarevich/rss_reader/engine/rssreader.py new file mode 100644 index 00000000..50ca3606 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/engine/rssreader.py @@ -0,0 +1,52 @@ +""" +RssReader - pure Python command-line RSS reader. +""" + +from engine.converter import Converter +from engine.rsscacher import RssCacher +from engine.rssparser import RssParser + + +class RssReader: + """Pure Python command-line RSS reader.""" + + def __init__(self, verbose=False, dbname="storage.db"): + """ + Initialize reader with `verbose` ability, storage `dbname`. + """ + self._verbose = verbose + self._dbname = dbname + + def read_rss(self, url, limit, json, date, html_path, epub_path): + """ + Get RSS entries from `url` and output them to the stdout. + Limit number of entries with `limit`. + Output all the entries if `limit` is not specified. + Read news from cache by `date` if specified. + """ + try: + with RssCacher(self._dbname, self._verbose) as cacher: + parser = RssParser(cacher, self._verbose) + + if date is None: + feed_list = parser.feed(url, limit) + else: + feed_list = cacher.feed(url, limit, date) + except Exception: + raise + + if html_path: + Converter.save_html(feed_list, html_path) + + if epub_path: + Converter.save_epub(feed_list, epub_path) + + # Conversions that require the full feed structure should be done before here + # Json conversion makes removes from feeds + + if json: + rss_content = Converter.to_json(feed_list) + else: + rss_content = Converter.to_text(feed_list) + + return rss_content diff --git a/MikalaiSidarevich/rss_reader/pyproject.toml b/MikalaiSidarevich/rss_reader/pyproject.toml new file mode 100644 index 00000000..bf912968 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools", "wheel"] diff --git a/MikalaiSidarevich/rss_reader/requirements.txt b/MikalaiSidarevich/rss_reader/requirements.txt new file mode 100644 index 00000000..403be712 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/requirements.txt @@ -0,0 +1,6 @@ +beautifulsoup4==4.11.1 +EbookLib==0.17.1 +colorize==1.1.0 +coverage==6.2 +lxml==4.8.0 +requests==2.26.0 diff --git a/MikalaiSidarevich/rss_reader/rss_reader.py b/MikalaiSidarevich/rss_reader/rss_reader.py new file mode 100644 index 00000000..d7a02bf1 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/rss_reader.py @@ -0,0 +1,8 @@ +""" +Pure Python command-line RSS reader. +""" + +from engine.main import main + +if __name__ == '__main__': + main() diff --git a/MikalaiSidarevich/rss_reader/setup.cfg b/MikalaiSidarevich/rss_reader/setup.cfg new file mode 100644 index 00000000..5db3e7e4 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/setup.cfg @@ -0,0 +1,17 @@ +[metadata] +name = rss-reader +version = 1.5 + +[options] +packages = find: +install_requires = + beautifulsoup4==4.11.1 + EbookLib==0.17.1 + coverage==6.2 + colorize==1.1.0 + lxml==4.8.0 + requests==2.26.0 + +[options.entry_points] +console_scripts = + rss_reader = engine.main:main diff --git a/MikalaiSidarevich/rss_reader/setup.py b/MikalaiSidarevich/rss_reader/setup.py new file mode 100644 index 00000000..699b270c --- /dev/null +++ b/MikalaiSidarevich/rss_reader/setup.py @@ -0,0 +1,7 @@ +""" +Entry point to install package. +""" + +from setuptools import setup + +setup() diff --git a/MikalaiSidarevich/rss_reader/tests/__init__.py b/MikalaiSidarevich/rss_reader/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/MikalaiSidarevich/rss_reader/tests/test_argparser.py b/MikalaiSidarevich/rss_reader/tests/test_argparser.py new file mode 100644 index 00000000..a0ee8aa7 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/tests/test_argparser.py @@ -0,0 +1,30 @@ +"""Test ArgParser class.""" + +from unittest import TestCase + +from engine.argparser import ArgParser + + +class TestArgParser(TestCase): + """Testcase for ArgParser class.""" + + def setUp(self): + """ + Prepare test fixture. + """ + self.parser = ArgParser() + + def test_args(self): + """ + args-property test. + """ + expected = {'source': None, + 'version': False, + 'json': False, + 'verbose': False, + 'limit': None, + 'date': None, + 'to_html': None, + 'to_epub': None, + 'colorize': False} + self.assertDictEqual(self.parser.args, expected) diff --git a/MikalaiSidarevich/rss_reader/tests/test_converter.py b/MikalaiSidarevich/rss_reader/tests/test_converter.py new file mode 100644 index 00000000..e62512a8 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/tests/test_converter.py @@ -0,0 +1,67 @@ +"""Test Converter class.""" + +from unittest import TestCase +from unittest.mock import patch + +from engine.converter import Converter + + +class TestConverter(TestCase): + """Testcase for Converter class.""" + + def test_to_text(self): + """ + to_text() test. + """ + feed_list = [{'channel': "", + 'entries': [{'title': "", + 'date': "", + 'link': "", + 'description': "", + 'image_link': ""}]}] + self.assertIsNotNone(Converter.to_text(feed_list)) + + def test_to_json(self): + """ + to_json() test. + """ + feed_list = [{'channel': "", + 'entries': [{'title': "", + 'link': "", + 'date_fmt': "", + 'image_data': ""}]}] + self.assertIsNotNone(Converter.to_json(feed_list)) + + @patch('builtins.open') + def test_save_html(self, mock_open): + """ + save_html() test. + """ + feeds = [{'channel': "", + 'url': "", + 'entries': [{'title': "", + 'date': "", + 'date_fmt': "", + 'link': "", + 'description': "", + 'image_link': "", + 'image_data': b""}]}] + with self.assertRaises(Exception): + Converter.save_html(feeds, "1/") + + @patch('ebooklib.epub.EpubWriter.write') + def test_save_epub(self, mock_write): + """ + save_epub() test. + """ + feeds = [{'channel': "", + 'url': "", + 'entries': [{'title': "", + 'date': "", + 'date_fmt': "", + 'link': "", + 'description': "", + 'image_link': "", + 'image_data': b""}]}] + Converter.save_epub(feeds, "") + mock_write.assert_called_once() diff --git a/MikalaiSidarevich/rss_reader/tests/test_rsscacher.py b/MikalaiSidarevich/rss_reader/tests/test_rsscacher.py new file mode 100644 index 00000000..146918c6 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/tests/test_rsscacher.py @@ -0,0 +1,33 @@ +"""Test RssCacher class.""" + +from unittest import TestCase + +from engine.rsscacher import QueryError, RssCacher, StorageError + + +class TestRssCacher(TestCase): + """Testcase for RssCacher class.""" + + def test_store_channel(self): + """ + store_channel() test. + """ + with RssCacher(':memory:') as db: + with self.assertRaises(StorageError): + db.store_channel({}) + + def test_store_entry(self): + """ + store_entry() test. + """ + with RssCacher(':memory:') as db: + with self.assertRaises(StorageError): + db.store_entry({}, 0) + + def test_feed(self): + """ + feed() test. + """ + with RssCacher(':memory:') as db: + with self.assertRaises(QueryError): + db.feed("", 1, None) diff --git a/MikalaiSidarevich/rss_reader/tests/test_rssparser.py b/MikalaiSidarevich/rss_reader/tests/test_rssparser.py new file mode 100644 index 00000000..6424b3ba --- /dev/null +++ b/MikalaiSidarevich/rss_reader/tests/test_rssparser.py @@ -0,0 +1,22 @@ +"""Test RssParser class.""" + +from unittest import TestCase + +from engine.rsscacher import RssCacher +from engine.rssparser import RequestError, RssParser + + +class TestRssParser(TestCase): + """Testcase for RssParser class.""" + + def test_feed(self): + """ + feed() test. + """ + with RssCacher(":memory:", verbose=False) as db: + self.parser = RssParser(db, verbose=False) + + with self.assertRaises(RequestError): + self.parser.feed("", 1) + + self.assertIsNotNone(self.parser.feed("https://news.yahoo.com/rss/", 1)) diff --git a/MikalaiSidarevich/rss_reader/tests/test_rssreader.py b/MikalaiSidarevich/rss_reader/tests/test_rssreader.py new file mode 100644 index 00000000..0d0acd78 --- /dev/null +++ b/MikalaiSidarevich/rss_reader/tests/test_rssreader.py @@ -0,0 +1,44 @@ +"""Test RssReader class.""" + +from unittest import TestCase +from unittest.mock import patch + +from engine.rssreader import RssReader + + +class TestRssReader(TestCase): + """Testcase for RssReader class.""" + + def setUp(self): + """ + Prepare test fixture. + """ + self.reader = RssReader(verbose=False, dbname=':memory:') + + @patch('engine.rssparser.RssParser.feed') + def test_read_rss(self, mock_parser_feed): + """ + read_rss() test. + """ + mock_parser_feed.return_value = [{'channel': "", + 'entries': [{'title': "", + 'date': "", + 'date_fmt': "", + 'link': "", + 'description': "", + 'image_link': "", + 'image_data': ""}]}] + + url = "https://news.yahoo.com/rss/" + limit = 1 + date = None + html_path = None + epub_path = None + + self.assertIsNotNone(self.reader.read_rss(url, limit, False, date, html_path, epub_path)) + self.assertIsNotNone(self.reader.read_rss(url, limit, True, date, html_path, epub_path)) + + with self.assertRaises(Exception): + self.reader.read_rss(url, limit, False, date, '/', epub_path) + with self.assertRaises(Exception): + self.reader.read_rss(url, limit, False, date, html_path, '/')