diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..67a60dfd --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +results +venv +.idea +rss_reader.egg-info +.pytest_cache diff --git a/README.md b/README.md index c86d1e65..df3d7b97 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,43 @@ -# How to create a PR with a homework task - -1. Create fork from the following repo: https://github.com/E-P-T/Homework. (Docs: https://docs.github.com/en/get-started/quickstart/fork-a-repo ) -2. Clone your forked repo in your local folder. -3. Create separate branches for each session.Example(`session_2`, `session_3` and so on) -4. Create folder with you First and Last name in you forked repo in the created session. -5. Add your task into created folder -6. Push finished session task in the appropriate branch in accordance with written above. - You should get the structure that looks something like that - -``` - Branch: Session_2 - DzmitryKolb - |___Task1.py - |___Task2.py - Branch: Session_3 - DzmitryKolb - |___Task1.py - |___Task2.py -``` - -7. When you finish your work on task you should create Pull request to the appropriate branch of the main repo https://github.com/E-P-T/Homework (Docs: https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork). -Please use the following instructions to prepare good description of the pull request: - - Pull request header should be: `Session - `. - Example: `Session 2 - Dzmitry Kolb` - - Pull request body: You should write here what tasks were implemented. - Example: `Finished: Task 1.2, Task 1.3, Task 1.6` +## Documentation +### Minimal requirements: +__Python 3.9__\ +On linux please add alias _python_ to _python3_. Look [here](https://askubuntu.com/questions/320996/how-to-make-python-program-command-execute-python-3). + +### Setup: +#### Virtual Environment (Optional) +Creating Virtual Environment (from the root of the project)\ +Windows: `python -m venv ./venv`\ +Linux: `virtualenv venv` + +Activate Virtual Environment:\ +Windows: `./venv/Scripts/activate`\ +Linux: `source venv/bin/activate` + +*_On Windows you might need to give rights to execute commands from PowerShell via the following command (running as Administrator)_\ +`Set-ExecutionPolicy Unrestricted` + +*_If you want to exit Virtual Environment please run `deactivate`_ + +#### Required Steps +Update pip:\ +`python -m pip install --upgrade pip` + +Install requirements:\ +`pip install -r ./requirements.txt` + +### Run Application: +Run `python ./rss_parse/rss_reader.py --help` to find available options + +### Cache +Application stores RSS Feed in a local storage in a temp folder (and rss_reader sub-folder).\ +For more info on what is considered a temp directory please look [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir) + +### Run Tests: +Run `pytest ./tests` to run tests + +### Package distributive: +To create a distribution package please run\ +`pip install -e .`\ +You will be able to run `rss_reader` directly\ +Also you should run this command as it makes the required font available for fpdf library diff --git a/fonts/OpenSans.ttf b/fonts/OpenSans.ttf new file mode 100644 index 00000000..e21ff5f1 Binary files /dev/null and b/fonts/OpenSans.ttf differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..bbd900ae --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +-e . +pytest +pytest-mock diff --git a/rss_parse/__init__.py b/rss_parse/__init__.py new file mode 100644 index 00000000..4802e90f --- /dev/null +++ b/rss_parse/__init__.py @@ -0,0 +1 @@ +__version__ = "1.0" diff --git a/rss_parse/exceptions/__init__.py b/rss_parse/exceptions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_parse/exceptions/exceptions.py b/rss_parse/exceptions/exceptions.py new file mode 100644 index 00000000..87864e02 --- /dev/null +++ b/rss_parse/exceptions/exceptions.py @@ -0,0 +1,19 @@ +class ParsingException(Exception): + """ + An exception that could happen during RSS parsing + """ + pass + + +class CacheException(Exception): + """ + An exception that could happen during caching of RSS feed + """ + pass + + +class ProcessingException(Exception): + """ + An exception that could happen during RSS feed processing + """ + pass diff --git a/rss_parse/parse/__init__.py b/rss_parse/parse/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_parse/parse/params.py b/rss_parse/parse/params.py new file mode 100644 index 00000000..8c17048e --- /dev/null +++ b/rss_parse/parse/params.py @@ -0,0 +1,17 @@ +class Params: + """ + Stores parameters to run rss reader. + """ + + def __init__(self, is_verbose, is_json, limit, source, pub_date, html_dir, pdf_dir): + self.is_verbose = is_verbose + self.is_json = is_json + self.limit = limit + self.source = source + self.pub_date = pub_date + self.html_dir = html_dir + self.pdf_dir = pdf_dir + + @staticmethod + def from_args(args): + return Params(args.verbose, args.json, args.limit, args.source, args.date, args.to_html, args.to_pdf) diff --git a/rss_parse/parse/rss_cache.py b/rss_parse/parse/rss_cache.py new file mode 100644 index 00000000..f211cdec --- /dev/null +++ b/rss_parse/parse/rss_cache.py @@ -0,0 +1,83 @@ +import os +import tempfile + +from rss_parse.exceptions.exceptions import CacheException, ParsingException +from rss_parse.parse.rss_feed import RssFeed +from rss_parse.parse.rss_mapper import RSS_FEED_JSON_MAPPER +from rss_parse.parse.rss_parser import RssJsonParser +from rss_parse.utils.collection_utils import group_by, merge_by_key +from rss_parse.utils.messaging_utils import MESSAGE_CONSUMER_NOOP + + +class TmpDirectoryCache: + """ + Class to store RSS Feed in a temporary directory + """ + __DATE_TO_FILE_NAME_PATTERN = '%Y%m%d' + + def __init__(self, rss_feed, mc=MESSAGE_CONSUMER_NOOP): + self.__rss_feed = rss_feed + self.__mc = mc + self.__base_dir = TmpDirectoryCache.get_cache_base_path() + + @staticmethod + def get_cache_base_path(): + """ + Returns the directory where all Cached files are stored + """ + return os.path.join(tempfile.gettempdir(), "rss_reader") + + @staticmethod + def get_cache_path(pub_date): + """ + Builds the path to the cache file based on a publication date + """ + return os.path.join(TmpDirectoryCache.get_cache_base_path(), + f"{pub_date.strftime(TmpDirectoryCache.__DATE_TO_FILE_NAME_PATTERN)}.json") + + def cache(self): + try: + if not os.path.exists(self.__base_dir): + os.mkdir(self.__base_dir) + except: + raise CacheException("Unable to create a directory for local cache") + + if not self.__rss_feed or not self.__rss_feed.rss_items: + return + + feed_by_date = group_by(self.__rss_feed.rss_items, + key=lambda x: x.publication_date.strftime( + TmpDirectoryCache.__DATE_TO_FILE_NAME_PATTERN)) + for pub_date, new_items in feed_by_date: + file_name = os.path.join(self.__base_dir, f"{pub_date}.json") + json_parser = RssJsonParser(file_name, self.__mc) + existing_items = json_parser.parse().rss_items + all_items = merge_by_key([*existing_items, *new_items], key=lambda x: x.key()) + all_feed = RssFeed(all_items) + rss_json = RSS_FEED_JSON_MAPPER.to_json(all_feed) + with open(file_name, "w", encoding="UTF-8") as f: + f.write(rss_json) + + +class CacheJsonParser(RssJsonParser): + """ + Class to read RSS Feed from a cached directory + """ + + def __init__(self, date, source, mc=None): + super().__init__(TmpDirectoryCache.get_cache_path(date), mc) + + self.__source = source + + def parse(self): + """ + Class to read RSS Feed from a cached directory. + Raises an exception if no news for the date found. + """ + rss_feed = super().parse() + items = rss_feed.rss_items + if self.__source: + items = [item for item in items if item.source == self.__source] + if not items: + raise ParsingException("No cached news for the date") + return RssFeed(items) diff --git a/rss_parse/parse/rss_feed.py b/rss_parse/parse/rss_feed.py new file mode 100644 index 00000000..234fa746 --- /dev/null +++ b/rss_parse/parse/rss_feed.py @@ -0,0 +1,28 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import List + + +@dataclass +class RssItem: + """ + Data class to store information about RSS Item/News + """ + + title: str + description: str + publication_date: datetime + link: str + image_url: str + source: str = None + + def key(self): + return self.link, self.publication_date + + +@dataclass +class RssFeed: + """ + Data class to store a list of RSS Items + """ + rss_items: List[RssItem] diff --git a/rss_parse/parse/rss_keys.py b/rss_parse/parse/rss_keys.py new file mode 100644 index 00000000..5566a8e9 --- /dev/null +++ b/rss_parse/parse/rss_keys.py @@ -0,0 +1,13 @@ +RSS_ROOT = 'rss' +RSS_CHANNEL = 'channel' +RSS_ITEMS = 'item' +RSS_ITEM_TITLE = 'title' +RSS_ITEM_DESCRIPTION = 'description' +RSS_ITEM_LINK = 'link' +RSS_ITEM_PUB_DATE = 'pubDate' +RSS_IMAGE_ROOT = 'image' +RSS_IMAGE_ROOT_ENCLOSURE = 'enclosure' +RSS_IMAGE_ROOT_MEDIA_CONTENT = 'media:content' +RSS_IMAGE_ROOT_MEDIA_THUMBNAIL = 'media:thumbnail' +RSS_IMAGE_URL_ATTR = '@url' +RSS_SOURCE = 'source' diff --git a/rss_parse/parse/rss_mapper.py b/rss_parse/parse/rss_mapper.py new file mode 100644 index 00000000..2b486d01 --- /dev/null +++ b/rss_parse/parse/rss_mapper.py @@ -0,0 +1,64 @@ +import json +from datetime import timezone, datetime + +from rss_parse.parse.rss_feed import RssFeed, RssItem +from rss_parse.parse.rss_keys import * +from rss_parse.utils.formatting_utils import format_date_pretty, get_description_plain + + +class RssJsonMapper: + """ + Class to do a conversion of RSS Feed TO and FROM json + """ + __DATE_TIME_FORMAT = "%Y-%m-%d %H:%M:%S" + + def to_json(self, rss_feed: RssFeed, indent=None, pretty=False): + res = { + RSS_ITEMS: [self.__item_to_json(item, pretty) for item in rss_feed.rss_items] + } + + return json.dumps(res, indent=indent, ensure_ascii=False) + + def __item_to_json(self, item: RssItem, pretty): + res = { + RSS_ITEM_TITLE: item.title, + RSS_ITEM_LINK: item.link, + } + # Store as UTC + publication_date = item.publication_date.astimezone(timezone.utc) \ + .strftime(RssJsonMapper.__DATE_TIME_FORMAT) + description = item.description + if pretty: + publication_date = format_date_pretty(item.publication_date) + description = get_description_plain(description) + + res[RSS_ITEM_PUB_DATE] = publication_date + + if description: + res[RSS_ITEM_DESCRIPTION] = description + + if item.image_url: + res[RSS_IMAGE_ROOT] = item.image_url + + if item.source: + res[RSS_SOURCE] = item.source + + return res + + def from_json(self, rss_feed_json): + rss_dict = json.loads(rss_feed_json) + items = [self.__parse_item(item) for item in rss_dict[RSS_ITEMS]] + return RssFeed(items) + + def __parse_item(self, item): + title = item[RSS_ITEM_TITLE] + description = item.get(RSS_ITEM_DESCRIPTION, None) + publication_date = datetime.strptime(item[RSS_ITEM_PUB_DATE], RssJsonMapper.__DATE_TIME_FORMAT) \ + .replace(tzinfo=timezone.utc).astimezone() + link = item[RSS_ITEM_LINK] + image_url = item.get(RSS_IMAGE_ROOT, None) + source = item.get(RSS_SOURCE, None) + return RssItem(title, description, publication_date, link, image_url, source) + + +RSS_FEED_JSON_MAPPER = RssJsonMapper() diff --git a/rss_parse/parse/rss_parser.py b/rss_parse/parse/rss_parser.py new file mode 100644 index 00000000..6ee3a804 --- /dev/null +++ b/rss_parse/parse/rss_parser.py @@ -0,0 +1,141 @@ +import os.path +import xml +from abc import ABC, abstractmethod + +import requests +import xmltodict +from requests.exceptions import InvalidSchema, InvalidURL, MissingSchema + +from rss_parse.exceptions.exceptions import ParsingException +from rss_parse.parse.rss_feed import RssFeed, RssItem +from rss_parse.parse.rss_keys import * +from rss_parse.parse.rss_mapper import RSS_FEED_JSON_MAPPER +from rss_parse.utils.messaging_utils import MESSAGE_CONSUMER_NOOP +from rss_parse.utils.parsing_utils import sanitize_text, to_date + + +class RssParser(ABC): + """ + Abstraction to parse RSS Feed from different sources (URL, XML, JSON, etc.) + """ + + def __init__(self, mc=MESSAGE_CONSUMER_NOOP): + self._mc = mc + + @abstractmethod + def parse(self) -> RssFeed: + """ + Reads and Returns Rss Feed from some source. + """ + pass + + +class RssJsonParser(RssParser): + """ + Implementation of RSSParser that reads RSS Feed from a file in a json format + """ + + def __init__(self, file_name, mc=None): + super().__init__(mc) + self.__file_name = file_name + self._mc = mc + + def parse(self): + if not os.path.exists(self.__file_name): + return RssFeed([]) + with open(self.__file_name, "r", encoding="UTF-8") as f: + rss_json = f.read() + return RSS_FEED_JSON_MAPPER.from_json(rss_json) + + +class RssXmlParser(RssParser): + """ + Implementation of RSSParser that reads RSS Feed from an XML string + """ + + def __init__(self, xml_feed, mc=None): + super().__init__(mc) + self.__xml_feed = xml_feed + self._mc = mc + + def parse(self): + + self._mc.add_message("Parsing RSS Feed by elements") + try: + rss_feed_dict = xmltodict.parse(self.__xml_feed)[RSS_ROOT] + except (xml.parsers.expat.ExpatError, KeyError): + raise ParsingException("Source doesn't contain a valid RSS Feed.") + + self._mc.add_message("Parsing items info") + rss_items = self.parse_items(rss_feed_dict) + + self._mc.add_message("Parsing finished") + return RssFeed(rss_items) + + def parse_items(self, rss_feed_dict): + rss_items_raw = rss_feed_dict[RSS_CHANNEL][RSS_ITEMS] + res = [] + for rss_item_dict in rss_items_raw: + item = self.parse_item(rss_item_dict) + if self.__validate_correctness(item): + res.append(item) + else: + self._mc.add_message("Item skipped because it is invalid (required fields are absent)") + return res + + def parse_item(self, rss_item_dict): + title = sanitize_text(rss_item_dict.get(RSS_ITEM_TITLE, None)) + description = rss_item_dict.get(RSS_ITEM_DESCRIPTION, None) + publication_date = to_date(rss_item_dict.get(RSS_ITEM_PUB_DATE, None)) + link = rss_item_dict.get(RSS_ITEM_LINK, None) + image_url = self.parse_image(rss_item_dict) + + return RssItem(title, description, publication_date, link, image_url) + + def parse_image(self, rss_item_dict): + image_url = rss_item_dict.get(RSS_IMAGE_ROOT, None) + if not image_url: + image_url = rss_item_dict.get(RSS_IMAGE_ROOT_MEDIA_CONTENT, {}).get(RSS_IMAGE_URL_ATTR, None) + if not image_url: + image_url = rss_item_dict.get(RSS_IMAGE_ROOT_MEDIA_THUMBNAIL, {}).get(RSS_IMAGE_URL_ATTR, None) + if not image_url: + enclosure = rss_item_dict.get(RSS_IMAGE_ROOT_ENCLOSURE, {}) + if enclosure.get('@type', "").startswith("image/"): + image_url = enclosure.get(RSS_IMAGE_URL_ATTR, None) + return image_url + + def __validate_correctness(self, item: RssItem): + return item.title and item.publication_date and item.link + + +class RssUrlParser(RssParser): + """ + Implementation of RSSParser that reads RSS Feed from URL in XML format + """ + + def __init__(self, source, mc=None): + super().__init__(mc) + self.__source = source + self._mc = mc + + def parse(self): + try: + self._mc.add_message(f"Reaching out to {self.__source}") + with requests.get(self.__source) as f: + if f.status_code != 200: + raise Exception + rss_raw_xml = f.text + except (InvalidSchema, InvalidURL, MissingSchema): + self._mc.add_message(f"Encountered an error during reading RSS Feed from URL") + raise ParsingException(f"Invalid source URL: {self.__source}") + except: # ConnectionError + self._mc.add_message(f"Unable to connect") + raise ParsingException(f"Unable to connect to {self.__source}") + + rss_xml_parser = RssXmlParser(rss_raw_xml, mc=self._mc) + + feed = rss_xml_parser.parse() + for item in feed.rss_items: + item.source = self.__source + + return feed diff --git a/rss_parse/parse/rss_parser_factory.py b/rss_parse/parse/rss_parser_factory.py new file mode 100644 index 00000000..302e42d0 --- /dev/null +++ b/rss_parse/parse/rss_parser_factory.py @@ -0,0 +1,12 @@ +from rss_parse.parse.rss_cache import CacheJsonParser +from rss_parse.parse.rss_parser import RssUrlParser +from rss_parse.utils.messaging_utils import MESSAGE_CONSUMER_NOOP + + +def get_parser(params, mc=MESSAGE_CONSUMER_NOOP): + """ + Fetch correct implementation of RssParser based on input parameters + """ + if params.pub_date: + return CacheJsonParser(params.pub_date, params.source, mc=mc) + return RssUrlParser(params.source, mc=mc) diff --git a/rss_parse/preprocessor/__init__.py b/rss_parse/preprocessor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_parse/preprocessor/rss_preprocessor.py b/rss_parse/preprocessor/rss_preprocessor.py new file mode 100644 index 00000000..381958f3 --- /dev/null +++ b/rss_parse/preprocessor/rss_preprocessor.py @@ -0,0 +1,61 @@ +from abc import ABC, abstractmethod + +from rss_parse.exceptions.exceptions import CacheException +from rss_parse.parse.rss_cache import TmpDirectoryCache +from rss_parse.parse.rss_feed import RssFeed +from rss_parse.utils.messaging_utils import MESSAGE_CONSUMER_NOOP + + +class RssPreprocessor(ABC): + """ + Abstraction to do preprocess/modify RSS Feed + """ + + def __init__(self, mc=MESSAGE_CONSUMER_NOOP): + self._mc = mc + + @abstractmethod + def preprocess(self, rss_feed: RssFeed) -> RssFeed: + """ + Method gets RSS Feed as an input and returns modified view of it + """ + pass + + +class RssCachePreprocessor(RssPreprocessor): + """ + Implementation of RSSPreprocessor that stores RSS Feed in cache + """ + + def preprocess(self, rss_feed): + self._mc.add_message("Trying to add fetched news to the local cache") + try: + rss_cache = TmpDirectoryCache(rss_feed, mc=self._mc) + rss_cache.cache() + except CacheException: + self._mc.add_message("Unable to save RSS Feed to cache. Proceeding...") + return rss_feed + + +class RssSortPreprocessor(RssPreprocessor): + """ + Implementation of RSSPreprocessor that sorts RSS Feed by publication date descending + """ + + def preprocess(self, rss_feed): + rss_items = sorted(rss_feed.rss_items, key=lambda item: item.publication_date, reverse=True) + return RssFeed(rss_items) + + +class RssLimitPreprocessor(RssPreprocessor): + """ + Implementation of RSSPreprocessor that gets limited number of RSS Items from RSS Feed + """ + + def __init__(self, limit, mc=None): + super().__init__(mc) + self.__limit = limit + + def preprocess(self, rss_feed): + rss_items = rss_feed.rss_items[:self.__limit] + return RssFeed(rss_items) diff --git a/rss_parse/preprocessor/rss_preprocessor_factory.py b/rss_parse/preprocessor/rss_preprocessor_factory.py new file mode 100644 index 00000000..7e9be96f --- /dev/null +++ b/rss_parse/preprocessor/rss_preprocessor_factory.py @@ -0,0 +1,14 @@ +from rss_parse.preprocessor.rss_preprocessor import RssSortPreprocessor, RssCachePreprocessor, RssLimitPreprocessor +from rss_parse.utils.messaging_utils import MESSAGE_CONSUMER_NOOP + + +def get_preprocessors(params, mc=MESSAGE_CONSUMER_NOOP): + """ + Fetch correct implementation of RssPreprocessor based on input parameters + """ + preprocessors = [RssSortPreprocessor(mc)] + if not params.pub_date: + preprocessors.append(RssCachePreprocessor(mc)) + if params.limit: + preprocessors.append(RssLimitPreprocessor(params.limit, mc)) + return preprocessors diff --git a/rss_parse/processor/__init__.py b/rss_parse/processor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_parse/processor/rss_html_converter.py b/rss_parse/processor/rss_html_converter.py new file mode 100644 index 00000000..29dd4cad --- /dev/null +++ b/rss_parse/processor/rss_html_converter.py @@ -0,0 +1,50 @@ +import os.path + +from rss_parse.exceptions.exceptions import ProcessingException +from rss_parse.processor.rss_processor import RssProcessor +from rss_parse.utils.formatting_utils import format_date_pretty + + +class RssToHtmlConverter(RssProcessor): + """ + Converts RSS to an HTML Format and saves it in a file + """ + HTML_FILE_NAME = "rss_feed.html" + + def __init__(self, rss_feed, dir, mc=None): + super().__init__(rss_feed, mc=mc) + self.__dir = dir + if not os.path.exists(dir): + raise ProcessingException(f"Path {dir} doesn't exist") + + def process(self): + html_res = self.__convert_to_html() + with open(os.path.join(self.__dir, RssToHtmlConverter.HTML_FILE_NAME), "w", encoding="UTF-8") as f: + f.write(html_res) + + def __convert_to_html(self): + html_res = ( + '' + '' + '' + 'RSS Feed' + '' + '' + '

RSS Feed

' + ) + + for item in self.rss_feed.rss_items: + html_res += '
' + html_res += f'

{item.title}

' + html_res += f'

{format_date_pretty(item.publication_date)}

' + html_res += f'

Link

' + if item.image_url: + html_res += f'
' + + if item.description: + html_res += f'

{item.description}

' + + html_res += '


' + html_res += '' + + return html_res diff --git a/rss_parse/processor/rss_pdf_converter.py b/rss_parse/processor/rss_pdf_converter.py new file mode 100644 index 00000000..4a7646e2 --- /dev/null +++ b/rss_parse/processor/rss_pdf_converter.py @@ -0,0 +1,55 @@ +import os.path + +from fpdf import FPDF, HTMLMixin + +from rss_parse.exceptions.exceptions import ProcessingException +from rss_parse.processor.rss_processor import RssProcessor +from rss_parse.utils.formatting_utils import format_date_pretty + + +class PdfWithHtml(FPDF, HTMLMixin): + pass + + +class RssToPdfConverter(RssProcessor): + """ + Converts RSS to a PDF Format and saves it in a file + """ + PDF_FILE_NAME = "rss_feed.pdf" + + def __init__(self, rss_feed, dir, mc=None): + super().__init__(rss_feed, mc=mc) + self.__dir = dir + if not os.path.exists(dir): + raise ProcessingException(f"Path {dir} doesn't exist") + + def process(self): + pdf = PdfWithHtml() + pdf.add_font('OpenSans', '', 'OpenSans.ttf', uni=True) + pdf.set_font('OpenSans', size=12) + pdf.add_page() + + items = self.rss_feed.rss_items + for index, item in enumerate(items): + pdf.multi_cell(w=0, h=5, txt=item.title, new_x="LEFT") + pdf.multi_cell(w=0, h=5, txt="", new_x="LEFT") + pdf.multi_cell(w=0, h=5, txt=f"Date: {format_date_pretty(item.publication_date)}", new_x="LEFT") + pdf.multi_cell(w=0, h=5, txt="", new_x="LEFT") + pdf.set_text_color(0, 0, 255) + pdf.multi_cell(w=0, h=5, txt=item.link, new_x="LEFT") + pdf.set_text_color(0, 0, 0) + pdf.multi_cell(w=0, h=5, txt="", new_x="LEFT") + try: + if item.image_url: + pdf.image(item.image_url, h=70) + pdf.multi_cell(w=0, h=5, txt="", new_x="LEFT") + except: + pass + + if item.description: + pdf.write_html(item.description) + + if index != len(items) - 1: + pdf.add_page() + + pdf.output(os.path.join(self.__dir, RssToPdfConverter.PDF_FILE_NAME)) diff --git a/rss_parse/processor/rss_processor.py b/rss_parse/processor/rss_processor.py new file mode 100644 index 00000000..3f1f2bc9 --- /dev/null +++ b/rss_parse/processor/rss_processor.py @@ -0,0 +1,80 @@ +from abc import ABC, abstractmethod + +from rss_parse.parse.rss_feed import RssFeed, RssItem +from rss_parse.parse.rss_mapper import RSS_FEED_JSON_MAPPER +from rss_parse.utils.formatting_utils import format_date_pretty, get_description_plain +from rss_parse.utils.messaging_utils import MESSAGE_CONSUMER_NOOP + + +class RssProcessor(ABC): + """ + Abstraction to do processing on RSS Feed (print, store it, save in db, post it somewhere, etc.) + """ + + def __init__(self, rss_feed: RssFeed, mc=MESSAGE_CONSUMER_NOOP): + self.rss_feed = rss_feed + self._mc = mc + + @abstractmethod + def process(self): + """ + Do some processing of RSS Feed + """ + pass + + +class RssPrinter(RssProcessor): + """ + Implementation of RSSProcessor that prints RSS Feed in a human-readable form to console + """ + __SEPARATOR = "----------" + + def __init__(self, rss_feed: RssFeed, file_descriptor, mc=None): + super().__init__(rss_feed, mc) + self.file_descriptor = file_descriptor + + def process(self): + self._mc.add_message("Staring to print the feed in a human-readable format") + self.__print() + self.print_items_info() + self._mc.add_message("Finishing printing") + + def print_items_info(self): + rss_items = self.rss_feed.rss_items + for item in rss_items: + self.print_item_info(item) + self.__print() + self.__print(RssPrinter.__SEPARATOR) + self.__print() + + def print_item_info(self, item: RssItem): + self.__print(f"Title: {item.title}") + self.__print(f"Date: {format_date_pretty(item.publication_date)}") + self.__print(f"Link: {item.link}") + + image_url = item.image_url + if image_url: + image_title = item.title + self.__print(f"Image: [{image_title}]({image_url})]") + + if item.description: + self.__print() + self.__print(get_description_plain(item.description)) + + def __print(self, v="", sep=None): + print(v, file=self.file_descriptor, sep=sep) + + +class RssJsonPrinter(RssProcessor): + """ + Implementation of RSSProcessor that prints RSS Feed in a human-readable form to console as json + """ + + def process(self): + self._mc.add_message("Converting to json") + + json_str = RSS_FEED_JSON_MAPPER.to_json(self.rss_feed, indent=4, pretty=True) + + self._mc.add_message("Printing json") + print(json_str) + self._mc.add_message("Printing finished") diff --git a/rss_parse/processor/rss_processor_factory.py b/rss_parse/processor/rss_processor_factory.py new file mode 100644 index 00000000..fc9b8350 --- /dev/null +++ b/rss_parse/processor/rss_processor_factory.py @@ -0,0 +1,26 @@ +import sys + +from rss_parse.processor.rss_html_converter import RssToHtmlConverter +from rss_parse.processor.rss_pdf_converter import RssToPdfConverter +from rss_parse.processor.rss_processor import RssPrinter, RssJsonPrinter +from rss_parse.utils.messaging_utils import MESSAGE_CONSUMER_NOOP + + +def get_processors(rss_feed, params, mc=MESSAGE_CONSUMER_NOOP): + """ + Fetch correct implementation of RssProcessor based on input parameters + """ + processors = [] + + if params.is_json: + processors.append(RssJsonPrinter(rss_feed, mc=mc)) + else: + processors.append(RssPrinter(rss_feed, sys.stdout, mc=mc)) + + if params.html_dir: + processors.append(RssToHtmlConverter(rss_feed, params.html_dir, mc=mc)) + + if params.pdf_dir: + processors.append(RssToPdfConverter(rss_feed, params.pdf_dir, mc=mc)) + + return processors diff --git a/rss_parse/rss_reader.py b/rss_parse/rss_reader.py new file mode 100644 index 00000000..dd4fd825 --- /dev/null +++ b/rss_parse/rss_reader.py @@ -0,0 +1,85 @@ +import argparse +import sys + +from rss_parse import __version__ +from rss_parse.exceptions.exceptions import ProcessingException +from rss_parse.parse.params import Params +from rss_parse.parse.rss_parser import ParsingException +from rss_parse.parse.rss_parser_factory import get_parser +from rss_parse.preprocessor.rss_preprocessor_factory import get_preprocessors +from rss_parse.processor.rss_html_converter import RssToHtmlConverter +from rss_parse.processor.rss_pdf_converter import RssToPdfConverter +from rss_parse.processor.rss_processor_factory import get_processors +from rss_parse.utils.arg_parse_types import date_YYYYMMDD, dir_path +from rss_parse.utils.messaging_utils import get_message_consumer +from rss_parse.utils.messaging_utils import print_error + + +def parse_params_from_arguments(): + parser = argparse.ArgumentParser(description="Pure Python command-line RSS reader.") + parser.add_argument("source", help="RSS URL", nargs='?' if '--date' in sys.argv else None) + parser.add_argument("--version", help="Print version info", action="version", + version=f"Version {__version__}") + parser.add_argument("--json", help="Print result as JSON in stdout", action="store_true") + parser.add_argument("--verbose", help="Output verbose status messages", action="store_true") + parser.add_argument("--limit", help="Limit news topics if this parameter provided", type=int, default=-1) + parser.add_argument("--date", help="Limit the feed by publication date - format YYYYMMDD", type=date_YYYYMMDD) + parser.add_argument("--to-html", + help=f"Directory to store generated html file. " + f"File name will be {RssToHtmlConverter.HTML_FILE_NAME}", + type=dir_path) + parser.add_argument("--to-pdf", + help=f"Directory to store generated pdf file. " + f"File name will be {RssToPdfConverter.PDF_FILE_NAME}", + type=dir_path) + args = parser.parse_args() + + return Params.from_args(args) + + +def main(): + # Parse arguments from console + params = parse_params_from_arguments() + mc = get_message_consumer(params.is_verbose) + + mc.add_message("Program started") + + mc.add_message("Initializing parser...") + parser = get_parser(params, mc) + mc.add_message("Parser initialized") + + rss_feed = None + try: + # parse RSS from different sources (cache, URL, XML file, etc.) + rss_feed = parser.parse() + except ParsingException as ex: + mc.add_message("Encountered an error during parsing") + print_error(str(ex)) + mc.add_message("Exiting the program") + exit(2) + + # modify original feed based on some needs (sorting, limit, etc) + preprocessors = get_preprocessors(params, mc) + for preprocessor in preprocessors: + rss_feed = preprocessor.preprocess(rss_feed) + + # do something with the feed (print, convert, save as file) + try: + processors = get_processors(rss_feed, params, mc) + for processor in processors: + processor.process() + except ProcessingException as ex: + print_error(str(ex)) + + +def main_wrapper(): + try: + main() + exit(0) + except Exception: + print_error("Unknown error, please rerun the application") + exit(1) + + +if __name__ == '__main__': + main_wrapper() diff --git a/rss_parse/utils/__init__.py b/rss_parse/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_parse/utils/arg_parse_types.py b/rss_parse/utils/arg_parse_types.py new file mode 100644 index 00000000..96265977 --- /dev/null +++ b/rss_parse/utils/arg_parse_types.py @@ -0,0 +1,23 @@ +import argparse +import os.path +from datetime import datetime + + +def date_YYYYMMDD(s): + """ + argparse type that reads date in a format of YYYYMMDD + """ + try: + return datetime.strptime(s, "%Y%m%d") + except ValueError: + raise argparse.ArgumentTypeError(f"Not a valid format of date: {s}") + + +def dir_path(dir): + """ + argparse type that represents an existing directory + """ + if os.path.isdir(dir): + return dir + else: + raise argparse.ArgumentTypeError(f"{dir} should be an existing directory") diff --git a/rss_parse/utils/collection_utils.py b/rss_parse/utils/collection_utils.py new file mode 100644 index 00000000..8d09bb10 --- /dev/null +++ b/rss_parse/utils/collection_utils.py @@ -0,0 +1,21 @@ +from collections import defaultdict + + +def group_by(it, key=lambda x: x): + """ + Group an iterable by key + """ + d = defaultdict(list) + for item in it: + d[key(item)].append(item) + return d.items() + + +def merge_by_key(it, key=lambda x: x): + """ + Merge an iterable by key. If the key is the same for multiple items, only the latest stays + """ + d = {} + for item in it: + d[key(item)] = item + return list(d.values()) diff --git a/rss_parse/utils/formatting_utils.py b/rss_parse/utils/formatting_utils.py new file mode 100644 index 00000000..5284d4a4 --- /dev/null +++ b/rss_parse/utils/formatting_utils.py @@ -0,0 +1,32 @@ +from html2text import HTML2Text + + +def __configure_translator(): + translator = HTML2Text() + translator.inline_links = False + translator.wrap_links = False + return translator + + +HTML_TO_TEXT_TRANSLATOR = __configure_translator() + + +def format_date_pretty(pub_date): + """ + Format date in a human-readable form + """ + if not pub_date: + return "" + return pub_date.strftime("%a, %d %b %Y %H:%M:%S %z") + + +def get_description_plain(description): + """ + Format a text that might be an HTML by parsing its tags and conveting them to plain text alternatives + """ + if not description: + return description + desc = description.strip() + # try to parse description as html + html = HTML_TO_TEXT_TRANSLATOR.handle(desc) + return html.rstrip() if html else desc diff --git a/rss_parse/utils/messaging_utils.py b/rss_parse/utils/messaging_utils.py new file mode 100644 index 00000000..999d733f --- /dev/null +++ b/rss_parse/utils/messaging_utils.py @@ -0,0 +1,46 @@ +import sys +from abc import ABC, abstractmethod + + +class MessageConsumer(ABC): + """ + Abstraction for adding messages + """ + + @abstractmethod + def add_message(self, message): + pass + + +class MessageConsumerNoop(MessageConsumer): + """ + Implementation of MessageConsumer that skips messages and does nothing. + """ + + def add_message(self, message): + pass + + +class VerboseMessageConsumer(MessageConsumer): + """ + Implementation of MessageConsumer that prints messages surrounding them with [[[ msg ]]] + """ + + def add_message(self, message): + print(f'[[[ {message} ]]]') + + +def get_message_consumer(is_verbose): + if is_verbose: + return VerboseMessageConsumer() + return MESSAGE_CONSUMER_NOOP + + +MESSAGE_CONSUMER_NOOP = MessageConsumerNoop() + + +def print_error(*args, **kwargs): + """ + Shortcut to print error messages to a console + """ + print(*args, file=sys.stderr, **kwargs) diff --git a/rss_parse/utils/parsing_utils.py b/rss_parse/utils/parsing_utils.py new file mode 100644 index 00000000..69a92cf6 --- /dev/null +++ b/rss_parse/utils/parsing_utils.py @@ -0,0 +1,21 @@ +from dateutil import parser as iso_date_parser +from dateutil.parser import ParserError + + +def sanitize_text(txt): + """ + Removes some encoded text from a string + """ + return txt.replace(" ", " ") + + +def to_date(date_str): + """ + Reads a date from a string and converts in to a user timezone + """ + if not date_str: + return None + try: + return iso_date_parser.parse(date_str).astimezone() + except ParserError: + return None diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..5e2a4f13 --- /dev/null +++ b/setup.py @@ -0,0 +1,101 @@ +import re +import shutil + +from setuptools import setup, find_packages +from setuptools.command.develop import develop +from setuptools.command.egg_info import egg_info +from setuptools.command.install import install + +VERSION_FILE = 'rss_parse/__init__.py' + + +def version(): + _version_re = re.compile(r'^\s*__version__\s*=\s*[\'"](.*)[\'"]\s*$') + + with open(VERSION_FILE, 'r') as f: + res = _version_re.search(f.read()) + if res is None: + raise RuntimeError(f"Unable to find version string in {VERSION_FILE}.") + ver = res.group(1) + + return ver + + +fonts_installed = False + + +def install_fdpf_fonts(): + global fonts_installed + if fonts_installed: + return + + try: + import os + import fpdf + + local_fonts_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fonts") + + fpdf_fonts_dir = os.path.join(os.path.dirname(fpdf.__file__), 'font') + if not os.path.exists(fpdf_fonts_dir): + os.mkdir(fpdf_fonts_dir) + + font_file_names = [f for f in os.listdir(local_fonts_dir) if f.endswith(".ttf")] + for font_file_name in font_file_names: + full_file_name = os.path.join(local_fonts_dir, font_file_name) + shutil.copy(full_file_name, fpdf_fonts_dir) + + fonts_installed = True + except ModuleNotFoundError: + pass + + +class CustomInstallCommand(install): + + def run(self): + install.run(self) + install_fdpf_fonts() + + +class CustomDevelopCommand(develop): + + def run(self): + develop.run(self) + install_fdpf_fonts() + + +class CustomEggInfoCommand(egg_info): + + def run(self): + egg_info.run(self) + install_fdpf_fonts() + + +setup( + name='rss_reader', + version=version(), + description='Pure Python command-line RSS reader.', + author='Aleksandra Khorosheva', + zip_safe=False, + author_email='Aleksandra_Khorosheva@epam.com', + keywords=['RSS Reader', 'RSS Feed Parser'], + install_requires=[ + 'setuptools~=57.0.0', + 'requests~=2.27.1', + 'xmltodict~=0.13.0', + 'fpdf2>=2.5.5', + 'python-dateutil~=2.8.0', + 'html2text>=2020.1.16', + ], + python_requires=">=3.8", + packages=find_packages(), + entry_points={ + 'console_scripts': [ + 'rss_reader=rss_parse.rss_reader:main_wrapper', + ], + }, + cmdclass={ + 'install': CustomInstallCommand, + 'develop': CustomDevelopCommand, + 'egg_info': CustomEggInfoCommand, + }, +) diff --git a/tests/it/test_it_arg_parse_types.py b/tests/it/test_it_arg_parse_types.py new file mode 100644 index 00000000..b3b4f93e --- /dev/null +++ b/tests/it/test_it_arg_parse_types.py @@ -0,0 +1,16 @@ +import argparse +from datetime import datetime + +import pytest + +from rss_parse.utils.arg_parse_types import date_YYYYMMDD + + +def test_date_YYYYMMDD_parsing(): + actual = date_YYYYMMDD("20201010") + assert actual == datetime(2020, 10, 10) + + +def test_date_YYYYMMDD_invalid_format_error(): + with pytest.raises(argparse.ArgumentTypeError): + date_YYYYMMDD("2020-10-10") diff --git a/tests/unit/test_collection_utils.py b/tests/unit/test_collection_utils.py new file mode 100644 index 00000000..b2ffd524 --- /dev/null +++ b/tests/unit/test_collection_utils.py @@ -0,0 +1,37 @@ +from rss_parse.utils.collection_utils import group_by, merge_by_key + + +def test_group_by_no_key(): + actual = group_by([1, 1, 2, 3, 2]) + assert actual == dict([(1, [1] * 2), (2, [2] * 2), (3, [3])]).items() + + +def test_group_by_modifying_key(): + actual = group_by([1, 1, 2, 3, 2], key=lambda x: x // 2) + assert actual == dict([(0, [1, 1]), (1, [2, 3, 2])]).items() + + +def test_group_by_field(): + def a_b(a, b): + return {"a": a, "b": b} + + actual = group_by([a_b(1, 2), a_b(2, 3), a_b(2, "a"), a_b("c", 3)], key=lambda x: x["a"]) + assert actual == dict([(1, [a_b(1, 2)]), (2, [a_b(2, 3), a_b(2, "a")]), ("c", [a_b("c", 3)])]).items() + + +def test_merge_by_key_distinct(): + actual = merge_by_key([1, 1, 1, 2, 3, 2]) + assert actual == [1, 2, 3] + + +def test_merge_by_key_modifying_key(): + actual = merge_by_key([1, 1, 1, 2, 3, 2, 3, 4], lambda x: x // 2) + assert actual == [1, 3, 4] + + +def test_merge_by_key_field(): + def a_b(a, b): + return {"a": a, "b": b} + + actual = merge_by_key([a_b(1, 2), a_b(1, 3), a_b(1, 4), a_b(2, 3), a_b(3, 2)], key=lambda x: x["a"]) + assert actual == [a_b(1, 4), a_b(2, 3), a_b(3, 2)] diff --git a/tests/unit/test_messaging_utils.py b/tests/unit/test_messaging_utils.py new file mode 100644 index 00000000..7ebfb926 --- /dev/null +++ b/tests/unit/test_messaging_utils.py @@ -0,0 +1,13 @@ +from rss_parse.utils.messaging_utils import print_error + + +def test_print_error_stdout_empty(capfd): + print_error("Not stdout") + out, err = capfd.readouterr() + assert not out + + +def test_print_error_stderr_not_empty(capfd): + print_error("Definitely stderr") + out, err = capfd.readouterr() + assert err == "Definitely stderr\n" diff --git a/tests/unit/test_parsing_utils.py b/tests/unit/test_parsing_utils.py new file mode 100644 index 00000000..e5c4ea38 --- /dev/null +++ b/tests/unit/test_parsing_utils.py @@ -0,0 +1,26 @@ +from dateutil.parser import ParserError +from pytest_mock import MockerFixture + +from rss_parse.utils.parsing_utils import sanitize_text, to_date + + +def test_sanitize_text_simple_text_not_changed(): + actual = sanitize_text("abcd xyz 123") + assert actual == "abcd xyz 123" + + +def test_sanitize_text_nbsp_becomes_space(): + actual = sanitize_text("a   c") + assert actual == "a c" + + +def test_to_date_empty_input_none(): + actual = to_date("") + assert actual is None + + +def test_to_date_exception_error(mocker: MockerFixture): + mocker.patch('dateutil.parser.parse', side_effect=ParserError()) + + actual = to_date("1996-04-02 12:12:12") + assert actual is None diff --git a/tests/unit/test_rss_feed_mapper.py b/tests/unit/test_rss_feed_mapper.py new file mode 100644 index 00000000..718e35d2 --- /dev/null +++ b/tests/unit/test_rss_feed_mapper.py @@ -0,0 +1,42 @@ +from datetime import datetime, timezone + +from rss_parse.parse.rss_feed import RssItem, RssFeed +from rss_parse.parse.rss_mapper import RssJsonMapper + + +def test_rss_json_mapper_to_json(): + rss_feed = RssFeed([ + RssItem("title1", "description1", datetime(2020, 11, 11, tzinfo=timezone.utc), "link1", "image_url1", + source="source1"), + RssItem("title2", "description2", datetime(2020, 10, 10, tzinfo=timezone.utc), "link2", "image_url2", + source="source2"), + ]) + expected = '{"item": [' \ + '{"title": "title1", "link": "link1", "pubDate": "2020-11-11 00:00:00", ' \ + '"description": "description1", "image": "image_url1", "source": "source1"}, ' \ + '{"title": "title2", "link": "link2", "pubDate": "2020-10-10 00:00:00", ' \ + '"description": "description2", "image": "image_url2", "source": "source2"}' \ + ']}' + + actual = RssJsonMapper().to_json(rss_feed) + + assert actual == expected + + +def test_rss_json_mapper_from_json(): + expected = RssFeed([ + RssItem("title1", "description1", datetime(2020, 11, 11, tzinfo=timezone.utc).astimezone(), "link1", + "image_url1", source="source1"), + RssItem("title2", "description2", datetime(2020, 10, 10, tzinfo=timezone.utc).astimezone(), "link2", + "image_url2", source="source2"), + ]) + rss_json = '{"item": [' \ + '{"title": "title1", "link": "link1", "pubDate": "2020-11-11 00:00:00", ' \ + '"description": "description1", "image": "image_url1", "source": "source1"}, ' \ + '{"title": "title2", "link": "link2", "pubDate": "2020-10-10 00:00:00", ' \ + '"description": "description2", "image": "image_url2", "source": "source2"}' \ + ']}' + + actual = RssJsonMapper().from_json(rss_json) + + assert actual == expected diff --git a/tests/unit/test_rss_feed_preprocessor.py b/tests/unit/test_rss_feed_preprocessor.py new file mode 100644 index 00000000..7cb1bbda --- /dev/null +++ b/tests/unit/test_rss_feed_preprocessor.py @@ -0,0 +1,41 @@ +from datetime import datetime + +from rss_parse.parse.rss_feed import RssFeed, RssItem +from rss_parse.preprocessor.rss_preprocessor import RssSortPreprocessor, RssLimitPreprocessor + + +def rss_item(pub_date): + return RssItem("", "", pub_date, "", "") + + +def test_rss_sort_preprocessor(): + d1 = rss_item(datetime(2019, 10, 28)) + d2 = rss_item(datetime(2018, 10, 28)) + d3 = rss_item(datetime(2020, 8, 12, 12, 12, 14)) + d4 = rss_item(datetime(2020, 8, 12, 11, 12, 14)) + rss_feed = RssFeed([d1, d2, d3, d4]) + preprocessor = RssSortPreprocessor() + actual = preprocessor.preprocess(rss_feed) + assert actual == RssFeed([d3, d4, d1, d2]) + + +def test_rss_limit_preprocessor_shrink(): + d1 = rss_item(datetime(2019, 10, 28)) + d2 = rss_item(datetime(2018, 10, 28)) + d3 = rss_item(datetime(2020, 8, 12, 12, 12, 14)) + d4 = rss_item(datetime(2020, 8, 12, 11, 12, 14)) + rss_feed = RssFeed([d1, d2, d3, d4]) + preprocessor = RssLimitPreprocessor(2) + actual = preprocessor.preprocess(rss_feed) + assert actual == RssFeed([d1, d2]) + + +def test_rss_limit_preprocessor_more_than_max_all(): + d1 = rss_item(datetime(2019, 10, 28)) + d2 = rss_item(datetime(2018, 10, 28)) + d3 = rss_item(datetime(2020, 8, 12, 12, 12, 14)) + d4 = rss_item(datetime(2020, 8, 12, 11, 12, 14)) + rss_feed = RssFeed([d1, d2, d3, d4]) + preprocessor = RssLimitPreprocessor(20) + actual = preprocessor.preprocess(rss_feed) + assert actual == rss_feed