diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..cc5f2f96 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.idea + +dist +RSS_Reader.egg-info + +__pycache__ \ No newline at end of file diff --git a/Narek Arsenyan/README.md b/Narek Arsenyan/README.md new file mode 100644 index 00000000..ef39b92a --- /dev/null +++ b/Narek Arsenyan/README.md @@ -0,0 +1,144 @@ + +# Feed Format + +--- + +**`Feed:`**      Feed source title + +  + +**`Title:`**    Title of the new + +**`Date:`**      Publishing date of the new + +**`Link:`**      Link of the new + +  + +### `Content of the new` + +  + +**`Links:`** + + +`[1]: Link 1` (type of source) + +`[2]: Link 2` (type of source) + +. + +. + +. + +`[n]: Link n` (type of source) + + +--- + +# Feed Format (JSON) + +--- + +```json +{ + "source_title": "Feed source title", + "source_url": "Feed source url", + "title": "Title of the new", + "date": "Publishing date of the new", + "link": "Link of the new", + "content": "Content of the new", + "non_media_links": [ + { + "href": "url of link used in the new", + "link_type": "link type of the link" + } + ], + "media_links": [ + { + "href": "url of media used in the new", + "link_type": "type of media" + } + ] +} +``` + +--- + +# Feed caching + +--- + +Previously fetched feeds are cached in user cache directory. + +**For macOS:** + +>/Users/[USER]/Library/Caches/RSSReader + +**For Linux:** + +>/home/[USER]/.cache/RSSReader + +**For Windows 7:** + +>C:\Users\[USER]\AppData\Local\nmac99\RSSReader\Cache + +## Format of caching + +Fetched feeds are stored in `[DATE].json` files, where `[DATE]` is the date of publication of the feed. + +Inside `.json` file is JSON object where keys are fetched feeds' sources and values are feeds' data list in JSON format. + +**Example:** + +`2022-06-11.json` + +```json +{ + "https://timesofindia.indiatimes.com/rssfeedstopstories.cms": [ + "{\n \"source_title\": Times of India,\n \"source_url\": \"https://timesofindia.indiatimes.com/rssfeedstopstories.cms\",\n \"title\": \"Presidential polls: Mamata invites 22 oppn CMs, leaders for joint meeting on June 15\",\n \"date\": \"2022-06-11T16:22:36+05:30\",\n \"link\": \"https://timesofindia.indiatimes.com/india/presidential-polls-mamata-invites-22-oppn-cms-leaders-for-joint-meeting-on-june-15/articleshow/92146582.cms\",\n \"content\": \"With the Rajya Sabha results exposing dissension and lack of cohesion among opposition parties, West Bengal chief minister Mamata Banerjee on Saturday reached out to her counterparts and other leaders to participate in a meeting in Delhi on June 15 to discuss the upcoming presidential polls, which are scheduled for July 18.\",\n \"non_media_links\": [\n {\n \"href\": \"https://timesofindia.indiatimes.com/india/presidential-polls-mamata-invites-22-oppn-cms-leaders-for-joint-meeting-on-june-15/articleshow/92146582.cms\",\n \"link_type\": \"link\"\n }\n ],\n \"media_links\": []\n}" + ] +} +``` +  + +3 types of cache checks are implemented: + +1. When cache files for dates are exceeding count of 10, the earliest date cache file is deleted +2. When cache sources in one cache file are exceeding count of 10, the first source is deleted with its content +3. When cached feeds in one cache source are exceeding count of 10, the first cached feed in that source is deleted + +  + +When reading from cache, JSON objects are being converted to normalized Feed objects + +--- + +# Feeds conversion + +--- + +Currently, there are **2 types** of conversion available: + +1. HTML +2. EPUB + +  + +You can easily convert your feeds to these 2 formats, whether they are newly fetched or were read from cache. + +Converted files will be saved in your provided directory, however if that directory does not exist, files will be saved +in user data directory. + +**For macOS:** + +>/Users/[USER]/Library/Application Support/RSSReader + +**For Linux:** + +>/home/[USER]/.local/share/RSSReader + +**For Windows 7:** + +>C:\Users\[USER]\AppData\Local\nmac99\RSSReader diff --git a/Narek Arsenyan/__init__.py b/Narek Arsenyan/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Narek Arsenyan/requirements.txt b/Narek Arsenyan/requirements.txt new file mode 100644 index 00000000..bf467ba0 --- /dev/null +++ b/Narek Arsenyan/requirements.txt @@ -0,0 +1,6 @@ +appdirs==1.4.4 +beautifulsoup4==4.11.1 +EbookLib==0.17.1 +requests==2.28.0 +setuptools==61.2.0 +yattag==1.14.0 diff --git a/Narek Arsenyan/rss_reader_package/.DS_Store b/Narek Arsenyan/rss_reader_package/.DS_Store new file mode 100644 index 00000000..5dd04e9b Binary files /dev/null and b/Narek Arsenyan/rss_reader_package/.DS_Store differ diff --git a/Narek Arsenyan/rss_reader_package/__init__.py b/Narek Arsenyan/rss_reader_package/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Narek Arsenyan/rss_reader_package/cache_worker.py b/Narek Arsenyan/rss_reader_package/cache_worker.py new file mode 100644 index 00000000..3de3ff74 --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/cache_worker.py @@ -0,0 +1,127 @@ +""" +Module for CacheWorker class + +exports CacheWorker class +""" +import os +import utils.config as config +from appdirs import user_cache_dir +from datetime import datetime +from json import dumps, load +from feed import Feed +from utils.exceptions import CachedFeedNotFoundError + + +class CacheWorker: + """Class for working with feeds caching""" + + appname = config.appname + appauthor = config.appauthor + + @staticmethod + def store_feed_in_cache(feed: Feed): + """ + Function that stores feed in cache files + + Args: + feed: Feed object with all necessary data + """ + + date = str(datetime.strptime(feed.date, "%a, %d %b %Y %H:%M:%S %z").date()) + source = feed.source_url + feed_id = feed.link + config.verbose_print("Getting user cache directory", "bold") + cache_dir = user_cache_dir(CacheWorker.appname, CacheWorker.appauthor) + if not os.path.exists(cache_dir): + os.mkdir(cache_dir) + config.verbose_print("Checking user cache directory overload", "und") + cache_files = [] + for (_, __, filenames) in os.walk(cache_dir): + for file in filenames: + if file.endswith(".json"): + cache_files.append(file) + if len(cache_files) > 10: + config.verbose_print("User cache directory is overloaded. Removing old cache file", "warn") + os.remove(os.path.join(cache_dir, f"{cache_files[0]}")) + try: + with open(os.path.join(cache_dir, f"{date}.json"), "r") as cache_file: + config.verbose_print("Reading date cache file", "bold") + try: + cache = load(cache_file) + except Exception as e: + config.verbose_print(f"Warning: JSON not found ({e})", "warn") + cache = dict() + except Exception as e: + config.verbose_print(f"Warning: date cache not found ({e})", "warn") + cache = dict() + config.verbose_print("Checking date cache file overload", "und") + if len(cache.keys()) > 10: + config.verbose_print("Date cache file is overloaded. Removing old cache entry", "warn") + del cache[list(cache.keys())[0]] + try: + with open(os.path.join(cache_dir, f"{date}.json"), "w") as cache_file: + config.verbose_print("Checking if feed is already in cache", "und") + feed_is_in_cache = False + if source in cache: + for feed in cache[source]: + if feed_id in feed: + config.verbose_print("Feed is in cache already", "green") + feed_is_in_cache = True + break + if not feed_is_in_cache: + config.verbose_print("Feed not in cache. Storing feed in cache", "bold") + if len(cache[source]) > 10: + cache[source].pop(0) + cache[source].append(feed.to_json()) + else: + cache[source] = [feed.to_json()] + config.verbose_print("Update date cache file", "bold") + cache_file.write(dumps(cache, indent=4)) + except Exception as e: + config.verbose_print(f"Unable to open cache file ({e})", "warn") + + @staticmethod + def read_feed_from_cache(date: str, source: str or None, limit: int or None) -> [Feed]: + """ + Args: + date: for which date cached feed should be read + source: specific source for feed + limit: limit of feeds that should be retrieved + + Returns: + [Feed]: list of fetched feeds from cache + + Raises: + CachedFeedNotFoundError + """ + + config.verbose_print("Opening user cache directory", "bold") + cache_dir = user_cache_dir(CacheWorker.appname, CacheWorker.appauthor) + try: + with open(os.path.join(cache_dir, f"{date}.json"), "r") as cache_file: + try: + cache = load(cache_file) + except Exception as e: + config.verbose_print(f"Cannot read JSON from cache file ({e})", "warn") + cache = dict() + if len(cache.keys()) == 0: + raise CachedFeedNotFoundError("Error: Cached Feed not found") + formatted_cached_feeds = [] + limit_final = 100 + if limit is not None: + limit_final = limit + config.verbose_print("Reading feeds from cache", "bold") + if source is None: + for cached_feed in cache.values(): + if len(formatted_cached_feeds) == limit_final: + break + formatted_cached_feeds.append(Feed.json_to_feed(cached_feed[0])) + else: + for cached_feed in cache[source]: + if len(formatted_cached_feeds) == limit_final: + break + formatted_cached_feeds.append(Feed.json_to_feed(cached_feed)) + return formatted_cached_feeds + except Exception as e: + config.verbose_print(f"Date cache file not found ({e})", "warn") + raise CachedFeedNotFoundError("Error: Cached Feed not found") diff --git a/Narek Arsenyan/rss_reader_package/date.py b/Narek Arsenyan/rss_reader_package/date.py new file mode 100644 index 00000000..065ad348 --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/date.py @@ -0,0 +1,25 @@ +""" +Module for datetime enhancements + +exports valid_date +""" +from argparse import ArgumentTypeError +from datetime import datetime + + +def valid_date(s: str) -> datetime.date: + """ + Parses string with datetime to date format + + Args: + s: datetime containing string + + Returns: + datetime.date: parsed string to date + """ + + try: + return datetime.strptime(s, "%Y%m%d").date() + except ValueError: + msg = "Not a valid date: {0!r}".format(s) + raise ArgumentTypeError(msg) diff --git a/Narek Arsenyan/rss_reader_package/feed.py b/Narek Arsenyan/rss_reader_package/feed.py new file mode 100644 index 00000000..54ecf5d7 --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/feed.py @@ -0,0 +1,127 @@ +""" +Module for Feed class + +exports Feed class +""" + +import json +import utils.config as config +from bs4 import BeautifulSoup +from link import Link +from utils.exceptions import ConvertJSONError + + +class Feed: + """Class which describes normalized Feed object""" + def __init__(self, + source_title: str, + source_url: str, + title: str, + date: str, + link: str, + content: str, + non_media_links: list[Link], + media_links: list[Link] + ): + """ + The constructor of Feed class + + Args: + source_title: the title of source from where the feed was fetched + title: the title of the current feed + date: the date of publication of the current feed + link: the link where feed is stored + content: the content of the current feed + non_media_links: non-media links, that are used in the feed + media_links: media links, that are used in the feed + """ + + self.source_title = source_title + self.source_url = source_url + self.title = title + self.date = date.replace("Z", "+00:00") + self.link = link + self.content = content + self.non_media_links = non_media_links + self.media_links = media_links + + def to_json(self): + """ + The function for converting Feed object to JSON string + + Returns: + JSON string + + Raises: + ConvertJSONError + """ + + config.verbose_print("Converting feed to JSON format", "blue") + try: + return json.dumps(self, default=lambda o: o.__dict__, indent=4) + except Exception as e: + raise ConvertJSONError(f"Error when converting feed to JSON ({e})") + + def to_readable(self) -> str: + """ + The function for converting Feed type object to readable string + + Returns: + str: Readable string based on Feed object + """ + + config.verbose_print("Converting feed to readable format", "blue") + all_links = self.non_media_links.copy() + all_links.extend(self.media_links) + formatted_links = "" + for i in range(0, len(all_links)): + formatted_links += f"[{i + 1}]: {str(all_links[i])}\n" + if formatted_links == "": + formatted_links = None + + content = BeautifulSoup(self.content, 'lxml') + feed_links = content.find_all("a", href = True) + + for a in content.select("a"): + anchor_index = next((i for i, item in enumerate(feed_links) if item["href"] == a["href"]), None) + anchor_before = f"[link {anchor_index + 1}: " + anchor_after = f"][{anchor_index + 1}]" + a.insert_before(anchor_before) + a.insert_after(anchor_after) + a.unwrap() + + formatted_content = content.get_text() + formatted_feed = \ + f"Feed: {self.source_title}\n\n" + \ + f"Title: {self.title}\n" + \ + f"Date: {self.date}\n" + \ + f"Link: {self.link}\n\n" + \ + f"{formatted_content}\n\n" + \ + f"Links:\n{formatted_links}\n\n" + + return formatted_feed + + @staticmethod + def json_to_feed(json_feed: str): + """ + The function for converting JSON of feed to Feed object + + Args: + json_feed: JSON string of feed + + Returns: + Feed: Feed object + """ + + converted_feed = json.loads(json_feed) + non_media_links = list(map(lambda l: Link(l["href"], l["link_type"]), converted_feed["non_media_links"])) + media_links = list(map(lambda l: Link(l["href"], l["link_type"]), converted_feed["media_links"])) + + return Feed(converted_feed["source_title"], + converted_feed["source_url"], + converted_feed["title"], + converted_feed["date"], + converted_feed["link"], + converted_feed["content"], + non_media_links, + media_links) diff --git a/Narek Arsenyan/rss_reader_package/feed_fetcher.py b/Narek Arsenyan/rss_reader_package/feed_fetcher.py new file mode 100644 index 00000000..2d5ef6c4 --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/feed_fetcher.py @@ -0,0 +1,75 @@ +""" +Module for FeedFetcher class + +exports FeedFetcher class +""" +import requests +from bs4 import BeautifulSoup +import utils.config as config +from link import Link +from feed import Feed +from cache_worker import CacheWorker +from utils.exceptions import WrongLimitError, WrongUrlError + + +class FeedFetcher: + """Class for fetching feeds and storing feeds""" + + def __init__(self, source: str, limit: int or None): + """ + The constructor for FeedFetcher class + + Args: + source: the source url of rss feeds + limit: limit of the feeds, that must be shown + + Raises: + WrongLimitError + """ + + if limit is not None and limit < 1: + raise WrongLimitError("Argument LIMIT must be greater than 0") + + self.__source = source + self.__limit = limit + self.feeds_formatted = [] + + def fetch_feeds(self): + """The function that fetches and stores feeds using source and limit of object """ + + try: + r = requests.get(self.__source) + soup = BeautifulSoup(r.content, features="xml") + source_title = soup.find("channel").find("title").text + config.verbose_print("Feeds fetched", "green") + if self.__limit is None: + limit = 100 + else: + limit = self.__limit + items = soup.find_all('item')[:limit] + for item in items: + config.verbose_print("Scraping feed title", "bold") + title = item.find('title').text + config.verbose_print("Scraping publish date", "bold") + date = item.find('pubDate').text + config.verbose_print("Scraping feed link", "bold") + link = item.find('link').text + config.verbose_print("Scraping feed content", "bold") + raw_content = item.find('description').text + config.verbose_print("Scraping media links", "bold") + media_links_raw = item.find_all("enclosure") + config.verbose_print("Scraping non-media links", "bold") + content = BeautifulSoup(raw_content, 'lxml') + feed_links_raw = content.find_all("a", href=True) + media_links = list() + feed_links = list() + for feed_link in feed_links_raw: + feed_links.append(Link(feed_link['href'], 'link')) + for media_link in media_links_raw: + media_links.append(Link(media_link["url"], media_link["type"].split("/")[0])) + config.verbose_print("Storing fetched feeds in cache", "bold") + formatted_feed = Feed(source_title, self.__source, title, date, link, raw_content, feed_links, media_links) + CacheWorker.store_feed_in_cache(formatted_feed) + self.feeds_formatted.append(formatted_feed) + except Exception: + raise WrongUrlError("Specified url is not rss type") diff --git a/Narek Arsenyan/rss_reader_package/format_converter.py b/Narek Arsenyan/rss_reader_package/format_converter.py new file mode 100644 index 00000000..87ec1e4d --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/format_converter.py @@ -0,0 +1,198 @@ +""" +Module for converting feeds to specific format + +exports FormatConverter +""" + +import os +import uuid +import utils.config as config +from utils.config import print +from appdirs import user_data_dir +from yattag import Doc, indent +from ebooklib import epub +from datetime import datetime +from feed import Feed +from utils.count_files import count_files_by_type +from utils.exceptions import NotSupportedConversionFormat + +SUPPORTED_FORMATS = ["html", "epub"] + +# Global css that is used for html conversion +CSS = """ +#feed-images-container { width: 100%; overflow-x: auto; display: flex; flex-direction: row; } +img { height: 200px; width: auto; margin: 0 10px; } +""" + + +class FormatConverter: + """ + Class for working with conversion of feeds + """ + @staticmethod + def convert_feeds(feeds: [Feed], convert_format: str or None, destination_path: str or None, print_json: bool): + """ + Function that processes conversion. Does generic part, which is used in all conversion types + + Args: + feeds: Feed objects, that should be converted + convert_format: conversion format. Supported formats are "epub" and "html" + destination_path: location of file, where converted feeds should take place. If destination_path is not + provided, user data directory will be used as location (See DOCS) + print_json: if feeds should be printed in JSON format in stdout during conversion + + Raises: + NotSupportedConversionFormat + """ + + config.verbose_print(f"Starting conversion to '{convert_format}'", "bold") + if convert_format is None or convert_format not in SUPPORTED_FORMATS: + raise NotSupportedConversionFormat(f"Conversion format '{convert_format}' is not supported") + else: + if destination_path is None or not os.path.exists(destination_path): + print("Warning: Specified directory is wrong or does not exist. " + "Falling back for user data directory", "warn") + destination = user_data_dir(config.appname, config.appauthor) + else: + destination = destination_path + if convert_format == "html": + FormatConverter.feeds_to_html(feeds, destination) + elif convert_format == "epub": + FormatConverter.feeds_to_epub(feeds, destination) + if print_json: + config.verbose_print("Printing feeds in JSON format in stdout", "bold") + for feed in feeds: + print(feed.to_json()) + + @staticmethod + def feeds_to_html(feeds: [Feed], destination_path: str): + """ + Function that converts feeds to html format file in specified directory + + Args: + feeds: Feed objects, that will be included in html file + destination_path: location, where converted html file will be saved + """ + + config.verbose_print("Creating html file for feeds", "bold") + html_files_count = count_files_by_type(destination_path, ".html") + doc, tag, text = Doc().tagtext() + with tag("html"): + with tag("head"): + with doc.tag('style', type='text/css'): + config.verbose_print("Applying css", "bold") + doc.asis(CSS) + doc.stag("meta", charset="UTF-8") + with tag("body"): + config.verbose_print("Appending feeds to html", "bold") + for feed in feeds: + doc.asis(FormatConverter.single_feed_to_html_content(feed, False)) + config.verbose_print("Saving html file", "bold") + with open(os.path.join(destination_path, f"Feeds {html_files_count + 1}.html"), "w") as html_file: + html_file.write(indent(doc.getvalue())) + print(f"Html file saved in {destination_path}", "green") + + @staticmethod + def feeds_to_epub(feeds: [Feed], destination_path: str): + """ + Function that converts feeds to epub format file in specified directory + + Args: + feeds: Feed objects, that will be included in html file + destination_path: location, where converted html file will be saved + """ + + config.verbose_print("Creating html file for feeds", "bold") + epub_files_count = count_files_by_type(destination_path, ".epub") + new_file_name = f"Feeds {epub_files_count + 1}" + book = epub.EpubBook() + + book.set_identifier(str(uuid.uuid4())) + book.set_title(new_file_name) + + book.add_author(config.appauthor) + + spine = ["nav"] + + config.verbose_print("Creating chapters for epub book", "bold") + for i in range(0, len(feeds)): + doc, tag, text = Doc().tagtext() + with tag("html"): + with tag("head"): + with doc.tag('style', type='text/css'): + doc.asis(CSS) + with tag("body"): + doc.asis(FormatConverter.single_feed_to_html_content(feeds[i], True)) + chapter = epub.EpubHtml(title=feeds[i].title, file_name=f"chap_{i + 1}.xhtml") + chapter.content = indent(doc.getvalue()) + book.add_item(chapter) + spine.append(chapter) + + config.verbose_print("Creating Table of Contents", "bold") + book.toc = tuple(spine[1::]) + + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + + book.spine = spine + + config.verbose_print("Saving epub book", "bold") + epub.write_epub(os.path.join(destination_path, new_file_name + ".epub"), book, {}) + print(f"Epub book saved in {destination_path}", "green") + + @staticmethod + def single_feed_to_html_content(feed: Feed, for_book: bool) -> str: + """ + Function that converts single Feed object to html body content. Used for both .epub and .html file conversions + + Args: + feed: Feed object + for_book: argument, that specifies if converting is for book type conversion or no + + Returns: + str: Feed object converted to html string, which contains Feed data + """ + + config.verbose_print(f"Creating feed content in html format (Feed: {feed.title})", "bold") + doc, tag, text = Doc().tagtext() + with tag("h3"): + text(feed.title) + with tag("p"): + text(str(datetime.strptime(feed.date, "%a, %d %b %Y %H:%M:%S %z").__format__("%d.%m.%Y, %H:%M"))) + doc.stag("br") + if not for_book: + with tag("div", id="feed-images-container"): + if len(feed.media_links) < 1: + text("No media was provided") + else: + for media_link in feed.media_links: + if media_link.link_type == "image": + doc.stag("img", src=media_link.href, klass="feed-image") + if media_link.link_type == "audio": + doc.stag("audio", controls=True, src=media_link.href) + if media_link.link_type == "video": + with tag("video", controls=True): + doc.stag("source", src=media_link.href) + doc.stag("br") + doc.asis(feed.content) + doc.stag("br") + with tag("a", href=feed.link, target="_blank"): + text("Read full article") + doc.stag("br") + doc.stag("br") + with tag("span"): + with tag("strong"): + text("Related links:") + with tag("div"): + for i in range(0, len(feed.non_media_links)): + with tag("div"): + with tag("span"): + text(f"[{i + 1}]") + with tag("a", href=feed.non_media_links[i].href, target="_blank"): + text(feed.non_media_links[i].href) + if not for_book: + for i in range(0, 5): + doc.stag("br") + doc.stag("hr") + + return doc.getvalue() diff --git a/Narek Arsenyan/rss_reader_package/link.py b/Narek Arsenyan/rss_reader_package/link.py new file mode 100644 index 00000000..0da6c2f2 --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/link.py @@ -0,0 +1,34 @@ +""" +Module for Link class + +exports Link class +""" + +from utils.exceptions import LinkWithNoSourceError + + +class Link: + """Class for specifying single Link object""" + def __init__(self, href, link_type: str = "unknown"): + """ + The constructor of Link class + + Args: + href: source of the link + link_type: type of the source + """ + + if href is None or type(href) is not str: + raise LinkWithNoSourceError("Provided source is missing or of wrong type") + self.href = href + self.link_type = link_type + + def __str__(self) -> str: + """ + The function for overriding string conversion behaviour + + Returns: + str: formatted string version of Link object + """ + + return f"{self.href} ({self.link_type})" diff --git a/Narek Arsenyan/rss_reader_package/mocks/__init__.py b/Narek Arsenyan/rss_reader_package/mocks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Narek Arsenyan/rss_reader_package/mocks/raw_feed_mock.py b/Narek Arsenyan/rss_reader_package/mocks/raw_feed_mock.py new file mode 100644 index 00000000..2ac416da --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/mocks/raw_feed_mock.py @@ -0,0 +1,3 @@ +"""Module for raw feed mock data""" + +mock_feed_raw = "{\n \"source_title\": \"The Daily\",\n \"source_url\": \"http://rss.art19.com/the-daily\",\n \"title\": \"Why Is It So Hard to Buy a House in America Right Now?\",\n \"date\": \"Tue, 21 Jun 2022 09:50:00 +0000\",\n \"link\": \"https://www.nytimes.com/the-daily\",\n \"content\": \"

This episode contains strong language.

When Drew Mena and Amena Sengal decided last year to sell their home in New York and relocate their young family to Austin, Texas, they figured they\\u2019d have no problem.

What they hadn\\u2019t realized was that, across the country, home prices \\u2014 and competition to secure properties \\u2014 had risen to jaw-dropping levels.

Guest: Francesca Mari, a contributing writer for The New York Times Magazine and a fellow at the think tank New America.

Want more from The Daily? For one big idea on the news each week from our team, subscribe to our newsletter.\\u00a0

Background reading:\\u00a0

For more information on today\\u2019s episode, visit nytimes.com/thedaily. Transcripts of each episode will be made available by the next workday.\\u00a0

\\n\",\n \"non_media_links\": [\n {\n \"href\": \"https://www.nytimes.com/newsletters/the-daily?module=inline\",\n \"link_type\": \"link\"\n },\n {\n \"href\": \"https://www.nytimes.com/2021/11/12/magazine/real-estate-pandemic.html\",\n \"link_type\": \"link\"\n },\n {\n \"href\": \"http://nytimes.com/thedaily?smid=pc-thedaily\",\n \"link_type\": \"link\"\n }\n ],\n \"media_links\": [\n {\n \"href\": \"https://dts.podtrac.com/redirect.mp3/chrt.fm/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/03d8b493-87fc-4bd1-931f-8a8e9b945d8a/episodes/230797bf-6d47-4648-81b5-79750b8d8023/audio/128/default.mp3?aid=rss_feed&awCollectionId=03d8b493-87fc-4bd1-931f-8a8e9b945d8a&awEpisodeId=230797bf-6d47-4648-81b5-79750b8d8023&feed=54nAGcIl\",\n \"link_type\": \"audio\"\n }\n ]\n}" \ No newline at end of file diff --git a/Narek Arsenyan/rss_reader_package/rss_reader.py b/Narek Arsenyan/rss_reader_package/rss_reader.py new file mode 100644 index 00000000..e4031f2c --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/rss_reader.py @@ -0,0 +1,122 @@ +""" +Main module of rss_reader + +It handles parsing arguments, prints version, sets verbose mode, initializes FeedFetcher and fetches feeds +""" +import os +import argparse +from utils import config +from utils.config import print +from utils.version import __version__ +from feed_fetcher import FeedFetcher +from cache_worker import CacheWorker +from format_converter import FormatConverter +from date import valid_date +from utils.exceptions import WrongLimitError,\ + ConvertJSONError,\ + WrongUrlError,\ + LinkWithNoSourceError,\ + NotSupportedConversionFormat + + +my_parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader') + +my_parser.add_argument('source', + metavar='source', + nargs='?', + type=str, + help='RSS URL') +my_parser.add_argument("--version", + help="Prints version info", + action="store_true") +my_parser.add_argument("--date", + action="store", + type=valid_date, + help="Fetches feeds from cache by specified date") +my_parser.add_argument("--limit", + action="store", + type=int, + help="Limits news topics if this parameter is provided") +my_parser.add_argument("--json", + action="store_true", + help="Prints result as JSON in stdout") +my_parser.add_argument("--to-html", + action="store", + type=str, + help="Converts feeds to html format in specified directory. \ + If specified directory does not exist, file will be created in application's data directory. \ + See DOCS for further info") +my_parser.add_argument("--to-epub", + action="store", + type=str, + help="Converts feeds to epub format in specified directory. \ + If specified directory does not exist, file will be created in application's data directory. \ + See DOCS for further info") +my_parser.add_argument("--verbose", + action="store_true", + help="Outputs verbose status messages") +my_parser.add_argument("--colorize", + action="store_true", + help="Enables colorized output for stdout") + +args = my_parser.parse_args() + +if args.verbose: + config.verbose_print = print +if args.colorize: + config.COLORIZED_MODE = True + + +def rss_reader_func(): + if args.version: + print(f"Version {__version__}", "pink") + return + convert_dir = args.to_html + if args.to_html and not os.path.isdir(args.to_html) or args.to_epub and not os.path.isdir(args.to_epub): + convert_dir = None + if args.date: + cached_feeds = CacheWorker.read_feed_from_cache(args.date, args.source, args.limit) + if args.to_html or args.to_epub: + if args.to_html: + FormatConverter.convert_feeds(cached_feeds, "html", convert_dir, args.json) + if args.to_epub: + FormatConverter.convert_feeds(cached_feeds, "epub", convert_dir, args.json) + return + for feed in cached_feeds: + if args.json: + print(feed.to_json()) + else: + print(feed.to_readable()) + return + config.verbose_print("Initializing url and limit", "pink") + try: + feed_fetcher = FeedFetcher(args.source, args.limit) + config.verbose_print("Fetching feeds...", "bold") + feed_fetcher.fetch_feeds() + if args.to_html or args.to_epub: + if args.to_html: + FormatConverter.convert_feeds(feed_fetcher.feeds_formatted, "html", convert_dir, args.json) + if args.to_epub: + FormatConverter.convert_feeds(feed_fetcher.feeds_formatted, "epub", convert_dir, args.json) + return + for feed in feed_fetcher.feeds_formatted: + if args.json: + print(feed.to_json()) + else: + print(feed.to_readable()) + except WrongLimitError as message: + print(str(message), "err") + except WrongUrlError as message: + print(str(message), "err") + except ConvertJSONError as message: + print(str(message), "err") + except LinkWithNoSourceError as message: + print(str(message), "err") + except NotSupportedConversionFormat as message: + print(str(message), "err") + except Exception as e: + print(f"Error ({e})", "err") + + +if __name__ == "__main__": + rss_reader_func() diff --git a/Narek Arsenyan/rss_reader_package/tests/__init__.py b/Narek Arsenyan/rss_reader_package/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Narek Arsenyan/rss_reader_package/tests/test_count_files.py b/Narek Arsenyan/rss_reader_package/tests/test_count_files.py new file mode 100644 index 00000000..7e85cbaf --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/tests/test_count_files.py @@ -0,0 +1,28 @@ +import unittest +import os +import uuid +import shutil +from appdirs import user_cache_dir +from rss_reader_package.utils.count_files import count_files_by_type + + +class TestCountFiles(unittest.TestCase): + def test_count_files_by_type(self): + test_dir = os.path.join(user_cache_dir(), "Test") + os.mkdir(test_dir) + for i in range(5): + with open(os.path.join(test_dir, f"{uuid.uuid4()}.txt"), "w") as temp_file: + temp_file.write("") + self.assertEqual(count_files_by_type(test_dir, ".txt"), 5) + self.assertEqual(count_files_by_type(test_dir, None), 5) + self.assertEqual(count_files_by_type(test_dir, ".html"), 0) + for i in range(3): + with open(os.path.join(test_dir, f"{uuid.uuid4()}.tmp"), "w") as temp_file: + temp_file.write("") + self.assertEqual(count_files_by_type(test_dir, ".tmp"), 3) + self.assertEqual(count_files_by_type(test_dir, None), 8) + self.assertEqual(count_files_by_type(test_dir, ".txt"), 5) + shutil.rmtree(test_dir) + + +unittest.main() diff --git a/Narek Arsenyan/rss_reader_package/tests/test_date.py b/Narek Arsenyan/rss_reader_package/tests/test_date.py new file mode 100644 index 00000000..a6b6012c --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/tests/test_date.py @@ -0,0 +1,15 @@ +import unittest +from argparse import ArgumentTypeError +from rss_reader_package.date import valid_date + + +class TestDate(unittest.TestCase): + def test_valid_date(self): + self.assertEqual(str(valid_date("20210730")), "2021-07-30") + with self.assertRaises(ArgumentTypeError): + valid_date("word") + with self.assertRaises(ArgumentTypeError): + valid_date(valid_date("30071999")) + + +unittest.main() diff --git a/Narek Arsenyan/rss_reader_package/tests/test_feed.py b/Narek Arsenyan/rss_reader_package/tests/test_feed.py new file mode 100644 index 00000000..3c645809 --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/tests/test_feed.py @@ -0,0 +1,30 @@ +import unittest +from rss_reader_package.mocks.raw_feed_mock import mock_feed_raw +from rss_reader_package.feed import Feed + + +class TestFeed(unittest.TestCase): + parsed_feed = Feed.json_to_feed(mock_feed_raw) + + def test_process_feed(self): + self.assertEqual(TestFeed.parsed_feed.source_title, "The Daily") + self.assertEqual(TestFeed.parsed_feed.title, "Why Is It So Hard to Buy a House in America Right Now?") + self.assertEqual(TestFeed.parsed_feed.date, "Tue, 21 Jun 2022 09:50:00 +0000") + self.assertEqual(TestFeed.parsed_feed.link, + "https://www.nytimes.com/the-daily") + self.assertEqual(len(TestFeed.parsed_feed.content), 1139) + self.assertEqual(len(TestFeed.parsed_feed.non_media_links), 3) + self.assertEqual(TestFeed.parsed_feed.media_links[0].href, + "https://dts.podtrac.com/redirect.mp3/chrt.fm/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/" + "03d8b493-87fc-4bd1-931f-8a8e9b945d8a/episodes/230797bf-6d47-4648-81b5-79750b8d8023/audio/" + "128/default.mp3?aid=rss_feed&awCollectionId=03d8b493-87fc-4bd1-931f-8a8e9b945d8a&awEpisodeId=" + "230797bf-6d47-4648-81b5-79750b8d8023&feed=54nAGcIl") + + def test_to_json(self): + self.assertEqual(type(TestFeed.parsed_feed.to_json()), str) + + def test_to_readable(self): + self.assertEqual(type(TestFeed.parsed_feed.to_json()), str) + + +unittest.main() diff --git a/Narek Arsenyan/rss_reader_package/tests/test_feed_fetcher.py b/Narek Arsenyan/rss_reader_package/tests/test_feed_fetcher.py new file mode 100644 index 00000000..3b0ed4f4 --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/tests/test_feed_fetcher.py @@ -0,0 +1,21 @@ +import unittest +from rss_reader_package.feed_fetcher import FeedFetcher +from rss_reader_package.utils.exceptions import WrongLimitError, WrongUrlError + + +class TestFeedFetcher(unittest.TestCase): + def test_feed_fetcher(self): + with self.assertRaises(WrongLimitError): + FeedFetcher("source", -1) + with self.assertRaises(WrongUrlError): + ff = FeedFetcher("source", None) + ff.fetch_feeds() + ff2 = FeedFetcher("https://timesofindia.indiatimes.com/rssfeedstopstories.cms", 1) + ff2.fetch_feeds() + self.assertEqual(len(ff2.feeds_formatted), 1) + ff3 = FeedFetcher("https://timesofindia.indiatimes.com/rssfeedstopstories.cms", 3) + ff3.fetch_feeds() + self.assertEqual(len(ff3.feeds_formatted), 3) + + +unittest.main() diff --git a/Narek Arsenyan/rss_reader_package/tests/test_format_converter.py b/Narek Arsenyan/rss_reader_package/tests/test_format_converter.py new file mode 100644 index 00000000..b7ec87f9 --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/tests/test_format_converter.py @@ -0,0 +1,34 @@ +import os +import shutil +import unittest +from rss_reader_package.format_converter import FormatConverter +from rss_reader_package.mocks.raw_feed_mock import mock_feed_raw +from rss_reader_package.feed import Feed +from appdirs import user_data_dir +from rss_reader_package.utils.count_files import count_files_by_type +from rss_reader_package.utils.exceptions import NotSupportedConversionFormat + + +class TestFormatConverter(unittest.TestCase): + parsed_feed = Feed.json_to_feed(mock_feed_raw) + + def test_convert_feeds(self): + user_dir = user_data_dir() + with self.assertRaises(NotSupportedConversionFormat): + FormatConverter.convert_feeds([TestFormatConverter.parsed_feed], "my_format", None, False) + os.mkdir(os.path.join(user_dir, "Test")) + FormatConverter.convert_feeds([TestFormatConverter.parsed_feed], "html", os.path.join(user_dir, "Test"), False) + self.assertEqual(count_files_by_type(os.path.join(user_dir, "Test"), ".html"), 1) + shutil.rmtree(os.path.join(user_dir, "Test")) + os.mkdir(os.path.join(user_dir, "Test")) + FormatConverter.convert_feeds([TestFormatConverter.parsed_feed], "epub", os.path.join(user_dir, "Test"), False) + self.assertEqual(count_files_by_type(os.path.join(user_dir, "Test"), ".epub"), 1) + shutil.rmtree(os.path.join(user_dir, "Test")) + + def test_single_feed_to_html_content(self): + self.assertEqual(type(FormatConverter.single_feed_to_html_content(TestFormatConverter.parsed_feed, True)), str) + self.assertEqual( + type(FormatConverter.single_feed_to_html_content(TestFormatConverter.parsed_feed, False)), str) + + +unittest.main() diff --git a/Narek Arsenyan/rss_reader_package/tests/test_link.py b/Narek Arsenyan/rss_reader_package/tests/test_link.py new file mode 100644 index 00000000..2050518d --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/tests/test_link.py @@ -0,0 +1,18 @@ +import unittest +from rss_reader_package.link import Link +from rss_reader_package.utils.exceptions import LinkWithNoSourceError + + +class TestLink(unittest.TestCase): + def test_link(self): + link = Link("source", "image") + self.assertEqual(link.href, "source") + self.assertEqual(link.link_type, "image") + self.assertEqual(str(link), "source (image)") + link2 = Link("source") + self.assertEqual(link2.link_type, "unknown") + with self.assertRaises(LinkWithNoSourceError): + Link(True) + + +unittest.main() diff --git a/Narek Arsenyan/rss_reader_package/utils/__init__.py b/Narek Arsenyan/rss_reader_package/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Narek Arsenyan/rss_reader_package/utils/config.py b/Narek Arsenyan/rss_reader_package/utils/config.py new file mode 100644 index 00000000..8f909526 --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/utils/config.py @@ -0,0 +1,37 @@ +""" +Module for application internal configs + +exports appname, appauthor, verbose_print +""" +import builtins as __builtin__ + +# E731 do not assign a lambda expression, use a def +# Breaking the rule, because it will lead to less readability and more code +appname = "RSSReader" +appauthor = "nmac99" + +COLORIZED_MODE = False +verbose_print = lambda *a, **k: None + + +COLORS = { + "pink": "\033[95m", + "blue": "\033[94m", + "cyan": "\033[96m", + "green": "\033[92m", + "warn": "\033[93m", + "err": "\033[91m", + "bold": "\033[1m", + "und": "\033[4m", + "none": "" +} + + +def print(text: str, type: str = "none"): + """Custom print() function to handle colorized mode""" + if not COLORIZED_MODE: + return __builtin__.print(text) + else: + endc = "\033[0m" + formatted_text = f"{COLORS[type]}{text}{endc}" + return __builtin__.print(formatted_text) diff --git a/Narek Arsenyan/rss_reader_package/utils/count_files.py b/Narek Arsenyan/rss_reader_package/utils/count_files.py new file mode 100644 index 00000000..6736ebfc --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/utils/count_files.py @@ -0,0 +1,30 @@ +""" +Module for counting files in directory + +exports count_files_by_type +""" +import os + + +def count_files_by_type(dir_path: str, file_type: str or None) -> int: + """ + Function that counts files by its type. If type is not provided, will count all files + + Args: + dir_path: path of directory where files should be counted + file_type: file extension that should be considered for counting + + Returns: + int: number of files after counting + """ + + files = [] + for (_, __, filenames) in os.walk(dir_path): + if file_type is None: + files.extend(filenames) + else: + for file in filenames: + if file.endswith(file_type): + files.append(file) + + return len(files) diff --git a/Narek Arsenyan/rss_reader_package/utils/exceptions.py b/Narek Arsenyan/rss_reader_package/utils/exceptions.py new file mode 100644 index 00000000..ae6ed9e2 --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/utils/exceptions.py @@ -0,0 +1,40 @@ +""" +Module for custom Errors + +exports WringUrlError, WrongLimitError, ConvertJSONError, LinkWithNoSourceError, CachedFeedNotFoundError +""" + + +class Error(Exception): + """Base class for custom errors""" + pass + + +class WrongUrlError(Error): + """Error for wrong url specification""" + pass + + +class WrongLimitError(Error): + """Error for wrong limit specification""" + pass + + +class ConvertJSONError(Error): + """Error for JSON converting issues""" + pass + + +class LinkWithNoSourceError(Error): + """Error for not providing source to Link object""" + pass + + +class CachedFeedNotFoundError(Error): + """Error for not found feed cache""" + pass + + +class NotSupportedConversionFormat(Error): + """Error for not supported conversion formats""" + pass diff --git a/Narek Arsenyan/rss_reader_package/utils/version.py b/Narek Arsenyan/rss_reader_package/utils/version.py new file mode 100644 index 00000000..1057eea9 --- /dev/null +++ b/Narek Arsenyan/rss_reader_package/utils/version.py @@ -0,0 +1,6 @@ +""" +Module for application versioning + +exports __version__ +""" +__version__ = "1.5.0" diff --git a/Narek Arsenyan/setup.cfg b/Narek Arsenyan/setup.cfg new file mode 100644 index 00000000..6d7c590e --- /dev/null +++ b/Narek Arsenyan/setup.cfg @@ -0,0 +1,25 @@ +[pycodestyle] +count = False +ignore = E731 +max-line-length = 120 +statistics = True + +[metadata] +name = rss_reader +version = 1.5.0 + +[options] +packages = + rss_reader_package + rss_reader_package.utils +install_requires = + appdirs + yattag + EbookLib + beautifulsoup4 + requests + +[options.entry_points] +console_scripts = + rss-reader = rss_reader_package.rss_reader:rss_reader_func + diff --git a/Narek Arsenyan/setup.py b/Narek Arsenyan/setup.py new file mode 100644 index 00000000..37417ed7 --- /dev/null +++ b/Narek Arsenyan/setup.py @@ -0,0 +1,58 @@ +from setuptools import find_packages, setup +import os + +# Optional rss_reader_package description in README.md: +current_directory = os.path.dirname(os.path.abspath(__file__)) + +try: + with open( + os.path.join(current_directory, "../README.md"), + encoding="utf-8") as f: + long_description = f.read() +except Exception as e: + print(e) + long_description = "" +setup( + # Project name: + name="RSS Reader", + + # Packages to include in the distribution: + packages=find_packages(','), + + # Project version number: + version="1.5.0", + + # List a license for the project, eg. MIT License + license="MIT License", + + # Short description of your library: + description="Pure Python command-line RSS reader", + + # Long description of your library: + long_description=long_description, + long_description_content_type='text/markdown', + + # Your name: + author="Narek Arsenyan", + + # Your email address: + author_email="narekarsenyan99@gmail.com", + + # Link to your github repository or website: + url="https://github.com/NMac99", + + # Download Link from where the project can be downloaded from: + download_url="", + + # List of keywords: + keywords=[], + + # List project dependencies: + install_requires=["appdirs", "yattag", "EbookLib", "beautifulsoup4", "requests"], + + # https://pypi.org/classifiers/ + classifiers=[], + entry_points={"console_scripts": [ + "rss-reader=rss_reader_package.rss_reader:rss_reader_func" + ]} + )