From dd5aa41471f0a06ca106a749990c9a869b5bb854 Mon Sep 17 00:00:00 2001 From: NMac99 Date: Mon, 23 May 2022 17:19:38 +0400 Subject: [PATCH 1/9] ADD gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..485dee64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea From ea8b21ce2ea1109994ba770e5a009df6e1432d72 Mon Sep 17 00:00:00 2001 From: narekarsenyan Date: Sat, 18 Jun 2022 00:14:10 +0400 Subject: [PATCH 2/9] FINISH iterations 1,2,3,4,5 --- .gitignore | 5 + DOCS.md | 144 +++++++++++++ requirements.txt | 5 + rss_reader_package/__init__.py | 0 rss_reader_package/cache_worker.py | 124 +++++++++++ rss_reader_package/date.py | 25 +++ rss_reader_package/feed.py | 181 ++++++++++++++++ rss_reader_package/feed_fetcher.py | 57 +++++ rss_reader_package/format_converter.py | 194 ++++++++++++++++++ rss_reader_package/link.py | 34 +++ rss_reader_package/mocks/raw_feed_mock.py | 16 ++ rss_reader_package/rss_reader.py | 122 +++++++++++ rss_reader_package/tests/test_count_files.py | 28 +++ rss_reader_package/tests/test_date.py | 15 ++ rss_reader_package/tests/test_feed.py | 32 +++ rss_reader_package/tests/test_feed_fetcher.py | 21 ++ .../tests/test_format_converter.py | 35 ++++ rss_reader_package/tests/test_link.py | 18 ++ rss_reader_package/utils/__init__.py | 0 rss_reader_package/utils/config.py | 37 ++++ rss_reader_package/utils/count_files.py | 32 +++ rss_reader_package/utils/exceptions.py | 40 ++++ rss_reader_package/utils/version.py | 6 + setup.cfg | 24 +++ setup.py | 58 ++++++ 25 files changed, 1253 insertions(+) create mode 100644 DOCS.md create mode 100644 requirements.txt create mode 100644 rss_reader_package/__init__.py create mode 100644 rss_reader_package/cache_worker.py create mode 100644 rss_reader_package/date.py create mode 100644 rss_reader_package/feed.py create mode 100644 rss_reader_package/feed_fetcher.py create mode 100644 rss_reader_package/format_converter.py create mode 100644 rss_reader_package/link.py create mode 100644 rss_reader_package/mocks/raw_feed_mock.py create mode 100644 rss_reader_package/rss_reader.py create mode 100644 rss_reader_package/tests/test_count_files.py create mode 100644 rss_reader_package/tests/test_date.py create mode 100644 rss_reader_package/tests/test_feed.py create mode 100644 rss_reader_package/tests/test_feed_fetcher.py create mode 100644 rss_reader_package/tests/test_format_converter.py create mode 100644 rss_reader_package/tests/test_link.py create mode 100644 rss_reader_package/utils/__init__.py create mode 100644 rss_reader_package/utils/config.py create mode 100644 rss_reader_package/utils/count_files.py create mode 100644 rss_reader_package/utils/exceptions.py create mode 100644 rss_reader_package/utils/version.py create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 485dee64..cc5f2f96 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,6 @@ .idea + +dist +RSS_Reader.egg-info + +__pycache__ \ No newline at end of file diff --git a/DOCS.md b/DOCS.md new file mode 100644 index 00000000..2000c214 --- /dev/null +++ b/DOCS.md @@ -0,0 +1,144 @@ + +# Feed Format + +--- + +**`Feed:`**      Feed source title + +  + +**`Title:`**    Title of the new + +**`Date:`**      Publishing date of the new + +**`Link:`**      Link of the new + +  + +### `Content of the new` + +  + +**`Links:`** + + +`[1]: Link 1` (type of source) + +`[2]: Link 2` (type of source) + +. + +. + +. + +`[n]: Link n` (type of source) + + +--- + +# Feed Format (JSON) + +--- + +```json +{ + "source_title": "Feed source title", + "title": "Title of the new", + "date": "Publishing date of the new", + "link": "Link of the new", + "content": "Content of the new", + "non_media_links": [ + { + "href": "url of link used in the new", + "link_type": "link type of the link" + } + ], + "media_links": [ + { + "href": "url of media used in the new", + "link_type": "type of media" + } + ] +} +``` + +--- + +# Feed caching + +--- + +Previously fetched feeds are cached in user cache directory. + +**For macOS:** + +>/Users/[USER]/Library/Caches/RSSReader + +**For Linux:** + +>/home/[USER]/.cache/RSSReader + +**For Windows 7:** + +>C:\Users\[USER]\AppData\Local\nmac99\RSSReader\Cache + +## Format of caching + +Fetched feeds are stored in `[DATE].json` files, where `[DATE]` is the date of publication of the feed. + +Inside `.json` file is JSON object where keys are fetched feeds' sources and values are feeds' data list in JSON format. + +**Example:** + +`2022-06-11.json` + +```json +{ + "https://timesofindia.indiatimes.com/rssfeedstopstories.cms": [ + "{\n \"source_title\": null,\n \"title\": \"Presidential polls: Mamata invites 22 oppn CMs, leaders for joint meeting on June 15\",\n \"date\": \"2022-06-11T16:22:36+05:30\",\n \"link\": \"https://timesofindia.indiatimes.com/india/presidential-polls-mamata-invites-22-oppn-cms-leaders-for-joint-meeting-on-june-15/articleshow/92146582.cms\",\n \"content\": \"With the Rajya Sabha results exposing dissension and lack of cohesion among opposition parties, West Bengal chief minister Mamata Banerjee on Saturday reached out to her counterparts and other leaders to participate in a meeting in Delhi on June 15 to discuss the upcoming presidential polls, which are scheduled for July 18.\",\n \"non_media_links\": [\n {\n \"href\": \"https://timesofindia.indiatimes.com/india/presidential-polls-mamata-invites-22-oppn-cms-leaders-for-joint-meeting-on-june-15/articleshow/92146582.cms\",\n \"link_type\": \"link\"\n }\n ],\n \"media_links\": []\n}" + ] +} +``` +  + +3 types of cache checks are implemented: + +1. When cache files for dates are exceeding count of 10, the earliest date cache file is deleted +2. When cache sources in one cache file are exceeding count of 10, the first source is deleted with its content +3. When cached feeds in one cache source are exceeding count of 10, the first cached feed in that source is deleted + +  + +When reading from cache, JSON objects are being converted to normalized Feed objects + +--- + +# Feeds conversion + +--- + +Currently, there are **2 types** of conversion available: + +1. HTML +2. EPUB + +  + +You can easily convert your feeds to these 2 formats, whether they are newly fetched or were read from cache. + +Converted files will be saved in your provided directory, however if that directory does not exist, files will be saved +in user data directory. + +**For macOS:** + +>/Users/[USER]/Library/Application Support/RSSReader + +**For Linux:** + +>/home/[USER]/.local/share/RSSReader + +**For Windows 7:** + +>C:\Users\[USER]\AppData\Local\nmac99\RSSReader + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..7eabfe5a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +appdirs==1.4.4 +EbookLib==0.17.1 +feedparser==6.0.10 +setuptools==61.2.0 +yattag==1.14.0 diff --git a/rss_reader_package/__init__.py b/rss_reader_package/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_reader_package/cache_worker.py b/rss_reader_package/cache_worker.py new file mode 100644 index 00000000..00a6043b --- /dev/null +++ b/rss_reader_package/cache_worker.py @@ -0,0 +1,124 @@ +""" +Module for CacheWorker class + +exports CacheWorker class +""" +import os +import rss_reader_package.utils.config as config +from appdirs import user_cache_dir +from json import dumps, load +from rss_reader_package.feed import Feed +from rss_reader_package.utils.exceptions import CachedFeedNotFoundError + + +class CacheWorker: + """Class for working with feeds caching""" + + appname = config.appname + appauthor = config.appauthor + + @staticmethod + def store_feed_in_cache(date: str, source: str, json: str, feed_id: str): + """ + Args: + date: date of publication of feed + source: source of feed, from where it was fetched + json: JSON string of converted feed + feed_id: id of the feed + """ + + config.verbose_print("Getting user cache directory", "bold") + cache_dir = user_cache_dir(CacheWorker.appname, CacheWorker.appauthor) + if not os.path.exists(cache_dir): + os.mkdir(cache_dir) + config.verbose_print("Checking user cache directory overload", "und") + cache_files = [] + for (_, __, filenames) in os.walk(cache_dir): + for file in filenames: + if file.endswith(".json"): + cache_files.append(file) + if len(cache_files) > 10: + config.verbose_print("User cache directory is overloaded. Removing old cache file", "warn") + os.remove(os.path.join(cache_dir, f"{cache_files[0]}")) + try: + with open(os.path.join(cache_dir, f"{date}.json"), "r") as cache_file: + config.verbose_print("Reading date cache file", "bold") + try: + cache = load(cache_file) + except Exception as e: + config.verbose_print(f"Warning: JSON not found ({e})", "warn") + cache = dict() + except Exception as e: + config.verbose_print(f"Warning: date cache not found ({e})", "warn") + cache = dict() + config.verbose_print("Checking date cache file overload", "und") + if len(cache.keys()) > 10: + config.verbose_print("Date cache file is overloaded. Removing old cache entry", "warn") + del cache[list(cache.keys())[0]] + try: + with open(os.path.join(cache_dir, f"{date}.json"), "w") as cache_file: + config.verbose_print("Checking if feed is already in cache", "und") + feed_is_in_cache = False + if source in cache: + for feed in cache[source]: + if feed_id in feed: + config.verbose_print("Feed is in cache already", "green") + feed_is_in_cache = True + break + if not feed_is_in_cache: + config.verbose_print("Feed not in cache. Storing feed in cache", "bold") + if len(cache[source]) > 10: + cache[source].pop(0) + cache[source].append(json) + else: + cache[source] = [json] + config.verbose_print("Update date cache file", "bold") + cache_file.write(dumps(cache, indent=4)) + except Exception as e: + config.verbose_print(f"Unable to open cache file ({e})", "warn") + + @staticmethod + def read_feed_from_cache(date: str, source: str or None, limit: int or None) -> [Feed]: + """ + Args: + date: for which date cached feed should be read + source: specific source for feed + limit: limit of feeds that should be retrieved + + Returns: + [Feed]: list of fetched feeds from cache + + Raises: + CachedFeedNotFoundError + """ + + config.verbose_print("Opening user cache directory", "bold") + cache_dir = user_cache_dir(CacheWorker.appname, CacheWorker.appauthor) + try: + with open(os.path.join(cache_dir, f"{date}.json"), "r") as cache_file: + try: + cache = load(cache_file) + except Exception as e: + config.verbose_print(f"Cannot read JSON from cache file ({e})", "warn") + cache = dict() + if len(cache.keys()) == 0: + raise CachedFeedNotFoundError("Error: Cached Feed not found") + formatted_cached_feeds = [] + limit_final = 100 + if limit is not None: + limit_final = limit + config.verbose_print("Reading feeds from cache", "bold") + if source is None: + for cached_feed in cache.values(): + if len(formatted_cached_feeds) == limit_final: + break + formatted_cached_feeds.append(Feed.json_to_feed(cached_feed[0])) + else: + for cached_feed in cache[source]: + if len(formatted_cached_feeds) == limit_final: + break + formatted_cached_feeds.append(Feed.json_to_feed(cached_feed)) + return formatted_cached_feeds + except Exception as e: + config.verbose_print(f"Date cache file not found ({e})", "warn") + raise CachedFeedNotFoundError("Error: Cached Feed not found") diff --git a/rss_reader_package/date.py b/rss_reader_package/date.py new file mode 100644 index 00000000..065ad348 --- /dev/null +++ b/rss_reader_package/date.py @@ -0,0 +1,25 @@ +""" +Module for datetime enhancements + +exports valid_date +""" +from argparse import ArgumentTypeError +from datetime import datetime + + +def valid_date(s: str) -> datetime.date: + """ + Parses string with datetime to date format + + Args: + s: datetime containing string + + Returns: + datetime.date: parsed string to date + """ + + try: + return datetime.strptime(s, "%Y%m%d").date() + except ValueError: + msg = "Not a valid date: {0!r}".format(s) + raise ArgumentTypeError(msg) diff --git a/rss_reader_package/feed.py b/rss_reader_package/feed.py new file mode 100644 index 00000000..ce303ea4 --- /dev/null +++ b/rss_reader_package/feed.py @@ -0,0 +1,181 @@ +""" +Module for Feed class + +exports Feed class +""" + +import json +import rss_reader_package.utils.config as config +import feedparser +from rss_reader_package.link import Link +from rss_reader_package.utils.exceptions import ConvertJSONError + + +class Feed: + """Class which describes normalized Feed object""" + def __init__(self, + source_title: str or None, + title: str or None, + date: str or None, + link: str or None, + content: str or None, + non_media_links: list[Link] or None, + media_links: list[Link] or None + ): + """ + The constructor of Feed class + + Args: + source_title: the title of source from where the feed was fetched + title: the title of the current feed + date: the date of publication of the current feed + link: the link where feed is stored + content: the content of the current feed + non_media_links: non-media links, that are used in the feed + media_links: media links, that are used in the feed + """ + + self.source_title = source_title + self.title = title + try: + self.date = date.replace("Z", "+00:00") + except Exception as e: + config.verbose_print(f"Feed date is not string ({e})", "warn") + self.date = date + self.link = link + self.content = content + self.non_media_links = non_media_links + self.media_links = media_links + + @staticmethod + def process_feed(feed: feedparser.util.FeedParserDict, source_title: str or None): + """ + The static function for normalizing fetched feed + + Args: + feed: the raw feed object, that was fetched + source_title: the source title of fetched news + + Returns: + Feed: Feed type object, that is already normalized + """ + + feed_source_title = source_title + config.verbose_print("Scraping feed title", "bold") + try: + title = feed.title + except Exception as e: + config.verbose_print(f"Warning: Feed title not found ({e})", "warn") + title = None + config.verbose_print("Scraping publish date", "bold") + try: + date = feed.published + except Exception as e: + config.verbose_print(f"Warning: Publish date not found ({e})", "warn") + date = None + config.verbose_print("Scraping feed link", "bold") + try: + link = feed.link + except Exception as e: + config.verbose_print(f"Warning: Feed link not found ({e})", "warn") + link = None + config.verbose_print("Scraping feed content", "bold") + try: + content = feed.summary + except Exception as e: + config.verbose_print(f"Warning: Content not found ({e})", "warn") + content = None + config.verbose_print("Scraping non-media links", "bold") + try: + non_media_links = list(map(lambda l: Link(l.href, 'link'), feed.links)) + except Exception as e: + config.verbose_print(f"Warning: Non-media links not found ({e})", "warn") + non_media_links = [] + config.verbose_print("Scraping media links", "bold") + try: + media_links = list(map(lambda l: Link(l['url'], 'image'), feed.media_content)) + except Exception as e: + config.verbose_print(f"Warning: Media links not found ({e})", "warn") + media_links = [] + + normalized_feed = Feed( + feed_source_title, + title, + date, + link, + content, + non_media_links, + media_links + ) + + config.verbose_print("Feed object created", "green") + + return normalized_feed + + def to_json(self): + """ + The function for converting Feed object to JSON string + + Returns: + JSON string + + Raises: + ConvertJSONError + """ + + config.verbose_print("Converting feed to JSON format", "blue") + try: + return json.dumps(self, default=lambda o: o.__dict__, indent=4) + except Exception as e: + raise ConvertJSONError(f"Error when converting feed to JSON ({e})") + + def to_readable(self) -> str: + """ + The function for converting Feed type object to readable string + + Returns: + str: Readable string based on Feed object + """ + + config.verbose_print("Converting feed to readable format", "blue") + all_links = self.non_media_links.copy() + all_links.extend(self.media_links) + formatted_links = "" + for i in range(0, len(all_links)): + formatted_links += f"[{i + 1}]: {str(all_links[i])}\n" + if formatted_links == "": + formatted_links = None + + formatted_feed = \ + f"Feed: {self.source_title}\n\n" + \ + f"Title: {self.title}\n" + \ + f"Date: {self.date}\n" + \ + f"Link: {self.link}\n\n" + \ + f"{self.content}\n\n" + \ + f"Links:\n{formatted_links}\n\n" + + return formatted_feed + + @staticmethod + def json_to_feed(json_feed: str): + """ + The function for converting JSON of feed to Feed object + + Args: + json_feed: JSON string of feed + + Returns: + Feed: Feed object + """ + + converted_feed = json.loads(json_feed) + non_media_links = list(map(lambda l: Link(l["href"], l["link_type"]), converted_feed["non_media_links"])) + media_links = list(map(lambda l: Link(l["href"], l["link_type"]), converted_feed["media_links"])) + + return Feed(converted_feed["source_title"], + converted_feed["title"], + converted_feed["date"], + converted_feed["link"], + converted_feed["content"], + non_media_links, + media_links) diff --git a/rss_reader_package/feed_fetcher.py b/rss_reader_package/feed_fetcher.py new file mode 100644 index 00000000..705318e1 --- /dev/null +++ b/rss_reader_package/feed_fetcher.py @@ -0,0 +1,57 @@ +""" +Module for FeedFetcher class + +exports FeedFetcher class +""" +import feedparser +import rss_reader_package.utils.config as config +from datetime import datetime +from rss_reader_package.feed import Feed +from rss_reader_package.cache_worker import CacheWorker +from rss_reader_package.utils.exceptions import WrongLimitError, WrongUrlError + + +class FeedFetcher: + """Class for fetching feeds and storing feeds using feedparser.py""" + + def __init__(self, source: str, limit: int or None): + """ + The constructor for FeedFetcher class + + Args: + source: the source url of rss feeds + limit: limit of the feeds, that must be shown + + Raises: + WrongLimitError + """ + + if limit is not None and limit < 1: + raise WrongLimitError("Argument LIMIT must be greater than 0") + + self.__source = source + self.__limit = limit + self.feeds = [] + self.feeds_formatted = [] + + def fetch_feeds(self): + """The function that fetches and stores feeds using source and limit of object """ + + self.feeds = feedparser.parse(self.__source) + if "entries" not in self.feeds or len(self.feeds.entries) < 1: + raise WrongUrlError("Specified url is not rss type") + config.verbose_print("Feeds fetched", "green") + if self.__limit is None: + limit = len(self.feeds.entries) + else: + limit = min(len(self.feeds.entries), self.__limit) + for i in range(0, limit): + self.feeds_formatted.append( + Feed.process_feed(self.feeds.entries[i], self.feeds.feed.title) + ) + config.verbose_print("Storing fetched feeds in cache", "bold") + for f in self.feeds_formatted: + CacheWorker.store_feed_in_cache(str(datetime.fromisoformat(f.date).date()), + self.__source, + f.to_json(), + f.link) diff --git a/rss_reader_package/format_converter.py b/rss_reader_package/format_converter.py new file mode 100644 index 00000000..e07579f2 --- /dev/null +++ b/rss_reader_package/format_converter.py @@ -0,0 +1,194 @@ +""" +Module for converting feeds to specific format + +exports FormatConverter +""" + +import os +import uuid +import rss_reader_package.utils.config as config +from rss_reader_package.utils.config import print +from appdirs import user_data_dir +from yattag import Doc, indent +from ebooklib import epub +from datetime import datetime +from rss_reader_package.feed import Feed +from rss_reader_package.utils.count_files import count_files_by_type +from rss_reader_package.utils.exceptions import NotSupportedConversionFormat + +SUPPORTED_FORMATS = ["html", "epub"] + +# Global css that is used for html conversion +CSS = """ +#feed-images-container { width: 100%; overflow-x: auto; display: flex; flex-direction: row; } +img { height: 200px; width: auto; margin: 0 10px; } +""" + + +class FormatConverter: + """ + Class for working with conversion of feeds + """ + @staticmethod + def convert_feeds(feeds: [Feed], convert_format: str or None, destination_path: str or None, print_json: bool): + """ + Function that processes conversion. Does generic part, which is used in all conversion types + + Args: + feeds: Feed objects, that should be converted + convert_format: conversion format. Supported formats are "epub" and "html" + destination_path: location of file, where converted feeds should take place. If destination_path is not + provided, user data directory will be used as location (See DOCS) + print_json: if feeds should be printed in JSON format in stdout during conversion + + Raises: + NotSupportedConversionFormat + """ + + config.verbose_print(f"Starting conversion to '{convert_format}'", "bold") + if convert_format is None or convert_format not in SUPPORTED_FORMATS: + raise NotSupportedConversionFormat(f"Conversion format '{convert_format}' is not supported") + else: + if destination_path is None or not os.path.exists(destination_path): + print("Warning: Specified directory is wrong or does not exist. " + "Falling back for user data directory", "warn") + destination = user_data_dir(config.appname, config.appauthor) + else: + destination = destination_path + if convert_format == "html": + FormatConverter.feeds_to_html(feeds, destination) + elif convert_format == "epub": + FormatConverter.feeds_to_epub(feeds, destination) + if print_json: + config.verbose_print("Printing feeds in JSON format in stdout", "bold") + for feed in feeds: + print(feed.to_json()) + + @staticmethod + def feeds_to_html(feeds: [Feed], destination_path: str): + """ + Function that converts feeds to html format file in specified directory + + Args: + feeds: Feed objects, that will be included in html file + destination_path: location, where converted html file will be saved + """ + + config.verbose_print("Creating html file for feeds", "bold") + html_files_count = count_files_by_type(destination_path, ".html") + doc, tag, text = Doc().tagtext() + with tag("html"): + with tag("head"): + with doc.tag('style', type='text/css'): + config.verbose_print("Applying css", "bold") + doc.asis(CSS) + with tag("body"): + config.verbose_print("Appending feeds to html", "bold") + for feed in feeds: + doc.asis(FormatConverter.single_feed_to_html_content(feed, False)) + config.verbose_print("Saving html file", "bold") + with open(os.path.join(destination_path, f"Feeds {html_files_count + 1}.html"), "w") as html_file: + html_file.write(indent(doc.getvalue())) + config.verbose_print(f"Html file saved in {destination_path}", "green") + + @staticmethod + def feeds_to_epub(feeds: [Feed], destination_path: str): + """ + Function that converts feeds to epub format file in specified directory + + Args: + feeds: Feed objects, that will be included in html file + destination_path: location, where converted html file will be saved + """ + + config.verbose_print("Creating html file for feeds", "bold") + epub_files_count = count_files_by_type(destination_path, ".epub") + new_file_name = f"Feeds {epub_files_count + 1}" + book = epub.EpubBook() + + book.set_identifier(str(uuid.uuid4())) + book.set_title(new_file_name) + + book.add_author(config.appauthor) + + spine = ["nav"] + + config.verbose_print("Creating chapters for epub book", "bold") + for i in range(0, len(feeds)): + doc, tag, text = Doc().tagtext() + with tag("html"): + with tag("head"): + with doc.tag('style', type='text/css'): + doc.asis(CSS) + with tag("body"): + doc.asis(FormatConverter.single_feed_to_html_content(feeds[i], True)) + chapter = epub.EpubHtml(title=feeds[i].title, file_name=f"chap_{i + 1}.xhtml") + chapter.content = indent(doc.getvalue()) + book.add_item(chapter) + spine.append(chapter) + + config.verbose_print("Creating Table of Contents", "bold") + book.toc = tuple(spine[1::]) + + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + + book.spine = spine + + config.verbose_print("Saving epub book", "bold") + epub.write_epub(os.path.join(destination_path, new_file_name + ".epub"), book, {}) + config.verbose_print(f"Epub book saved in {destination_path}", "green") + + @staticmethod + def single_feed_to_html_content(feed: Feed, for_book: bool) -> str: + """ + Function that converts single Feed object to html body content. Used for both .epub and .html file conversions + + Args: + feed: Feed object + for_book: argument, that specifies if converting is for book type conversion or no + + Returns: + str: Feed object converted to html string, which contains Feed data + """ + + config.verbose_print(f"Creating feed content in html format (Feed: {feed.title})", "bold") + doc, tag, text = Doc().tagtext() + with tag("h3"): + text(feed.title) + with tag("p"): + text(str(datetime.fromisoformat(feed.date).__format__("%d.%m.%Y, %H:%M"))) + doc.stag("br") + if not for_book: + with tag("div", id="feed-images-container"): + if len(feed.media_links) < 1: + text("No media was provided") + else: + for media_link in feed.media_links: + doc.stag("img", src=media_link.href, klass="feed-image") + doc.stag("br") + with tag("p"): + if feed.content is None: + text("No feed content in summary was provided") + else: + text(feed.content) + doc.stag("br") + with tag("a", href=feed.link, target="_blank"): + text("Read full article") + doc.stag("br") + doc.stag("br") + with tag("span"): + with tag("strong"): + text("Related links:") + with tag("div"): + for i in range(0, len(feed.non_media_links)): + with tag("span"): + text(f"[{i + 1}]") + with tag("a", href=feed.non_media_links[i].href, target="_blank"): + text(feed.non_media_links[i].href) + if not for_book: + for i in range(0, 5): + doc.stag("br") + doc.stag("hr") + + return doc.getvalue() diff --git a/rss_reader_package/link.py b/rss_reader_package/link.py new file mode 100644 index 00000000..0a01dbdd --- /dev/null +++ b/rss_reader_package/link.py @@ -0,0 +1,34 @@ +""" +Module for Link class + +exports Link class +""" + +from rss_reader_package.utils.exceptions import LinkWithNoSourceError + + +class Link: + """Class for specifying single Link object""" + def __init__(self, href, link_type: str = "unknown"): + """ + The constructor of Link class + + Args: + href: source of the link + link_type: type of the source + """ + + if href is None or type(href) is not str: + raise LinkWithNoSourceError("Provided source is missing or of wrong type") + self.href = href + self.link_type = link_type + + def __str__(self) -> str: + """ + The function for overriding string conversion behaviour + + Returns: + str: formatted string version of Link object + """ + + return f"{self.href} ({self.link_type})" diff --git a/rss_reader_package/mocks/raw_feed_mock.py b/rss_reader_package/mocks/raw_feed_mock.py new file mode 100644 index 00000000..1aa9fb44 --- /dev/null +++ b/rss_reader_package/mocks/raw_feed_mock.py @@ -0,0 +1,16 @@ +"""Module for raw feed mock data""" + +mock_feed_raw = {'title': 'Taiwanese F-16 fighter makes emergency landing in Hawaii', + 'title_detail': {'type': 'text/plain', 'language': None, 'base': 'https://news.yahoo.com/rss/', + 'value': 'Taiwanese F-16 fighter makes emergency landing in Hawaii'}, 'links': [ + {'rel': 'alternate', 'type': 'text/html', + 'href': 'https://news.yahoo.com/taiwanese-f-16-fighter-makes-191643064.html'}], + 'link': 'https://news.yahoo.com/taiwanese-f-16-fighter-makes-191643064.html', + 'published': '2022-06-07T19:16:43Z', + 'source': {'href': 'http://www.ap.org/', 'title': 'Associated Press'}, + 'id': 'taiwanese-f-16-fighter-makes-191643064.html', 'guidislink': False, 'media_content': [ + {'height': '86', + 'url': 'https://s.yimg.com/uu/api/res/1.2/xEduvF_K_md_I0S4N_bKPA--~B/' + 'aD0zMzMzO3c9NTAwMDthcHBpZD15dGFjaHlvbg--/' + 'https://media.zenfs.com/en/ap.org/d6b6186493eecb5a8085e62472973496', + 'width': '130'}], 'media_credit': [{'role': 'publishing company'}], 'credit': ''} diff --git a/rss_reader_package/rss_reader.py b/rss_reader_package/rss_reader.py new file mode 100644 index 00000000..60833f37 --- /dev/null +++ b/rss_reader_package/rss_reader.py @@ -0,0 +1,122 @@ +""" +Main module of rss_reader + +It handles parsing arguments, prints version, sets verbose mode, initializes FeedFetcher and fetches feeds +""" +import os +import argparse +import rss_reader_package.utils.config as config +from rss_reader_package.utils.config import print +from rss_reader_package.utils.version import __version__ +from rss_reader_package.feed_fetcher import FeedFetcher +from rss_reader_package.cache_worker import CacheWorker +from rss_reader_package.format_converter import FormatConverter +from rss_reader_package.date import valid_date +from rss_reader_package.utils.exceptions import WrongLimitError,\ + ConvertJSONError,\ + WrongUrlError,\ + LinkWithNoSourceError,\ + NotSupportedConversionFormat + + +my_parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader') + +my_parser.add_argument('source', + metavar='source', + nargs='?', + type=str, + help='RSS URL') +my_parser.add_argument("--version", + help="Prints version info", + action="store_true") +my_parser.add_argument("--date", + action="store", + type=valid_date, + help="Fetches feeds from cache by specified date") +my_parser.add_argument("--limit", + action="store", + type=int, + help="Limits news topics if this parameter is provided") +my_parser.add_argument("--json", + action="store_true", + help="Prints result as JSON in stdout") +my_parser.add_argument("--to-html", + action="store", + type=str, + help="Converts feeds to html format in specified directory. \ + If specified directory does not exist, file will be created in application's data directory. \ + See DOCS for further info") +my_parser.add_argument("--to-epub", + action="store", + type=str, + help="Converts feeds to epub format in specified directory. \ + If specified directory does not exist, file will be created in application's data directory. \ + See DOCS for further info") +my_parser.add_argument("--verbose", + action="store_true", + help="Outputs verbose status messages") +my_parser.add_argument("--colorize", + action="store_true", + help="Enables colorized output for stdout") + +args = my_parser.parse_args() + +if args.verbose: + config.verbose_print = print +if args.colorize: + config.COLORIZED_MODE = True + + +def rss_reader_func(): + if args.version: + print(f"Version {__version__}", "pink") + return + convert_dir = args.to_html + if args.to_html and not os.path.isdir(args.to_html) or args.to_epub and not os.path.isdir(args.to_epub): + convert_dir = None + if args.date: + cached_feeds = CacheWorker.read_feed_from_cache(args.date, args.source, args.limit) + if args.to_html or args.to_epub: + if args.to_html: + FormatConverter.convert_feeds(cached_feeds, "html", convert_dir, args.json) + if args.to_epub: + FormatConverter.convert_feeds(cached_feeds, "epub", convert_dir, args.json) + return + for feed in cached_feeds: + if args.json: + print(feed.to_json()) + else: + print(feed.to_readable()) + return + config.verbose_print("Initializing url and limit", "pink") + try: + feed_fetcher = FeedFetcher(args.source, args.limit) + config.verbose_print("Fetching feeds...", "bold") + feed_fetcher.fetch_feeds() + if args.to_html or args.to_epub: + if args.to_html: + FormatConverter.convert_feeds(feed_fetcher.feeds_formatted, "html", convert_dir, args.json) + if args.to_epub: + FormatConverter.convert_feeds(feed_fetcher.feeds_formatted, "epub", convert_dir, args.json) + return + for feed in feed_fetcher.feeds_formatted: + if args.json: + print(feed.to_json()) + else: + print(feed.to_readable()) + except WrongLimitError as message: + print(str(message), "err") + except WrongUrlError as message: + print(str(message), "err") + except ConvertJSONError as message: + print(str(message), "err") + except LinkWithNoSourceError as message: + print(str(message), "err") + except NotSupportedConversionFormat as message: + print(str(message), "err") + except Exception as e: + print(f"Error ({e})", "err") + + +if __name__ == "__main__": + rss_reader_func() diff --git a/rss_reader_package/tests/test_count_files.py b/rss_reader_package/tests/test_count_files.py new file mode 100644 index 00000000..7e85cbaf --- /dev/null +++ b/rss_reader_package/tests/test_count_files.py @@ -0,0 +1,28 @@ +import unittest +import os +import uuid +import shutil +from appdirs import user_cache_dir +from rss_reader_package.utils.count_files import count_files_by_type + + +class TestCountFiles(unittest.TestCase): + def test_count_files_by_type(self): + test_dir = os.path.join(user_cache_dir(), "Test") + os.mkdir(test_dir) + for i in range(5): + with open(os.path.join(test_dir, f"{uuid.uuid4()}.txt"), "w") as temp_file: + temp_file.write("") + self.assertEqual(count_files_by_type(test_dir, ".txt"), 5) + self.assertEqual(count_files_by_type(test_dir, None), 5) + self.assertEqual(count_files_by_type(test_dir, ".html"), 0) + for i in range(3): + with open(os.path.join(test_dir, f"{uuid.uuid4()}.tmp"), "w") as temp_file: + temp_file.write("") + self.assertEqual(count_files_by_type(test_dir, ".tmp"), 3) + self.assertEqual(count_files_by_type(test_dir, None), 8) + self.assertEqual(count_files_by_type(test_dir, ".txt"), 5) + shutil.rmtree(test_dir) + + +unittest.main() diff --git a/rss_reader_package/tests/test_date.py b/rss_reader_package/tests/test_date.py new file mode 100644 index 00000000..a6b6012c --- /dev/null +++ b/rss_reader_package/tests/test_date.py @@ -0,0 +1,15 @@ +import unittest +from argparse import ArgumentTypeError +from rss_reader_package.date import valid_date + + +class TestDate(unittest.TestCase): + def test_valid_date(self): + self.assertEqual(str(valid_date("20210730")), "2021-07-30") + with self.assertRaises(ArgumentTypeError): + valid_date("word") + with self.assertRaises(ArgumentTypeError): + valid_date(valid_date("30071999")) + + +unittest.main() diff --git a/rss_reader_package/tests/test_feed.py b/rss_reader_package/tests/test_feed.py new file mode 100644 index 00000000..cde13a54 --- /dev/null +++ b/rss_reader_package/tests/test_feed.py @@ -0,0 +1,32 @@ +import unittest +import feedparser +from rss_reader_package.mocks.raw_feed_mock import mock_feed_raw +from rss_reader_package.feed import Feed + + +class TestFeed(unittest.TestCase): + parsed_feed = Feed.process_feed(feedparser.util.FeedParserDict(mock_feed_raw), None) + + def test_process_feed(self): + self.assertIsNone(TestFeed.parsed_feed.source_title) + self.assertEqual(TestFeed.parsed_feed.title, "Taiwanese F-16 fighter makes emergency landing in Hawaii") + self.assertEqual(TestFeed.parsed_feed.date, "2022-06-07T19:16:43Z") + self.assertEqual(TestFeed.parsed_feed.link, + "https://news.yahoo.com/taiwanese-f-16-fighter-makes-191643064.html") + self.assertIsNone(TestFeed.parsed_feed.content) + self.assertEqual(len(TestFeed.parsed_feed.non_media_links), 0) + self.assertEqual(TestFeed.parsed_feed.media_links[0].href, + "https://s.yimg.com/uu/api/res/1.2/xEduvF_K_md_I0S4N_bKPA--~B/" + "aD0zMzMzO3c9NTAwMDthcHBpZD15dGFjaHlvbg--/" + "https://media.zenfs.com/en/ap.org/d6b6186493eecb5a8085e62472973496") + + def test_to_json(self): + self.assertEqual(type(TestFeed.parsed_feed.to_json()), str) + self.assertEqual(len(TestFeed.parsed_feed.to_json()), 540) + + def test_to_readable(self): + self.assertEqual(type(TestFeed.parsed_feed.to_json()), str) + self.assertEqual(len(TestFeed.parsed_feed.to_json()), 374) + + +unittest.main() diff --git a/rss_reader_package/tests/test_feed_fetcher.py b/rss_reader_package/tests/test_feed_fetcher.py new file mode 100644 index 00000000..51ab0fdf --- /dev/null +++ b/rss_reader_package/tests/test_feed_fetcher.py @@ -0,0 +1,21 @@ +import unittest +from feed_fetcher import FeedFetcher +from exceptions import WrongLimitError, WrongUrlError + + +class TestFeedFetcher(unittest.TestCase): + def test_feed_fetcher(self): + with self.assertRaises(WrongLimitError): + FeedFetcher("source", -1) + with self.assertRaises(WrongUrlError): + ff = FeedFetcher("source", None) + ff.fetch_feeds() + ff2 = FeedFetcher("https://timesofindia.indiatimes.com/rssfeedstopstories.cms", 1) + ff2.fetch_feeds() + self.assertEqual(len(ff2.feeds_formatted), 1) + ff3 = FeedFetcher("https://timesofindia.indiatimes.com/rssfeedstopstories.cms", 3) + ff3.fetch_feeds() + self.assertEqual(len(ff3.feeds_formatted), 3) + + +unittest.main() diff --git a/rss_reader_package/tests/test_format_converter.py b/rss_reader_package/tests/test_format_converter.py new file mode 100644 index 00000000..7352631d --- /dev/null +++ b/rss_reader_package/tests/test_format_converter.py @@ -0,0 +1,35 @@ +import os +import shutil +import unittest +import feedparser +from rss_reader_package.format_converter import FormatConverter +from rss_reader_package.mocks.raw_feed_mock import mock_feed_raw +from rss_reader_package.feed import Feed +from appdirs import user_data_dir +from rss_reader_package.utils.count_files import count_files_by_type +from rss_reader_package.utils.exceptions import NotSupportedConversionFormat + + +class TestFormatConverter(unittest.TestCase): + parsed_feed = Feed.process_feed(feedparser.util.FeedParserDict(mock_feed_raw), None) + + def test_convert_feeds(self): + user_dir = user_data_dir() + with self.assertRaises(NotSupportedConversionFormat): + FormatConverter.convert_feeds([TestFormatConverter.parsed_feed], "my_format", None, False) + os.mkdir(os.path.join(user_dir, "Test")) + FormatConverter.convert_feeds([TestFormatConverter.parsed_feed], "html", os.path.join(user_dir, "Test"), False) + self.assertEqual(count_files_by_type(os.path.join(user_dir, "Test"), ".html"), 1) + shutil.rmtree(os.path.join(user_dir, "Test")) + os.mkdir(os.path.join(user_dir, "Test")) + FormatConverter.convert_feeds([TestFormatConverter.parsed_feed], "epub", os.path.join(user_dir, "Test"), False) + self.assertEqual(count_files_by_type(os.path.join(user_dir, "Test"), ".epub"), 1) + shutil.rmtree(os.path.join(user_dir, "Test")) + + def test_single_feed_to_html_content(self): + self.assertEqual(type(FormatConverter.single_feed_to_html_content(TestFormatConverter.parsed_feed, True)), str) + self.assertEqual( + type(FormatConverter.single_feed_to_html_content(TestFormatConverter.parsed_feed, False)), str) + + +unittest.main() diff --git a/rss_reader_package/tests/test_link.py b/rss_reader_package/tests/test_link.py new file mode 100644 index 00000000..2050518d --- /dev/null +++ b/rss_reader_package/tests/test_link.py @@ -0,0 +1,18 @@ +import unittest +from rss_reader_package.link import Link +from rss_reader_package.utils.exceptions import LinkWithNoSourceError + + +class TestLink(unittest.TestCase): + def test_link(self): + link = Link("source", "image") + self.assertEqual(link.href, "source") + self.assertEqual(link.link_type, "image") + self.assertEqual(str(link), "source (image)") + link2 = Link("source") + self.assertEqual(link2.link_type, "unknown") + with self.assertRaises(LinkWithNoSourceError): + Link(True) + + +unittest.main() diff --git a/rss_reader_package/utils/__init__.py b/rss_reader_package/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_reader_package/utils/config.py b/rss_reader_package/utils/config.py new file mode 100644 index 00000000..8f909526 --- /dev/null +++ b/rss_reader_package/utils/config.py @@ -0,0 +1,37 @@ +""" +Module for application internal configs + +exports appname, appauthor, verbose_print +""" +import builtins as __builtin__ + +# E731 do not assign a lambda expression, use a def +# Breaking the rule, because it will lead to less readability and more code +appname = "RSSReader" +appauthor = "nmac99" + +COLORIZED_MODE = False +verbose_print = lambda *a, **k: None + + +COLORS = { + "pink": "\033[95m", + "blue": "\033[94m", + "cyan": "\033[96m", + "green": "\033[92m", + "warn": "\033[93m", + "err": "\033[91m", + "bold": "\033[1m", + "und": "\033[4m", + "none": "" +} + + +def print(text: str, type: str = "none"): + """Custom print() function to handle colorized mode""" + if not COLORIZED_MODE: + return __builtin__.print(text) + else: + endc = "\033[0m" + formatted_text = f"{COLORS[type]}{text}{endc}" + return __builtin__.print(formatted_text) diff --git a/rss_reader_package/utils/count_files.py b/rss_reader_package/utils/count_files.py new file mode 100644 index 00000000..c4e4f505 --- /dev/null +++ b/rss_reader_package/utils/count_files.py @@ -0,0 +1,32 @@ +""" +Module for counting files in directory + +exports count_files_by_type +""" +import os +import rss_reader_package.utils.config as config + + +def count_files_by_type(dir_path: str, file_type: str or None) -> int: + """ + Function that counts files by its type. If type is not provided, will count all files + + Args: + dir_path: path of directory where files should be counted + file_type: file extension that should be considered for counting + + Returns: + int: number of files after counting + """ + + files = [] + for (_, __, filenames) in os.walk(dir_path): + if file_type is None: + config.verbose_print("File type not specified. Counting all files", "warn") + files.extend(filenames) + else: + for file in filenames: + if file.endswith(file_type): + files.append(file) + + return len(files) diff --git a/rss_reader_package/utils/exceptions.py b/rss_reader_package/utils/exceptions.py new file mode 100644 index 00000000..ae6ed9e2 --- /dev/null +++ b/rss_reader_package/utils/exceptions.py @@ -0,0 +1,40 @@ +""" +Module for custom Errors + +exports WringUrlError, WrongLimitError, ConvertJSONError, LinkWithNoSourceError, CachedFeedNotFoundError +""" + + +class Error(Exception): + """Base class for custom errors""" + pass + + +class WrongUrlError(Error): + """Error for wrong url specification""" + pass + + +class WrongLimitError(Error): + """Error for wrong limit specification""" + pass + + +class ConvertJSONError(Error): + """Error for JSON converting issues""" + pass + + +class LinkWithNoSourceError(Error): + """Error for not providing source to Link object""" + pass + + +class CachedFeedNotFoundError(Error): + """Error for not found feed cache""" + pass + + +class NotSupportedConversionFormat(Error): + """Error for not supported conversion formats""" + pass diff --git a/rss_reader_package/utils/version.py b/rss_reader_package/utils/version.py new file mode 100644 index 00000000..1057eea9 --- /dev/null +++ b/rss_reader_package/utils/version.py @@ -0,0 +1,6 @@ +""" +Module for application versioning + +exports __version__ +""" +__version__ = "1.5.0" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..dfe04b81 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,24 @@ +[pycodestyle] +count = False +ignore = E731 +max-line-length = 120 +statistics = True + +[metadata] +name = rss_reader +version = 1.5.0 + +[options] +packages = + rss_reader_package + rss_reader_package.utils +install_requires = + feedparser + appdirs + yattag + EbookLib + +[options.entry_points] +console_scripts = + rss-reader = rss_reader_package.rss_reader:rss_reader_func + diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..d13e9727 --- /dev/null +++ b/setup.py @@ -0,0 +1,58 @@ +from setuptools import find_packages, setup +import os + +# Optional rss_reader_package description in README.md: +current_directory = os.path.dirname(os.path.abspath(__file__)) + +try: + with open( + os.path.join(current_directory, "README.md"), + encoding="utf-8") as f: + long_description = f.read() +except Exception as e: + print(e) + long_description = "" +setup( + # Project name: + name="RSS Reader", + + # Packages to include in the distribution: + packages=find_packages(','), + + # Project version number: + version="1.5.0", + + # List a license for the project, eg. MIT License + license="MIT License", + + # Short description of your library: + description="Pure Python command-line RSS reader", + + # Long description of your library: + long_description=long_description, + long_description_content_type='text/markdown', + + # Your name: + author="Narek Arsenyan", + + # Your email address: + author_email="narekarsenyan99@gmail.com", + + # Link to your github repository or website: + url="https://github.com/NMac99", + + # Download Link from where the project can be downloaded from: + download_url="", + + # List of keywords: + keywords=[], + + # List project dependencies: + install_requires=["feedparser", "appdirs", "yattag", "EbookLib"], + + # https://pypi.org/classifiers/ + classifiers=[], + entry_points={"console_scripts": [ + "rss-reader=rss_reader_package.rss_reader:rss_reader_func" + ]} + ) From 56bbf63d7355e1b0633174a9ee9c9b3bc94311d4 Mon Sep 17 00:00:00 2001 From: narekarsenyan Date: Tue, 28 Jun 2022 23:29:18 +0400 Subject: [PATCH 3/9] REMOVE feedparser dependency; REFACTOR project using beautiful soup and requests --- requirements.txt | 3 +- rss_reader_package/.DS_Store | Bin 0 -> 6148 bytes rss_reader_package/cache_worker.py | 25 ++-- rss_reader_package/feed.py | 110 +++++------------- rss_reader_package/feed_fetcher.py | 70 +++++++---- rss_reader_package/format_converter.py | 40 ++++--- rss_reader_package/link.py | 2 +- rss_reader_package/mocks/raw_feed_mock.py | 15 +-- rss_reader_package/rss_reader.py | 16 +-- rss_reader_package/tests/test_feed.py | 3 +- rss_reader_package/tests/test_feed_fetcher.py | 4 +- .../tests/test_format_converter.py | 3 +- rss_reader_package/utils/count_files.py | 2 - setup.cfg | 3 +- setup.py | 2 +- 15 files changed, 129 insertions(+), 169 deletions(-) create mode 100644 rss_reader_package/.DS_Store diff --git a/requirements.txt b/requirements.txt index 7eabfe5a..bf467ba0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ appdirs==1.4.4 +beautifulsoup4==4.11.1 EbookLib==0.17.1 -feedparser==6.0.10 +requests==2.28.0 setuptools==61.2.0 yattag==1.14.0 diff --git a/rss_reader_package/.DS_Store b/rss_reader_package/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5dd04e9b5784add22ac253e0c1ecea02c118e36f GIT binary patch literal 6148 zcmeHKOHKko5Uo-SL{v!JxG|X%ka&ZU;KH>B80Di9kRg0D>}KyByn^R*rLU?RGt)C{ z4KY=u`!zkUx~d+TZkmX=yB=N65A$B z0aIY#6p(kfsqgHVW|V(_H@DMt(4VG#Fl2kUM(cd5VYKy{(}o80fc8uySts!N47^$e z_8XE=W>BOeV^t&1K0>qRN&BkZOi$8v-|#Ebh2-zfGO~=6yRF#uGd9L z{%kE3CuePhc8?||yv(7xp!3JE4&*4lKvToBNCv>zW9ASuF#98*GFWE{{HX#zp 10: cache[source].pop(0) - cache[source].append(json) + cache[source].append(feed.to_json()) else: - cache[source] = [json] + cache[source] = [feed.to_json()] config.verbose_print("Update date cache file", "bold") cache_file.write(dumps(cache, indent=4)) except Exception as e: diff --git a/rss_reader_package/feed.py b/rss_reader_package/feed.py index ce303ea4..54ecf5d7 100644 --- a/rss_reader_package/feed.py +++ b/rss_reader_package/feed.py @@ -5,22 +5,23 @@ """ import json -import rss_reader_package.utils.config as config -import feedparser -from rss_reader_package.link import Link -from rss_reader_package.utils.exceptions import ConvertJSONError +import utils.config as config +from bs4 import BeautifulSoup +from link import Link +from utils.exceptions import ConvertJSONError class Feed: """Class which describes normalized Feed object""" def __init__(self, - source_title: str or None, - title: str or None, - date: str or None, - link: str or None, - content: str or None, - non_media_links: list[Link] or None, - media_links: list[Link] or None + source_title: str, + source_url: str, + title: str, + date: str, + link: str, + content: str, + non_media_links: list[Link], + media_links: list[Link] ): """ The constructor of Feed class @@ -36,82 +37,14 @@ def __init__(self, """ self.source_title = source_title + self.source_url = source_url self.title = title - try: - self.date = date.replace("Z", "+00:00") - except Exception as e: - config.verbose_print(f"Feed date is not string ({e})", "warn") - self.date = date + self.date = date.replace("Z", "+00:00") self.link = link self.content = content self.non_media_links = non_media_links self.media_links = media_links - @staticmethod - def process_feed(feed: feedparser.util.FeedParserDict, source_title: str or None): - """ - The static function for normalizing fetched feed - - Args: - feed: the raw feed object, that was fetched - source_title: the source title of fetched news - - Returns: - Feed: Feed type object, that is already normalized - """ - - feed_source_title = source_title - config.verbose_print("Scraping feed title", "bold") - try: - title = feed.title - except Exception as e: - config.verbose_print(f"Warning: Feed title not found ({e})", "warn") - title = None - config.verbose_print("Scraping publish date", "bold") - try: - date = feed.published - except Exception as e: - config.verbose_print(f"Warning: Publish date not found ({e})", "warn") - date = None - config.verbose_print("Scraping feed link", "bold") - try: - link = feed.link - except Exception as e: - config.verbose_print(f"Warning: Feed link not found ({e})", "warn") - link = None - config.verbose_print("Scraping feed content", "bold") - try: - content = feed.summary - except Exception as e: - config.verbose_print(f"Warning: Content not found ({e})", "warn") - content = None - config.verbose_print("Scraping non-media links", "bold") - try: - non_media_links = list(map(lambda l: Link(l.href, 'link'), feed.links)) - except Exception as e: - config.verbose_print(f"Warning: Non-media links not found ({e})", "warn") - non_media_links = [] - config.verbose_print("Scraping media links", "bold") - try: - media_links = list(map(lambda l: Link(l['url'], 'image'), feed.media_content)) - except Exception as e: - config.verbose_print(f"Warning: Media links not found ({e})", "warn") - media_links = [] - - normalized_feed = Feed( - feed_source_title, - title, - date, - link, - content, - non_media_links, - media_links - ) - - config.verbose_print("Feed object created", "green") - - return normalized_feed - def to_json(self): """ The function for converting Feed object to JSON string @@ -146,12 +79,24 @@ def to_readable(self) -> str: if formatted_links == "": formatted_links = None + content = BeautifulSoup(self.content, 'lxml') + feed_links = content.find_all("a", href = True) + + for a in content.select("a"): + anchor_index = next((i for i, item in enumerate(feed_links) if item["href"] == a["href"]), None) + anchor_before = f"[link {anchor_index + 1}: " + anchor_after = f"][{anchor_index + 1}]" + a.insert_before(anchor_before) + a.insert_after(anchor_after) + a.unwrap() + + formatted_content = content.get_text() formatted_feed = \ f"Feed: {self.source_title}\n\n" + \ f"Title: {self.title}\n" + \ f"Date: {self.date}\n" + \ f"Link: {self.link}\n\n" + \ - f"{self.content}\n\n" + \ + f"{formatted_content}\n\n" + \ f"Links:\n{formatted_links}\n\n" return formatted_feed @@ -173,6 +118,7 @@ def json_to_feed(json_feed: str): media_links = list(map(lambda l: Link(l["href"], l["link_type"]), converted_feed["media_links"])) return Feed(converted_feed["source_title"], + converted_feed["source_url"], converted_feed["title"], converted_feed["date"], converted_feed["link"], diff --git a/rss_reader_package/feed_fetcher.py b/rss_reader_package/feed_fetcher.py index 705318e1..d54d11a6 100644 --- a/rss_reader_package/feed_fetcher.py +++ b/rss_reader_package/feed_fetcher.py @@ -3,16 +3,17 @@ exports FeedFetcher class """ -import feedparser -import rss_reader_package.utils.config as config -from datetime import datetime -from rss_reader_package.feed import Feed -from rss_reader_package.cache_worker import CacheWorker -from rss_reader_package.utils.exceptions import WrongLimitError, WrongUrlError +import requests +from bs4 import BeautifulSoup +import utils.config as config +from link import Link +from feed import Feed +from cache_worker import CacheWorker +from utils.exceptions import WrongLimitError, WrongUrlError class FeedFetcher: - """Class for fetching feeds and storing feeds using feedparser.py""" + """Class for fetching feeds and storing feeds""" def __init__(self, source: str, limit: int or None): """ @@ -31,27 +32,46 @@ def __init__(self, source: str, limit: int or None): self.__source = source self.__limit = limit - self.feeds = [] self.feeds_formatted = [] def fetch_feeds(self): """The function that fetches and stores feeds using source and limit of object """ - self.feeds = feedparser.parse(self.__source) - if "entries" not in self.feeds or len(self.feeds.entries) < 1: + try: + r = requests.get(self.__source) + soup = BeautifulSoup(r.content, features="xml") + source_title = soup.find("channel").find("title").text + config.verbose_print("Feeds fetched", "green") + if self.__limit is None: + limit = 100 + else: + limit = self.__limit + items = soup.find_all('item')[:limit] + for item in items: + config.verbose_print("Scraping feed title", "bold") + title = item.find('title').text + config.verbose_print("Scraping publish date", "bold") + date = item.find('pubDate').text + config.verbose_print("Scraping feed link", "bold") + link = item.find('link').text + config.verbose_print("Scraping feed content", "bold") + raw_content = item.find('description').text + config.verbose_print("Scraping media links", "bold") + media_links_raw = item.find_all("enclosure") + config.verbose_print("Scraping non-media links", "bold") + content = BeautifulSoup(raw_content, 'lxml') + feed_links_raw = content.find_all("a", href=True) + media_links = list() + feed_links = list() + for feed_link in feed_links_raw: + feed_links.append(Link(feed_link['href'], 'link')) + for media_link in media_links_raw: + media_links.append(Link(media_link["url"], media_link["type"].split("/")[0])) + config.verbose_print("Storing fetched feeds in cache", "bold") + formatted_feed = Feed(source_title, self.__source, title, date, link, raw_content, feed_links, media_links) + CacheWorker.store_feed_in_cache(formatted_feed) + self.feeds_formatted.append(formatted_feed) + with open("content.txt", "w") as f: + f.write(str(items)) + except Exception: raise WrongUrlError("Specified url is not rss type") - config.verbose_print("Feeds fetched", "green") - if self.__limit is None: - limit = len(self.feeds.entries) - else: - limit = min(len(self.feeds.entries), self.__limit) - for i in range(0, limit): - self.feeds_formatted.append( - Feed.process_feed(self.feeds.entries[i], self.feeds.feed.title) - ) - config.verbose_print("Storing fetched feeds in cache", "bold") - for f in self.feeds_formatted: - CacheWorker.store_feed_in_cache(str(datetime.fromisoformat(f.date).date()), - self.__source, - f.to_json(), - f.link) diff --git a/rss_reader_package/format_converter.py b/rss_reader_package/format_converter.py index e07579f2..87ec1e4d 100644 --- a/rss_reader_package/format_converter.py +++ b/rss_reader_package/format_converter.py @@ -6,15 +6,15 @@ import os import uuid -import rss_reader_package.utils.config as config -from rss_reader_package.utils.config import print +import utils.config as config +from utils.config import print from appdirs import user_data_dir from yattag import Doc, indent from ebooklib import epub from datetime import datetime -from rss_reader_package.feed import Feed -from rss_reader_package.utils.count_files import count_files_by_type -from rss_reader_package.utils.exceptions import NotSupportedConversionFormat +from feed import Feed +from utils.count_files import count_files_by_type +from utils.exceptions import NotSupportedConversionFormat SUPPORTED_FORMATS = ["html", "epub"] @@ -82,6 +82,7 @@ def feeds_to_html(feeds: [Feed], destination_path: str): with doc.tag('style', type='text/css'): config.verbose_print("Applying css", "bold") doc.asis(CSS) + doc.stag("meta", charset="UTF-8") with tag("body"): config.verbose_print("Appending feeds to html", "bold") for feed in feeds: @@ -89,7 +90,7 @@ def feeds_to_html(feeds: [Feed], destination_path: str): config.verbose_print("Saving html file", "bold") with open(os.path.join(destination_path, f"Feeds {html_files_count + 1}.html"), "w") as html_file: html_file.write(indent(doc.getvalue())) - config.verbose_print(f"Html file saved in {destination_path}", "green") + print(f"Html file saved in {destination_path}", "green") @staticmethod def feeds_to_epub(feeds: [Feed], destination_path: str): @@ -137,7 +138,7 @@ def feeds_to_epub(feeds: [Feed], destination_path: str): config.verbose_print("Saving epub book", "bold") epub.write_epub(os.path.join(destination_path, new_file_name + ".epub"), book, {}) - config.verbose_print(f"Epub book saved in {destination_path}", "green") + print(f"Epub book saved in {destination_path}", "green") @staticmethod def single_feed_to_html_content(feed: Feed, for_book: bool) -> str: @@ -157,7 +158,7 @@ def single_feed_to_html_content(feed: Feed, for_book: bool) -> str: with tag("h3"): text(feed.title) with tag("p"): - text(str(datetime.fromisoformat(feed.date).__format__("%d.%m.%Y, %H:%M"))) + text(str(datetime.strptime(feed.date, "%a, %d %b %Y %H:%M:%S %z").__format__("%d.%m.%Y, %H:%M"))) doc.stag("br") if not for_book: with tag("div", id="feed-images-container"): @@ -165,13 +166,15 @@ def single_feed_to_html_content(feed: Feed, for_book: bool) -> str: text("No media was provided") else: for media_link in feed.media_links: - doc.stag("img", src=media_link.href, klass="feed-image") + if media_link.link_type == "image": + doc.stag("img", src=media_link.href, klass="feed-image") + if media_link.link_type == "audio": + doc.stag("audio", controls=True, src=media_link.href) + if media_link.link_type == "video": + with tag("video", controls=True): + doc.stag("source", src=media_link.href) doc.stag("br") - with tag("p"): - if feed.content is None: - text("No feed content in summary was provided") - else: - text(feed.content) + doc.asis(feed.content) doc.stag("br") with tag("a", href=feed.link, target="_blank"): text("Read full article") @@ -182,10 +185,11 @@ def single_feed_to_html_content(feed: Feed, for_book: bool) -> str: text("Related links:") with tag("div"): for i in range(0, len(feed.non_media_links)): - with tag("span"): - text(f"[{i + 1}]") - with tag("a", href=feed.non_media_links[i].href, target="_blank"): - text(feed.non_media_links[i].href) + with tag("div"): + with tag("span"): + text(f"[{i + 1}]") + with tag("a", href=feed.non_media_links[i].href, target="_blank"): + text(feed.non_media_links[i].href) if not for_book: for i in range(0, 5): doc.stag("br") diff --git a/rss_reader_package/link.py b/rss_reader_package/link.py index 0a01dbdd..0da6c2f2 100644 --- a/rss_reader_package/link.py +++ b/rss_reader_package/link.py @@ -4,7 +4,7 @@ exports Link class """ -from rss_reader_package.utils.exceptions import LinkWithNoSourceError +from utils.exceptions import LinkWithNoSourceError class Link: diff --git a/rss_reader_package/mocks/raw_feed_mock.py b/rss_reader_package/mocks/raw_feed_mock.py index 1aa9fb44..2ac416da 100644 --- a/rss_reader_package/mocks/raw_feed_mock.py +++ b/rss_reader_package/mocks/raw_feed_mock.py @@ -1,16 +1,3 @@ """Module for raw feed mock data""" -mock_feed_raw = {'title': 'Taiwanese F-16 fighter makes emergency landing in Hawaii', - 'title_detail': {'type': 'text/plain', 'language': None, 'base': 'https://news.yahoo.com/rss/', - 'value': 'Taiwanese F-16 fighter makes emergency landing in Hawaii'}, 'links': [ - {'rel': 'alternate', 'type': 'text/html', - 'href': 'https://news.yahoo.com/taiwanese-f-16-fighter-makes-191643064.html'}], - 'link': 'https://news.yahoo.com/taiwanese-f-16-fighter-makes-191643064.html', - 'published': '2022-06-07T19:16:43Z', - 'source': {'href': 'http://www.ap.org/', 'title': 'Associated Press'}, - 'id': 'taiwanese-f-16-fighter-makes-191643064.html', 'guidislink': False, 'media_content': [ - {'height': '86', - 'url': 'https://s.yimg.com/uu/api/res/1.2/xEduvF_K_md_I0S4N_bKPA--~B/' - 'aD0zMzMzO3c9NTAwMDthcHBpZD15dGFjaHlvbg--/' - 'https://media.zenfs.com/en/ap.org/d6b6186493eecb5a8085e62472973496', - 'width': '130'}], 'media_credit': [{'role': 'publishing company'}], 'credit': ''} +mock_feed_raw = "{\n \"source_title\": \"The Daily\",\n \"source_url\": \"http://rss.art19.com/the-daily\",\n \"title\": \"Why Is It So Hard to Buy a House in America Right Now?\",\n \"date\": \"Tue, 21 Jun 2022 09:50:00 +0000\",\n \"link\": \"https://www.nytimes.com/the-daily\",\n \"content\": \"

This episode contains strong language.

When Drew Mena and Amena Sengal decided last year to sell their home in New York and relocate their young family to Austin, Texas, they figured they\\u2019d have no problem.

What they hadn\\u2019t realized was that, across the country, home prices \\u2014 and competition to secure properties \\u2014 had risen to jaw-dropping levels.

Guest: Francesca Mari, a contributing writer for The New York Times Magazine and a fellow at the think tank New America.

Want more from The Daily? For one big idea on the news each week from our team, subscribe to our newsletter.\\u00a0

Background reading:\\u00a0

For more information on today\\u2019s episode, visit nytimes.com/thedaily. Transcripts of each episode will be made available by the next workday.\\u00a0

\\n\",\n \"non_media_links\": [\n {\n \"href\": \"https://www.nytimes.com/newsletters/the-daily?module=inline\",\n \"link_type\": \"link\"\n },\n {\n \"href\": \"https://www.nytimes.com/2021/11/12/magazine/real-estate-pandemic.html\",\n \"link_type\": \"link\"\n },\n {\n \"href\": \"http://nytimes.com/thedaily?smid=pc-thedaily\",\n \"link_type\": \"link\"\n }\n ],\n \"media_links\": [\n {\n \"href\": \"https://dts.podtrac.com/redirect.mp3/chrt.fm/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/03d8b493-87fc-4bd1-931f-8a8e9b945d8a/episodes/230797bf-6d47-4648-81b5-79750b8d8023/audio/128/default.mp3?aid=rss_feed&awCollectionId=03d8b493-87fc-4bd1-931f-8a8e9b945d8a&awEpisodeId=230797bf-6d47-4648-81b5-79750b8d8023&feed=54nAGcIl\",\n \"link_type\": \"audio\"\n }\n ]\n}" \ No newline at end of file diff --git a/rss_reader_package/rss_reader.py b/rss_reader_package/rss_reader.py index 60833f37..e4031f2c 100644 --- a/rss_reader_package/rss_reader.py +++ b/rss_reader_package/rss_reader.py @@ -5,14 +5,14 @@ """ import os import argparse -import rss_reader_package.utils.config as config -from rss_reader_package.utils.config import print -from rss_reader_package.utils.version import __version__ -from rss_reader_package.feed_fetcher import FeedFetcher -from rss_reader_package.cache_worker import CacheWorker -from rss_reader_package.format_converter import FormatConverter -from rss_reader_package.date import valid_date -from rss_reader_package.utils.exceptions import WrongLimitError,\ +from utils import config +from utils.config import print +from utils.version import __version__ +from feed_fetcher import FeedFetcher +from cache_worker import CacheWorker +from format_converter import FormatConverter +from date import valid_date +from utils.exceptions import WrongLimitError,\ ConvertJSONError,\ WrongUrlError,\ LinkWithNoSourceError,\ diff --git a/rss_reader_package/tests/test_feed.py b/rss_reader_package/tests/test_feed.py index cde13a54..94095e6c 100644 --- a/rss_reader_package/tests/test_feed.py +++ b/rss_reader_package/tests/test_feed.py @@ -1,11 +1,10 @@ import unittest -import feedparser from rss_reader_package.mocks.raw_feed_mock import mock_feed_raw from rss_reader_package.feed import Feed class TestFeed(unittest.TestCase): - parsed_feed = Feed.process_feed(feedparser.util.FeedParserDict(mock_feed_raw), None) + parsed_feed = Feed.json_to_feed(mock_feed_raw) def test_process_feed(self): self.assertIsNone(TestFeed.parsed_feed.source_title) diff --git a/rss_reader_package/tests/test_feed_fetcher.py b/rss_reader_package/tests/test_feed_fetcher.py index 51ab0fdf..3b0ed4f4 100644 --- a/rss_reader_package/tests/test_feed_fetcher.py +++ b/rss_reader_package/tests/test_feed_fetcher.py @@ -1,6 +1,6 @@ import unittest -from feed_fetcher import FeedFetcher -from exceptions import WrongLimitError, WrongUrlError +from rss_reader_package.feed_fetcher import FeedFetcher +from rss_reader_package.utils.exceptions import WrongLimitError, WrongUrlError class TestFeedFetcher(unittest.TestCase): diff --git a/rss_reader_package/tests/test_format_converter.py b/rss_reader_package/tests/test_format_converter.py index 7352631d..b7ec87f9 100644 --- a/rss_reader_package/tests/test_format_converter.py +++ b/rss_reader_package/tests/test_format_converter.py @@ -1,7 +1,6 @@ import os import shutil import unittest -import feedparser from rss_reader_package.format_converter import FormatConverter from rss_reader_package.mocks.raw_feed_mock import mock_feed_raw from rss_reader_package.feed import Feed @@ -11,7 +10,7 @@ class TestFormatConverter(unittest.TestCase): - parsed_feed = Feed.process_feed(feedparser.util.FeedParserDict(mock_feed_raw), None) + parsed_feed = Feed.json_to_feed(mock_feed_raw) def test_convert_feeds(self): user_dir = user_data_dir() diff --git a/rss_reader_package/utils/count_files.py b/rss_reader_package/utils/count_files.py index c4e4f505..6736ebfc 100644 --- a/rss_reader_package/utils/count_files.py +++ b/rss_reader_package/utils/count_files.py @@ -4,7 +4,6 @@ exports count_files_by_type """ import os -import rss_reader_package.utils.config as config def count_files_by_type(dir_path: str, file_type: str or None) -> int: @@ -22,7 +21,6 @@ def count_files_by_type(dir_path: str, file_type: str or None) -> int: files = [] for (_, __, filenames) in os.walk(dir_path): if file_type is None: - config.verbose_print("File type not specified. Counting all files", "warn") files.extend(filenames) else: for file in filenames: diff --git a/setup.cfg b/setup.cfg index dfe04b81..6d7c590e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,10 +13,11 @@ packages = rss_reader_package rss_reader_package.utils install_requires = - feedparser appdirs yattag EbookLib + beautifulsoup4 + requests [options.entry_points] console_scripts = diff --git a/setup.py b/setup.py index d13e9727..f3deb690 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ keywords=[], # List project dependencies: - install_requires=["feedparser", "appdirs", "yattag", "EbookLib"], + install_requires=["appdirs", "yattag", "EbookLib", "beautifulsoup4", "requests"], # https://pypi.org/classifiers/ classifiers=[], From 375871f223cfd1793a2d120da5a7f2165f2faeda Mon Sep 17 00:00:00 2001 From: narekarsenyan Date: Tue, 28 Jun 2022 23:52:16 +0400 Subject: [PATCH 4/9] REARRANGE file structure so that all project is in one folder with name --- DOCS.md => Narek Arsenyan/DOCS.md | 0 {rss_reader_package => Narek Arsenyan}/__init__.py | 0 requirements.txt => Narek Arsenyan/requirements.txt | 0 .../rss_reader_package}/.DS_Store | Bin .../rss_reader_package}/__init__.py | 0 .../rss_reader_package}/cache_worker.py | 0 .../rss_reader_package}/date.py | 0 .../rss_reader_package}/feed.py | 0 .../rss_reader_package}/feed_fetcher.py | 0 .../rss_reader_package}/format_converter.py | 0 .../rss_reader_package}/link.py | 0 Narek Arsenyan/rss_reader_package/mocks/__init__.py | 0 .../rss_reader_package}/mocks/raw_feed_mock.py | 0 .../rss_reader_package}/rss_reader.py | 0 Narek Arsenyan/rss_reader_package/tests/__init__.py | 0 .../rss_reader_package}/tests/test_count_files.py | 0 .../rss_reader_package}/tests/test_date.py | 0 .../rss_reader_package}/tests/test_feed.py | 0 .../rss_reader_package}/tests/test_feed_fetcher.py | 0 .../tests/test_format_converter.py | 0 .../rss_reader_package}/tests/test_link.py | 0 Narek Arsenyan/rss_reader_package/utils/__init__.py | 0 .../rss_reader_package}/utils/config.py | 0 .../rss_reader_package}/utils/count_files.py | 0 .../rss_reader_package}/utils/exceptions.py | 0 .../rss_reader_package}/utils/version.py | 0 setup.cfg => Narek Arsenyan/setup.cfg | 0 setup.py => Narek Arsenyan/setup.py | 2 +- 28 files changed, 1 insertion(+), 1 deletion(-) rename DOCS.md => Narek Arsenyan/DOCS.md (100%) rename {rss_reader_package => Narek Arsenyan}/__init__.py (100%) rename requirements.txt => Narek Arsenyan/requirements.txt (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/.DS_Store (100%) rename {rss_reader_package/utils => Narek Arsenyan/rss_reader_package}/__init__.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/cache_worker.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/date.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/feed.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/feed_fetcher.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/format_converter.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/link.py (100%) create mode 100644 Narek Arsenyan/rss_reader_package/mocks/__init__.py rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/mocks/raw_feed_mock.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/rss_reader.py (100%) create mode 100644 Narek Arsenyan/rss_reader_package/tests/__init__.py rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/tests/test_count_files.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/tests/test_date.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/tests/test_feed.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/tests/test_feed_fetcher.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/tests/test_format_converter.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/tests/test_link.py (100%) create mode 100644 Narek Arsenyan/rss_reader_package/utils/__init__.py rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/utils/config.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/utils/count_files.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/utils/exceptions.py (100%) rename {rss_reader_package => Narek Arsenyan/rss_reader_package}/utils/version.py (100%) rename setup.cfg => Narek Arsenyan/setup.cfg (100%) rename setup.py => Narek Arsenyan/setup.py (96%) diff --git a/DOCS.md b/Narek Arsenyan/DOCS.md similarity index 100% rename from DOCS.md rename to Narek Arsenyan/DOCS.md diff --git a/rss_reader_package/__init__.py b/Narek Arsenyan/__init__.py similarity index 100% rename from rss_reader_package/__init__.py rename to Narek Arsenyan/__init__.py diff --git a/requirements.txt b/Narek Arsenyan/requirements.txt similarity index 100% rename from requirements.txt rename to Narek Arsenyan/requirements.txt diff --git a/rss_reader_package/.DS_Store b/Narek Arsenyan/rss_reader_package/.DS_Store similarity index 100% rename from rss_reader_package/.DS_Store rename to Narek Arsenyan/rss_reader_package/.DS_Store diff --git a/rss_reader_package/utils/__init__.py b/Narek Arsenyan/rss_reader_package/__init__.py similarity index 100% rename from rss_reader_package/utils/__init__.py rename to Narek Arsenyan/rss_reader_package/__init__.py diff --git a/rss_reader_package/cache_worker.py b/Narek Arsenyan/rss_reader_package/cache_worker.py similarity index 100% rename from rss_reader_package/cache_worker.py rename to Narek Arsenyan/rss_reader_package/cache_worker.py diff --git a/rss_reader_package/date.py b/Narek Arsenyan/rss_reader_package/date.py similarity index 100% rename from rss_reader_package/date.py rename to Narek Arsenyan/rss_reader_package/date.py diff --git a/rss_reader_package/feed.py b/Narek Arsenyan/rss_reader_package/feed.py similarity index 100% rename from rss_reader_package/feed.py rename to Narek Arsenyan/rss_reader_package/feed.py diff --git a/rss_reader_package/feed_fetcher.py b/Narek Arsenyan/rss_reader_package/feed_fetcher.py similarity index 100% rename from rss_reader_package/feed_fetcher.py rename to Narek Arsenyan/rss_reader_package/feed_fetcher.py diff --git a/rss_reader_package/format_converter.py b/Narek Arsenyan/rss_reader_package/format_converter.py similarity index 100% rename from rss_reader_package/format_converter.py rename to Narek Arsenyan/rss_reader_package/format_converter.py diff --git a/rss_reader_package/link.py b/Narek Arsenyan/rss_reader_package/link.py similarity index 100% rename from rss_reader_package/link.py rename to Narek Arsenyan/rss_reader_package/link.py diff --git a/Narek Arsenyan/rss_reader_package/mocks/__init__.py b/Narek Arsenyan/rss_reader_package/mocks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_reader_package/mocks/raw_feed_mock.py b/Narek Arsenyan/rss_reader_package/mocks/raw_feed_mock.py similarity index 100% rename from rss_reader_package/mocks/raw_feed_mock.py rename to Narek Arsenyan/rss_reader_package/mocks/raw_feed_mock.py diff --git a/rss_reader_package/rss_reader.py b/Narek Arsenyan/rss_reader_package/rss_reader.py similarity index 100% rename from rss_reader_package/rss_reader.py rename to Narek Arsenyan/rss_reader_package/rss_reader.py diff --git a/Narek Arsenyan/rss_reader_package/tests/__init__.py b/Narek Arsenyan/rss_reader_package/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_reader_package/tests/test_count_files.py b/Narek Arsenyan/rss_reader_package/tests/test_count_files.py similarity index 100% rename from rss_reader_package/tests/test_count_files.py rename to Narek Arsenyan/rss_reader_package/tests/test_count_files.py diff --git a/rss_reader_package/tests/test_date.py b/Narek Arsenyan/rss_reader_package/tests/test_date.py similarity index 100% rename from rss_reader_package/tests/test_date.py rename to Narek Arsenyan/rss_reader_package/tests/test_date.py diff --git a/rss_reader_package/tests/test_feed.py b/Narek Arsenyan/rss_reader_package/tests/test_feed.py similarity index 100% rename from rss_reader_package/tests/test_feed.py rename to Narek Arsenyan/rss_reader_package/tests/test_feed.py diff --git a/rss_reader_package/tests/test_feed_fetcher.py b/Narek Arsenyan/rss_reader_package/tests/test_feed_fetcher.py similarity index 100% rename from rss_reader_package/tests/test_feed_fetcher.py rename to Narek Arsenyan/rss_reader_package/tests/test_feed_fetcher.py diff --git a/rss_reader_package/tests/test_format_converter.py b/Narek Arsenyan/rss_reader_package/tests/test_format_converter.py similarity index 100% rename from rss_reader_package/tests/test_format_converter.py rename to Narek Arsenyan/rss_reader_package/tests/test_format_converter.py diff --git a/rss_reader_package/tests/test_link.py b/Narek Arsenyan/rss_reader_package/tests/test_link.py similarity index 100% rename from rss_reader_package/tests/test_link.py rename to Narek Arsenyan/rss_reader_package/tests/test_link.py diff --git a/Narek Arsenyan/rss_reader_package/utils/__init__.py b/Narek Arsenyan/rss_reader_package/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_reader_package/utils/config.py b/Narek Arsenyan/rss_reader_package/utils/config.py similarity index 100% rename from rss_reader_package/utils/config.py rename to Narek Arsenyan/rss_reader_package/utils/config.py diff --git a/rss_reader_package/utils/count_files.py b/Narek Arsenyan/rss_reader_package/utils/count_files.py similarity index 100% rename from rss_reader_package/utils/count_files.py rename to Narek Arsenyan/rss_reader_package/utils/count_files.py diff --git a/rss_reader_package/utils/exceptions.py b/Narek Arsenyan/rss_reader_package/utils/exceptions.py similarity index 100% rename from rss_reader_package/utils/exceptions.py rename to Narek Arsenyan/rss_reader_package/utils/exceptions.py diff --git a/rss_reader_package/utils/version.py b/Narek Arsenyan/rss_reader_package/utils/version.py similarity index 100% rename from rss_reader_package/utils/version.py rename to Narek Arsenyan/rss_reader_package/utils/version.py diff --git a/setup.cfg b/Narek Arsenyan/setup.cfg similarity index 100% rename from setup.cfg rename to Narek Arsenyan/setup.cfg diff --git a/setup.py b/Narek Arsenyan/setup.py similarity index 96% rename from setup.py rename to Narek Arsenyan/setup.py index f3deb690..37417ed7 100644 --- a/setup.py +++ b/Narek Arsenyan/setup.py @@ -6,7 +6,7 @@ try: with open( - os.path.join(current_directory, "README.md"), + os.path.join(current_directory, "../README.md"), encoding="utf-8") as f: long_description = f.read() except Exception as e: From ec478311c35f1f9799e52ff6f2fa9ab2d0ac5a3f Mon Sep 17 00:00:00 2001 From: narekarsenyan Date: Tue, 28 Jun 2022 23:54:45 +0400 Subject: [PATCH 5/9] RENAME docs.md to readme --- Narek Arsenyan/{DOCS.md => README.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Narek Arsenyan/{DOCS.md => README.md} (100%) diff --git a/Narek Arsenyan/DOCS.md b/Narek Arsenyan/README.md similarity index 100% rename from Narek Arsenyan/DOCS.md rename to Narek Arsenyan/README.md From 064f47615899f98416725b5f6cc17a7eaeb75b8b Mon Sep 17 00:00:00 2001 From: NMac99 <87823992+NMac99@users.noreply.github.com> Date: Thu, 30 Jun 2022 14:22:24 +0400 Subject: [PATCH 6/9] UPDATE readme file --- Narek Arsenyan/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Narek Arsenyan/README.md b/Narek Arsenyan/README.md index 2000c214..ef39b92a 100644 --- a/Narek Arsenyan/README.md +++ b/Narek Arsenyan/README.md @@ -44,6 +44,7 @@ ```json { "source_title": "Feed source title", + "source_url": "Feed source url", "title": "Title of the new", "date": "Publishing date of the new", "link": "Link of the new", @@ -96,7 +97,7 @@ Inside `.json` file is JSON object where keys are fetched feeds' sources and val ```json { "https://timesofindia.indiatimes.com/rssfeedstopstories.cms": [ - "{\n \"source_title\": null,\n \"title\": \"Presidential polls: Mamata invites 22 oppn CMs, leaders for joint meeting on June 15\",\n \"date\": \"2022-06-11T16:22:36+05:30\",\n \"link\": \"https://timesofindia.indiatimes.com/india/presidential-polls-mamata-invites-22-oppn-cms-leaders-for-joint-meeting-on-june-15/articleshow/92146582.cms\",\n \"content\": \"With the Rajya Sabha results exposing dissension and lack of cohesion among opposition parties, West Bengal chief minister Mamata Banerjee on Saturday reached out to her counterparts and other leaders to participate in a meeting in Delhi on June 15 to discuss the upcoming presidential polls, which are scheduled for July 18.\",\n \"non_media_links\": [\n {\n \"href\": \"https://timesofindia.indiatimes.com/india/presidential-polls-mamata-invites-22-oppn-cms-leaders-for-joint-meeting-on-june-15/articleshow/92146582.cms\",\n \"link_type\": \"link\"\n }\n ],\n \"media_links\": []\n}" + "{\n \"source_title\": Times of India,\n \"source_url\": \"https://timesofindia.indiatimes.com/rssfeedstopstories.cms\",\n \"title\": \"Presidential polls: Mamata invites 22 oppn CMs, leaders for joint meeting on June 15\",\n \"date\": \"2022-06-11T16:22:36+05:30\",\n \"link\": \"https://timesofindia.indiatimes.com/india/presidential-polls-mamata-invites-22-oppn-cms-leaders-for-joint-meeting-on-june-15/articleshow/92146582.cms\",\n \"content\": \"With the Rajya Sabha results exposing dissension and lack of cohesion among opposition parties, West Bengal chief minister Mamata Banerjee on Saturday reached out to her counterparts and other leaders to participate in a meeting in Delhi on June 15 to discuss the upcoming presidential polls, which are scheduled for July 18.\",\n \"non_media_links\": [\n {\n \"href\": \"https://timesofindia.indiatimes.com/india/presidential-polls-mamata-invites-22-oppn-cms-leaders-for-joint-meeting-on-june-15/articleshow/92146582.cms\",\n \"link_type\": \"link\"\n }\n ],\n \"media_links\": []\n}" ] } ``` @@ -141,4 +142,3 @@ in user data directory. **For Windows 7:** >C:\Users\[USER]\AppData\Local\nmac99\RSSReader - From 1668f3cebd079fc9fa0a77cc62423a4789eee5a4 Mon Sep 17 00:00:00 2001 From: NMac99 <87823992+NMac99@users.noreply.github.com> Date: Thu, 30 Jun 2022 14:23:59 +0400 Subject: [PATCH 7/9] UPDATE feed.py test with new feed mock data --- .../rss_reader_package/tests/test_feed.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/Narek Arsenyan/rss_reader_package/tests/test_feed.py b/Narek Arsenyan/rss_reader_package/tests/test_feed.py index 94095e6c..3c645809 100644 --- a/Narek Arsenyan/rss_reader_package/tests/test_feed.py +++ b/Narek Arsenyan/rss_reader_package/tests/test_feed.py @@ -7,25 +7,24 @@ class TestFeed(unittest.TestCase): parsed_feed = Feed.json_to_feed(mock_feed_raw) def test_process_feed(self): - self.assertIsNone(TestFeed.parsed_feed.source_title) - self.assertEqual(TestFeed.parsed_feed.title, "Taiwanese F-16 fighter makes emergency landing in Hawaii") - self.assertEqual(TestFeed.parsed_feed.date, "2022-06-07T19:16:43Z") + self.assertEqual(TestFeed.parsed_feed.source_title, "The Daily") + self.assertEqual(TestFeed.parsed_feed.title, "Why Is It So Hard to Buy a House in America Right Now?") + self.assertEqual(TestFeed.parsed_feed.date, "Tue, 21 Jun 2022 09:50:00 +0000") self.assertEqual(TestFeed.parsed_feed.link, - "https://news.yahoo.com/taiwanese-f-16-fighter-makes-191643064.html") - self.assertIsNone(TestFeed.parsed_feed.content) - self.assertEqual(len(TestFeed.parsed_feed.non_media_links), 0) + "https://www.nytimes.com/the-daily") + self.assertEqual(len(TestFeed.parsed_feed.content), 1139) + self.assertEqual(len(TestFeed.parsed_feed.non_media_links), 3) self.assertEqual(TestFeed.parsed_feed.media_links[0].href, - "https://s.yimg.com/uu/api/res/1.2/xEduvF_K_md_I0S4N_bKPA--~B/" - "aD0zMzMzO3c9NTAwMDthcHBpZD15dGFjaHlvbg--/" - "https://media.zenfs.com/en/ap.org/d6b6186493eecb5a8085e62472973496") + "https://dts.podtrac.com/redirect.mp3/chrt.fm/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/" + "03d8b493-87fc-4bd1-931f-8a8e9b945d8a/episodes/230797bf-6d47-4648-81b5-79750b8d8023/audio/" + "128/default.mp3?aid=rss_feed&awCollectionId=03d8b493-87fc-4bd1-931f-8a8e9b945d8a&awEpisodeId=" + "230797bf-6d47-4648-81b5-79750b8d8023&feed=54nAGcIl") def test_to_json(self): self.assertEqual(type(TestFeed.parsed_feed.to_json()), str) - self.assertEqual(len(TestFeed.parsed_feed.to_json()), 540) def test_to_readable(self): self.assertEqual(type(TestFeed.parsed_feed.to_json()), str) - self.assertEqual(len(TestFeed.parsed_feed.to_json()), 374) unittest.main() From f68bfbe1037056a819ad3d0394956cb6a799972e Mon Sep 17 00:00:00 2001 From: NMac99 <87823992+NMac99@users.noreply.github.com> Date: Thu, 30 Jun 2022 14:25:07 +0400 Subject: [PATCH 8/9] REMOVE unnecessary prints --- Narek Arsenyan/rss_reader_package/cache_worker.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Narek Arsenyan/rss_reader_package/cache_worker.py b/Narek Arsenyan/rss_reader_package/cache_worker.py index d505ff95..3de3ff74 100644 --- a/Narek Arsenyan/rss_reader_package/cache_worker.py +++ b/Narek Arsenyan/rss_reader_package/cache_worker.py @@ -27,9 +27,7 @@ def store_feed_in_cache(feed: Feed): feed: Feed object with all necessary data """ - print("here") date = str(datetime.strptime(feed.date, "%a, %d %b %Y %H:%M:%S %z").date()) - print("here 2") source = feed.source_url feed_id = feed.link config.verbose_print("Getting user cache directory", "bold") From 82cfd7eee680b3bba1278cbc61d44da3001153b0 Mon Sep 17 00:00:00 2001 From: NMac99 <87823992+NMac99@users.noreply.github.com> Date: Thu, 30 Jun 2022 14:26:43 +0400 Subject: [PATCH 9/9] REMOVE unnecessary file creation --- Narek Arsenyan/rss_reader_package/feed_fetcher.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Narek Arsenyan/rss_reader_package/feed_fetcher.py b/Narek Arsenyan/rss_reader_package/feed_fetcher.py index d54d11a6..2d5ef6c4 100644 --- a/Narek Arsenyan/rss_reader_package/feed_fetcher.py +++ b/Narek Arsenyan/rss_reader_package/feed_fetcher.py @@ -71,7 +71,5 @@ def fetch_feeds(self): formatted_feed = Feed(source_title, self.__source, title, date, link, raw_content, feed_links, media_links) CacheWorker.store_feed_in_cache(formatted_feed) self.feeds_formatted.append(formatted_feed) - with open("content.txt", "w") as f: - f.write(str(items)) except Exception: raise WrongUrlError("Specified url is not rss type")