Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
results
venv
.idea
rss_reader.egg-info
.pytest_cache
69 changes: 42 additions & 27 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,28 +1,43 @@
# How to create a PR with a homework task

1. Create fork from the following repo: https://github.com/E-P-T/Homework. (Docs: https://docs.github.com/en/get-started/quickstart/fork-a-repo )
2. Clone your forked repo in your local folder.
3. Create separate branches for each session.Example(`session_2`, `session_3` and so on)
4. Create folder with you First and Last name in you forked repo in the created session.
5. Add your task into created folder
6. Push finished session task in the appropriate branch in accordance with written above.
You should get the structure that looks something like that

```
Branch: Session_2
DzmitryKolb
|___Task1.py
|___Task2.py
Branch: Session_3
DzmitryKolb
|___Task1.py
|___Task2.py
```

7. When you finish your work on task you should create Pull request to the appropriate branch of the main repo https://github.com/E-P-T/Homework (Docs: https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork).
Please use the following instructions to prepare good description of the pull request:
- Pull request header should be: `Session <Number of the session> - <FirstName> <LastName>`.
Example: `Session 2 - Dzmitry Kolb`
- Pull request body: You should write here what tasks were implemented.
Example: `Finished: Task 1.2, Task 1.3, Task 1.6`
## Documentation

### Minimal requirements:
__Python 3.9__\
On linux please add alias _python_ to _python3_. Look [here](https://askubuntu.com/questions/320996/how-to-make-python-program-command-execute-python-3).

### Setup:
#### Virtual Environment (Optional)
Creating Virtual Environment (from the root of the project)\
Windows: `python -m venv ./venv`\
Linux: `virtualenv venv`

Activate Virtual Environment:\
Windows: `./venv/Scripts/activate`\
Linux: `source venv/bin/activate`

*_On Windows you might need to give rights to execute commands from PowerShell via the following command (running as Administrator)_\
`Set-ExecutionPolicy Unrestricted`

*_If you want to exit Virtual Environment please run `deactivate`_

#### Required Steps
Update pip:\
`python -m pip install --upgrade pip`

Install requirements:\
`pip install -r ./requirements.txt`

### Run Application:
Run `python ./rss_parse/rss_reader.py --help` to find available options

### Cache
Application stores RSS Feed in a local storage in a temp folder (and rss_reader sub-folder).\
For more info on what is considered a temp directory please look [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir)

### Run Tests:
Run `pytest ./tests` to run tests

### Package distributive:
To create a distribution package please run\
`pip install -e .`\
You will be able to run `rss_reader` directly\
Also you should run this command as it makes the required font available for fpdf library
Binary file added fonts/OpenSans.ttf
Binary file not shown.
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-e .
pytest
pytest-mock
1 change: 1 addition & 0 deletions rss_parse/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "1.0"
Empty file.
19 changes: 19 additions & 0 deletions rss_parse/exceptions/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
class ParsingException(Exception):
"""
An exception that could happen during RSS parsing
"""
pass


class CacheException(Exception):
"""
An exception that could happen during caching of RSS feed
"""
pass


class ProcessingException(Exception):
"""
An exception that could happen during RSS feed processing
"""
pass
Empty file added rss_parse/parse/__init__.py
Empty file.
17 changes: 17 additions & 0 deletions rss_parse/parse/params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
class Params:
"""
Stores parameters to run rss reader.
"""

def __init__(self, is_verbose, is_json, limit, source, pub_date, html_dir, pdf_dir):
self.is_verbose = is_verbose
self.is_json = is_json
self.limit = limit
self.source = source
self.pub_date = pub_date
self.html_dir = html_dir
self.pdf_dir = pdf_dir

@staticmethod
def from_args(args):
return Params(args.verbose, args.json, args.limit, args.source, args.date, args.to_html, args.to_pdf)
83 changes: 83 additions & 0 deletions rss_parse/parse/rss_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import os
import tempfile

from rss_parse.exceptions.exceptions import CacheException, ParsingException
from rss_parse.parse.rss_feed import RssFeed
from rss_parse.parse.rss_mapper import RSS_FEED_JSON_MAPPER
from rss_parse.parse.rss_parser import RssJsonParser
from rss_parse.utils.collection_utils import group_by, merge_by_key
from rss_parse.utils.messaging_utils import MESSAGE_CONSUMER_NOOP


class TmpDirectoryCache:
"""
Class to store RSS Feed in a temporary directory
"""
__DATE_TO_FILE_NAME_PATTERN = '%Y%m%d'

def __init__(self, rss_feed, mc=MESSAGE_CONSUMER_NOOP):
self.__rss_feed = rss_feed
self.__mc = mc
self.__base_dir = TmpDirectoryCache.get_cache_base_path()

@staticmethod
def get_cache_base_path():
"""
Returns the directory where all Cached files are stored
"""
return os.path.join(tempfile.gettempdir(), "rss_reader")

@staticmethod
def get_cache_path(pub_date):
"""
Builds the path to the cache file based on a publication date
"""
return os.path.join(TmpDirectoryCache.get_cache_base_path(),
f"{pub_date.strftime(TmpDirectoryCache.__DATE_TO_FILE_NAME_PATTERN)}.json")

def cache(self):
try:
if not os.path.exists(self.__base_dir):
os.mkdir(self.__base_dir)
except:
raise CacheException("Unable to create a directory for local cache")

if not self.__rss_feed or not self.__rss_feed.rss_items:
return

feed_by_date = group_by(self.__rss_feed.rss_items,
key=lambda x: x.publication_date.strftime(
TmpDirectoryCache.__DATE_TO_FILE_NAME_PATTERN))
for pub_date, new_items in feed_by_date:
file_name = os.path.join(self.__base_dir, f"{pub_date}.json")
json_parser = RssJsonParser(file_name, self.__mc)
existing_items = json_parser.parse().rss_items
all_items = merge_by_key([*existing_items, *new_items], key=lambda x: x.key())
all_feed = RssFeed(all_items)
rss_json = RSS_FEED_JSON_MAPPER.to_json(all_feed)
with open(file_name, "w", encoding="UTF-8") as f:
f.write(rss_json)


class CacheJsonParser(RssJsonParser):
"""
Class to read RSS Feed from a cached directory
"""

def __init__(self, date, source, mc=None):
super().__init__(TmpDirectoryCache.get_cache_path(date), mc)

self.__source = source

def parse(self):
"""
Class to read RSS Feed from a cached directory.
Raises an exception if no news for the date found.
"""
rss_feed = super().parse()
items = rss_feed.rss_items
if self.__source:
items = [item for item in items if item.source == self.__source]
if not items:
raise ParsingException("No cached news for the date")
return RssFeed(items)
28 changes: 28 additions & 0 deletions rss_parse/parse/rss_feed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from dataclasses import dataclass
from datetime import datetime
from typing import List


@dataclass
class RssItem:
"""
Data class to store information about RSS Item/News
"""

title: str
description: str
publication_date: datetime
link: str
image_url: str
source: str = None

def key(self):
return self.link, self.publication_date


@dataclass
class RssFeed:
"""
Data class to store a list of RSS Items
"""
rss_items: List[RssItem]
13 changes: 13 additions & 0 deletions rss_parse/parse/rss_keys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
RSS_ROOT = 'rss'
RSS_CHANNEL = 'channel'
RSS_ITEMS = 'item'
RSS_ITEM_TITLE = 'title'
RSS_ITEM_DESCRIPTION = 'description'
RSS_ITEM_LINK = 'link'
RSS_ITEM_PUB_DATE = 'pubDate'
RSS_IMAGE_ROOT = 'image'
RSS_IMAGE_ROOT_ENCLOSURE = 'enclosure'
RSS_IMAGE_ROOT_MEDIA_CONTENT = 'media:content'
RSS_IMAGE_ROOT_MEDIA_THUMBNAIL = 'media:thumbnail'
RSS_IMAGE_URL_ATTR = '@url'
RSS_SOURCE = 'source'
64 changes: 64 additions & 0 deletions rss_parse/parse/rss_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import json
from datetime import timezone, datetime

from rss_parse.parse.rss_feed import RssFeed, RssItem
from rss_parse.parse.rss_keys import *
from rss_parse.utils.formatting_utils import format_date_pretty, get_description_plain


class RssJsonMapper:
"""
Class to do a conversion of RSS Feed TO and FROM json
"""
__DATE_TIME_FORMAT = "%Y-%m-%d %H:%M:%S"

def to_json(self, rss_feed: RssFeed, indent=None, pretty=False):
res = {
RSS_ITEMS: [self.__item_to_json(item, pretty) for item in rss_feed.rss_items]
}

return json.dumps(res, indent=indent, ensure_ascii=False)

def __item_to_json(self, item: RssItem, pretty):
res = {
RSS_ITEM_TITLE: item.title,
RSS_ITEM_LINK: item.link,
}
# Store as UTC
publication_date = item.publication_date.astimezone(timezone.utc) \
.strftime(RssJsonMapper.__DATE_TIME_FORMAT)
description = item.description
if pretty:
publication_date = format_date_pretty(item.publication_date)
description = get_description_plain(description)

res[RSS_ITEM_PUB_DATE] = publication_date

if description:
res[RSS_ITEM_DESCRIPTION] = description

if item.image_url:
res[RSS_IMAGE_ROOT] = item.image_url

if item.source:
res[RSS_SOURCE] = item.source

return res

def from_json(self, rss_feed_json):
rss_dict = json.loads(rss_feed_json)
items = [self.__parse_item(item) for item in rss_dict[RSS_ITEMS]]
return RssFeed(items)

def __parse_item(self, item):
title = item[RSS_ITEM_TITLE]
description = item.get(RSS_ITEM_DESCRIPTION, None)
publication_date = datetime.strptime(item[RSS_ITEM_PUB_DATE], RssJsonMapper.__DATE_TIME_FORMAT) \
.replace(tzinfo=timezone.utc).astimezone()
link = item[RSS_ITEM_LINK]
image_url = item.get(RSS_IMAGE_ROOT, None)
source = item.get(RSS_SOURCE, None)
return RssItem(title, description, publication_date, link, image_url, source)


RSS_FEED_JSON_MAPPER = RssJsonMapper()
Loading