From d62e05bc4e2c723f3dc2cab50bc98f6ff752ed4a Mon Sep 17 00:00:00 2001 From: Valery Litskevich Date: Tue, 24 May 2022 18:57:26 +0500 Subject: [PATCH 1/8] Add final task --- Final_Task.md | 195 ++++++++++++++++++++++++++++++++++++++++++++++++++ RULES.md | 12 ++++ 2 files changed, 207 insertions(+) create mode 100644 Final_Task.md create mode 100644 RULES.md diff --git a/Final_Task.md b/Final_Task.md new file mode 100644 index 00000000..2e2e618a --- /dev/null +++ b/Final_Task.md @@ -0,0 +1,195 @@ +# Introduction to Python. Final task. +You are proposed to implement Python RSS-reader using **python 3.9**. + +The task consists of few iterations. Do not start new iteration if the previous one is not implemented yet. + +## Common requirements. +* It is mandatory to use `argparse` module. +* Codebase must be covered with unit tests with at least 50% coverage. It's a mandatory requirement. +* Yor script should **not** require installation of other services such as mysql server, +postgresql and etc. (except Iteration 6). If it does require such programs, +they should be installed automatically by your script, without user doing anything. +* In case of any mistakes utility should print human-readable. +error explanation. Exception tracebacks in stdout are prohibited in final version of application. +* Docstrings are mandatory for all methods, classes, functions and modules. +* Code must correspond to `pep8` (use `pycodestyle` utility for self-check). + * You can set line length up to 120 symbols. +* Commit messages should provide correct and helpful information about changes in commit. Messages like `Fix bug`, +`Tried to make workable`, `Temp commit` and `Finally works` are prohibited. +* All used third-party packages should be written in the `requirements.txt` file and in installation files (`setup.py`, `setup.cfg`, etc.). +* You have to write a file with documentation. Everything must be documented: how to run scripts, how to run tests, how to install the library and etc. + +## [Iteration 1] One-shot command-line RSS reader. +RSS reader should be a command-line utility which receives [RSS](wikipedia.org/wiki/RSS) URL and prints results in human-readable format. + +You are free to choose format of the news console output. The textbox below provides an example of how it can be implemented: + +```shell +$ rss_reader.py "https://news.yahoo.com/rss/" --limit 1 + +Feed: Yahoo News - Latest News & Headlines + +Title: Nestor heads into Georgia after tornados damage Florida +Date: Sun, 20 Oct 2019 04:21:44 +0300 +Link: https://news.yahoo.com/wet-weekend-tropical-storm-warnings-131131925.html + +[image 2: Nestor heads into Georgia after tornados damage Florida][2]Nestor raced across Georgia as a post-tropical cyclone late Saturday, hours after the former tropical storm spawned a tornado that damaged +homes and a school in central Florida while sparing areas of the Florida Panhandle devastated one year earlier by Hurricane Michael. The storm made landfall Saturday on St. Vincent Island, a nature preserve +off Florida's northern Gulf Coast in a lightly populated area of the state, the National Hurricane Center said. Nestor was expected to bring 1 to 3 inches of rain to drought-stricken inland areas on its +march across a swath of the U.S. Southeast. + + +Links: +[1]: https://news.yahoo.com/wet-weekend-tropical-storm-warnings-131131925.html (link) +[2]: http://l2.yimg.com/uu/api/res/1.2/Liyq2kH4HqlYHaS5BmZWpw--/YXBwaWQ9eXRhY2h5b247aD04Njt3PTEzMDs-/https://media.zenfs.com/en/ap.org/5ecc06358726cabef94585f99050f4f0 (image) + +``` + +Utility should provide the following interface: +```shell +usage: rss_reader.py [-h] [--version] [--json] [--verbose] [--limit LIMIT] + source + +Pure Python command-line RSS reader. + +positional arguments: + source RSS URL + +optional arguments: + -h, --help show this help message and exit + --version Print version info + --json Print result as JSON in stdout + --verbose Outputs verbose status messages + --limit LIMIT Limit news topics if this parameter provided + +``` + +In case of using `--json` argument your utility should convert the news into [JSON](https://en.wikipedia.org/wiki/JSON) format. +You should come up with the JSON structure on you own and describe it in the README.md file for your repository or in a separate documentation file. + + + +With the argument `--verbose` your program should print all logs in stdout. + +### Task clarification (I) + +1) If `--version` option is specified app should _just print its version_ and stop. +2) User should be able to use `--version` option without specifying RSS URL. For example: +``` +> python rss_reader.py --version +"Version 1.4" +``` +3) The version is supposed to change with every iteration. +4) If `--limit` is not specified, then user should get _all_ available feed. +5) If `--limit` is larger than feed size then user should get _all_ available news. +6) `--verbose` should print logs _in the process_ of application running, _not after everything is done_. +7) Make sure that your app **has no encoding issues** (meaning symbols like `'` and etc) when printing news to _stdout_. +8) Make sure that your app **has no encoding issues** (meaning symbols like `'` and etc) when printing news to _stdout in JSON format_. +9) It is preferrable to have different custom exceptions for different situations(If needed). +10) The `--limit` argument should also affect JSON generation. + + +## [Iteration 2] Distribution. + +* Utility should be wrapped into distribution package with `setuptools`. +* This package should export CLI utility named `rss-reader`. + + +### Task clarification (II) + +1) User should be able to run your application _both_ with and without installation of CLI utility, +meaning that this should work: + +``` +> python rss_reader.py ... +``` + +as well as this: + +``` +> rss_reader ... +``` +2) Make sure your second iteration works on a clean machie with python 3.9. (!) +3) Keep in mind that installed CLI utility should have the same functionality, so do not forget to update dependencies and packages. + + +## [Iteration 3] News caching. +The RSS news should be stored in a local storage while reading. The way and format of this storage you can choose yourself. +Please describe it in a separate section of README.md or in the documentation. + +New optional argument `--date` must be added to your utility. It should take a date in `%Y%m%d` format. +For example: `--date 20191020` +Here date means actual *publishing date* not the date when you fetched the news. + +The cashed news can be read with it. The new from the specified day will be printed out. +If the news are not found return an error. + +If the `--date` argument is not provided, the utility should work like in the previous iterations. + +### Task clarification (III) +1) Try to make your application crossplatform, meaning that it should work on both Linux and Windows. +For example when working with filesystem, try to use `os.path` lib instead of manually concatenating file paths. +2) `--date` should **not** require internet connection to fetch news from local cache. +3) User should be able to use `--date` without specifying RSS source. For example: +``` +> python rss_reader.py --date 20191206 +...... +``` +Or for second iteration (when installed using setuptools): +``` +> rss_reader --date 20191206 +...... +``` +4) If `--date` specified _together with RSS source_, then app should get news _for this date_ from local cache that _were fetched from specified source_. +5) `--date` should work correctly with both `--json`, `--limit`, `--verbose` and their different combinations. + +## [Iteration 4] Format converter. + +You should implement the conversion of news in at least two of the suggested format: `.mobi`, `.epub`, `.fb2`, `.html`, `.pdf` + +New optional argument must be added to your utility. This argument receives the path where new file will be saved. The arguments should represents which format will be generated. + +For example: `--to-mobi` or `--to-fb2` or `--to-epub` + +You can choose yourself the way in which the news will be displayed, but the final text result should contain pictures and links, if they exist in the original article and if the format permits to store this type of data. + +### Task clarification (IV) + +Convertation options should work correctly together with all arguments that were implemented in Iterations 1-3. For example: +* Format convertation process should be influenced by `--limit`. +* If `--json` is specified together with convertation options, then JSON news should +be printed to stdout, and converted file should contain news in normal format. +* Logs from `--verbose` should be printed in stdout and not added to the resulting file. +* `--date` should also work correctly with format converter and to not require internet access. + +## * [Iteration 5] Output colorization. +> Note: An optional iteration, it is not necessary to implement it. You can move on with it only if all the previous iterations (from 1 to 4) are completely implemented. + +You should add new optional argument `--colorize`, that will print the result of the utility in colorized mode. + +*If the argument is not provided, the utility should work like in the previous iterations.* + +> Note: Take a look at the [colorize](https://pypi.org/project/colorize/) library + +## * [Iteration 6] Web-server. +> Note: An optional iteration, it is not necessary to implement it. You can move on with it only if all the previous iterations (from 1 to 4) are completely implemented. Introduction to Python course does not cover the topics that are needed for the implementation of this part. + +There are several mandatory requirements in this iteration: +* `Docker` + `docker-compose` usage (at least 2 containers: one for web-application, one for DB) +* Web application should provide all the implemented in the previous parts of the task functionality, using the REST API: + * One-shot conversion from RSS to Human readable format + * Server-side news caching + * Conversion in epub, mobi, fb2 or other formats + +Feel free to choose the way of implementation, libraries and frameworks. (We suggest you `Django Rest Framework` + `PostgreSQL` combination) + +You can implement any functionality that you want. The only requirement is to add the description into README file or update project documentation, for example: +* authorization/authentication +* automatic scheduled news update +* adding new RSS sources using API + +--- +Implementations will be checked with the latest cPython interpreter of 3.9 branch. +--- + +> Always code as if the guy who ends up maintaining your code will be a violent psychopath who knows where you live. Code for readability. **John F. Woods** diff --git a/RULES.md b/RULES.md new file mode 100644 index 00000000..9a72034f --- /dev/null +++ b/RULES.md @@ -0,0 +1,12 @@ +# Final task +Final task (`FT`) for EPAM Python Training 2022.03 + +## Rules +* All work has to be implemented in the `master` branch in forked repository. If you think that `FT` is ready, please open a pull request (`PR`) to our repo. +* When a `PR` will be ready, please mark it with the `final_task` label. +* You have one month to finish `FT`. Commits commited after deadline will be ignored. +* At least the first 4 iterations must be done. +* `FT` you can find in the `Final_Task.md` file. + +### Good luck! + From ed8cbb1eb4f30ee812b269bdb2a82b7c6ab3d9fc Mon Sep 17 00:00:00 2001 From: Natallia Paliashchuk Date: Wed, 29 Jun 2022 15:41:29 +0400 Subject: [PATCH 2/8] Final task --- .gitignore | 23 +++++ NatalliaPaliashchuk/README.md | 99 +++++++++++++++++++ NatalliaPaliashchuk/requirements.txt | 6 ++ NatalliaPaliashchuk/rss_reader.py | 10 ++ NatalliaPaliashchuk/rss_reader/__init__.py | 10 ++ NatalliaPaliashchuk/rss_reader/caching.py | 38 +++++++ NatalliaPaliashchuk/rss_reader/converting.py | 40 ++++++++ NatalliaPaliashchuk/rss_reader/exceptions.py | 26 +++++ NatalliaPaliashchuk/rss_reader/parsing.py | 73 ++++++++++++++ NatalliaPaliashchuk/rss_reader/run.py | 69 +++++++++++++ .../rss_reader/templates/to_.html | 24 +++++ NatalliaPaliashchuk/setup.py | 18 ++++ .../test_rss_reader/__init__.py | 0 .../test_rss_reader/test_caching.py | 20 ++++ .../test_rss_reader/test_converting.py | 43 ++++++++ .../test_rss_reader/test_feed.py | 16 +++ .../test_rss_reader/test_parsing.py | 69 +++++++++++++ 17 files changed, 584 insertions(+) create mode 100644 .gitignore create mode 100644 NatalliaPaliashchuk/README.md create mode 100644 NatalliaPaliashchuk/requirements.txt create mode 100644 NatalliaPaliashchuk/rss_reader.py create mode 100644 NatalliaPaliashchuk/rss_reader/__init__.py create mode 100644 NatalliaPaliashchuk/rss_reader/caching.py create mode 100644 NatalliaPaliashchuk/rss_reader/converting.py create mode 100644 NatalliaPaliashchuk/rss_reader/exceptions.py create mode 100644 NatalliaPaliashchuk/rss_reader/parsing.py create mode 100644 NatalliaPaliashchuk/rss_reader/run.py create mode 100644 NatalliaPaliashchuk/rss_reader/templates/to_.html create mode 100644 NatalliaPaliashchuk/setup.py create mode 100644 NatalliaPaliashchuk/test_rss_reader/__init__.py create mode 100644 NatalliaPaliashchuk/test_rss_reader/test_caching.py create mode 100644 NatalliaPaliashchuk/test_rss_reader/test_converting.py create mode 100644 NatalliaPaliashchuk/test_rss_reader/test_feed.py create mode 100644 NatalliaPaliashchuk/test_rss_reader/test_parsing.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..ecb4530f --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Cache +cache.pk1 + +# Unit test / coverage reports +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Distribution / packaging +dist/ +build/ +*.egg-info/ + diff --git a/NatalliaPaliashchuk/README.md b/NatalliaPaliashchuk/README.md new file mode 100644 index 00000000..81b86bf3 --- /dev/null +++ b/NatalliaPaliashchuk/README.md @@ -0,0 +1,99 @@ +# RSS reader +Pure Python command-line RSS reader. +___ +## Installation +### Install all dependencies +`$ pip install -r requirements.txt` +### Install CLI utility +`$ python setup.py install` +### The application can be used also without installation of CLI utility +To run the application use `rss_reader.py` file.\ +Example:\ +`$ python rss_reader.py --version` +___ +## Usage +`rss_reader [-h] [--version] [--json] [--to-html FILE] [--verbose] [--to-epub FILE] [--limit LIMIT] [--date DATE] source` + +| Positional argument | Description | +|-|-| +| `source` | RSS URL | + +| Options | Actions | +|-|-| +| `--version` | Print version info | +| `--json` | Print result as JSON in stdout | +| `--to-html FILE` | Convert RSS feed into html and save as a file to the path | +| `--verbose` | Outputs verbose status messages | +| `--to-epub DIRECTORY` | Limit news topics if this parameter provided | +| `--limit LIMIT` | Limit news topics if this parameter provided | +| `--date DATE ` | Extract news from archive. Take a start publish date in format YYYYMMDD | +### Examples +`$ python rss_reader.py --limit 1 http://feeds.bbci.co.uk/news/world/rss.xml +Feed: BBC News - World` +Return example +```bash +Title: Jan 6 hearings: Ex-aide paints devastating picture of Trump +Date: 2022-06-29 03:20:29+00:00 +Link: https://www.bbc.co.uk/news/world-us-canada-61970258?at_medium=RSS&at_campaign=KARANGA + +Enraged president tried to grab the steering wheel to direct his limousine to the Capitol, ex-aide says. +``` +`$ rss_reader --json --limit 1 https://tech.onliner.by/feed`\ +Return example +```json +{ + "feed_title": "Технологии Onlíner", + "feed_items": [ + { + "item_title": "Sony представила игровые мониторы для PS5 и геймерские наушники. Известны цены", + "item_pub_date": "2022-06-29 09:44:11+03:00", + "item_url": "https://tech.onliner.by/2022/06/29/sony-predstavila-igrovye-monitory-dlya-ps5-i-gejmerskie-naushniki-izvestny-ceny", + "item_desc_text": "[1]Sony официально представила игровой бренд InZone — под ним будут выходить гаджеты для геймеров. Начали с мониторов и наушников, [2 сообщает] engadget.[3 Читать далее…]", + "item_desc_links": [ + { + "link_pos": 1, + "link_url": "https://content.onliner.by/news/thumbnail/625fdac3c028b390f2d80f9c26fe90de.jpeg", + "link_type": "image" + }, + { + "link_pos": 2, + "link_url": "https://www.engadget.com/sony-inzone-gaming-monitors-headsets-specs-pricing-availability-210056794.html", + "link_type": "link" + }, + { + "link_pos": 3, + "link_url": "https://tech.onliner.by/2022/06/29/sony-predstavila-igrovye-monitory-dlya-ps5-i-gejmerskie-naushniki-izvestny-ceny", + "link_type": "link" + } + ], + "item_image_url": "https://content.onliner.by/news/default/625fdac3c028b390f2d80f9c26fe90de.jpeg" + } + ] +} +``` +`$ ./rss_reader.py --limit 1 https://www.dailymail.co.uk/articles.rss`\ +`$ ./rss_reader.py --json --date 20000101 https://www.dailymail.co.uk/articles.rss`\ +Return example +```json +{ + "feed_title": " Articles | Mail Online", + "feed_items": [ + { + "item_title": "Nick Kyrgios admits he spat in the direction of an abusive spectator at Wimbledon", + "item_pub_date": "2022-06-29 07:48:27+01:00", + "item_url": "https://www.dailymail.co.uk/sport/sportsnews/article-10962269/Nick-Kyrgios-admits-spat-direction-abusive-spectator-Wimbledon.html?ns_mchannel=rss&ito=1490&ns_campaign=1490", + "item_desc_text": "The Australian called one line judge 'a snitch' for reporting his abuse and suggested another was in his 90s and 'can't see the ball' during his five-set win over Britain's Paul Jubb.", + "item_desc_links": [], + "item_image_url": "https://i.dailymail.co.uk/1s/2022/06/28/19/59635777-0-image-a-46_1656440320325.jpg" + } + ] +} +``` +___ +## Caching +The application provides cashing RSS feed using `cache.pk1` file in the application directory. The option `--date` with date in `%Y%m%d` format can be used to read cashed news from the specified date also without Internet connection. +___ +## Tests +To run unittests use the following command:\ +`$ python -m unittest discover` +___ diff --git a/NatalliaPaliashchuk/requirements.txt b/NatalliaPaliashchuk/requirements.txt new file mode 100644 index 00000000..f60569d7 --- /dev/null +++ b/NatalliaPaliashchuk/requirements.txt @@ -0,0 +1,6 @@ +jinja2~=3.1.2 +requests~=2.28.0 +html2epub~=1.2 +beautifulsoup4~=4.11.1 +lxml~=4.9.0 +python-dateutil~=2.8.2 \ No newline at end of file diff --git a/NatalliaPaliashchuk/rss_reader.py b/NatalliaPaliashchuk/rss_reader.py new file mode 100644 index 00000000..64fae02c --- /dev/null +++ b/NatalliaPaliashchuk/rss_reader.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +import sys +from rss_reader.run import run +from rss_reader.exceptions import PythonVersionError + +if __name__ == '__main__': + sys.tracebacklimit = -1 + if sys.version_info < (3, 8): + raise PythonVersionError('It needs at least 3.9 version of Python to run the application') + run() diff --git a/NatalliaPaliashchuk/rss_reader/__init__.py b/NatalliaPaliashchuk/rss_reader/__init__.py new file mode 100644 index 00000000..2387e46d --- /dev/null +++ b/NatalliaPaliashchuk/rss_reader/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +__all__ = [ + '__version__', + '__author__', + '__email__', +] + +__version__ = '1.4' +__author__ = 'Natallia Palyashchuk' +__email__ = 'paurra@live.com' \ No newline at end of file diff --git a/NatalliaPaliashchuk/rss_reader/caching.py b/NatalliaPaliashchuk/rss_reader/caching.py new file mode 100644 index 00000000..3d2dc73c --- /dev/null +++ b/NatalliaPaliashchuk/rss_reader/caching.py @@ -0,0 +1,38 @@ +import pickle +import logging +from .exceptions import CachingError + +logger = logging.getLogger(__name__) + + +def cache_feed(feed): + '''Serialization feed into a pickle file''' + logger.debug('Feed serialization started') + try: + with open('cache.pk1', 'wb') as f: + pickle.dump(feed, f) + except Exception as e: + logger.debug(f'Cashing can\'t be done due to {e}') + raise CachingError(e) from None + logger.debug('Done') + + +def get_feed_by_date(date, url, limit): + '''Get feed by date from cache''' + logger.debug(f'Getting feed by {date} started') + try: + with open('cache.pk1', 'rb') as f: + feed = pickle.load(f) + except Exception as e: + logger.debug(f'Getting from cache can\'t be done due to {e}') + raise CachingError(e) from None + if url not in feed: + logger.debug(f'{url} not found in cache') + return None + logger.debug('Done') + if limit == 0: + return {url: {'feed_title': feed[url]['feed_title'], + 'feed_items': [item for item in feed[url]['feed_items'] if item['item_pub_date'] >= date]}} + else: + return {url: {'feed_title': feed[url]['feed_title'], + 'feed_items': [item for item in feed[url]['feed_items'] if item['item_pub_date'] >= date][:limit]}} diff --git a/NatalliaPaliashchuk/rss_reader/converting.py b/NatalliaPaliashchuk/rss_reader/converting.py new file mode 100644 index 00000000..3a02a93f --- /dev/null +++ b/NatalliaPaliashchuk/rss_reader/converting.py @@ -0,0 +1,40 @@ +import os +import json +import logging +import html2epub +from jinja2 import Template +from .exceptions import ConvertError + +logger = logging.getLogger(__name__) + + +def feed_to_html(feed): + '''Convert feed to html''' + here = os.path.abspath(os.path.dirname(__file__)) + logger.debug('Creating html') + try: + with open(os.path.join(here, 'templates', 'to_.html')) as f: + html = Template(f.read()).render(feed) + except Exception as e: + logger.debug(f'Creating html can\'t be done due to {e}') + raise ConvertError(e) from None + logger.debug('Done') + return html + + +def feed_to_json(feed, indent=None): + '''Convert feed to json''' + logger.debug('Creating json') + json_ = json.dumps(feed, ensure_ascii=False, indent=indent, default=str) + logger.debug('Done') + return json_ + + +def feed_to_epub(feed, path): + '''Convert feed to epub''' + logger.debug('Creating epub') + epub = html2epub.Epub(feed['feed_title']) + chapter = html2epub.create_chapter_from_string(feed_to_html(feed)) + epub.add_chapter(chapter) + epub.create_epub(path) + logger.debug('Done') diff --git a/NatalliaPaliashchuk/rss_reader/exceptions.py b/NatalliaPaliashchuk/rss_reader/exceptions.py new file mode 100644 index 00000000..7a5bffde --- /dev/null +++ b/NatalliaPaliashchuk/rss_reader/exceptions.py @@ -0,0 +1,26 @@ +class ParserError(Exception): + pass + + +class LimitError(Exception): + pass + + +class NotFoundError(Exception): + pass + + +class PythonVersionError(Exception): + pass + + +class RequirementsError(Exception): + pass + + +class ConvertError(Exception): + pass + + +class CachingError(Exception): + pass diff --git a/NatalliaPaliashchuk/rss_reader/parsing.py b/NatalliaPaliashchuk/rss_reader/parsing.py new file mode 100644 index 00000000..f371ad44 --- /dev/null +++ b/NatalliaPaliashchuk/rss_reader/parsing.py @@ -0,0 +1,73 @@ +import logging +import requests +from bs4 import BeautifulSoup +from dateutil.parser import parse +from .exceptions import ParserError + +logger = logging.getLogger(__name__) + + +def get_item_desc(item): + '''Parse description of RSS item''' + logger.debug('Parsing description of RSS feed') + url_tags = {'a': {'attr': 'href', 'type': 'link', 'alt_text': ''}, + 'img': {'attr': 'src', 'type': 'image', 'alt_text': 'alt'}} + text, links, link_pos = '', [], 1 + if desc := item.description: + bs = BeautifulSoup(desc.get_text(strip=True), features='html.parser') + for tag in bs.find_all(): + if tag.attrs and not tag.find_all(): + url = tag.attrs.get(url_tags.get(tag.name, {}).get('attr', '')) + if url: + links.append({'link_pos': link_pos, 'link_url': url, 'link_type': url_tags.get(tag.name)['type']}) + alt = tag.text or tag.attrs.get(url_tags.get(tag.name)['alt_text'], '') + tag.replace_with(f'[{link_pos} {alt}'.strip() + ']') + link_pos += 1 + tag.smooth() + text = bs.text + logger.debug('Done') + return {'desc_text': text, 'desc_links': links} + + +def get_item_image_url(item, url): + '''Parse image url of RSS item''' + logger.debug('Parsing image url of RSS feed') + image_url = '' + if media_content := item.find('media:content', attrs={'url': True}): + if media_content.has_attr('medium'): + if media_content.get('medium') == 'image': + image_url = media_content['url'] + else: + if requests.head(media_content['url']).headers.get('Content-Type')[:5] == 'image': + image_url = media_content['url'] + else: + bs_html = BeautifulSoup(requests.get(url).content, features='html.parser') + if preview_image := bs_html.find('meta', property='og:image', attrs={'content': True}): + image_url = preview_image.attrs.get('content') + logger.debug('Done') + return image_url.strip() + + +def parse_rss(url, limit=0): + '''Parse a RSS feed and return a RSS dict''' + logger.debug(f'Parsing RSS from {url} started') + try: + bs_xml = BeautifulSoup(requests.get(url).content, features='xml') + except Exception as e: + raise ParserError(f'The {url} can\'t be parsed due to {e}') from None + if not bs_xml.rss: + raise ParserError(f'The {url} can\'t be parsed as RSS') + try: + feed = {url: {'feed_title': bs_xml.title.text, 'feed_items': []}} + for item in bs_xml.findAll('item')[:None if limit == 0 else limit]: + item_dict = {'item_title': getattr(item.title, 'text', '').strip(), + 'item_pub_date': parse(getattr(item.pubDate, 'text', '0001-01-01')), + 'item_url': getattr(item.link, 'text', '').strip(), + 'item_desc_text': get_item_desc(item)['desc_text'].strip(), + 'item_desc_links': get_item_desc(item)['desc_links']} + item_dict['item_image_url'] = get_item_image_url(item, item_dict['item_url']) + feed[url]['feed_items'].append(item_dict) + except Exception as e: + raise ParserError(f'The {url} can\'t be parsed as RSS due to {e}') from None + logger.debug('Done') + return feed diff --git a/NatalliaPaliashchuk/rss_reader/run.py b/NatalliaPaliashchuk/rss_reader/run.py new file mode 100644 index 00000000..cbf61ab7 --- /dev/null +++ b/NatalliaPaliashchuk/rss_reader/run.py @@ -0,0 +1,69 @@ +import argparse +import logging +import sys +from . import * +from datetime import datetime, timezone +from .caching import cache_feed, get_feed_by_date +from .parsing import parse_rss +from .exceptions import NotFoundError, LimitError +from .converting import feed_to_json, feed_to_epub, feed_to_html + +logger = logging.getLogger(__name__) + + +def run(): + '''The main function of the application''' + sys.tracebacklimit = -1 + parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader', prog='rss_reader') + parser.add_argument('source', help='RSS URL') + parser.add_argument('--version', action='version', help='Print version info', version=f'Version {__version__}') + parser.add_argument('--json', action='store_true', help='Print result as JSON in stdout') + parser.add_argument('--to-html', help='Convert RSS feed into html and save as a file to the path', metavar='FILE') + parser.add_argument('--verbose', action='store_true', help='Outputs verbose status messages') + parser.add_argument( + '--to-epub', help='Convert RSS feed into epub and save as a file in the directory', metavar='DIRECTORY') + parser.add_argument('--limit', type=int, default=0, help='Limit news topics if this parameter provided') + parser.add_argument('--date', help='Extract news from archive. Take a start publish date in format YYYYMMDD') + + args = parser.parse_args() + + if args.verbose: + logging.basicConfig( + level=logging.DEBUG, format='%(asctime)s — %(name)s — %(levelname)s — %(funcName)s:%(lineno)d — %(message)s', stream=sys.stdout) + + if args.limit == 0: + logger.debug('Limit of items was not specified. All items will be displayed.') + elif args.limit > 0: + logger.debug(f'{args.limit} items will be displayed.') + else: + raise LimitError('Limit of items must be greater than 1') + + if args.date: + feed = get_feed_by_date(datetime.strptime(args.date, '%Y%m%d').replace( + tzinfo=timezone.utc), args.source, args.limit) + else: + feed = parse_rss(args.source, args.limit) + cache_feed(feed) + + if not feed: + raise NotFoundError('News not found') + + if args.json: + print(feed_to_json(feed[args.source], indent=2)) + else: + print(f'Feed: {feed[args.source]["feed_title"]}\n') + for item in feed[args.source]['feed_items']: + print(f'Title: {item["item_title"]}\nDate: {item["item_pub_date"]}\nLink: {item["item_url"]}\ + \n\n{item["item_desc_text"]}\n') + if item['item_desc_links']: + print('Links:') + for link in item['item_desc_links']: + print(f'[{link["link_pos"]}]: {link["link_url"]} {link["link_type"]}') + print('\n') + + if args.to_html: + with open(args.to_html, 'w', encoding='UTF-8') as f: + f.write(feed_to_html(feed[args.source])) + + if args.to_epub: + feed_to_epub(feed[args.source], args.to_epub) diff --git a/NatalliaPaliashchuk/rss_reader/templates/to_.html b/NatalliaPaliashchuk/rss_reader/templates/to_.html new file mode 100644 index 00000000..4f94e400 --- /dev/null +++ b/NatalliaPaliashchuk/rss_reader/templates/to_.html @@ -0,0 +1,24 @@ + + + + + {{ feed_title }} + + +

{{ feed_title }}

+ {%- for item in feed_items %} +

+ Title: {{ item.item_title }}
+ Date: {{ item.item_pub_date }}
+ Link: {{ item.item_url }}

+ {%- if item.item_image_url != '' %} + + {%- endif %}
+ {{ item.item_desc_text }}
+ {%- for link in item.item_desc_links %} + [{{ link.link_pos }}]: {{ link.link_type }}
+ {%- endfor %} +

+ {%- endfor %} + + \ No newline at end of file diff --git a/NatalliaPaliashchuk/setup.py b/NatalliaPaliashchuk/setup.py new file mode 100644 index 00000000..04e7f220 --- /dev/null +++ b/NatalliaPaliashchuk/setup.py @@ -0,0 +1,18 @@ +from setuptools import setup +from rss_reader import * + + +with open('README.md', encoding="utf8") as readme, open('requirements.txt', encoding="utf8") as requirements: + read_me_description, required_dependencies = readme.read(), requirements.readlines() + +setup( + name='rss_reader', + description='CLI RSS-reader', + long_description=read_me_description, + long_description_content_type='text/markdown', + install_requires=required_dependencies, + author=__author__, + packages=['rss_reader'], + package_data={'rss_reader': ['templates/*']}, + python_requires='>=3.9', + entry_points={'console_scripts': ['rss_reader = rss_reader.run:run']}) diff --git a/NatalliaPaliashchuk/test_rss_reader/__init__.py b/NatalliaPaliashchuk/test_rss_reader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/NatalliaPaliashchuk/test_rss_reader/test_caching.py b/NatalliaPaliashchuk/test_rss_reader/test_caching.py new file mode 100644 index 00000000..76dc3037 --- /dev/null +++ b/NatalliaPaliashchuk/test_rss_reader/test_caching.py @@ -0,0 +1,20 @@ +import unittest +import os +from rss_reader.caching import * +from .test_feed import test_feed +from rss_reader.exceptions import CachingError + + +class TestCaching(unittest.TestCase): + def test_caching(self, *args): + cache_feed({'url': test_feed}) + cached_feed = get_feed_by_date(test_feed['feed_items'][0]['item_pub_date'], 'url', 1) + self.assertEqual(cached_feed['url'], test_feed) + none_cached_feed = get_feed_by_date(test_feed['feed_items'][0]['item_pub_date'], 'not_url', 1) + self.assertIsNone(none_cached_feed) + os.remove('cache.pk1') + self.assertRaises(CachingError, get_feed_by_date, test_feed['feed_items'][0]['item_pub_date'], 'url', 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/NatalliaPaliashchuk/test_rss_reader/test_converting.py b/NatalliaPaliashchuk/test_rss_reader/test_converting.py new file mode 100644 index 00000000..5de7ea7f --- /dev/null +++ b/NatalliaPaliashchuk/test_rss_reader/test_converting.py @@ -0,0 +1,43 @@ +import unittest +from rss_reader.converting import feed_to_html, feed_to_json +from .test_feed import test_feed + + +class TestConverting(unittest.TestCase): + def test_feed_to_html(self, *args): + self.maxDiff = None + html = feed_to_html(test_feed) + self.assertEqual(html, ''' + + + + Feed title + + +

Feed title

+

+ Title: Item title 1
+ Date: 2022-06-22 12:12:12+00:00
+ Link: http://example.com/item_1.html

+
+ [1 Image comment]Item description 1[2 Link comment]
+ [1]: image
+ [2]: link
+

+ +''') + + def test_feed_to_json(self, *args): + json_ = feed_to_json(test_feed) + self.assertEqual(json_, '''{"feed_title": \ +"Feed title", "feed_items": [{"item_title": "Item title 1", \ +"item_pub_date": "2022-06-22 12:12:12+00:00", "item_url": \ +"http://example.com/item_1.html", "item_desc_text": \ +"[1 Image comment]Item description 1[2 Link comment]", "item_desc_links": \ +[{"link_pos": 1, "link_url": "http://example.com/files/item_1.jpg", "link_type": \ +"image"}, {"link_pos": 2, "link_url": "http://example.com/item_1.html", "link_type": \ +"link"}], "item_image_url": "http://example.com/item_1.jpg"}]}''') + + +if __name__ == '__main__': + unittest.main() diff --git a/NatalliaPaliashchuk/test_rss_reader/test_feed.py b/NatalliaPaliashchuk/test_rss_reader/test_feed.py new file mode 100644 index 00000000..7712a894 --- /dev/null +++ b/NatalliaPaliashchuk/test_rss_reader/test_feed.py @@ -0,0 +1,16 @@ +import datetime +from dateutil.tz import tzutc + +test_feed = {'feed_title': 'Feed title', 'feed_items': [ + {'item_title': 'Item title 1', + 'item_pub_date': datetime.datetime(2022, 6, 22, 12, 12, 12, tzinfo=tzutc()), + 'item_url': 'http://example.com/item_1.html', + 'item_desc_text': '[1 Image comment]Item description 1[2 Link comment]', + 'item_desc_links': [ + {'link_pos': 1, + 'link_url': 'http://example.com/files/item_1.jpg', + 'link_type': 'image'}, + {'link_pos': 2, + 'link_url': 'http://example.com/item_1.html', + 'link_type': 'link'}], + 'item_image_url': 'http://example.com/item_1.jpg'}]} diff --git a/NatalliaPaliashchuk/test_rss_reader/test_parsing.py b/NatalliaPaliashchuk/test_rss_reader/test_parsing.py new file mode 100644 index 00000000..c128c78c --- /dev/null +++ b/NatalliaPaliashchuk/test_rss_reader/test_parsing.py @@ -0,0 +1,69 @@ +import datetime +import unittest +from unittest.mock import patch +from dateutil.tz import tzutc +from rss_reader.parsing import parse_rss +from rss_reader.exceptions import ParserError + +test_content = { + 'http://example.com/rss/': ''' + + + + Feed title + http://example.com/ + + Item title 1 + http://example.com/item_1.html + 2022-06-22T12:12:12Z + + + Item description 1Link comment]]> + + + + Item title 2 + http://example.com/item_2.html + Fri, 24 Jun 2022 12:12:12 +0000 + + +'''} + + +class MockResponse: + def __init__(self, url): + self.content = test_content.get(url, '') + + +def mock_requests_get(url): + return MockResponse(url) + + +class TestParsing(unittest.TestCase): + @patch('rss_reader.parsing.requests.get', side_effect=mock_requests_get) + def test_parse_rss(self, *args): + feed = parse_rss('http://example.com/rss/') + self.assertDictEqual(feed, {'http://example.com/rss/': {'feed_title': 'Feed title', 'feed_items': [ + {'item_title': 'Item title 1', + 'item_pub_date': datetime.datetime(2022, 6, 22, 12, 12, 12, tzinfo=tzutc()), + 'item_url': 'http://example.com/item_1.html', + 'item_desc_text': '[1 Image comment]Item description 1[2 Link comment]', + 'item_desc_links': [ + {'link_pos': 1, + 'link_url': 'http://example.com/files/item_1.jpg', + 'link_type': 'image'}, + {'link_pos': 2, + 'link_url': 'http://example.com/item_1.html', + 'link_type': 'link'}], + 'item_image_url': 'http://example.com/item_1.jpg'}, + {'item_title': 'Item title 2', + 'item_pub_date': datetime.datetime(2022, 6, 24, 12, 12, 12, tzinfo=tzutc()), + 'item_url': 'http://example.com/item_2.html', + 'item_desc_text': '', + 'item_desc_links': [], + 'item_image_url': ''}]}}) + self.assertRaises(ParserError, parse_rss, 'not a valid url') + + +if __name__ == '__main__': + unittest.main() From ef5757009a3bc1e86671355e393bc24d55f051cd Mon Sep 17 00:00:00 2001 From: Natallia Paliashchuk Date: Wed, 29 Jun 2022 20:07:53 +0400 Subject: [PATCH 3/8] Installation process was updated --- NatalliaPaliashchuk/README.md | 16 ++++++++++------ NatalliaPaliashchuk/setup.py | 7 ++++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/NatalliaPaliashchuk/README.md b/NatalliaPaliashchuk/README.md index 81b86bf3..1347e6da 100644 --- a/NatalliaPaliashchuk/README.md +++ b/NatalliaPaliashchuk/README.md @@ -1,13 +1,17 @@ # RSS reader -Pure Python command-line RSS reader. +Pure Python command-line RSS reader ___ ## Installation -### Install all dependencies -`$ pip install -r requirements.txt` +The following dependencies are required for installation: +- Python 3.9 or above +- PIP + ### Install CLI utility -`$ python setup.py install` -### The application can be used also without installation of CLI utility -To run the application use `rss_reader.py` file.\ +`$ python setup.py install`\ +### Using the application without installation of CLI utility +- install all dependencies\ +`$ pip install -r requirements.txt` +- run the application using `rss_reader.py` file\ Example:\ `$ python rss_reader.py --version` ___ diff --git a/NatalliaPaliashchuk/setup.py b/NatalliaPaliashchuk/setup.py index 04e7f220..39fa2064 100644 --- a/NatalliaPaliashchuk/setup.py +++ b/NatalliaPaliashchuk/setup.py @@ -1,16 +1,17 @@ +import os from setuptools import setup from rss_reader import * +os.system('pip install -r requirements.txt') -with open('README.md', encoding="utf8") as readme, open('requirements.txt', encoding="utf8") as requirements: - read_me_description, required_dependencies = readme.read(), requirements.readlines() +with open('README.md', encoding="utf8") as readme: + read_me_description = readme.read() setup( name='rss_reader', description='CLI RSS-reader', long_description=read_me_description, long_description_content_type='text/markdown', - install_requires=required_dependencies, author=__author__, packages=['rss_reader'], package_data={'rss_reader': ['templates/*']}, From 5cf75853041b90c48a80b17158f77d1c8eb99ab5 Mon Sep 17 00:00:00 2001 From: Natallia Paliashchuk Date: Wed, 29 Jun 2022 21:16:01 +0400 Subject: [PATCH 4/8] .gitignore was moved to personal directory --- .gitignore => NatalliaPaliashchuk/.gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) rename .gitignore => NatalliaPaliashchuk/.gitignore (92%) diff --git a/.gitignore b/NatalliaPaliashchuk/.gitignore similarity index 92% rename from .gitignore rename to NatalliaPaliashchuk/.gitignore index ecb4530f..4c149016 100644 --- a/.gitignore +++ b/NatalliaPaliashchuk/.gitignore @@ -10,11 +10,10 @@ pip-delete-this-directory.txt cache.pk1 # Unit test / coverage reports -.tox/ .coverage .cache -nosetests.xml coverage.xml +htmlcov/ # Distribution / packaging dist/ From 8d05e45d5e9a386c7083dc73b7f73b4a0528371a Mon Sep 17 00:00:00 2001 From: Natallia Paliashchuk Date: Wed, 29 Jun 2022 23:04:22 +0400 Subject: [PATCH 5/8] Python version, gitignore were updated --- NatalliaPaliashchuk/.gitignore | 7 +++++++ NatalliaPaliashchuk/README.md | 2 +- NatalliaPaliashchuk/rss_reader.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/NatalliaPaliashchuk/.gitignore b/NatalliaPaliashchuk/.gitignore index 4c149016..d8fb74fa 100644 --- a/NatalliaPaliashchuk/.gitignore +++ b/NatalliaPaliashchuk/.gitignore @@ -20,3 +20,10 @@ dist/ build/ *.egg-info/ +# Venv +share/ +pyvenv.cfg +bin/ +include/ +lib/ + diff --git a/NatalliaPaliashchuk/README.md b/NatalliaPaliashchuk/README.md index 1347e6da..0b4d19a7 100644 --- a/NatalliaPaliashchuk/README.md +++ b/NatalliaPaliashchuk/README.md @@ -7,7 +7,7 @@ The following dependencies are required for installation: - PIP ### Install CLI utility -`$ python setup.py install`\ +`# python setup.py install`\ ### Using the application without installation of CLI utility - install all dependencies\ `$ pip install -r requirements.txt` diff --git a/NatalliaPaliashchuk/rss_reader.py b/NatalliaPaliashchuk/rss_reader.py index 64fae02c..545938ce 100644 --- a/NatalliaPaliashchuk/rss_reader.py +++ b/NatalliaPaliashchuk/rss_reader.py @@ -5,6 +5,6 @@ if __name__ == '__main__': sys.tracebacklimit = -1 - if sys.version_info < (3, 8): + if sys.version_info < (3, 9): raise PythonVersionError('It needs at least 3.9 version of Python to run the application') run() From f33f872327c747d3e5e4d5052bab2f53dea227ab Mon Sep 17 00:00:00 2001 From: Natallia Paliashchuk Date: Thu, 30 Jun 2022 11:53:07 +0400 Subject: [PATCH 6/8] Error handling, docstrings were added --- NatalliaPaliashchuk/rss_reader/caching.py | 25 ++++++++- NatalliaPaliashchuk/rss_reader/converting.py | 55 ++++++++++++++++--- NatalliaPaliashchuk/rss_reader/run.py | 12 ++-- .../test_rss_reader/test_converting.py | 2 + 4 files changed, 78 insertions(+), 16 deletions(-) diff --git a/NatalliaPaliashchuk/rss_reader/caching.py b/NatalliaPaliashchuk/rss_reader/caching.py index 3d2dc73c..d8432d1f 100644 --- a/NatalliaPaliashchuk/rss_reader/caching.py +++ b/NatalliaPaliashchuk/rss_reader/caching.py @@ -6,7 +6,14 @@ def cache_feed(feed): - '''Serialization feed into a pickle file''' + '''Serialization feed into a pickle file + + Args: + feed (dict): RSS feed + + Raises: + CachingError: if an error is detected during caching + ''' logger.debug('Feed serialization started') try: with open('cache.pk1', 'wb') as f: @@ -18,8 +25,20 @@ def cache_feed(feed): def get_feed_by_date(date, url, limit): - '''Get feed by date from cache''' - logger.debug(f'Getting feed by {date} started') + '''Get feed by date from cache + + Args: + date (datetime): starting date + url (str): URL of RSS feed + limit (int): quantity of RSS items + + Raises: + CachingError: _description_ + + Returns: + dict: RSS feed + ''' + logger.debug(f'Getting {url} feed from {date} with {limit} limit') try: with open('cache.pk1', 'rb') as f: feed = pickle.load(f) diff --git a/NatalliaPaliashchuk/rss_reader/converting.py b/NatalliaPaliashchuk/rss_reader/converting.py index 3a02a93f..dcca5d2f 100644 --- a/NatalliaPaliashchuk/rss_reader/converting.py +++ b/NatalliaPaliashchuk/rss_reader/converting.py @@ -9,7 +9,17 @@ def feed_to_html(feed): - '''Convert feed to html''' + '''Convert RSS feed to html + + Args: + feed (dict): RSS feed + + Raises: + ConvertError: if an error is detected during converting + + Returns: + str: string that contains html document + ''' here = os.path.abspath(os.path.dirname(__file__)) logger.debug('Creating html') try: @@ -23,18 +33,45 @@ def feed_to_html(feed): def feed_to_json(feed, indent=None): - '''Convert feed to json''' + '''Convert RSS feed to json + + Args: + feed (dict): RSS feed + indent (int, optional): indent level. Defaults to None + + Raises: + ConvertError: if an error is detected during converting + + Returns: + str: string that contains JSON document + ''' logger.debug('Creating json') - json_ = json.dumps(feed, ensure_ascii=False, indent=indent, default=str) + try: + json_ = json.dumps(feed, ensure_ascii=False, indent=indent, default=str) + except Exception as e: + logger.debug(f'Creating JSON can\'t be done due to {e}') + raise ConvertError(e) from None logger.debug('Done') return json_ def feed_to_epub(feed, path): - '''Convert feed to epub''' - logger.debug('Creating epub') - epub = html2epub.Epub(feed['feed_title']) - chapter = html2epub.create_chapter_from_string(feed_to_html(feed)) - epub.add_chapter(chapter) - epub.create_epub(path) + '''Convert feed to epub + + Args: + feed (dict): RSS feed + path (str): path of directory + + Raises: + ConvertError: if an error is detected during converting + ''' + logger.debug(f'Creating epub in {path} directory') + try: + epub = html2epub.Epub(feed['feed_title']) + chapter = html2epub.create_chapter_from_string(feed_to_html(feed)) + epub.add_chapter(chapter) + epub.create_epub(path) + except Exception as e: + logger.debug(f'Creating epub can\'t be done due to {e}') + raise ConvertError(e) from None logger.debug('Done') diff --git a/NatalliaPaliashchuk/rss_reader/run.py b/NatalliaPaliashchuk/rss_reader/run.py index cbf61ab7..114d236b 100644 --- a/NatalliaPaliashchuk/rss_reader/run.py +++ b/NatalliaPaliashchuk/rss_reader/run.py @@ -12,7 +12,12 @@ def run(): - '''The main function of the application''' + '''The main function of the application + + Raises: + LimitError: if the limit option was defined under zero + NotFoundError: if the URL feed is not in the cache + ''' sys.tracebacklimit = -1 parser = argparse.ArgumentParser(description='Pure Python command-line RSS reader', prog='rss_reader') parser.add_argument('source', help='RSS URL') @@ -41,13 +46,12 @@ def run(): if args.date: feed = get_feed_by_date(datetime.strptime(args.date, '%Y%m%d').replace( tzinfo=timezone.utc), args.source, args.limit) + if not feed: + raise NotFoundError('News not found') else: feed = parse_rss(args.source, args.limit) cache_feed(feed) - if not feed: - raise NotFoundError('News not found') - if args.json: print(feed_to_json(feed[args.source], indent=2)) else: diff --git a/NatalliaPaliashchuk/test_rss_reader/test_converting.py b/NatalliaPaliashchuk/test_rss_reader/test_converting.py index 5de7ea7f..377ec3d8 100644 --- a/NatalliaPaliashchuk/test_rss_reader/test_converting.py +++ b/NatalliaPaliashchuk/test_rss_reader/test_converting.py @@ -1,6 +1,7 @@ import unittest from rss_reader.converting import feed_to_html, feed_to_json from .test_feed import test_feed +from rss_reader.exceptions import ConvertError class TestConverting(unittest.TestCase): @@ -26,6 +27,7 @@ def test_feed_to_html(self, *args):

''') + self.assertRaises(ConvertError, feed_to_html, 1) def test_feed_to_json(self, *args): json_ = feed_to_json(test_feed) From 4999bc14a6c6991b1e1e9e8b88ffe555bcfcb9b8 Mon Sep 17 00:00:00 2001 From: Natallia Paliashchuk Date: Thu, 30 Jun 2022 21:26:10 +0400 Subject: [PATCH 7/8] Doctstrings were updated --- NatalliaPaliashchuk/rss_reader/caching.py | 4 +-- NatalliaPaliashchuk/rss_reader/parsing.py | 32 ++++++++++++++++++++--- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/NatalliaPaliashchuk/rss_reader/caching.py b/NatalliaPaliashchuk/rss_reader/caching.py index d8432d1f..b4eff3f9 100644 --- a/NatalliaPaliashchuk/rss_reader/caching.py +++ b/NatalliaPaliashchuk/rss_reader/caching.py @@ -30,10 +30,10 @@ def get_feed_by_date(date, url, limit): Args: date (datetime): starting date url (str): URL of RSS feed - limit (int): quantity of RSS items + limit (int): max quantity of RSS items Raises: - CachingError: _description_ + CachingError: if an error is detected during getting feed from cache Returns: dict: RSS feed diff --git a/NatalliaPaliashchuk/rss_reader/parsing.py b/NatalliaPaliashchuk/rss_reader/parsing.py index f371ad44..35108807 100644 --- a/NatalliaPaliashchuk/rss_reader/parsing.py +++ b/NatalliaPaliashchuk/rss_reader/parsing.py @@ -8,7 +8,14 @@ def get_item_desc(item): - '''Parse description of RSS item''' + '''Parse description of RSS item + + Args: + item (str): RSS item + + Returns: + dict: description text and links + ''' logger.debug('Parsing description of RSS feed') url_tags = {'a': {'attr': 'href', 'type': 'link', 'alt_text': ''}, 'img': {'attr': 'src', 'type': 'image', 'alt_text': 'alt'}} @@ -30,7 +37,15 @@ def get_item_desc(item): def get_item_image_url(item, url): - '''Parse image url of RSS item''' + '''Parse image url of RSS item + + Args: + item (str): RSS item + url (str): URL of RSS feed + + Returns: + str: URL of image + ''' logger.debug('Parsing image url of RSS feed') image_url = '' if media_content := item.find('media:content', attrs={'url': True}): @@ -49,7 +64,18 @@ def get_item_image_url(item, url): def parse_rss(url, limit=0): - '''Parse a RSS feed and return a RSS dict''' + '''Parse a RSS feed and return a RSS dict + + Args: + url (str): URL of RSS feed + limit (int, optional): max quantity of RSS items. Defaults to 0. + + Raises: + ParserError: if an error is detected during parcing + + Returns: + dict: RSS feed + ''' logger.debug(f'Parsing RSS from {url} started') try: bs_xml = BeautifulSoup(requests.get(url).content, features='xml') From 94acb5d19356777d892d13710e97b68b9e9a47d0 Mon Sep 17 00:00:00 2001 From: Natallia Paliashchuk Date: Thu, 30 Jun 2022 21:57:59 +0400 Subject: [PATCH 8/8] Parsing optimization --- NatalliaPaliashchuk/README.md | 19 +++++++------------ NatalliaPaliashchuk/rss_reader/parsing.py | 4 ++-- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/NatalliaPaliashchuk/README.md b/NatalliaPaliashchuk/README.md index 0b4d19a7..320be25f 100644 --- a/NatalliaPaliashchuk/README.md +++ b/NatalliaPaliashchuk/README.md @@ -49,28 +49,23 @@ Return example "feed_title": "Технологии Onlíner", "feed_items": [ { - "item_title": "Sony представила игровые мониторы для PS5 и геймерские наушники. Известны цены", - "item_pub_date": "2022-06-29 09:44:11+03:00", - "item_url": "https://tech.onliner.by/2022/06/29/sony-predstavila-igrovye-monitory-dlya-ps5-i-gejmerskie-naushniki-izvestny-ceny", - "item_desc_text": "[1]Sony официально представила игровой бренд InZone — под ним будут выходить гаджеты для геймеров. Начали с мониторов и наушников, [2 сообщает] engadget.[3 Читать далее…]", + "item_title": "Чип М2 в новом MacBook может нагреваться до 108 градусов", + "item_pub_date": "2022-06-30 17:20:10+03:00", + "item_url": "https://tech.onliner.by/2022/06/30/chip-m2-v-novom-macbook-mozhet-nagrevatsya-do-108-gradusov", + "item_desc_text": "[1]Новый чип М2 оказался не столь революционным, как его предшественник. Более того, новинка склонна к перегреву и, как следствие, троттлингу, то есть жесткому снижению частот. К таким выводам пришел блогер из Max Tech. Он заполучил новый MacBook Pro на М2 и запустил на нем экспорт видео в формате RAW и с разрешением 8K.[2 Читать далее…]", "item_desc_links": [ { "link_pos": 1, - "link_url": "https://content.onliner.by/news/thumbnail/625fdac3c028b390f2d80f9c26fe90de.jpeg", + "link_url": "https://content.onliner.by/news/thumbnail/7b957a17487635c74ef5a743f07ebe75.jpeg", "link_type": "image" }, { "link_pos": 2, - "link_url": "https://www.engadget.com/sony-inzone-gaming-monitors-headsets-specs-pricing-availability-210056794.html", - "link_type": "link" - }, - { - "link_pos": 3, - "link_url": "https://tech.onliner.by/2022/06/29/sony-predstavila-igrovye-monitory-dlya-ps5-i-gejmerskie-naushniki-izvestny-ceny", + "link_url": "https://tech.onliner.by/2022/06/30/chip-m2-v-novom-macbook-mozhet-nagrevatsya-do-108-gradusov", "link_type": "link" } ], - "item_image_url": "https://content.onliner.by/news/default/625fdac3c028b390f2d80f9c26fe90de.jpeg" + "item_image_url": "https://content.onliner.by/news/default/7b957a17487635c74ef5a743f07ebe75.jpeg" } ] } diff --git a/NatalliaPaliashchuk/rss_reader/parsing.py b/NatalliaPaliashchuk/rss_reader/parsing.py index 35108807..3a1d2c88 100644 --- a/NatalliaPaliashchuk/rss_reader/parsing.py +++ b/NatalliaPaliashchuk/rss_reader/parsing.py @@ -89,8 +89,8 @@ def parse_rss(url, limit=0): item_dict = {'item_title': getattr(item.title, 'text', '').strip(), 'item_pub_date': parse(getattr(item.pubDate, 'text', '0001-01-01')), 'item_url': getattr(item.link, 'text', '').strip(), - 'item_desc_text': get_item_desc(item)['desc_text'].strip(), - 'item_desc_links': get_item_desc(item)['desc_links']} + 'item_desc_text': (desc_result := get_item_desc(item))['desc_text'].strip(), + 'item_desc_links': desc_result['desc_links']} item_dict['item_image_url'] = get_item_image_url(item, item_dict['item_url']) feed[url]['feed_items'].append(item_dict) except Exception as e: