diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..3de56af9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +.vscode +.idea +.coverage +__pycache__ +build/ +source/_templates +source/_static +docs/build +rss_reader.egg-info/ +dist/ \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..26391479 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include rss_reader/saver/to_html/templates/main.html \ No newline at end of file diff --git a/README.md b/README.md index c86d1e65..ef387248 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,22 @@ -# How to create a PR with a homework task -1. Create fork from the following repo: https://github.com/E-P-T/Homework. (Docs: https://docs.github.com/en/get-started/quickstart/fork-a-repo ) -2. Clone your forked repo in your local folder. -3. Create separate branches for each session.Example(`session_2`, `session_3` and so on) -4. Create folder with you First and Last name in you forked repo in the created session. -5. Add your task into created folder -6. Push finished session task in the appropriate branch in accordance with written above. - You should get the structure that looks something like that +# rss-reader +## rss-reader is a command line utility that allows you to view RSS feeds + +## Usage + +``` +> python -m rss-reader https://news.yahoo.com/rss/ ``` - Branch: Session_2 - DzmitryKolb - |___Task1.py - |___Task2.py - Branch: Session_3 - DzmitryKolb - |___Task1.py - |___Task2.py +or: ``` +> rss-reader https://news.yahoo.com/rss/ +``` + +## Contributing +Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. -7. When you finish your work on task you should create Pull request to the appropriate branch of the main repo https://github.com/E-P-T/Homework (Docs: https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork). -Please use the following instructions to prepare good description of the pull request: - - Pull request header should be: `Session - `. - Example: `Session 2 - Dzmitry Kolb` - - Pull request body: You should write here what tasks were implemented. - Example: `Finished: Task 1.2, Task 1.3, Task 1.6` +Please make sure to update tests as appropriate. +## License +[MIT](https://choosealicense.com/licenses/mit/) diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d0c3cbf1 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..dc1312ab --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..bd860633 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,58 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('../..')) + + +# -- Project information ----------------------------------------------------- + +project = 'rss-reader' +copyright = '2022, Andrey Ozerets' +author = 'Andrey Ozerets' + +# The full version, including alpha/beta/rc tags +release = '0.0.1' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/docs/source/display_information.rst b/docs/source/display_information.rst new file mode 100644 index 00000000..45d1eb3b --- /dev/null +++ b/docs/source/display_information.rst @@ -0,0 +1,115 @@ +Display of information. +======================= + +This part of the documentation describes how rss-reader displays information. +----------------------------------------------------------------------------- + +Information display: +-------------------- +Normally. +~~~~~~~~~~~ + * When all data is available. + + .. figure:: /images/output.jpg + + * When there is missing data. + + .. figure:: /images/output-empty.jpg + + * When the date parameter is used and the result contains different news sources. + + .. figure:: /images/date_parametr.jpg + + +With the given \-\-json parameter: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * When all data is available. + + .. code-block:: JSON + + [ + { + "title_web_resource": "Yahoo News - Latest News & Headlines", + "link": "https://news.yahoo.com/rss/", + "items": + [ + { + "title": "1955 warrant family seeks", + "link": "https://news.yahoo.com/1955-war79.html", + "pubDate": "2022-06-29T19:41:30Z", + "source": "Associated Press", + "content": { + "url": "https://s.yimg.com/uu/api/res/1.2/z8bf83", + "title": null + } + } + ] + } + ] + + * When there is missing data. + + .. code-block:: JSON + + [ + { + "title_web_resource": "Yahoo News - Latest News Headlines", + "link": "https://news.yahoo.com/rss/", + "items": + [ + { + "title": "Biden takes", + "link": "", + "pubDate": "2022-06-18T14:37:24Z", + "source": "", + "content": { + "url": "", + "title": null + } + } + ] + } + ] + + +With the given \-\-date and \-\-json parameter: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: JSON + + [ + { + "title_web_resource": "Yahoo News - Latest News & Headlines", + "link": "https://news.yahoo.com/rss/", + "items": + [ + { + "title": "Hong Kongs", + "link": "https://news.yahoo.com/hong.html", + "pubDate": "2022-06-01", + "source": "Associated Press", + "content": { + "url": "https://s.yimg.com/uu/api/reen/ap.org/0c", + "title": null + } + } + ] + }, + { + "title_web_resource": "Новости ООН - Здравоохранение", + "link": "https://news.un.org/feed/rss.xml", + "items": + [ + { + "title": "ВОЗ: необходимы антибиотики нового поколения", + "link": "https://news.un.org/08.html", + "pubDate": "2022-06-01", + "source": "Associated Press", + "content": { + "url": "https://s.yimg.com/uu/api-rg/0725fc", + "title": null + } + } + ] + } + ] \ No newline at end of file diff --git a/docs/source/images/date_parametr.jpg b/docs/source/images/date_parametr.jpg new file mode 100644 index 00000000..ec627e55 Binary files /dev/null and b/docs/source/images/date_parametr.jpg differ diff --git a/docs/source/images/local_storage.jpg b/docs/source/images/local_storage.jpg new file mode 100644 index 00000000..ce492989 Binary files /dev/null and b/docs/source/images/local_storage.jpg differ diff --git a/docs/source/images/output-empty.jpg b/docs/source/images/output-empty.jpg new file mode 100644 index 00000000..c93c8641 Binary files /dev/null and b/docs/source/images/output-empty.jpg differ diff --git a/docs/source/images/output.jpg b/docs/source/images/output.jpg new file mode 100644 index 00000000..9482f25d Binary files /dev/null and b/docs/source/images/output.jpg differ diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..862b346a --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,27 @@ +.. rss-reader documentation master file, created by + sphinx-quickstart on Sun Jun 26 12:48:42 2022. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to rss-reader's documentation! +====================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + start_program.rst + display_information.rst + local_storage.rst + to_html_mode.rst + to_pdf_mode.rst + tests.rst + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/local_storage.rst b/docs/source/local_storage.rst new file mode 100644 index 00000000..ed8ecd82 --- /dev/null +++ b/docs/source/local_storage.rst @@ -0,0 +1,11 @@ +Local storage. +======================= + +This part of the documentation describes how rss-reader stores local information. +--------------------------------------------------------------------------------- + +All information is stored in 'home directory'/.rss-reader/local_storage.csv + +Representation of the local storage file. + +.. figure:: /images/local_storage.jpg \ No newline at end of file diff --git a/docs/source/start_program.rst b/docs/source/start_program.rst new file mode 100644 index 00000000..87e66b7d --- /dev/null +++ b/docs/source/start_program.rst @@ -0,0 +1,17 @@ +Program launch +============== + +This part of the documentation describes how to run rss-reader. +--------------------------------------------------------------- + +To start rss-reader, just run this simple command in the terminal of your choice:: + + $ python -m rss_reader + +The program supports the following keys: + * source - RSS URL. Required argument. + * \-\-verbose - Outputs verbose status messages.a + * \-\-json - Print result as JSON in stdout. + * \-\-limit - Limit news topics if this parameter provided. + * \-\-version - Print version info. + * \-\-date - Search for news on a specified date. Date in the format Y-m-d (for example: 20191206). diff --git a/docs/source/tests.rst b/docs/source/tests.rst new file mode 100644 index 00000000..69810ddf --- /dev/null +++ b/docs/source/tests.rst @@ -0,0 +1,14 @@ +Program testing. +================ + +This part of the documentation describes how to test the rss-reader. +-------------------------------------------------------------------- + +To start testing just run the following code:: + + $ python -m pytest + +For reference: + +* Each package contains its own tests. +* Integration tests are collected in the rss_reader\tests package. \ No newline at end of file diff --git a/docs/source/to_html_mode.rst b/docs/source/to_html_mode.rst new file mode 100644 index 00000000..07923109 --- /dev/null +++ b/docs/source/to_html_mode.rst @@ -0,0 +1,14 @@ +Mode for saving work results to an HTML file. +============================================= + +This part of the documentation describes how to save the results of your work in a HTML file. +--------------------------------------------------------------------------------------------- + +You can start this mode:: + + $ rss_reader https://rss.nytimes.com/services/xml/rss/nyt/World.xml --to-html D:\ + +The file will be saved as news.html. + +The save handler in the html file is called through the chain pattern, and the specific way +the file is generated is determined through the strategy pattern. \ No newline at end of file diff --git a/docs/source/to_pdf_mode.rst b/docs/source/to_pdf_mode.rst new file mode 100644 index 00000000..8aaebe1f --- /dev/null +++ b/docs/source/to_pdf_mode.rst @@ -0,0 +1,14 @@ +Mode for saving work results to an PDF file. +============================================= + +This part of the documentation describes how to save the results of your work in a PDF file. +--------------------------------------------------------------------------------------------- + +You can start this mode:: + + $ rss_reader https://rss.nytimes.com/services/xml/rss/nyt/World.xml --to-pdf D:\ + +The file will be saved as news.pdf. + +The save handler in the PDF file is called by the chain template. +Temporary files are stored in the user's directory and are deleted after saving the PDF file. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..41f53010 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,72 @@ +alabaster==0.7.12 +appdirs==1.4.4 +atomicwrites==1.4.0 +attrs==21.4.0 +autopep8==1.6.0 +Babel==2.10.3 +beautifulsoup4==4.11.1 +Brotli==1.0.9 +cairocffi==1.3.0 +CairoSVG==2.5.2 +certifi==2022.6.15 +cffi==1.15.0 +charset-normalizer==2.0.12 +colorama==0.4.5 +coverage==6.4.1 +cssselect2==0.6.0 +defusedxml==0.7.1 +docutils==0.18.1 +esbonio==0.13.0 +fonttools==4.33.3 +fpdf==1.7.2 +html5lib==1.1 +idna==3.3 +imagesize==1.3.0 +importlib-metadata==4.12.0 +iniconfig==1.1.1 +Jinja2==3.1.2 +lxml==4.9.0 +MarkupSafe==2.1.1 +numpy==1.23.0 +packaging==21.3 +pandas==1.4.3 +pdfkit==1.0.0 +Pillow==9.1.1 +pluggy==1.0.0 +py==1.11.0 +pycodestyle==2.8.0 +pycparser==2.21 +pydantic==1.8.2 +pydyf==0.2.0 +pygls==0.11.3 +Pygments==2.12.0 +pyparsing==3.0.9 +pyphen==0.12.0 +pyspellchecker==0.6.3 +pytest==7.1.2 +pytest-cov==3.0.0 +pytest-mock==3.8.1 +python-dateutil==2.8.2 +pytz==2022.1 +requests==2.28.0 +six==1.16.0 +snowballstemmer==2.2.0 +soupsieve==2.3.2.post1 +Sphinx==5.0.2 +sphinxcontrib-applehelp==1.0.2 +sphinxcontrib-devhelp==1.0.2 +sphinxcontrib-htmlhelp==2.0.0 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-serializinghtml==1.1.5 +tinycss2==1.1.1 +toml==0.10.2 +tomli==2.0.1 +typeguard==2.13.3 +types-requests==2.27.31 +types-urllib3==1.26.15 +typing-extensions==4.2.0 +urllib3==1.26.9 +webencodings==0.5.1 +zipp==3.8.0 +zopfli==0.2.1 diff --git a/rss_reader/__init__.py b/rss_reader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_reader/__main__.py b/rss_reader/__main__.py new file mode 100644 index 00000000..644849b4 --- /dev/null +++ b/rss_reader/__main__.py @@ -0,0 +1,54 @@ +"""Entry point to the program.""" + + +from rss_reader.starter.base import (create_logger, + init_arguments_functionality as iaf) +from rss_reader.logger.logger import Logger +from rss_reader.starter.ecxeptions import NonNumericError +from rss_reader.crawler.exceptions import BadURLError +from rss_reader.loader.exceptions import EmptyURLError, DataFileNotFoundError, DataEmptyError + + +def main(): + """Entry point to the program.""" + + args = iaf() + create_logger(args.get('verbose')) + log = Logger.get_logger(__name__) + + from rss_reader.starter.starter import Starter + log.debug("Create a Starter object.") + s = Starter(args) + + def print_exc(e: Exception): + print(f'Sorry, we have to stop working. Because:') + print(f'\t {e}') + log.error(e) + + try: + log.info("Start the program.") + s.run() + except NonNumericError as e: + print_exc(e) + except BadURLError as e: + print_exc(e) + except EmptyURLError as e: + print_exc(e) + except DataEmptyError as e: + print_exc(e) + except DataFileNotFoundError as e: + print_exc(e) + except FileExistsError as e: + print_exc(e) + except Exception as e: + s = ('Sorry, we have to stop working. Something went wrong.' + 'We are terribly sorry.') + print(s) + log.error(e) + finally: + log.info("Stop the program.") + exit() + + +if __name__ == "__main__": + main() diff --git a/rss_reader/crawler/__init__.py b/rss_reader/crawler/__init__.py new file mode 100644 index 00000000..b6743a1a --- /dev/null +++ b/rss_reader/crawler/__init__.py @@ -0,0 +1 @@ +"""This package contains modules that work with crawlers.""" diff --git a/rss_reader/crawler/crawler.py b/rss_reader/crawler/crawler.py new file mode 100644 index 00000000..15007be1 --- /dev/null +++ b/rss_reader/crawler/crawler.py @@ -0,0 +1,80 @@ +"""This module contains objects that receive data from the internet.""" + + +from requests import Response, get, ConnectionError +from requests.exceptions import MissingSchema + +from rss_reader.interfaces.icrawler.icrawler import ICrawler +from rss_reader.decorator.decorator import send_log_of_start_function +from rss_reader.logger.logger import Logger +from .exceptions import BadURLError, FailStatusCodeError + +log = Logger.get_logger(__name__) + + +class SuperCrawler(ICrawler): + """A class to represent a crawler.""" + + def __init__(self, url: str) -> None: + """Initializer. + + :param url: URL of the requested web page. + :type url: str + """ + self._url = url + + @send_log_of_start_function + def get_data(self) -> bytes: + """Get the content of the requested page. + + :raises FailStatusCodeError: If status code is not equal to 200. + :return: Page data. + :rtype: bytes + """ + + r = self._get_response() + status = self._get_status(r) + if status == 200: + return self._get_content(r) + log.error( + f'An unsupported HTTP status code returned. (code = {status})') + raise FailStatusCodeError(status) + + def _get_response(self) -> Response: + """Get the server's response to an HTTP request. + + :raises BadURLError: If the url is wrong. + :return: Contains a server's response to an HTTP request. + :rtype: Response + """ + try: + req = get(self._url) + except ConnectionError as e: + s = f'It is not possible to get data for the given url ({self._url})' + log.error(s) + raise BadURLError(self._url) from e + except MissingSchema as e: + s = f'It is not possible to get data for the given url ({self._url})' + log.error(s) + raise BadURLError(self._url) from e + return req + + def _get_content(self, req: Response) -> bytes: + """Get the content of the Response object. + + :param req: Contains a server's response. + :type req: Response + :return: Content of the response, in bytes. + :rtype: bytes + """ + return req.content + + def _get_status(self, req: Response) -> int: + """Get status code. + + :param req: Contains a server's response. + :type req: Response + :return: Status code. + :rtype: int + """ + return req.status_code diff --git a/rss_reader/crawler/exceptions.py b/rss_reader/crawler/exceptions.py new file mode 100644 index 00000000..4b4e1aaa --- /dev/null +++ b/rss_reader/crawler/exceptions.py @@ -0,0 +1,23 @@ +"""This module contains custom exceptions for the crawler package.""" + + +class BadURLError(Exception): + """"Occurs when the url is specified incorrectly.""" + + def __init__(self, url, *args, **kwargs) -> None: + self.url = url + super().__init__(*args, **kwargs) + + def __str__(self) -> str: + return f'It is not possible to get data for the given url ({self.url})' + + +class FailStatusCodeError(Exception): + """Occurs when the server response code differs from the expected one.""" + + def __init__(self, status_code, *args, **kwargs) -> None: + self.status_code = status_code + super().__init__(*args, **kwargs) + + def __str__(self) -> str: + return f'An unsupported HTTP status code returned. (code = {self.status_code})' diff --git a/rss_reader/crawler/tests/__init__.py b/rss_reader/crawler/tests/__init__.py new file mode 100644 index 00000000..624cba14 --- /dev/null +++ b/rss_reader/crawler/tests/__init__.py @@ -0,0 +1 @@ +"""Test suite for the crawler module.""" diff --git a/rss_reader/crawler/tests/test_super_crawler.py b/rss_reader/crawler/tests/test_super_crawler.py new file mode 100644 index 00000000..4bda1b26 --- /dev/null +++ b/rss_reader/crawler/tests/test_super_crawler.py @@ -0,0 +1,67 @@ +"""A test suite for the SuperCrawler class.""" + +import pytest +import requests + +from ..crawler import SuperCrawler +from ..exceptions import FailStatusCodeError, BadURLError + + +class MockResponse: + """Mimics the behavior of the Response object.""" + + def __init__(self, content: str, status_code: int = 200) -> None: + """Initializer. + + :param content: The content to be returned. + :type content: str + :param status_code: HTTP status code, defaults to 200 + :type status_code: int, optional + """ + self.content = content + self.status_code = status_code + + +def test_get_data(monkeypatch): + """Checks the type of the returned object. + + Type must be a byte string. + """ + + def mock_get_data(*args, **kwargs): + return MockResponse(b'') + + monkeypatch.setattr(SuperCrawler, '_get_response', mock_get_data) + data = SuperCrawler('https://news.yahoo888.com/rss/').get_data() + + assert isinstance(data, bytes) + + +def test_get_fail_error(monkeypatch): + """Check that a FailStatusCodeError exception is returned. + + An exception is thrown when status code is not equal to 200. + """ + + def mock_get_status(*args, **kwargs): + return MockResponse(b'', 100) + + monkeypatch.setattr(SuperCrawler, '_get_response', mock_get_status) + + with pytest.raises(FailStatusCodeError): + SuperCrawler('https://news.yahoo3.com/rss/').get_data() + + +def test_fail_url_response(monkeypatch): + """Check that a BadURLError exception is returned. + + An exception is thrown when it is not possible to get data from the site. + """ + + def mock_get_error(*args, **kwargs): + raise ConnectionError + + monkeypatch.setattr(requests, 'get', mock_get_error) + + with pytest.raises(BadURLError): + SuperCrawler('https://news.yahoo888.com/rss/').get_data() diff --git a/rss_reader/data_converter/__init__.py b/rss_reader/data_converter/__init__.py new file mode 100644 index 00000000..b6b0fcbb --- /dev/null +++ b/rss_reader/data_converter/__init__.py @@ -0,0 +1 @@ +"""This package contains modules that work with dat converters.""" diff --git a/rss_reader/data_converter/data_converter.py b/rss_reader/data_converter/data_converter.py new file mode 100644 index 00000000..60cff242 --- /dev/null +++ b/rss_reader/data_converter/data_converter.py @@ -0,0 +1,116 @@ +"""This module contains a class that works with data stored in local storage.""" + +from locale import normalize +from typing import List, Optional +from pandas import DataFrame, json_normalize, concat, to_datetime + +from rss_reader.interfaces.idataconverter.idataconverter import IDataConverter + + +class DataConverter(IDataConverter): + """Transforms data from one state to another.""" + + def concat_data(self, data: List[dict], + local_data: DataFrame, + ignore_index: bool) -> Optional[DataFrame]: + """Concat two DataFrames. + + :param data: Data to be merged with data from local storage. + :type data: List[dict] + :param local_data: DataFrame which is obtained from local storage. + :type local_data: DataFrame + :param ignore_index: Returns True when the argument x is true, + False otherwise. The builtins True and False are the only two + instances of the class bool. The class bool is a subclass of the + class int, and cannot be subclassed. + :type ignore_index: bool + :return: Merged DataFrame. + :rtype: Optional[DataFrame] + """ + + data_concat = concat([local_data, data], + ignore_index=ignore_index) + return data_concat + + def drop_duplicates_(self, data: DataFrame, + keep: str = "first", + ignore_index: bool = False) -> Optional[DataFrame]: + """Return DataFrame with duplicate rows removed. + + :param data: The DataFrame in which duplicates should be removed. + :type data: DataFrame + :param keep: {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to keep. + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + :type keep: str + :param inplace: bool, default False + Whether to drop duplicates in place or to return a copy. + :type inplace: bool + :param ignore_index: bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + :type ignore_index: bool + :return: DataFrame without duplicates. + :rtype: Optional[DataFrame] + """ + return data.drop_duplicates(keep=keep, + ignore_index=ignore_index) + + def normalize_(self, data: List[dict], record_path: List[str], + meta: List[str], record_prefix: str) -> DataFrame: + """Normalize semi-structured JSON data into a flat table. + + :param data: Data for normalize. + :type data: List[dict] + :param record_path: Path in each object to list of records. If not + passed, data will be assumed to be an array of records. + :type record_path: List[str] + :param meta: Fields to use as metadata for each record in resulting + table. + :type meta: List[str] + :param record_prefix: If True, prefix records with dotted (?) path, + e.g. foo.bar.field if + path to records is ['foo', 'bar']. + :type record_prefix: str + :return: Normalize semi-structured JSON data into a flat table. + :rtype: DataFrame + """ + return json_normalize(data, record_path=record_path, + meta=meta, + record_prefix=record_prefix) + + def convert_date(self, df: DataFrame, column_name: + str, format: str = '%Y-%m-%d', + utc: bool = True) -> DataFrame: + """Convert the date to the desired format. + + :param df: The DataFrame in which to replace. + :type df: DataFrame + :param column_name: The name of the column in which you want to change. + :type column_name: str + :param format: Return a string representing the date, controlled by an + explicit format string. Format codes referring to hours, minutes or + seconds will see 0 values., defaults to '%Y-%m-%d' + :type format: str, optional + :param utc: Control timezone-related parsing, localization and + conversion. + + If True, the function always returns a timezone-aware UTC-localized + Timestamp, Series or DatetimeIndex. To do this, timezone-naive inputs + are localized as UTC, while timezone-aware inputs are converted to UTC. + + If False (default), inputs will not be coerced to UTC. Timezone-naive + inputs will remain naive, while timezone-aware ones will keep their + time offsets. Limitations exist for mixed offsets (typically, daylight + savings), see Examples section for details., defaults to True + + :type utc: bool + :return: DataFrame with the converted date. + :rtype: DataFrame + """ + + df[column_name] = to_datetime(df.get(column_name), utc=utc) + df[column_name] = df.get(column_name).dt.date.apply( + lambda x: x.strftime(format)) + return df diff --git a/rss_reader/date_converter/__init__.py b/rss_reader/date_converter/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_reader/date_converter/date_converter.py b/rss_reader/date_converter/date_converter.py new file mode 100644 index 00000000..b948dc47 --- /dev/null +++ b/rss_reader/date_converter/date_converter.py @@ -0,0 +1,24 @@ +"""This module contains classes that work with dates""" + + +from datetime import datetime + +from rss_reader.interfaces.idateconverter.idateconverter import IDateConverter + + +class DateConverter(IDateConverter): + """Converts date.""" + + @staticmethod + def date_convert(date: str, format: str = '%Y%m%d') -> str: + """Return the date part. + + :param date: Date in original state. + :type date: str + :param format: Substring selection format. + Example '%Y%m%d'. + :type format: str + :return: Date in the given format. + :rtype: str + """ + return datetime.strptime(date, format).date().__str__() diff --git a/rss_reader/decorator/__init__.py b/rss_reader/decorator/__init__.py new file mode 100644 index 00000000..6876cb57 --- /dev/null +++ b/rss_reader/decorator/__init__.py @@ -0,0 +1 @@ +"""This package contains modules that work with decorators.""" diff --git a/rss_reader/decorator/decorator.py b/rss_reader/decorator/decorator.py new file mode 100644 index 00000000..79462db7 --- /dev/null +++ b/rss_reader/decorator/decorator.py @@ -0,0 +1,21 @@ +"""This module contains decorators used in the program.""" + +from functools import wraps +from rss_reader.logger.logger import Logger + +log = Logger.get_logger(__name__) + + +def send_log_of_start_function(func): + """Log the start and end of the function. + + It is decorator function. + """ + + @wraps(func) + def wrapper(*args, **kwargs): + log.info(f'A {func.__name__} function starts working.') + res = func(*args, **kwargs) + log.info(f'A {func.__name__} function has completed its work.') + return res + return wrapper diff --git a/rss_reader/interfaces/__init__.py b/rss_reader/interfaces/__init__.py new file mode 100644 index 00000000..ad9530e9 --- /dev/null +++ b/rss_reader/interfaces/__init__.py @@ -0,0 +1 @@ +"""This package contains modules with interfaces.""" diff --git a/rss_reader/interfaces/icrawler/__init__.py b/rss_reader/interfaces/icrawler/__init__.py new file mode 100644 index 00000000..5a8fcc6c --- /dev/null +++ b/rss_reader/interfaces/icrawler/__init__.py @@ -0,0 +1 @@ +"""This package contains the crawler interfaces.""" diff --git a/rss_reader/interfaces/icrawler/icrawler.py b/rss_reader/interfaces/icrawler/icrawler.py new file mode 100644 index 00000000..81e3c0ca --- /dev/null +++ b/rss_reader/interfaces/icrawler/icrawler.py @@ -0,0 +1,16 @@ +"""This module contains a set of interfaces for the crawler.""" + +from abc import ABC, abstractmethod + + +class ICrawler(ABC): + """Interface for a crawler.""" + + @abstractmethod + def get_data(self) -> bytes: + """Get the content of the requested page. + + :return: Returns the result as a byte string. + :rtype: bytes + """ + pass diff --git a/rss_reader/interfaces/idataconverter/__init__.py b/rss_reader/interfaces/idataconverter/__init__.py new file mode 100644 index 00000000..c9f336a1 --- /dev/null +++ b/rss_reader/interfaces/idataconverter/__init__.py @@ -0,0 +1 @@ +"""This package contains the dataconverter interfaces.""" diff --git a/rss_reader/interfaces/idataconverter/idataconverter.py b/rss_reader/interfaces/idataconverter/idataconverter.py new file mode 100644 index 00000000..a3018b57 --- /dev/null +++ b/rss_reader/interfaces/idataconverter/idataconverter.py @@ -0,0 +1,109 @@ +"""This module contains a set of interfaces for the logger.""" + + +from abc import abstractmethod, ABC +from typing import List, Optional +from pandas import DataFrame + + +class IDataConverter(ABC): + """Basic converter interface.""" + + @abstractmethod + def concat_data(self, data: List[dict], + local_data: DataFrame, + ignore_index: bool) -> Optional[DataFrame]: + """Concat two DataFrames. + + :param data: Data to be merged with data from local storage. + :type data: List[dict] + :param local_data: DataFrame which is obtained from local storage. + :type local_data: DataFrame + :param ignore_index: Returns True when the argument x is true, + False otherwise. The builtins True and False are the only two + instances of the class bool. The class bool is a subclass of the + class int, and cannot be subclassed. + :type ignore_index: bool + :return: Merged DataFrame. + :rtype: Optional[DataFrame] + """ + pass + + @abstractmethod + def drop_duplicates_(self, data: DataFrame, + keep: str = "first", + ignore_index: bool = False) -> Optional[DataFrame]: + """Return DataFrame with duplicate rows removed. + + :param data: The DataFrame in which duplicates should be removed. + :type data: DataFrame + :param keep: {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to keep. + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + :type keep: str + :param inplace: bool, default False + Whether to drop duplicates in place or to return a copy. + :type inplace: bool + :param ignore_index: bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + :type ignore_index: bool + :return: DataFrame without duplicates. + :rtype: Optional[DataFrame] + """ + pass + + @abstractmethod + def normalize_(self, data: List[dict], record_path: List[str], + meta: List[str], record_prefix: str) -> DataFrame: + """Normalize semi-structured JSON data into a flat table. + + :param data: Data for normalize. + :type data: List[dict] + :param record_path: Path in each object to list of records. If not + passed, data will be assumed to be an array of records. + :type record_path: List[str] + :param meta: Fields to use as metadata for each record in resulting + table. + :type meta: List[str] + :param record_prefix: If True, prefix records with dotted (?) path, + e.g. foo.bar.field if + path to records is ['foo', 'bar']. + :type record_prefix: str + :return: Normalize semi-structured JSON data into a flat table. + :rtype: DataFrame + """ + pass + + @abstractmethod + def convert_date(self, df: DataFrame, column_name: + str, format: str = '%Y-%m-%d', + utc: bool = True) -> DataFrame: + """Convert the date to the desired format. + + :param df: The DataFrame in which to replace. + :type df: DataFrame + :param column_name: The name of the column in which you want to change. + :type column_name: str + :param format: Return a string representing the date, controlled by an + explicit format string. Format codes referring to hours, minutes or + seconds will see 0 values., defaults to '%Y-%m-%d' + :type format: str, optional + :param utc: Control timezone-related parsing, localization and + conversion. + + If True, the function always returns a timezone-aware UTC-localized + Timestamp, Series or DatetimeIndex. To do this, timezone-naive inputs + are localized as UTC, while timezone-aware inputs are converted to UTC. + + If False (default), inputs will not be coerced to UTC. Timezone-naive + inputs will remain naive, while timezone-aware ones will keep their + time offsets. Limitations exist for mixed offsets (typically, daylight + savings), see Examples section for details., defaults to True + + :type utc: bool + :return: DataFrame with the converted date. + :rtype: DataFrame + """ + pass diff --git a/rss_reader/interfaces/idateconverter/__init__.py b/rss_reader/interfaces/idateconverter/__init__.py new file mode 100644 index 00000000..eff3a80b --- /dev/null +++ b/rss_reader/interfaces/idateconverter/__init__.py @@ -0,0 +1 @@ +"""This package contains the dateconverter interfaces.""" diff --git a/rss_reader/interfaces/idateconverter/idateconverter.py b/rss_reader/interfaces/idateconverter/idateconverter.py new file mode 100644 index 00000000..f67407d2 --- /dev/null +++ b/rss_reader/interfaces/idateconverter/idateconverter.py @@ -0,0 +1,22 @@ +"""This module contains classes that work with dates.""" + +from abc import ABC, abstractmethod + + +class IDateConverter(ABC): + """Converts date.""" + + @staticmethod + @abstractmethod + def date_convert(date: str, format: str) -> str: + """Return the date part. + + :param date: Date in original state. + :type date: str + :param format: Substring selection format. + Example '%Y%m%d'. + :type format: str + :return: Date in the given format. + :rtype: str + """ + pass diff --git a/rss_reader/interfaces/iloader/__init__.py b/rss_reader/interfaces/iloader/__init__.py new file mode 100644 index 00000000..f7e553be --- /dev/null +++ b/rss_reader/interfaces/iloader/__init__.py @@ -0,0 +1 @@ +"""This package contains the loader interfaces.""" diff --git a/rss_reader/interfaces/iloader/iloader.py b/rss_reader/interfaces/iloader/iloader.py new file mode 100644 index 00000000..c91d3fc0 --- /dev/null +++ b/rss_reader/interfaces/iloader/iloader.py @@ -0,0 +1,52 @@ +"""This module contains a set of interfaces for the loader.""" + +from __future__ import annotations +from abc import ABC, abstractmethod + + +class IHandler(ABC): + """Interface for receiving data.""" + + def get_data(self, tag_name: str, + title_tag: str, + source: str, + limit: int) -> dict: + """Return a dictionary with parsed data. + + :param tag_name: The name of the tag in which the news is stored. + :type tag_name: str + :param title_tag: The name of the tag in which the name + of the rss resource is stored. + :type title_tag: str + :param source: Resource URL. + :type source: str + :param limit: Number of news items to display. + :type limit: int + :return: Dictionary with parsed data. + :rtype: dict + """ + pass + + +class ILoadHandler(ABC): + """Interface for loaders data.""" + + @abstractmethod + def set_next(self, handler: ILoadHandler) -> ILoadHandler: + """Set the next viewer in the handler chain. + + :param handler: Next handler. + :type handler: ILoadHandler + :return: Handler. + :rtype: ILoadHandler + """ + pass + + @abstractmethod + def get_data(self) -> list: + """Get the requested data. + + :return: List with data. + :rtype: list + """ + pass diff --git a/rss_reader/interfaces/iloader/ireader_files.py b/rss_reader/interfaces/iloader/ireader_files.py new file mode 100644 index 00000000..2014b9b8 --- /dev/null +++ b/rss_reader/interfaces/iloader/ireader_files.py @@ -0,0 +1,10 @@ + +from abc import ABC, abstractmethod +from pandas import DataFrame + + +class IReadFile(ABC): + @abstractmethod + def read(file: str, + index_col: str = 'index', encoding='utf-8') -> DataFrame: + pass diff --git a/rss_reader/interfaces/ilogger/__init__.py b/rss_reader/interfaces/ilogger/__init__.py new file mode 100644 index 00000000..7b0a4021 --- /dev/null +++ b/rss_reader/interfaces/ilogger/__init__.py @@ -0,0 +1 @@ +"""This package contains the logger interfaces.""" diff --git a/rss_reader/interfaces/ilogger/ilogger.py b/rss_reader/interfaces/ilogger/ilogger.py new file mode 100644 index 00000000..035e2515 --- /dev/null +++ b/rss_reader/interfaces/ilogger/ilogger.py @@ -0,0 +1,20 @@ +"""This module contains a set of interfaces for the logger.""" + + +from abc import ABC, abstractmethod +from logging import Logger as LG + + +class ISetLoggerConfig(ABC): + """Logger configuration setup interface.""" + + @abstractmethod + def set_config(self, name: str) -> LG: + """Set logger configuration. + + :param name: Logger name. + :type name: str + :return: object Logger. + :rtype: LG + """ + pass diff --git a/rss_reader/interfaces/iparser/__init__.py b/rss_reader/interfaces/iparser/__init__.py new file mode 100644 index 00000000..f5674640 --- /dev/null +++ b/rss_reader/interfaces/iparser/__init__.py @@ -0,0 +1 @@ +"""This package contains the parser interfaces.""" diff --git a/rss_reader/interfaces/iparser/iparser.py b/rss_reader/interfaces/iparser/iparser.py new file mode 100644 index 00000000..cfdc0cf6 --- /dev/null +++ b/rss_reader/interfaces/iparser/iparser.py @@ -0,0 +1,76 @@ +"""This module contains a set of interfaces that describe the work +data parsers. +""" + + +from abc import ABC, abstractmethod +from typing import Generator, List + + +class IParser(ABC): + """Interface for a parser.""" + + @abstractmethod + def create_parser(self, markup: bytes, features: str = 'xml') -> None: + """Create a parser object. + + :param markup: A string or a file-like object representing markup + to be parsed. + :type markup: bytes + :param features: Desirable features of the parser to be used. This may + be the name of a specific parser ("lxml", "lxml-xml", + "html.parser", or "html5lib") or it may be the type + of markup to be used ("html", "html5", "xml"). + Defaults to 'xml'. + :type features: str, optional + """ + + @abstractmethod + def get_tags_text(self, + selector: str, + limit_elms: int = None) -> Generator[str, None, None]: + """Returns the text stored in the tag(s). + + :param selector: A string containing a CSS selector. + :type selector: str + :param limit_elms: The number of elements to return, defaults to None. + :type limit_elms: int, optional + :yield: Returns the text of each element. + :rtype: Generator[str, None, None] + """ + pass + + @abstractmethod + def get_items(self, + template: dict, + name: str, + limit_elms: int = None) -> List[dict]: + """Get a list of found items. + + :param template: Specifies the element search pattern. Represents a + dictionary. The keys of the dictionary are the tags + to be found, and the value is just a string + (for example, 'text'). If you need to find the + attributes of a tag, then the value is a list that + lists all the attributes you need to search + (for example, Iterable: + """Return all tags with the given name. + + :param name: A filter on tag name, defaults to None + :type name: str, optional + :param limit: Stop looking after finding this many results, + defaults to None. + :type limit: int, optional + :return: An iterable object that provides the found elements. + :rtype: Iterable + """ + pass + + @abstractmethod + def select(self, selector: str, limit: int = None) -> Iterable: + """Return tags selected by CSS selector. + + :param selector: A string containing a CSS selector. + :type selector: str + :param limit: After finding this number of results, stop looking, + defaults to None + :type limit: int, optional + :return: An iterable object that provides the found elements. + :rtype: Iterable + """ + pass diff --git a/rss_reader/interfaces/ipathfile/__init__.py b/rss_reader/interfaces/ipathfile/__init__.py new file mode 100644 index 00000000..eac1d972 --- /dev/null +++ b/rss_reader/interfaces/ipathfile/__init__.py @@ -0,0 +1 @@ +"""This package contains the pathfile interfaces.""" diff --git a/rss_reader/interfaces/ipathfile/ipathfile.py b/rss_reader/interfaces/ipathfile/ipathfile.py new file mode 100644 index 00000000..0d28cc9d --- /dev/null +++ b/rss_reader/interfaces/ipathfile/ipathfile.py @@ -0,0 +1,17 @@ +"""This module contains a set of interfaces for the pathfile package.""" + + +from abc import abstractmethod, ABC + + +class ICreateFile(ABC): + """A interface for creating files.""" + + @abstractmethod + def create_file(self, file: str) -> None: + """Create file. + + :param file: File name. + :type file: str + """ + pass diff --git a/rss_reader/interfaces/isaver/__init__.py b/rss_reader/interfaces/isaver/__init__.py new file mode 100644 index 00000000..fb61ecaa --- /dev/null +++ b/rss_reader/interfaces/isaver/__init__.py @@ -0,0 +1 @@ +"""This package contains the saver interfaces.""" diff --git a/rss_reader/interfaces/isaver/ireader_files.py b/rss_reader/interfaces/isaver/ireader_files.py new file mode 100644 index 00000000..60006f08 --- /dev/null +++ b/rss_reader/interfaces/isaver/ireader_files.py @@ -0,0 +1,32 @@ + +from pandas import DataFrame + +from typing import Optional +from abc import abstractmethod, ABC + +from rss_reader.interfaces.ipathfile.ipathfile import ICreateFile + + +class IReadFile(ABC): + @abstractmethod + def read_csv_file(self, file: str, + index_col_: str, + creater: ICreateFile, + encoding_: str = 'utf-8') -> Optional[DataFrame]: + """Read csv file. + + :param file: File name. + :type file: str + :param index_col_: Column(s) to use as the row labels of the DataFrame, + either given as string name or column index. If a + sequence of int / str is given, a MultiIndex is used. + :type index_col_: str + :param creater: An object that implements the ability to create a file. + Used when the requested file does not exist. + :type creater: ICreateFile + :param encoding_: File encoding, defaults to 'utf-8' + :type encoding_: str, optional + :return: Return the read data as a DataFrame. + :rtype: Optional[DataFrame] + """ + pass diff --git a/rss_reader/interfaces/isaver/isaver.py b/rss_reader/interfaces/isaver/isaver.py new file mode 100644 index 00000000..befe0e74 --- /dev/null +++ b/rss_reader/interfaces/isaver/isaver.py @@ -0,0 +1,29 @@ +"""This module contains a set of interfaces for savers.""" + +from __future__ import annotations +from abc import ABC, abstractmethod +from typing import List + + +class ISaveHandler(ABC): + """Basic interface of data savers.""" + + @abstractmethod + def set_next(self, handler: ISaveHandler) -> ISaveHandler: + """Set the next saver in the handler chain. + + :param handler: Next handler. + :type handler: ISaveHandler + :return: Handler. + :rtype: ISaveHandler + """ + + pass + + def save(self, data: List[dict]) -> None: + """Save data. + + :param data: Dictionary with data to save. + :type data: dict + """ + pass diff --git a/rss_reader/interfaces/isaver/istrategies.py b/rss_reader/interfaces/isaver/istrategies.py new file mode 100644 index 00000000..2f818343 --- /dev/null +++ b/rss_reader/interfaces/isaver/istrategies.py @@ -0,0 +1,21 @@ +"""This module contains a set of interfaces for HTML strategies.""" + +from abc import ABC, abstractmethod +from typing import List + + +class StrategySaveHTML(ABC): + """Strategy interface.""" + @abstractmethod + def prepare_html(self, data: List[dict]) -> str: + """Prepare html page. + + The operation of preparing a page according to a specific algorithm. + Each specific strategy defines its own page creation logic. + + :param data: Initial data intended for display on the html page. + :type data: List[dict] + :return: Generated html page according to a certain algorithm. + :rtype: str + """ + pass diff --git a/rss_reader/interfaces/iviewer/__init__.py b/rss_reader/interfaces/iviewer/__init__.py new file mode 100644 index 00000000..32c8ba56 --- /dev/null +++ b/rss_reader/interfaces/iviewer/__init__.py @@ -0,0 +1 @@ +"""This package contains the viewer interfaces.""" diff --git a/rss_reader/interfaces/iviewer/iviewer.py b/rss_reader/interfaces/iviewer/iviewer.py new file mode 100644 index 00000000..5b7a6f04 --- /dev/null +++ b/rss_reader/interfaces/iviewer/iviewer.py @@ -0,0 +1,30 @@ +"""This module contains a set of interfaces for data viewers.""" + + +from __future__ import annotations +from abc import ABC, abstractmethod +from typing import List + + +class IViewHandler(ABC): + """Interface for a parser.""" + + @abstractmethod + def set_next(self, handler: IViewHandler) -> IViewHandler: + """Set the next viewer in the handler chain. + + :param handler: Next handler. + :type handler: IViewHandler + :return: Handler. + :rtype: IViewHandler + """ + pass + + @abstractmethod + def show(self, data: List[dict]) -> None: + """Show data. + + :param data: Dictionary with data to be printed on the screen. + :type data: dict + """ + pass diff --git a/rss_reader/loader/__init__.py b/rss_reader/loader/__init__.py new file mode 100644 index 00000000..fcd0b073 --- /dev/null +++ b/rss_reader/loader/__init__.py @@ -0,0 +1 @@ +"""This package contains modules that work with data loaders.""" diff --git a/rss_reader/loader/decorators.py b/rss_reader/loader/decorators.py new file mode 100644 index 00000000..579d496b --- /dev/null +++ b/rss_reader/loader/decorators.py @@ -0,0 +1,95 @@ +"""This module implements the decorator pattern.""" + + +from abc import ABC, abstractmethod +from pandas import DataFrame + + +class IComponent(ABC): + """Basic Component Interface""" + + @abstractmethod + def operation(self, data: DataFrame) -> DataFrame: + pass + + +class BaseComponent(IComponent): + """A base concrete component. + + It is a stub. + """ + + def operation(self, data) -> str: + return data + + +class Decorator(IComponent): + """Decorator base class.""" + + _component: IComponent = None + + def __init__(self, component: IComponent) -> None: + self._component = component + + @property + def component(self) -> IComponent: + return self._component + + def operation(self, data) -> DataFrame: + return self._component.operation(data) + + +class LimitRecords(Decorator): + """A decorator that produces a certain number of entries.""" + + def __init__(self, limit: int, component: IComponent) -> None: + """Initializer. + + :param limit: How many records to return. + :type limit: int + :param component: object of type IComponent. + :type component: IComponent + """ + self._limit = limit + super().__init__(component) + + def operation(self, data: DataFrame) -> DataFrame: + """Return the required number of data records. + + :param data: Sample data. + :type data: DataFrame + :return: Data sampling. + :rtype: DataFrame + """ + result = self.component.operation(data) + return result.head(self._limit) + + +class SortByEqual(Decorator): + """A decorator that selects DataFrame elements by column value.""" + + def __init__(self, search_column: str, criterion: str, + component: IComponent) -> None: + """Initializer. + + :param search_column: the name of the column to select from. + :type search_column: str + :param criterion: comparison criterion. + :type criterion: str + :param component: object of type IComponent. + :type component: IComponent + """ + self._search_column = search_column + self._criterion = criterion + super().__init__(component) + + def operation(self, data: DataFrame) -> DataFrame: + """Return records that match the criteria. + + :param data: sample data. + :type data: DataFrame + :return: data sampling. + :rtype: DataFrame + """ + result = self.component.operation(data) + return result[result[self._search_column] == self._criterion] diff --git a/rss_reader/loader/exceptions.py b/rss_reader/loader/exceptions.py new file mode 100644 index 00000000..a4c82a7c --- /dev/null +++ b/rss_reader/loader/exceptions.py @@ -0,0 +1,38 @@ + +from pandas.errors import EmptyDataError + + +class EmptyURLError(Exception): + """Occurs when the url is empty.""" + + +class DataEmptyError(Exception): + """Occurs when there is no data.""" + + +class DataFileEmptyError(EmptyDataError): + """Occurs when there is no data in the uploaded file.""" + + def __init__(self, file, *args, **kwargs) -> None: + self.file = file + super().__init__(*args, **kwargs) + + def __str__(self) -> str: + a = f'No columns to parse from file ({self.file}). '\ + f'Delete the file and run the program in the mode of reading '\ + f'news from the Internet.' + return a + + +class DataFileNotFoundError(EmptyDataError): + """Occurs when the file is not found.""" + + def __init__(self, file, *args, **kwargs) -> None: + self.file = file + super().__init__(*args, **kwargs) + + def __str__(self) -> str: + a = f'File ({self.file}) does not exist. '\ + f'Run the program in the mode of reading '\ + f'news from the Internet.' + return a diff --git a/rss_reader/loader/loader.py b/rss_reader/loader/loader.py new file mode 100644 index 00000000..326e3698 --- /dev/null +++ b/rss_reader/loader/loader.py @@ -0,0 +1,278 @@ +"""This module contains a set of handlers for receiving data. + +The handler receives data from a specific source, selects the necessary +data elements and forms the final result. +""" + + +from typing import Dict, List, Optional +from numpy import nan +from pandas import DataFrame + +from rss_reader.interfaces.iloader.iloader import IHandler, ILoadHandler +from rss_reader.interfaces.icrawler.icrawler import ICrawler +from rss_reader.interfaces.iparser.iparser import IParser +from rss_reader.logger.logger import Logger +from rss_reader.decorator.decorator import send_log_of_start_function +from rss_reader.date_converter.date_converter import DateConverter +from rss_reader.parser.exceptions import EmptyListError + + +from .reader import ReaderCSVFile +from .decorators import BaseComponent, SortByEqual, LimitRecords +from .exceptions import DataEmptyError, EmptyURLError + + +log = Logger.get_logger(__name__) + + +class AbstractLoaderHandler(ILoadHandler): + """The base class of the handler.""" + + _next_handler: Optional[ILoadHandler] = None + + @send_log_of_start_function + def set_next(self, handler: ILoadHandler) -> ILoadHandler: + """Set the next viewer in the handler chain. + + :param handler: Next handler. + :type handler: ILoadHandler + :return: Handler. + :rtype: ILoadHandler + """ + self._next_handler = handler + return handler + + @send_log_of_start_function + def get_data(self) -> list: + """Get the requested data. + + :return: List with data. + :rtype: list + """ + + if self._next_handler: + return self._next_handler.get_data() + return None + + +class FromLocalSTorageHandler(AbstractLoaderHandler): + """Data loader from local storage.""" + + def __init__(self, file: str, request: Dict[str, str]) -> None: + self._file = file + self._request = request + + def get_data(self) -> list: + """Return a list with parsed data. + + :raises ValueError: Occurs when the date cannot be converted. + :raises DataEmptyError: Occurs when there is no data. + :return: List with data. + :rtype: list + """ + date = self._request.get('date') + if date: + + # load data from local storage + raw_data = ReaderCSVFile.read(self._file) + + # convert date to string + try: + dt = DateConverter().date_convert(date) + except ValueError as e: + raise ValueError('Wrong time format') from e + + # design pattern - decorator + bc = BaseComponent() + # sort by date + fined_data = SortByEqual('item.pubDate', dt.__str__(), bc) + + source = self._request.get('source') + if source: + # sort by source + fined_data = SortByEqual( + 'link', source, fined_data) + + limit = self._request.get('limit') + if limit: + # sort by limit + fined_data = LimitRecords(limit, fined_data) + # start execution of the decorator pattern + fined_data = fined_data.operation(raw_data) + + if fined_data.empty: + raise DataEmptyError( + 'There is no data to provide for the current date.') + + # convert to the given dictionary structure + data = self._convert_to_dict(fined_data) + + return data + else: + return super().get_data() + + def _convert_to_dict(self, raw_data: DataFrame) -> list: + """Convert to the desired data structure. + + For example: + [ + {'title_web_resource': 'Yahoo News - Latest News & Headlines', + 'link':'https://news.yahoo.com/rss/', + 'items': [ + {'title': 'The media receive copies onched missiles on25 June', + 'link': 'https://news.yahoo.com/medi04.html', + 'pubDate': '2022-06-28', + 'source': 'Ukrayina Pravda', + 'content': + { + 'url': 'https://s.yimg.com/uu/api/re.com/en/ukc321c3475', + 'title': None + } + } + ] + } + + {'title_web_resource': 'Новости ООН - Здравоохранение', + 'link':'https://news.un.org/feed/subscribe/ru/news/topic/health/feed/rss.xml', + 'items': [ + {'title': 'итуация с безопасностью дорожного движения ухудшается', + 'link': 'https://news.yahoo.com/medi04.html', + 'pubDate': '2022-06-28', + 'source': 'UN', + 'content': + { + 'url': 'https://s.yimg.com/uu/api/re.com/en/ukc321c3475', + 'title': None + } + } + ] + } + ] + """ + + l_item = [] + + def new_item(): + new_source = {} + new_source['title_web_resource'] = v.get('title_web_resource') + new_source['link'] = link + new_source['items'] = [item] + l_item.append(new_source) + + for i, v in raw_data.iterrows(): + + v.replace(nan, None, inplace=True) + link = v.get('link') + + item = {} + item['title'] = v.get('item.title') + item['link'] = v.get('item.link') + item['pubDate'] = v.get('item.pubDate') + item['source'] = v.get('item.source') + + content = {} + content['url'] = v.get('item.content.url') + content['title'] = v.get('item.content.title') + item['content'] = content + + if not l_item: + new_item() + else: + for i in l_item: + if link == i.get('link'): + i['items'].append(item) + break + else: + new_item() + + return l_item + + +class FromWebHandler(AbstractLoaderHandler): + """Internet data handler.""" + + template = {'title': 'text', + 'pubDate': 'text', + 'source': 'text', + 'link': 'text', + 'content': ['url', 'title'] + } + + def __init__(self, + tag_name: str, + title_tag: str, + source: str, + limit: Optional[int], + crawler: ICrawler, + parser: IParser) -> None: + """Initializer. + + :param tag_name: The name of the tag in which the news is stored. + For example . + :type tag_name: str + :param title_tag: The name of the tag that contains the title. + It is CSS selector. + :type title_tag: str + :param source: News source url. + :type source: str + :param limit: The number of elements to return. + :type limit: Optional[int] + :param crawler: Crawler object. Used to get information + from the Internet. + :type crawler: ICrawler + :param parser: Parser object. Used to parse information + received from the Internet. + :type parser: IParser + """ + self._tag_name = tag_name + self._title_tag = title_tag + self._source = source + self._limit = limit + self._crawler = crawler + self._parser = parser + + @send_log_of_start_function + def get_data(self) -> List[dict]: + """Return a list with parsed data. + + :raises EmptyURLError: Occurs when the url is empty. + :raises DataEmptyError: Occurs when there is no data. + :return: List with data. + :rtype: List[dict] + """ + if not self._source: + raise EmptyURLError('Passed url is empty!') + + # get data from internet + cr = self._crawler(self._source) + response_ = cr.get_data() + + log.debug('Start creating the parser.') + self._parser.create_parser(markup=response_) + log.debug('Stop creating the parser.') + + log.debug('Start the process of getting the resource title.') + try: + title_tag = self._parser.get_tags_text(selector=self._title_tag) + title_text = next(title_tag) + except EmptyListError: + title_text = None + log.debug('Stop the process of getting the resource title.') + + # get news + items = self._parser.get_items( + self.template, name=self._tag_name, limit_elms=self._limit) + + if not items: + raise DataEmptyError('no news') + + # collect all the data in a dictionary + log.debug('Start generating results.') + result = {'title_web_resource': title_text} + result.update({'link': self._source}) + items_dict = {'items': items} + result.update(items_dict) + log.debug('Result was formed.') + + return [result] diff --git a/rss_reader/loader/reader.py b/rss_reader/loader/reader.py new file mode 100644 index 00000000..d71101d0 --- /dev/null +++ b/rss_reader/loader/reader.py @@ -0,0 +1,41 @@ +"""This module contains data readers from files.""" + + +from pandas import DataFrame, read_csv +from pandas.errors import EmptyDataError + +from rss_reader.interfaces.iloader.ireader_files import IReadFile +from .exceptions import DataFileNotFoundError, DataFileEmptyError + + +class ReaderCSVFile(IReadFile): + """Reader CSV File.""" + + @staticmethod + def read(file: str, + index_col: str = 'index', encoding='utf-8') -> DataFrame: + """Read CSV File. + + :param file: File name. + :type file: str + :param index_col: Column(s) to use as the row labels of the DataFrame, + either given as string name or column index. If a sequence of + int / str is given, a MultiIndex is used, defaults to 'index'. + :type index_col: str, optional + :param encoding: Encoding to use for UTF when reading/writing + (ex. ‘utf-8’). , defaults to 'utf-8' + :type encoding: str, optional + :raises DataFileNotFoundError: Occurs when the file is not found. + :raises DataFileEmptyError: Occurs when there is no data in the + uploaded file. + :return: The data read from the file as a DataFrame. + :rtype: DataFrame + """ + try: + raw_data = read_csv(file, index_col=index_col, encoding=encoding) + except FileNotFoundError as e: + raise DataFileNotFoundError(file) from e + except EmptyDataError as e: + raise DataFileEmptyError(file) from e + + return raw_data diff --git a/rss_reader/logger/__init__.py b/rss_reader/logger/__init__.py new file mode 100644 index 00000000..b3ee2e23 --- /dev/null +++ b/rss_reader/logger/__init__.py @@ -0,0 +1 @@ +"""This package contains modules that work with logging.""" diff --git a/rss_reader/logger/logger.py b/rss_reader/logger/logger.py new file mode 100644 index 00000000..72d0f021 --- /dev/null +++ b/rss_reader/logger/logger.py @@ -0,0 +1,99 @@ +"""This module contains classes for working with data logging. + +The module implements the Logger class, which provides basic functionality +for the functioning of the data logging system. + +The module has two standard classes that serve as system customizers +logging. + +""" + + +from logging import (getLogger, StreamHandler, Formatter, + NullHandler, Logger as LG, DEBUG as DBG) + +from rss_reader.interfaces.ilogger.ilogger import ISetLoggerConfig + + +class StreamHandlerConfig(ISetLoggerConfig): + """Basic configuration. + + Sets up logging with the output of the result on the screen. + """ + + def set_config(self, name: str) -> LG: + """Set logger configuration. + + :param name: Logger name. + :type name: str + :return: object Logger. + :rtype: LG + """ + + logger = getLogger(name) + logger.setLevel(DBG) + sh = StreamHandler() + sh.setLevel(DBG) + li = ['%(asctime)s', '%(name)s', '%(levelname)s', '%(funcName)s', + '%(lineno)d', '%(message)s'] + str_f = '|'.join(li) + formatter = Formatter(str_f, '%Y-%m-%d %H:%M:%S') + sh.setFormatter(formatter) + logger.addHandler(sh) + return logger + + +class NullHandlerConfig(ISetLoggerConfig): + """Logger configuration with output to the void.""" + + def set_config(self, name: str) -> LG: + """Set logger configuration. + + :param name: Logger name. + :type name: str + :return: object Logger. + :rtype: LG + """ + + logger = getLogger(name) + logger.setLevel(DBG) + sh = NullHandler() + sh.setLevel(DBG) + logger.addHandler(sh) + return logger + + +class Logger: + """Logs data.""" + + NAME_LOGGER = 'base_logger' + + def __init__(self, name_loger: str, config: ISetLoggerConfig) -> None: + """Initializer. + + :param name_loger: Logger name. + :type name_loger: str + :param config: Logger configuration. + :type config: ISetLoggerConfig + """ + Logger.NAME_LOGGER = name_loger + self._config = config + + def setup_logger(self) -> LG: + """Set logger configuration. + + :return: object Logger + :rtype: LG + """ + return self._config.set_config(Logger.NAME_LOGGER) + + @classmethod + def get_logger(cls, module_name: str) -> LG: + """Get logger by name. + + :param module_name: Logger name. + :type module_name: str + :return: object Logger + :rtype: LG + """ + return getLogger(cls.NAME_LOGGER).getChild(module_name) diff --git a/rss_reader/logger/tests/__init__.py b/rss_reader/logger/tests/__init__.py new file mode 100644 index 00000000..63621acf --- /dev/null +++ b/rss_reader/logger/tests/__init__.py @@ -0,0 +1 @@ +"""Test suite for the logger package.""" diff --git a/rss_reader/logger/tests/conftest.py b/rss_reader/logger/tests/conftest.py new file mode 100644 index 00000000..a922ba8f --- /dev/null +++ b/rss_reader/logger/tests/conftest.py @@ -0,0 +1,11 @@ +import pytest + +from ..logger import Logger, StreamHandlerConfig, NullHandlerConfig + + +@pytest.fixture(scope="class", + params=[StreamHandlerConfig, + NullHandlerConfig]) +def logger_obj(request): + """The fixture returns a logger object with different configurations.""" + yield Logger('test_name', request.param()) diff --git a/rss_reader/logger/tests/test_logger.py b/rss_reader/logger/tests/test_logger.py new file mode 100644 index 00000000..f02be8b5 --- /dev/null +++ b/rss_reader/logger/tests/test_logger.py @@ -0,0 +1,41 @@ +"""A test suite for the logger module.""" + +from logging import Logger + +from ..logger import StreamHandlerConfig, NullHandlerConfig + + +def test_stream_config(): + """Check that the type of logger returned is of type Logger.""" + sc = StreamHandlerConfig() + logger = sc.set_config('name') + assert isinstance(logger, Logger) + + +def test_null_config(): + """Check that NullHandlerConfig returns the desired logger type.""" + sc = NullHandlerConfig() + logger = sc.set_config('name') + assert isinstance(logger, Logger) + + +class TestLogger(): + """Class tests Logger class from rss-reader.""" + + def test_setup_logger(self, logger_obj): + """Check return type of setup_logger function. + + :param logger_obj: Logger from the rss-reader.logger package. + :type logger_obj: Logger + """ + o = logger_obj.setup_logger() + assert isinstance(o, Logger) + + def test_get_logger(self, logger_obj): + """Check return type of get_logger function. + + :param logger_obj: Logger from the rss-reader.logger package. + :type logger_obj: Logger + """ + o = logger_obj.get_logger('test_module') + assert isinstance(o, Logger) diff --git a/rss_reader/parser/__init__.py b/rss_reader/parser/__init__.py new file mode 100644 index 00000000..29071829 --- /dev/null +++ b/rss_reader/parser/__init__.py @@ -0,0 +1 @@ +"""This package contains modules that work with parser.""" diff --git a/rss_reader/parser/exceptions.py b/rss_reader/parser/exceptions.py new file mode 100644 index 00000000..c449781c --- /dev/null +++ b/rss_reader/parser/exceptions.py @@ -0,0 +1,6 @@ +"""This module contains custom exceptions for the parser package.""" + + +class EmptyListError(Exception): + """List has no elements.""" + pass diff --git a/rss_reader/parser/parser.py b/rss_reader/parser/parser.py new file mode 100644 index 00000000..9b6a28b3 --- /dev/null +++ b/rss_reader/parser/parser.py @@ -0,0 +1,135 @@ +"""This module contains a class for parsing data received from the net.""" + + +from typing import Generator, Iterable, List + +from rss_reader.logger.logger import Logger +from rss_reader.interfaces.iparser.iparser import IParser +from rss_reader.interfaces.iparser.isubsystem import ISubsystem +from rss_reader.decorator.decorator import send_log_of_start_function +from .exceptions import EmptyListError + +log = Logger.get_logger(__name__) + + +class BeautifulParser(IParser): + """A class to represent a parser.""" + + def __init__(self, subsystem: ISubsystem) -> None: + """Initializer. + + :param subsystem: Third party parser. + :type subsystem: ISubsystem + """ + self._subsystem = subsystem + + @send_log_of_start_function + def create_parser(self, markup: bytes, features: str = 'xml') -> None: + """Create a parser object. + + :param markup: A string or a file-like object representing markup + to be parsed. + :type markup: bytes + :param features: Desirable features of the parser to be used. This may + be the name of a specific parser ("lxml", "lxml-xml", + "html.parser", or "html5lib") or it may be the type + of markup to be used ("html", "html5", "xml"). + Defaults to 'xml'. + :type features: str, optional + """ + self._subsystem = self._subsystem(markup, features) + + def _find_all(self, name: str = None, limit_elms: int = None) -> Iterable: + """Return all tags with the given name.""" + return self._subsystem.find_all(name, limit=limit_elms) + + def _select(self, selector: str, limit_elms: int = None) -> Iterable: + """Return tags selected by CSS selector.""" + return self._subsystem.select(selector, limit=limit_elms) + + @send_log_of_start_function + def get_tags_text(self, + selector: str, + limit_elms: int = None) -> Generator[str, None, None]: + """Returns the text stored in the tag(s). + + :param selector: A string containing a CSS selector. + :type selector: str + :param limit_elms: The number of elements to return, defaults to None. + :type limit_elms: int, optional + :yield: Returns the text of each element. + :rtype: Generator[str, None, None] + """ + + tags = self._select(selector, limit_elms) + + if not tags: + log.error("No matching tags. Maybe the selector is wrong.") + raise EmptyListError( + "No matching tags. Maybe the selector is wrong.") + + for i in tags: + yield i.text + + @send_log_of_start_function + def get_items(self, + template: dict, + name: str, + limit_elms: int = None) -> List[dict]: + """Get a list of found items. + + :param template: Specifies the element search pattern. Represents a + dictionary. The keys of the dictionary are the tags + to be found, and the value is just a string + (for example, 'text'). If you need to find the + attributes of a tag, then the value is a list that + lists all the attributes you need to search + (for example, None: + """Create file. + + :param file: File name. + :type file: str + """ + file = Path(file) + Path.mkdir(file.parent, parents=True, exist_ok=True) + file.touch(exist_ok=True) + + def home(self) -> None: + """Return a new path pointing to the user's home directory.""" + return Path.home() + + @staticmethod + def exists_file(file: str) -> bool: + path = Path(file) + return path.exists() + + @staticmethod + def unlink(file: str) -> None: + Path(file).unlink() diff --git a/rss_reader/saver/__init__.py b/rss_reader/saver/__init__.py new file mode 100644 index 00000000..6be20347 --- /dev/null +++ b/rss_reader/saver/__init__.py @@ -0,0 +1 @@ +"""This package contains modules that work with savers.""" diff --git a/rss_reader/saver/reader_files.py b/rss_reader/saver/reader_files.py new file mode 100644 index 00000000..fe3d4eda --- /dev/null +++ b/rss_reader/saver/reader_files.py @@ -0,0 +1,52 @@ +"""This module contains objects that read files.""" + + +from pandas import DataFrame, read_csv +from pandas.errors import EmptyDataError +from typing import Optional + +from rss_reader.interfaces.isaver.ireader_files import IReadFile +from rss_reader.interfaces.ipathfile.ipathfile import ICreateFile +from rss_reader.logger.logger import Logger + +log = Logger.get_logger(__name__) + + +class ReaderFiles(IReadFile): + """A class to represent a file reader.""" + + def read_csv_file(self, file: str, + index_col_: str, + creater: ICreateFile, + encoding_: str = 'utf-8') -> Optional[DataFrame]: + """Read csv file. + + If the file does not exist, it will be created + + :param file: File name. + :type file: str + :param index_col_: Column(s) to use as the row labels of the DataFrame, + either given as string name or column index. If a + sequence of int / str is given, a MultiIndex is used. + :type index_col_: str + :param creater: An object that implements the ability to create a file. + Used when the requested file does not exist. + :type creater: ICreateFile + :param encoding_: File encoding, defaults to 'utf-8' + :type encoding_: str, optional + :return: Return the read data as a DataFrame. + :rtype: Optional[DataFrame] + """ + local_storage = None + + try: + local_storage = read_csv(file, + index_col=index_col_, + encoding=encoding_) + except EmptyDataError as e: + log.error(f'{file} is empty') + except FileNotFoundError as e: + log.error(f'No such file or directory: {file}') + creater.create_file(file) + + return local_storage diff --git a/rss_reader/saver/saver.py b/rss_reader/saver/saver.py new file mode 100644 index 00000000..788de323 --- /dev/null +++ b/rss_reader/saver/saver.py @@ -0,0 +1,72 @@ +"""This module contains a specific handler for saving data to local storage.""" + + +import pandas as pd +from typing import Optional, List + + +from rss_reader.interfaces.isaver.isaver import ISaveHandler +from rss_reader.decorator.decorator import send_log_of_start_function +from rss_reader.data_converter.data_converter import DataConverter +from rss_reader.saver.reader_files import ReaderFiles +from rss_reader.pathfile.pathfile import PathFile + + +class AbstractSaveHandler(ISaveHandler): + """An interface that declares a method for constructing a chain of handlers.""" + + _next_handler: Optional[ISaveHandler] = None + + @send_log_of_start_function + def set_next(self, handler: ISaveHandler) -> ISaveHandler: + """Set the next saver in the handler chain. + + :param handler: Next handler. + :type handler: ISaveHandler + :return: Handler. + :rtype: ISaveHandler + """ + self._next_handler = handler + return handler + + @send_log_of_start_function + def save(self, data: List[dict]) -> None: + """Save data. + + :param data: Dictionary with data to save. + :type data: List[dict] + """ + if self._next_handler: + return self._next_handler.save(data) + + +class LocalSaveHandler(AbstractSaveHandler): + """Stores data locally.""" + + def __init__(self, file: str) -> None: + self._file = file + + def save(self, data: List[dict]) -> None: + """Save data. + + :param data: Dictionary with data to save. + :type data: List[dict] + """ + local_data = ReaderFiles().read_csv_file( + self._file, 'index', PathFile()) + + try: + dc = DataConverter() + norm_data = dc.normalize_(data, record_path=['items'], + meta=['title_web_resource', 'link'], + record_prefix="item.") + norm_data = dc.convert_date(norm_data, 'item.pubDate') + data_concat = dc.concat_data(local_data, norm_data, + ignore_index=True) + data_to_file = dc.drop_duplicates_(data_concat, keep='first', + ignore_index=True) + except NotImplementedError as e: + local_data = None + data_to_file = pd.DataFrame() + + data_to_file.to_csv(self._file, encoding='utf-8', index_label='index') diff --git a/rss_reader/saver/to_html/__init__.py b/rss_reader/saver/to_html/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rss_reader/saver/to_html/strategies.py b/rss_reader/saver/to_html/strategies.py new file mode 100644 index 00000000..5a1a6fd0 --- /dev/null +++ b/rss_reader/saver/to_html/strategies.py @@ -0,0 +1,33 @@ +"""This module implements specific strategies that form html files.""" + + +from typing import List + +import pathlib +from jinja2 import FileSystemLoader, Environment + +from rss_reader.interfaces.isaver.istrategies import StrategySaveHTML + + +class SuperStrategySaveHTML(StrategySaveHTML): + """Implements the simplest strategy for generating an HTML file.""" + + def prepare_html(self, data: List[dict]) -> str: + """Prepare html page. + + The operation of preparing a page according to a specific algorithm. + Each specific strategy defines its own page creation logic. + + :param data: Initial data intended for display on the html page. + :type data: List[dict] + :return: Generated html page according to a certain algorithm. + :rtype: str + """ + + path_ = pathlib.Path(__file__).parent + file_loader = FileSystemLoader(path_/'templates') + env = Environment(loader=file_loader) + template_ = env.get_template('main.html') + html_ = template_.render(data=data) + + return html_ diff --git a/rss_reader/saver/to_html/templates/main.html b/rss_reader/saver/to_html/templates/main.html new file mode 100644 index 00000000..fe3ca65f --- /dev/null +++ b/rss_reader/saver/to_html/templates/main.html @@ -0,0 +1,63 @@ + + + + + + + + + + + +
+
+
+
+ {% for element in data %} +
+ {% if element.get('title_web_resource') %} +

{{ element.get('title_web_resource') }}

+ {% endif %} + + {% if element.get('link') %} + Link to feed. + {% endif %} + + {% for item in element.get('items') %} + {% if item %} +
+ + {% if item.get('title') %} +

{{ item.get('title') }}

+ {% endif %} + + {% if item.get('link') %} + Link to news. + {% endif %} + + {% if item.get('pubDate') %} +

Publication date: {{ item.get('pubDate') }}

+ {% endif %} + + {% if item.get('link') %} +

Source: {{ item.get('source') }}

+ {% endif %} + + {% if item.get('content').get('url') %} + {{ item.get('content').get('title') }} + {% endif %} + +
+
+ {% endif %} + {% endfor %} + +
+ {% endfor %} +
+
+ + + \ No newline at end of file diff --git a/rss_reader/saver/to_html/to_html.py b/rss_reader/saver/to_html/to_html.py new file mode 100644 index 00000000..31689723 --- /dev/null +++ b/rss_reader/saver/to_html/to_html.py @@ -0,0 +1,54 @@ +"""Contains a specific HTML file handler.""" + +from typing import Dict, List + + +from rss_reader.pathfile.pathfile import PathFile +from ..saver import AbstractSaveHandler +from .strategies import StrategySaveHTML + + +class HTMLSaveHandler(AbstractSaveHandler): + """Saves the data to an HTML file. + + Processed in a chain. + """ + def __init__(self, request: Dict[str, str], + strategy: StrategySaveHTML) -> None: + """Initializer. + + :param request: A dictionary in which there may be a key + by which this handler will work. + :type request: Dict[str, str] + :param strategy: The strategy by which the data file will be farmed. + :type strategy: StrategySaveHTML + """ + + self._request = request + self._strategy = strategy + + def save(self, data: List[dict]) -> None: + """Save data. + + :param data: Dictionary with data to save. + :type data: List[dict] + :raises FileExistsError: An error occurs when the specified path + does not exist. + """ + + file = self._request.get('to_html') + + if file: + html_ = self._strategy.prepare_html(data) + exists_file = PathFile.exists_file(file) + + if exists_file: + news_file = file + 'news.html' + + with open(news_file, "w", encoding='utf-8') as file_: + file_.write(html_) + else: + raise FileExistsError('The requested path does not exist.') + + else: + super().save(data) diff --git a/rss_reader/saver/to_pdf/__init__.py b/rss_reader/saver/to_pdf/__init__.py new file mode 100644 index 00000000..0dfe0bde --- /dev/null +++ b/rss_reader/saver/to_pdf/__init__.py @@ -0,0 +1 @@ +"""This package contains modules that work with guards in pdf format.""" diff --git a/rss_reader/saver/to_pdf/to_pdf.py b/rss_reader/saver/to_pdf/to_pdf.py new file mode 100644 index 00000000..54d5c7e9 --- /dev/null +++ b/rss_reader/saver/to_pdf/to_pdf.py @@ -0,0 +1,119 @@ +"""Contains a specific PDF file handler.""" + + +from typing import Dict, List +from io import BytesIO + +from fpdf import FPDF +from PIL import Image +import requests + +from rss_reader.pathfile.pathfile import PathFile +from ..saver import AbstractSaveHandler + + +class PDFSaveHandler(AbstractSaveHandler): + """Saves data in pdf format.""" + + def __init__(self, request: Dict[str, str], folder: str) -> None: + """Initializer. + + :param request: A dictionary in which there may be a key + by which this handler will work. + :type request: Dict[str, str] + :param folder: The folder in which the images will be stored during + the formation of the PDF file. + :type folder: str + """ + self._request = request + self._folder = folder + + def convert_to_latin1(self, data: str) -> str: + """Convert to latin1 encoding. + + :param data: Data to be converted. + :type data: str + :return: String in the new encoding. + :rtype: str + """ + data = data.encode('ascii', errors='ignore') + return data.decode('latin1', errors='ignore') + + def save(self, data: List[dict]) -> None: + """Save data to PDF. + + :param data: Dictionary with data to save. + :type data: List[dict] + :raises FileExistsError: An error occurs when the specified path + does not exist. + """ + file = self._request.get('to_pdf') + if file: + pdf = FPDF() + pdf.set_font("Arial", size=14) + + y = 10 + imgs = [] + + for i in data: + pdf.add_page() + + title_ = i.get('title_web_resource') + if title_: + title_ = self.convert_to_latin1(title_) + pdf.cell(150, 10, txt=title_, ln=1, align='L') + + link_feed = i.get('link') + if link_feed: + pdf.cell(150, 10, txt='Link to feed.', ln=1, + align='L', link=link_feed) + + for key, item in enumerate(i.get('items')): + t = item.get('title') + + if t: + t = self.convert_to_latin1(t) + pdf.cell(150, 10, txt=t, ln=1, align='L') + + item_link = item.get('link') + if item_link: + pdf.cell(150, 10, txt='Link to news.', ln=1, + align='L', link=item_link) + + pd = item.get('pubDate') + if pd: + pdf.cell(150, 10, txt=pd, ln=1, align='L') + + s = item.get('source') + if s: + s = self.convert_to_latin1(s) + pdf.cell(150, 10, txt=s, ln=1, align='L') + + url_image = item.get("content").get("url") + if url_image: + resp = requests.get(url_image, stream=True) + img = Image.open(BytesIO(resp.content)) + name_img = f'{self._folder}\\{key}.jpg' + imgs.append(name_img) + + # create empty files. + PathFile().create_file(name_img) + img.save(name_img) + + pdf.image(name_img, x=10, y=pdf.get_y(), w=40) + pdf.line(10, pdf.get_y()+60, 200, pdf.get_y()+60) + pdf.cell(150, pdf.get_y()+25-y, ln=1, align='L') + y = pdf.get_y()-25 + + exists_file = PathFile.exists_file(file) + if exists_file: + pdf_file = file + 'news.pdf' + pdf.output(pdf_file) + else: + raise FileExistsError('The requested path does not exist.') + + # del all images + for i in imgs: + PathFile.unlink(i) + else: + super().save(data) diff --git a/rss_reader/starter/__init__.py b/rss_reader/starter/__init__.py new file mode 100644 index 00000000..8cfce077 --- /dev/null +++ b/rss_reader/starter/__init__.py @@ -0,0 +1 @@ +"""Package for the initial setup of the program.""" diff --git a/rss_reader/starter/base.py b/rss_reader/starter/base.py new file mode 100644 index 00000000..4c11d795 --- /dev/null +++ b/rss_reader/starter/base.py @@ -0,0 +1,86 @@ +"""Initial program setup. + +This module contains functions intended for the initial setup of the program. +""" + + +import argparse +from typing import Dict + +from rss_reader.logger.logger import (Logger, StreamHandlerConfig, + NullHandlerConfig) +from rss_reader.interfaces.ilogger.ilogger import ISetLoggerConfig + + +NAME_LOGGER = 'rss-reader' +version = '0.0.4' + + +def init_arguments_functionality(args=None) -> Dict[str, str]: + """Create command line options. + + :param args: Command line options, defaults to None. + :type args: List, optional. + :return: Command line parameter dictionary. + :rtype: Dict[str, str] + """ + + parser = argparse.ArgumentParser( + prog='RSS reader', + description='Pure Python command-line RSS reader.', + epilog='''(c) 2022. Have a nice day.''' + ) + + parser.add_argument('source', + nargs='?', + default='', + help='RSS URL') + + parser.add_argument('--version', + action='version', + version='%(prog)s version {}'.format(version), + help='Print version info') + + parser.add_argument('--json', + action='store_true', + help='Print result as JSON in stdout') + + parser.add_argument('--verbose', + action='store_true', + help='Outputs verbose status messages') + + parser.add_argument('--limit', + help='Limit news topics if this parameter provided') + + parser.add_argument('--date', + help='Search for news on a specified date.\ + Date in the format Y-m-d (for example: 20191206).' + ) + parser.add_argument('--to-html', + help='Specify the path where to save the resulting\ + data in the html file.' + ) + parser.add_argument('--to-pdf', + help='Specify the path where to save the resulting\ + data in the PDF file.' + ) + + namespace_ = parser.parse_args(args) + + return vars(namespace_) + + +def create_logger(verbose: bool) -> None: + """Create a logger. + + :param verbose: Parameter responsible for selecting a specific handler. + The handler is responsible for how the information is + displayed. + :type verbose: bool + """ + if verbose: + config: ISetLoggerConfig = StreamHandlerConfig() + else: + config = NullHandlerConfig() + + Logger(NAME_LOGGER, config).setup_logger() diff --git a/rss_reader/starter/ecxeptions.py b/rss_reader/starter/ecxeptions.py new file mode 100644 index 00000000..0e9e5fab --- /dev/null +++ b/rss_reader/starter/ecxeptions.py @@ -0,0 +1,6 @@ +"""This module contains custom exceptions for the starter package.""" + + +class NonNumericError(Exception): + """Cannot convert incoming value to int.""" + pass diff --git a/rss_reader/starter/starter.py b/rss_reader/starter/starter.py new file mode 100644 index 00000000..141497b6 --- /dev/null +++ b/rss_reader/starter/starter.py @@ -0,0 +1,110 @@ +"""The main configurator for launching the program.""" + +from typing import Dict +from bs4 import BeautifulSoup + +from rss_reader.logger.logger import Logger +from rss_reader.interfaces.iloader.iloader import IHandler, ILoadHandler +from rss_reader.loader.loader import FromWebHandler, FromLocalSTorageHandler +from rss_reader.parser.parser import BeautifulParser +from rss_reader.crawler.crawler import SuperCrawler +from rss_reader.crawler.exceptions import BadURLError +from rss_reader.interfaces.iviewer.iviewer import IViewHandler +from rss_reader.viewer.viewer import StandartViewHandler, JSONViewHandler +from rss_reader.pathfile.pathfile import PathFile +from rss_reader.interfaces.isaver.isaver import ISaveHandler +from rss_reader.saver.saver import LocalSaveHandler +from rss_reader.saver.to_html.to_html import HTMLSaveHandler +from rss_reader.saver.to_html.strategies import SuperStrategySaveHTML +from rss_reader.saver.to_pdf.to_pdf import PDFSaveHandler + +from .ecxeptions import NonNumericError + +log = Logger.get_logger(__name__) + +LOCAL_STORAGE = '.rss-reader/local_storage.csv' + + +class Starter: + """A class to represent a starter main program.""" + + def __init__(self, argv: Dict[str, str]) -> None: + """Initializer. + + :param argv: Command line parameter dictionary. + :type argv: Dict[str, str] + """ + self._argv = argv + + def run(self) -> None: + """Program launch.""" + + data_handler = self._get_data_from_resource() + try: + data = data_handler.get_data() + except BadURLError as e: + log.error(e) + raise + + log.info("Start getting the viewer object.") + viewer = self._get_viewer(self._argv) + log.info("Stop getting the viewer object.") + + log.info("Start displaying data.") + viewer.show(data) + log.info("Stop displaying data.") + + # save data + self._get_saver(self._argv).save(data) + + def _get_limit(self) -> None: + log.info("Get the number of requested news.") + try: + lim = self._argv.get('limit') + limit = int(lim) if lim else None + except ValueError as e: + log.exception(e) + raise NonNumericError("--limit has a non-numeric value") from e + + if limit is not None and limit < 0: + raise ValueError('--limit must be positive') + log.info("Number was received.") + + self._argv['limit'] = limit + + def _get_data_from_resource(self) -> ILoadHandler: + """Configure the data handler.""" + + self._get_limit() + + wh = FromWebHandler('item', + 'channel > title', + self._argv.get('source'), + self._argv.get('limit'), + SuperCrawler, + BeautifulParser(BeautifulSoup)) + local_storage = PathFile().home()/LOCAL_STORAGE + + ls = FromLocalSTorageHandler(local_storage, self._argv) + ls.set_next(wh) + return ls + + def _get_viewer(self, request: Dict[str, str]) -> IViewHandler: + """Get data viewer.""" + stdout_ = StandartViewHandler() + json_ = JSONViewHandler(request) + json_.set_next(stdout_) + return json_ + + def _get_saver(self, request: Dict[str, str]) -> ISaveHandler: + + local_storage = PathFile().home()/LOCAL_STORAGE + + standart_saver = LocalSaveHandler(local_storage) + html_saver = HTMLSaveHandler(request, SuperStrategySaveHTML()) + pdf_saver = PDFSaveHandler( + request, PathFile().home()/'.rss-reader/img') + + pdf_saver.set_next(html_saver).set_next(standart_saver) + + return pdf_saver diff --git a/rss_reader/starter/tests/__init__.py b/rss_reader/starter/tests/__init__.py new file mode 100644 index 00000000..4072792b --- /dev/null +++ b/rss_reader/starter/tests/__init__.py @@ -0,0 +1 @@ +"""A test suite for the parser package.""" diff --git a/rss_reader/starter/tests/test_base.py b/rss_reader/starter/tests/test_base.py new file mode 100644 index 00000000..8221965e --- /dev/null +++ b/rss_reader/starter/tests/test_base.py @@ -0,0 +1,19 @@ +"""A test suite for the base module.""" + +import pytest + +from ..base import init_arguments_functionality as iaf + + +@pytest.mark.parametrize("option", [("https://y.com",)]) +def test_init_arguments_functionality(option): + """Check if dictionary returned""" + a = iaf(option) + assert isinstance(a, dict) + + +@pytest.mark.parametrize("option", [("https://y.com", "--json",)]) +def test_init_return_json(option): + """Check if a key exists in a dictionary.""" + a = iaf(option) + assert 'json' in a diff --git a/rss_reader/starter/tests/test_starter.py b/rss_reader/starter/tests/test_starter.py new file mode 100644 index 00000000..4fe6b133 --- /dev/null +++ b/rss_reader/starter/tests/test_starter.py @@ -0,0 +1,45 @@ +"""A test suite for the starter module.""" + +from pytest import raises + + +from ..starter import Starter +from ..ecxeptions import NonNumericError + + +class BadURLError(Exception): + """Serves to simulate the occurrence of a BadURLError error.""" + pass + + +def test_starter_run_NonNumericError(): + """Check NonNumericError exception return + + Occurs when --limit is not equal to a number. + """ + + argv = {'source': 1, 'limit': 'abc'} + s = Starter(argv) + with raises(NonNumericError): + s.run() + + +def test_starter_run_BadURLError(monkeypatch): + """Verify that the BadURLError exception is being caught. + + Occurs when an invalid URL is specified. + """ + + def mock_get_status(*args, **kwargs): + class Mock_BadURLError: + @classmethod + def get_data(self): + raise BadURLError + return Mock_BadURLError() + + monkeypatch.setattr(Starter, '_get_data_from_resource', mock_get_status) + + argv = {'source': 1, 'limit': 1} + s = Starter(argv) + with raises(BadURLError): + s.run() diff --git a/rss_reader/tests/__init__.py b/rss_reader/tests/__init__.py new file mode 100644 index 00000000..70da13d2 --- /dev/null +++ b/rss_reader/tests/__init__.py @@ -0,0 +1 @@ +"""Integration test package.""" diff --git a/rss_reader/tests/loader/__init__.py b/rss_reader/tests/loader/__init__.py new file mode 100644 index 00000000..e440e0d3 --- /dev/null +++ b/rss_reader/tests/loader/__init__.py @@ -0,0 +1 @@ +"""A test suite for the loader package.""" diff --git a/rss_reader/tests/loader/test_loader.py b/rss_reader/tests/loader/test_loader.py new file mode 100644 index 00000000..c1708c0e --- /dev/null +++ b/rss_reader/tests/loader/test_loader.py @@ -0,0 +1,39 @@ +"""A test suite for the loader module.""" + +import pytest + +from rss_reader.loader.loader import FromWebHandler +from rss_reader.crawler.crawler import SuperCrawler +from rss_reader.parser.parser import BeautifulParser + + +@pytest.fixture +def mock_crawler(mocker): + """Replaces crawler methods""" + + m = __name__ + '.SuperCrawler' + mock_cls = mocker.patch(m) + mock_cp = mock_cls.return_value + mock_cp.get_data.return_value = "Response!" + + +@pytest.fixture +def mock_parser(mocker): + """Replaces parser methods""" + + m = __name__ + '.BeautifulParser' + mock_cls = mocker.patch(m) + mock_cp = mock_cls.return_value + mock_cp.create_parser.return_value = "Create!" + mock_cp.get_tags_text.return_value = iter(["Title!"]) + mock_cp.get_items.return_value = [{'mock_item': 1}] + + +def test_get_items_empty(mock_crawler, mock_parser): + """Check that a dictionary of the given structure is returned""" + + h = FromWebHandler('a', 'b', 'c', 1, SuperCrawler, BeautifulParser('s')) + r = h.get_data() + assert r == [{'title_web_resource': 'Title!', + 'link': 'c', + 'items': [{'mock_item': 1}]}] diff --git a/rss_reader/viewer/__init__.py b/rss_reader/viewer/__init__.py new file mode 100644 index 00000000..172970ed --- /dev/null +++ b/rss_reader/viewer/__init__.py @@ -0,0 +1 @@ +"""This package contains modules that work with viewer.""" diff --git a/rss_reader/viewer/tests/__init__.py b/rss_reader/viewer/tests/__init__.py new file mode 100644 index 00000000..1fc7cf93 --- /dev/null +++ b/rss_reader/viewer/tests/__init__.py @@ -0,0 +1 @@ +"""A test suite for the viewer package.""" diff --git a/rss_reader/viewer/tests/test_viewer.py b/rss_reader/viewer/tests/test_viewer.py new file mode 100644 index 00000000..e2c0356e --- /dev/null +++ b/rss_reader/viewer/tests/test_viewer.py @@ -0,0 +1,41 @@ +"""A test suite for the viewer module.""" + +from json import dumps + +from ..viewer import JSONViewHandler, StandartViewHandler + + +def test_StandartViewHandler_show(capsys): + """check that the show method prints the correct data to stdout.""" + + data = [{'title_web_resource': 'mock_show'}] + json_obj = StandartViewHandler() + json_obj.show(data) + out, err = capsys.readouterr() + out = out.strip('\n') + assert out == 'Feed: : mock_show' + + +def test_JSONViewHandler_show(capsys): + """Check that the show method prints data to stdout in the format json.""" + + data = {'test_dict': 1} + json_obj = JSONViewHandler({'json': 1}) + json_obj.show(data) + out, err = capsys.readouterr() + out = out.strip('\n') + x = dumps(data, indent=3) + assert out == x + + +def test_JSONViewHandler_chain_show(mocker): + """Check that the show method is called in chain.""" + + m = __name__ + '.StandartViewHandler.show' + mock_show = mocker.patch(m) + data = {'test_dict': 1} + json_obj = JSONViewHandler({}) + stdout_ = StandartViewHandler() + json_obj.set_next(stdout_) + json_obj.show(data) + assert mock_show.called diff --git a/rss_reader/viewer/viewer.py b/rss_reader/viewer/viewer.py new file mode 100644 index 00000000..8f80e520 --- /dev/null +++ b/rss_reader/viewer/viewer.py @@ -0,0 +1,129 @@ +"""This module contains specific viewers. + +Viewers display data in the desired form. +Each viewer is called in a chain. +""" + +from typing import Dict, List, Optional +from json import dumps + +from rss_reader.interfaces.iviewer.iviewer import IViewHandler +from rss_reader.decorator.decorator import send_log_of_start_function +from rss_reader.logger.logger import Logger + +log = Logger.get_logger(__name__) + + +class AbstractViewHandler(IViewHandler): + """The base class of the handler.""" + + _next_handler: Optional[IViewHandler] = None + + @send_log_of_start_function + def set_next(self, handler: IViewHandler) -> IViewHandler: + """Set the next viewer in the handler chain. + + :param handler: Next handler. + :type handler: IViewHandler + :return: Handler. + :rtype: IViewHandler + """ + + self._next_handler = handler + return handler + + @send_log_of_start_function + def show(self, data: List[dict]) -> None: + """Show data. + + :param data: Dictionary with data to be printed on the screen. + :type data: dict + """ + if self._next_handler: + return self._next_handler.show(data) + + +class StandartViewHandler(AbstractViewHandler): + """Displays data on standard output + + It is the base handler. + Executed when others have failed to process the data. + """ + + def show(self, data: List[dict]) -> None: + """Show data. + + :param data: Information td display. + :type data: List[dict] + """ + for i in data: + log.debug('start displaying new block news.') + self._show_item(i) + log.debug('stop displaying new block news.') + + def _show_item(self, data: dict): + """Show data.""" + log.debug('start getting title.') + self._get_info(data, "title_web_resource", "\nFeed: ", end="\n\n\n") + log.debug('stop getting title.') + items = data.get('items') + if isinstance(items, list): + for i in items: + log.debug('start getting new news.') + self._get_info(i, "title", "Title") + self._get_info(i, "source", "Source") + self._get_info(i, "pubDate", "PubDate") + self._get_info(i, "link", "Link") + media_content = i.get("content") + + if not self._is_empty(media_content): + print("Media content:") + self._get_info(media_content, "title", + "[title of media content]") + self._get_info(media_content, "url", + "[source of media content]") + print('\n\n') + log.debug('stop getting new news.') + elif items: + print(items) + + def _is_empty(self, lst: List[Dict[str, str]]) -> bool: + result = True + if lst: + for i in lst: + if lst[i] is not None: + result = False + break + + return result + + def _get_info(self, dict_: dict, attr: str, str_: str, end='\n'): + """Printing a string from a dictionary""" + x = dict_.get(attr) + if x: + print(f'{str_}: {x}', end=end) + + +class JSONViewHandler(AbstractViewHandler): + """Process data as JSON.""" + + def __init__(self, request: Dict[str, str]) -> None: + """Initializer. + + :param request: A dictionary in which there may be a key + by which this handler will work. + :type request: Dict[str, str] + """ + self._request = request + + def show(self, data: List[dict]) -> None: + """Display the data as a JSON structure. + + :param data: Dictionary with data to be printed on the screen. + :type data: dict + """ + log.debug("Start outputting news in json format.") + if self._request.get('json'): + print(dumps(data, indent=3, ensure_ascii=False)) + else: + super().show(data) diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..d1ca8021 --- /dev/null +++ b/setup.py @@ -0,0 +1,39 @@ +# import setuptools +from setuptools import setup, find_packages + +with open("README.md") as file: + read_me_description = file.read() + +setup( + name="rss_reader", + version="0.0.4", + author="Andrey Ozerets", + description="Super rss-reader.", + long_description=read_me_description, + long_description_content_type="text/markdown", + entry_points={ + 'console_scripts': [ + 'rss_reader = rss_reader.__main__:main', + ] + }, + install_requires=[ + "requests==2.28.0", + "beautifulsoup4 == 4.11.1", + "lxml==4.9.0", + "pandas==1.4.3", + "fpdf==1.7.2", + "Jinja2==3.1.2", + "Pillow==9.1.1" + ], + + packages=find_packages(exclude=['test*']), + package_data={ + 'rss_reader': ['saver/to_html/templates/main.html'], + }, + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.8', +)