diff --git a/HTML_converter.py b/HTML_converter.py new file mode 100644 index 00000000..a96418b8 --- /dev/null +++ b/HTML_converter.py @@ -0,0 +1,96 @@ +import os +from json2html import * +from bs4 import BeautifulSoup +import datetime + + +with open('base.html', 'r', encoding='utf-8') as readfile: + html_file = readfile.read() + +soup = BeautifulSoup(html_file, "html.parser") +readfile.close() +def convert_to_html(dataset, date=None): + '''Method of data convertion into html''' + folder = 'html_convert' + if os.path.exists(folder): + pass + else: + os.mkdir(folder) + dir_path = "img_storage" + for data in dataset: + main_div = soup.new_tag('div', **{'class': 'card mb-3 em'}) + body_div = soup.new_tag('div', **{'class': 'card-body'}) + tag_p = soup.new_tag('p', **{'class': 'card-text'}) + body_div.append(tag_p) + for key, value in data.items(): + if date: + img_source = "News image_link:" + if key == img_source: + img_tag = soup.new_tag( + "img", **{'class': 'card-img-top'}, src=value, alt="No image link") + main_div.append(img_tag) + else: + # list to store files + res = [] + # Iterate directory + for path in os.listdir(dir_path): + # check if current path is a file + if os.path.isfile(os.path.join(dir_path, path)): + res.append(path) + if key == 'News image_link:': + img_name = value.split('/')[-1] + for i in res: + if img_name == i[:-5]: + img_tag = soup.new_tag( + "img", **{'class': 'card-img-top'}, src=f"../{dir_path}/{img_name}.jpeg", alt="No image link") + main_div.append(img_tag) + if key == 'News title:': + title_h5 = soup.new_tag('h5', **{'class': 'card-title'}) + title_h5.string = value + body_div.append(title_h5) + if key == "News description:": + p_tag = soup.new_tag('p', **{'class': 'card-text'}) + p_tag.string = value + body_div.append(p_tag) + if key == "News link:": + small_tag_link = soup.new_tag( + 'a', **{'class': 'text-muted links'}, href=value) + small_tag_link.string = value + tag_p.append(small_tag_link) + tag_p.append(soup.new_tag('br')) + if key == "News date:": + small_tag_date = soup.new_tag( + 'span', **{'class': 'text-muted'}) + small_tag_date.string = value + tag_p.append(small_tag_date) + tag_p.append(soup.new_tag('br')) + if key == "News source:": + small_tag_source = soup.new_tag( + 'a', **{'class': 'text-muted links'}, href=f"{value}
") + small_tag_source.string = value + tag_p.append(small_tag_source) + tag_p.append(soup.new_tag('br')) + if key == "News creator:": + small_tag_creator = soup.new_tag( + 'small', **{'class': 'text-muted'}) + small_tag_creator.string = value + tag_p.append(small_tag_creator) + if key == "News enclosure:": + small_tag_enclosure = soup.new_tag( + 'small', **{'class': 'text-muted'}, href=value) + small_tag_enclosure.string = value + tag_p.append(small_tag_enclosure) + body_div.append(tag_p) + main_div.append(body_div) + soup.body.div.append(main_div) + + if date: + file = datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S")+'(offline)'+'.html' + else: + file = datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S")+'(online)'+'.html' + + with open(f'{folder}/{file}', 'w', encoding='utf-8') as writefile: + writefile.write(str(soup)) + writefile.close() + print("\n", f"Your data has been successfully converted into .html and saved in {folder} folder as {file}", "\n") + return 'test' \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 00000000..3dc7278c --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,10 @@ +MIT License + +Copyright (c) 2022 Doniyor Karimov + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/PDF_converter.py b/PDF_converter.py new file mode 100644 index 00000000..b8d2eaf6 --- /dev/null +++ b/PDF_converter.py @@ -0,0 +1,41 @@ +from reportlab.pdfgen import canvas +import os +import textwrap +import datetime + + +info = "Your data has been successfully converted into .html and saved in 'pdf_convert' directory with 'local datetime' name" +def convert_to_pdf(infoFromJson, date=None): + '''Method of data convertion into html''' + folder = 'pdf_convert' + if os.path.exists(folder): + pass + else: + os.mkdir(folder) + if date: + file = datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S")+'(offline)'+'.pdf' + else: + file = datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S")+'(online)'+'.pdf' + can = canvas.Canvas(f'PDF_convert/{file}') + dir_path = "img_storage" + for item in infoFromJson: + x = 10 + y = 800 + for k, v in item.items(): + img_source = "News image_link:" + if img_source in item: + img_name= item[img_source].split('/')[-1] + can.drawImage(f"{dir_path}/{img_name}"+'.jpeg', 30, 300, width=250, height=200) + text = f"{k}: {v}" + wrap_text = textwrap.wrap(text, width=100) + can.drawString(x, y, wrap_text[0]) + try: + y -= 25 + can.drawString(x+50, y, wrap_text[1]) + except: + pass + y -= 25 + can.showPage() + can.save() + print("\n", info, "\n") + return "test" diff --git a/README.md b/README.md index c86d1e65..85247064 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,288 @@ -# How to create a PR with a homework task - -1. Create fork from the following repo: https://github.com/E-P-T/Homework. (Docs: https://docs.github.com/en/get-started/quickstart/fork-a-repo ) -2. Clone your forked repo in your local folder. -3. Create separate branches for each session.Example(`session_2`, `session_3` and so on) -4. Create folder with you First and Last name in you forked repo in the created session. -5. Add your task into created folder -6. Push finished session task in the appropriate branch in accordance with written above. - You should get the structure that looks something like that - -``` - Branch: Session_2 - DzmitryKolb - |___Task1.py - |___Task2.py - Branch: Session_3 - DzmitryKolb - |___Task1.py - |___Task2.py -``` - -7. When you finish your work on task you should create Pull request to the appropriate branch of the main repo https://github.com/E-P-T/Homework (Docs: https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork). -Please use the following instructions to prepare good description of the pull request: - - Pull request header should be: `Session - `. - Example: `Session 2 - Dzmitry Kolb` - - Pull request body: You should write here what tasks were implemented. - Example: `Finished: Task 1.2, Task 1.3, Task 1.6` +# Python RSS reader +`rss-reader` is a command line utility that makes it easy to view RSS feeds in a readable format. + +***Python 3.8 and above prefferable*** + +***Tested on Windows and MacOS*** + +## Installation and usage + +You can install it by running the following command: + + pip install ... + +Now, you can run the utility by this command: + + rss_reader {YOUR ARGUMENTS} + +*OR* + +1. Clone github repository: + + git clone https://github.com/karimoff96/Final-task + +2. Change directory to `karimoff96/Final-task`. + + cd .../Final-task + +3. Install necessary dependencies: + + pip install -r requirements.txt + +Now, provided, your current directory is `/Final_task`, you can run `rss_reader` as a +package: + + rss_reader {your arguments} + +or, provided, your current directory is `/Final_task/rss_reader`, you can directly run the +module: + python rss_reader.py {your arguments} + + +!!! Before using short command (rss_reader {your arguments}) generate distribution packages for the package. These are archives that are uploaded to the Python Package Index and can be installed by pip. +Make sure you have the latest version of PyPA’s build installed: + + >>>pip install --upgrade build + +Now run this command from the same directory where pyproject.toml is located: + + >>>python -m build + +This command should output a lot of text and once completed should generate two files in the dist directory: + dist/ + RSS_reader-4.2-py3-none-any.whl + RSS_reader-4.2.tar.gz + +The tar.gz file is a source distribution whereas the .whl file is a built distribution. Newer pip versions preferentially install built distributions, but will fall back to source distributions if needed. You should always upload a source distribution and provide built distributions for the platforms your project is compatible with. In this case, our example package is compatible with Python on any platform so only one built distribution is needed. + + >>>pip install --editable . + +This command will install the package in develop mode, meaning it will just link back to where the sources are. If by any chance the sources are moved or deleted, importing the package will fail. + +## Functionality + +To see help message, please, use `-h/--help` argument: `rss_reader -h`. + + usage: rss_reader [-h] [--verbose] [--json] [-limit LIMIT] [-date DATE] [--to-html] [--to-pdf] [source] + + Pure Python command-line RSS reader. + + positional arguments: + source RSS URL + + optional arguments: + -h, --help Show this help message and exit. + --version Print version info. + --verbose Output verbose status messages. + --json Print news as JSON. + --limit LIMIT Limit news amount to be processed. + --date DATE Get news published on a specific date from cache for further processing. + --to-html Convert news to .html format and save it by the specified folder path (FOLDER_PATH=pdf_convert, FILE_NAME=current datetime). + --to-pdf Convert news to .pdf format and save it by the specified folder path (FOLDER_PATH=pdf_convert, FILE_NAME=current datetime). + + +## Logging +If `--verbose` argument is **PASSED**, messages with either `INFO` or `ERROR` severities +of `rss_reader` are printed to console, + +If `--verbose` argument is passed, then all `rss_reader` logs are printed console. + +## Configuration + +Application creates several files: ++ converted to supported formats directories and files: + Folders: + `html_convert`/, + `pdf_convert`/, + `img_storage`/, + Files: + `local_storage.json` +Application uses `base.htm` file as core of html stucture and uses it for creating new html formatted files +By default, the application files are stored inside home directory in a freshly created `Final Task` folder: + + - Windows: C:\Users\User\Final Task + or C:\Users\Final Task + - Linux and MacOS: /home/Final Task + +## Cache JSON structure + +Cache represents a dictionary of URLs with according lists of dictionaries of items, preceded by a dictionary of feed +info. + +*Example:* + + [ + { + "Source": "https://news.yahoo.com/rss/", + "News title:": "More than 1 million voters switch to GOP in warning for Dems", + "News date:": "2022-06-27 04:08:17", + "News link:": "https://news.yahoo.com/more-1-million-voters-switch-040817454.html", + "News source:": "http://www.ap.org/", + "News image_link:": "https://s.yimg.com/uu/api/res/1.2/j8.JKTuo4zusFSxicKl6iw--~B/aD00MDAwO3c9NjAwMDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/ap.org/f2da5e0ede64ea49e83215a3895b1ac5" + }, + { + "Source": "https://news.yahoo.com/rss/", + "News title:": "The impact of Kavanaugh's confirmation on the 2018 elections may reveal how the reversal of Roe v. Wade could impact this year's midterms", + "News date:": "2022-06-27 00:36:15", + "News link:": "https://news.yahoo.com/impact-kavanaughs-confirmation-2018-elections-003615363.html", + "News source:": "https://www.insider.com/", + "News image_link:": "https://s.yimg.com/uu/api/res/1.2/XEOtLfPIRULP6UAqNAihlA--~B/aD00NDgwO3c9NTk3MzthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/insider_articles_922/9ef49ef9b6cfe0e457ff6a2ebc08d87f" + }, + { + "Source": "https://news.yahoo.com/rss/", + "News title:": "\"It's a terrible scene\": At least 21 teens die in tavern mystery", + "News date:": "2022-06-26 18:06:00", + "News link:": "https://news.yahoo.com/terrible-scene-least-21-teens-180658249.html", + "News source:": "https://www.cbsnews.com/", + "News image_link:": "https://s.yimg.com/uu/api/res/1.2/7fJJnU5hoPxnY6a5h0Ki5g--~B/aD0yNTYwO3c9Mzg0MDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/cbs_news_897/0086668e97cf2885e4decaffd9611be7" + }, + { + "Source": "https://news.yahoo.com/rss/", + "News title:": "Officials: Georgia man sentenced to die kills self in prison", + "News date:": "2022-06-27 12:50:43", + "News link:": "https://news.yahoo.com/officials-georgia-man-sentenced-die-125043110.html", + "News source:": "http://www.ap.org/", + "News image_link:": "https://s.yimg.com/uu/api/res/1.2/zEeBoPLQVzw1u2VjZt.THA--~B/aD03NDk7dz0xMDAwO2FwcGlkPXl0YWNoeW9u/https://media.zenfs.com/en/ap.org/5cf8e923a8d9dec8758480785184f376" + }, + { + "Source": "https://news.yahoo.com/rss/", + "News title:": "Navy SEALs 'Hell Week' autopsy reveals cause of death of Manalapan man four months later", + "News date:": "2022-06-26 09:00:34", + "News link:": "https://news.yahoo.com/navy-seals-hell-week-autopsy-090034460.html", + "News source:": "https://www.app.com", + "News image_link:": "https://s.yimg.com/uu/api/res/1.2/Gw1CaDrK1F6QV9TZjnZ8bw--~B/aD0xNjAwO3c9MjQwMDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/app-com-asbury-park-press/ad51ae10218c6faadf835e52e3616c42" + } + ... + +*Some notes*: + ++ `--json`-printed results are different from ones, stored in cache; as the cache file is just json formatted file the user can easily explore and + modify it (modifing is not reccomended), whereas `--json` argument is a part of the user interface, that's why its output is user-friendly. + +`--json` output example: + + { + "Feed title": "Yahoo News - Latest News & Headlines", + "Feed link": "https://www.yahoo.com/news", + "Feed description:": "The latest news and headlines from Yahoo! News. Get breaking news stories and in-depth coverage with videos and photos.", + "Feed date:": "Mon, 27 Jun 2022 10:05:03 -0400", + "Feed items": [ + { + "Source": "https://news.yahoo.com/rss/", + "News title:": "Officials: Georgia man sentenced to die kills self in prison", + "News date:": "2022-06-27 12:50:43", + "News link:": "https://news.yahoo.com/officials-georgia-man-sentenced-die-125043110.html", + "News source:": "http://www.ap.org/", + "News image_link:": "https://s.yimg.com/uu/api/res/1.2/zEeBoPLQVzw1u2VjZt.THA--~B/aD03NDk7dz0xMDAwO2FwcGlkPXl0YWNoeW9u/https://media.zenfs.com/en/ap.org/5cf8e923a8d9dec8758480785184f376" + }, + { + "Source": "https://news.yahoo.com/rss/", + "News title:": "13 daring looks celebrities wore on the BET Awards red carpet", + "News date:": "2022-06-27 12:35:18", + "News link:": "https://news.yahoo.com/13-daring-looks-celebrities-wore-123518632.html", + "News source:": "https://www.insider.com/", + "News image_link:": "https://s.yimg.com/uu/api/res/1.2/28ulMokGwYz3HLNudzDI_A--~B/aD0xOTk5O3c9MjY2NjthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/insider_articles_922/f18744f132d7bb2db49022dae6704f4d" + }, + { + "Source": "https://news.yahoo.com/rss/", + "News title:": "Navy SEALs 'Hell Week' autopsy reveals cause of death of Manalapan man four months later", + "News date:": "2022-06-26 09:00:34", + "News link:": "https://news.yahoo.com/navy-seals-hell-week-autopsy-090034460.html", + "News source:": "https://www.app.com", + "News image_link:": "https://s.yimg.com/uu/api/res/1.2/Gw1CaDrK1F6QV9TZjnZ8bw--~B/aD0xNjAwO3c9MjQwMDthcHBpZD15dGFjaHlvbg--/https://media.zenfs.com/en/app-com-asbury-park-press/ad51ae10218c6faadf835e52e3616c42" + } + ... + ... + +Why is there a list of feeds inside `--json` structure, not just a single feed? Inside cache file there may be items +with the same `pubDate`, but they may belong to different feeds. So, when there are such items and a user +passes `--date DATE` argument which represents this exact date, then these several items are returned and attributed to +several newly created `Feed` instances. After that, these `Feed` instances are printed. Printing returned news could be +implemented without respect to the feeds they belong to, but in this case it would be hard to distinguish them. + +## Parsing XML + +XML is parsed by parser implemented from scratch, it exploits the idea of XML *tokenization*, dom-tree is created from +tokens. + +*Features*: + ++ `XML CDATA` parsing support: whenever CDATA is encountered in XML, it gets recursively parsed and substituted by a + normal text in the final form. + \ + XML CDATA example link: https://rss.art19.com/apology-line ++ detecting `invalid XML`: parser notifies user with a wide range of messages whenever invalid syntax or some mistake + was encountered in XML document. + \ + Invalid XML example: http://feeds.bbci.co.uk/news/world/rss.xml + \ + Its fragment (notice tags order): + + In some rss formatted urls there are not given some items like image_link, creator, description... so they are ommited and the utility prints oup the existed ones + +## Tested RSS links + + + + ++ `<` char inside text is parsed correctly, as well as `commented pieces` are skipped properly: + + https://defenseofthepatience.libsyn.com/rss + + + ++ `Big channels` are parsed correctly: + + https://feeds.megaphone.fm/WWO3519750118 + + https://feeds.simplecast.com/54nAGcIl + + ++ `CDATA` is parsed correctly: + + https://rss.art19.com/apology-line + ++ `Image` is parsed correctly: + + https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml + ++ Feeds in `Russian` are handled completely correctly: + + https://rss.dw.com/xml/rss-ru-rus + + https://people.onliner.by/feed + + https://brestcity.com/blog/feed + + https://rss.dw.com/xml/rss-ru-news + + https://lenta.ru/rss/top7 + + https://www.liga.net/tech/battles/rss.xml + + https://vse.sale/news/rss + + ++ Some others: + + https://news.yahoo.com/rss/ + + https://www.liquiddota.com/rss/news.xml + +## Testing + +Before testing please pay attention to notes inside the functions. Some test require folders or images which are stored locally. Best sollution is to run the utility couple of times checking with different arguments and optins. +project is tested for the file convertions, url validation, argparse options and decoding the html functions. + +Modules tested: + +| Name | Stmts | Miss | Cover | +| ------------------- | ---------- | ---------- | ---------- | +| HTML_converter.py | 77 | 13 | 83% | +| PDF_converter.py | 36 | 2 | 94% | +| rss_reader.py | 319 | 175 | 45% | +| test.py | 48 | 1 | 98% | +| `TOTAL` | `480` | `191` | `60%` | + + *Test coverage is 60%.* diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/__main__.py b/__main__.py new file mode 100644 index 00000000..b0931317 --- /dev/null +++ b/__main__.py @@ -0,0 +1,15 @@ +""" +__main__.py module makes it possible to run application as module like this: rss_reader +""" + +from rss_reader import read_defs +import sys +from pathlib import Path + +# add rss_reader package path to sys.path +rss_reader_pkg_dir_path = str(Path(__file__).parent.resolve()) +sys.path.insert(1, rss_reader_pkg_dir_path) + + +if __name__ == "__main__": + read_defs() diff --git a/base.html b/base.html new file mode 100644 index 00000000..29e2bb3d --- /dev/null +++ b/base.html @@ -0,0 +1,68 @@ + + + + + + + + + + + + Convert + + + + +

RSS Reader html converted version

+
+ +
+ + + + + + + + \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..2c80ae73 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[tool.black] +line-length = 88 +target-version = ['py39'] +exclude = ''' +( + \.eggs + | \.git + | build + | dist + | venv + | .venv +) +''' + +[tool.isort] +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true +skip_gitignore = true +skip_glob = ['**/.venv/**'] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..4b9fe37e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +beautifulsoup4==4.11.1 +certifi==2022.5.18.1 +charset-normalizer==2.0.12 +colorama==0.4.4 +decorator==5.1.1 +idna==3.3 +lxml==4.9.0 +py-console==0.1.4 +requests==2.28.0 +soupsieve==2.3.2.post1 +urllib3==1.26.9 +validators==0.20.0 +python-dateutil==2.8.2 +reportlab==3.6.10 +json2html==1.3.0 \ No newline at end of file diff --git a/rss_reader.py b/rss_reader.py new file mode 100644 index 00000000..4d0a7e12 --- /dev/null +++ b/rss_reader.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python3 +"""The main workspace for RSS_reader application""" +import datetime +import argparse +import html +import validators +import json +from py_console import console +from pprint import pprint #PrettyPrint function +from bs4 import BeautifulSoup +import os.path +from dateutil import parser as pr +from HTML_converter import convert_to_html #HTML converter function +import requests # request img from web +import shutil # save img locally +from PDF_converter import convert_to_pdf # PDF converter function + +def all_args(): + parser = argparse.ArgumentParser( + description="Pure Python command-line RSS reader.") + parser.add_argument("source", help="RSS URL", nargs="?") + parser.add_argument( + "--version", help="Print version info", action="store_true") + parser.add_argument( + "--json", help="Print result as JSON in stdout", action="store_true") + parser.add_argument( + "--verbose", help="Output verbose status messages", action="store_true") + parser.add_argument( + "--limit", help="Limit news topics if this parameter provided", type=int) + parser.add_argument( + "--date", help="Get news published on a specific date from cache for further processing.", nargs="*") + parser.add_argument( + "--to-html", help="Convert news to .html format and save it into 'html_convert' directory with 'local datetime' name", action='store_true') + parser.add_argument( + "--to-pdf", help="Convert news to .pdf format and save it into 'pdf_convert' directory with 'local datetime' name", action='store_true') + return parser + + + + +def datetime_now(): + '''Method of defining current datetime''' + return datetime.datetime.now().astimezone().strftime('%Y.%m.%d %H:%M:%S') + +def console_log(msg): + '''Function for printing verbose log messages''' + if verbose_true: + console.info(f'{datetime_now()} - {msg}', showTime=False) + +def console_error(msg): + '''Function for printing verbsoe error messages''' + console.error(f'{datetime_now()} - {msg}', showTime=False) + +def check_version(): + """Method for revealing current version of the utility""" + print("Version 4.3") + +def clean_desc(description): + '''Function for decoding some part of feed item''' + decoded_string = html.unescape(description) + soup = BeautifulSoup(decoded_string, features="lxml") + return soup.get_text() + +def check_url(url): + '''Method for checking url`s validation and availability''' + console_log(f'Checking for validation of URL: {url}') + if validators.url(str(url)): + console_log('URL validation completed') + try: + console_log('Requesting data from URL') + if requests.get(url).status_code == 200: + console_log( + 'Request completed successfully. Reading and decoding request data') + return True + console_error('URL is not responding') + return False + except: + console_error(f'This URL can`t be reached ') + return False + console_error(f'URL validation error') + return False + + +argument = all_args().parse_args() +verbose_true = True if argument.verbose else False + +def check_storage(): + '''Method of checking for existance of storage, if the storage is not found, the function will automatically create one''' + console_log('Checking for storage existance') + if os.path.exists('local_storage.json'): + console_log('Scanning the contaminants of the file') + if os.stat("local_storage.json").st_size <= 2: + with open('local_storage.json', 'w', encoding='utf-8') as writefile: + json.dump(list(), writefile, ensure_ascii=False, indent=4) + else: + with open('local_storage.json', encoding='utf-8') as f: + try: + f.read() + except: + with open('local_storage.json', 'w', encoding='utf-8') as writefile: + json.dump(list(), writefile, ensure_ascii=False, indent=4) + else: + console_log('Local storage is not found') + console_log("Creating local storage 'local_storage.json'") + with open('local_storage.json', 'w', encoding='utf-8') as writefile: + json.dump(list(), writefile, ensure_ascii=False, indent=4) + +def get_data(arguments=None): + print(arguments) + """The main function for collectiong the major part of an Feed and its` items.""" + try: + if arguments is None: + arguments = all_args().parse_args() + given_limit = arguments.limit + console_log(f'Limit is given {given_limit}') + if check_url(arguments.source): + console_log(f'Parsing data from URL') + ready_url = requests.get(arguments.source) + soup = BeautifulSoup(ready_url.content, features="xml") + items = soup.find_all('item') + feeds = soup.find('channel') + feed = feeds.find_all('title')[0].text + feed_link = soup.find_all('link')[0].text + feed_desc = soup.find_all('description')[0].text + feed_date = soup.find_all('pubDate')[0].text + if arguments.limit: + if given_limit >= len(items): + given_limit = len(items) + else: + console_log(f'Limit is not given') + given_limit = len(items) + news = [] + links = [] + feed_json = {} + item_list = [] + console_log( + "Searching and collecting data from tags: ('title', 'pubDate', 'link', 'content', 'creator', 'enclosure', 'description')") + for i in range(given_limit): + items_json = {} + news.append("\n") + feed_json['Feed title'] = feed + feed_json['Feed link'] = feed_link + feed_json['Feed description:'] = feed_desc + feed_json['Feed date:'] = feed_date + item_date = items[i].pubDate.text + d1 = str(pr.parse(item_date).strftime("%Y-%m-%d %H:%M:%S")) + items_json.update({'Source': arguments.source}) + if items[i].title: + title = items[i].title.text + news.append(f"Title: {title}") + items_json['News title:'] = title + if items[i].pubDate: + news.append(f"Date: {d1}") + items_json['News date:'] = d1 + if items[i].link: + link = items[i].link.text + news.append(f"Link: {link}") + links.append(f"{link}(link)") + items_json['News link:'] = link + if items[i].source: + source = items[i].source['url'] + news.append(f"Source: {source}") + items_json['News source:'] = source + if items[i].content: + image_link = items[i].content['url'] + news.append(f"Image_link: {image_link}") + links.append(f"{image_link}(Image_link)") + items_json['News image_link:'] = image_link + if items[i].creator: + creator = items[i].creator.text + news.append(f"Creator: {creator}") + items_json['News creator:'] = creator + if items[i].enclosure: + enclosure = items[i].enclosure['url'] + news.append(f"Enclosure: {enclosure}") + items_json['News enclosure:'] = enclosure + if items[i].description: + description_decode = str(items[i].description) + description = clean_desc(description_decode) + news.append(f"\nDescription: {description}") + items_json['News description:'] = description + item_list.append(items_json) + console_log('Data successfully collected') + console_log( + f'Organising and preparing datas according to given limit {given_limit}') + if arguments.json: + console_log('Converting data into JSON') + console_log('Printing all collected information ') + + print(f'\nFeed: {feed}') + + img_folder='img_storage' + if os.path.exists(img_folder): + pass + else: + os.mkdir(img_folder) + for new in news: + print(new) + + print('\nLinks:') + for i in range(len(links)): + print(f'[{i+1}]: {links[i]}') + print("\n") + + with open('local_storage.json', "r", encoding='utf-8') as file: + data = json.loads(file.read()) + file.close() + for item in item_list: + if os.stat("local_storage.json").st_size == 2: + data.append(item) + else: + if item not in data: + data.append(item) + + with open('local_storage.json', 'w', encoding='utf-8') as writefile: + json.dump(data, writefile, ensure_ascii=False, indent=4) + writefile.close() + + if arguments.json: + """Function to convert feeds to json format.""" + item = {'Feed items': item_list} + feed_json.update(item) + json_object = json.dumps(feed_json, indent=4, ensure_ascii=False) + print(json_object) + + + for item in item_list: + if 'News image_link:' in item: + img_name=item['News image_link:'].split('/')[-1]+'.jpeg' + res = requests.get(item['News image_link:'], stream=True) + if res.status_code == 200: + with open(f'{img_folder}/{img_name}', 'wb') as f: + shutil.copyfileobj(res.raw, f) + + if arguments.to_html: + """Function to convert feeds to html format.""" + data_dicts_html = [] + for data in item_list: + data_dicts_html.append(data) + convert_to_html(data_dicts_html) + console_log("The result for given date successfully convert into html") + + if arguments.to_pdf: + """Function to convert feeds to pdf format.""" + data_dicts_pdf = [] + for data in item_list: + data_dicts_pdf.append(data) + convert_to_pdf(data_dicts_pdf) + console_log("The result for given date successfully convert into pdf") + return True + except: + return False + + +def get_date(arguments = None): + '''Distrubuting collected data from local storage "local_storage.json" for different functions according to required options''' + try: + if arguments is None: + arguments = all_args().parse_args() + date_data = [] + if len(arguments.date)>0: + given_date = arguments.date[0] + else: + given_date = arguments.date + given_source = arguments.source + given_limit = arguments.limit + data_json = arguments.json + console_log('Reading the file') + with open('local_storage.json', "r", encoding='utf-8') as file: + data = json.loads(file.read()) + if given_source: + console_log(f'Searching data with cource {given_source}') + console_log('Converting collected data into json') + console_log('Collecting data according to given requirements: [source], [limit], [date], [json]') + console_log('Printing final result') + if arguments.to_html and arguments.to_pdf: + console_log('Converting result into html') + console_log('Converting result into pdf') + elif arguments.to_pdf: + console_log('Converting result into pdf') + elif arguments.to_html: + console_log('Converting result into html') + for item in data: + day = str(pr.parse(item['News date:']).strftime("%Y%m%d")) + source = item['Source'] + if len(given_date)==0 and given_source == source: + if given_source == source: + date_data.append(item) + elif len(given_date) == 0 and given_source is None: + date_data.append(item) + elif given_date == day: + if given_source == source: + date_data.append(item) + if data_json: + + if len(date_data[:given_limit]) > 0: + if arguments.to_html: + + convert_to_html(date_data[:given_limit], arguments.date) + elif arguments.to_pdf: + + convert_to_pdf(date_data[:given_limit], arguments.date) + else: + print(json.dumps( + date_data[:given_limit], indent=4, ensure_ascii=False)) + print('Total data found:', len(date_data[:given_limit])) + else: + if verbose_true: + console_error('No data found') + else: + print('No data found') + + elif arguments.to_html: + + convert_to_html(date_data[:given_limit], arguments.date) + elif arguments.to_pdf: + + convert_to_pdf(date_data[:given_limit], arguments.date) + for i in date_data[:given_limit]: + pprint(i) + print('Total data found:', len(date_data[:given_limit])) + elif given_source is None: + console_log('Searching source is not given. Looking up local storage with given date') + console_log('Converting collected data into json') + console_log('Collecting data according to given requirements: [limit], [date], [json]') + console_log('Printing final result') + if arguments.to_html and arguments.to_pdf: + console_log('Converting result into html') + console_log('Converting result into pdf') + elif arguments.to_pdf: + console_log('Converting result into pdf') + elif arguments.to_html: + console_log('Converting result into html') + for item in data: + day = str(pr.parse(item['News date:']).strftime("%Y%m%d")) + source = item['Source'] + if given_date == day: + date_data.append(item) + elif len(given_date)==0: + date_data.append(item) + if data_json: + if len(date_data[:given_limit]) > 0: + if arguments.to_html: + convert_to_html(date_data[:given_limit], arguments.date) + elif arguments.to_pdf: + convert_to_pdf(date_data[:given_limit], arguments.date) + else: + print(json.dumps( + date_data[:given_limit], indent=4, ensure_ascii=False)) + print('Total data found:', len(date_data[:given_limit])) + else: + if verbose_true: + console_error('No data found') + else: + print('No data found') + else: + if arguments.to_html: + console_log('Converting result into html') + if arguments.to_pdf: + console_log('Converting result into pdf') + convert_to_html(date_data[:given_limit], arguments.date) + elif arguments.to_pdf: + if arguments.to_html: + console_log('Converting result into html') + console_log('Converting result into pdf') + convert_to_pdf(date_data[:given_limit], arguments.date) + for i in date_data[:given_limit]: + pprint(i) + print('Total data found:', len(date_data[:given_limit])) + + file.close() + return True + except: + return False + + +def read_defs(): + """Method to print obtained feeds to console.""" + if verbose_true: + console_log('Verbose mode turned on') + if argument.json: + console_log('Json mode turned on') + check_storage() + if argument.date is None and argument.source: + get_data() + else: + if argument.date or len(argument.date) == 0: + console_log('Date mode turned on') + console_log(f'Given date is {argument.date}') + get_date() + if argument.version: + check_version() + + + +if __name__ == '__main__': + read_defs() diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..65277469 --- /dev/null +++ b/setup.py @@ -0,0 +1,42 @@ +import os +from setuptools import find_packages, setup +def read(file_name): + with open(os.path.join(os.path.dirname(__file__), file_name)) as file: + return file.read() + + +setup( + name='RSS_reader', + version='4.3', + description='python rss_reader, helps to read and get information from xml and rss formatted urls ', + long_description=read("README.md"), + python_requires=">=3.7", + license="MIT", + classifiers=["Programming Language :: Python :: 3", + "Operating System :: OS Independent", + "License :: OSI Approved :: MIT License"], + author='Doniyorbek Karimov', + author_email='doniyorkarimoff96@gmail.com', + url='https://github.com/karimoff96/Final-task', + packages=find_packages(where='src'), + include_package_data=True, + entry_points={'console_scripts': ['rss_reader=rss_reader:read_defs']}, + keywords=['Rss_reader'], + # install_requires=read("requirements.txt").splitlines(), + install_requires=['beautifulsoup4 == 4.11.1', + 'certifi == 2022.5.18.1', + 'charset-normalizer == 2.0.12', + 'colorama == 0.4.4', + 'decorator == 5.1.1', + 'idna == 3.3', + 'lxml == 4.9.0', + 'py-console == 0.1.4', + 'requests == 2.28.0', + 'soupsieve == 2.3.2.post1', + 'urllib3 == 1.26.9', + 'validators == 0.20.0', + 'python-dateutil==2.8.2', + 'reportlab==3.6.10', + 'json2html==1.3.0', + ], +) diff --git a/test.py b/test.py new file mode 100644 index 00000000..b9b56595 --- /dev/null +++ b/test.py @@ -0,0 +1,85 @@ + +import unittest +from rss_reader import all_args, check_url, clean_desc, get_data +from PDF_converter import convert_to_pdf +from HTML_converter import convert_to_html + + +class TestRssReader(unittest.TestCase): + '''Testing the projcet`s main functions with Unittest module''' + def setUp(self) -> None: + + self.text = '''<p><a href="https://auto.onliner.by/2022/06/27/v-uruche-ukreplyayut-most"><img src="https://content.onliner.by/news/thumbnail/18dcaf08bb50e746d6e7f1b1007881e6.jpeg" alt="" + /></a></p><p>О том, что с путепроводом на пересечении пр. Независимости и ул. ;''' + + self.text2 = '''О том, что с путепроводом на пересечении пр. Независимости и ул. ;''' + + self.url = "https://news.yahoo.com/rss/" + self.dataset = [{ + "Source": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml", + "News title:": "Shanghai Wrestles With Psychological Scars From Lockdown", + "News date:": "2022-06-29 09:00:46", + "News link:": "https://www.nytimes.com/2022/06/29/world/asia/shanghai-lockdown-china.html", + "News image_link:": "https://static01.nyt.com/images/2022/06/28/world/00china-mentalhealth-01/00china-mentalhealth-01-moth.jpg", + "News creator:": "Vivian Wang", + "News description:": "The lockdown fueled anxiety, fear and depression among the city’s residents. Experts have warned that the mental health impact of the confinement will be long-lasting." + }] + + self.convert_to_html = convert_to_html(self.dataset) + self.convert_to_pdf = convert_to_pdf(self.dataset) + self.url_validation = check_url(self.url) + self.desc_clean = clean_desc(self.text) + self.parser = all_args() + + def test_convet_to_html(self): + '''This function test the html convertion. Berfore testing please run rss_reader url {your option} operation''' + self.assertEqual(self.convert_to_html, 'test') + + def test_convert_to_pdf(self): + '''Function for testing pdf convertion. This func requires images from local storage. If there none, the test wont work''' + self.assertEqual(self.convert_to_pdf, 'test') + + def test_url_validation(self): + '''Testing URL validation''' + self.assertEqual(self.url_validation, True) + + def test_args_limit(self): + '''Limit tester''' + parsed = self.parser.parse_args(['--limit', '5']) + self.assertEqual(parsed.limit, 5) + + def test_args_source(self): + parsed = self.parser.parse_args(['https://news.google.com/rss/']) + self.assertEqual(parsed.source, 'https://news.google.com/rss/') + + def test_args_date(self): + '''Gets date as a str''' + parsed = self.parser.parse_args(['--date', '20220630']) + self.assertEqual(parsed.date[0], '20220630') + + def test_clean_desc(self): + unittest.TestCase.maxDiff = None + print(self.desc_clean) + self.assertEqual(self.desc_clean, self.text2) + + def test_arg_source_news(self): + test = get_data(self.parser.parse_args(['https://news.google.com/rss/', '--limit', '1'])) + self.assertEqual(test, True) + + def test_arg_source_news_json(self): + test = get_data(self.parser.parse_args( + ['https://news.google.com/rss/', '--limit', '1', '--json'])) + self.assertEqual(test, True) + + def test_arg_source_news_json_verbose(self): + test = get_data(self.parser.parse_args( + ['https://news.google.com/rss/', '--limit', '1', '--json', '--verbose'])) + self.assertEqual(test, True) + + def test_arg_news_date(self): + test = get_data(self.parser.parse_args( + ['https://news.google.com/rss/', '--limit', '1', '--date', '20220628' ])) + self.assertEqual(test, True) + +if __name__ == '__main__': + unittest.main()