diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..956a693 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +__pycache__ +.env +.git +.venv +.idea \ No newline at end of file diff --git a/.flake8 b/.flake8 index a4ca38c..bb835ee 100644 --- a/.flake8 +++ b/.flake8 @@ -1,6 +1,6 @@ [flake8] max-line-length = 79 -extend-ignore = E501, W503, E203, E402, E712 +extend-ignore = E501, W503, E203, E402, E712, W605 exclude = .git, backend/alembic/versions/*, diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 389ecf4..3bec1c5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,6 +1,10 @@ name: Satire Pulp parser -on: [push] +on: + push: + branches: + - "**" + pull_request: jobs: lint: @@ -30,3 +34,79 @@ jobs: - name: Flake8 Check run: flake8 . + + tests: + name: Pytest + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.11" + + - name: Upgrade pip + run: python -m pip install --upgrade pip + + - name: Install Dependencies + run: pip install -r requirements.txt + + - name: Run Pytest + run: pytest -v + + push_branch_dev_to_docker_hub: + name: Build and Push Docker(dev) + runs-on: ubuntu-latest + needs: lint + + if: github.ref == 'refs/heads/dev' + + steps: + - name: Check out the repo + uses: actions/checkout@v4 + + - name: Setup Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Push to Docker Hub + uses: docker/build-push-action@v5 + with: + push: true + tags: | + dmsn/satire_pulp_parser:dev + + push_branch_main_to_docker_hub: + name: Build and Push Docker(prod) + runs-on: ubuntu-latest + needs: lint + + if: github.ref == 'refs/heads/main' + + steps: + - name: Check out the repo + uses: actions/checkout@v4 + + - name: Setup Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Push to Docker Hub + uses: docker/build-push-action@v5 + with: + push: true + tags: | + dmsn/satire_pulp_parser:prod diff --git a/.gitignore b/.gitignore index b7faf40..a2004c9 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,7 @@ local_settings.py db.sqlite3 db.sqlite3-journal + # Flask stuff: instance/ .webassets-cache @@ -169,11 +170,8 @@ dmypy.json cython_debug/ # PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ + +.idea/ # Abstra # Abstra is an AI-powered process automation framework. @@ -205,3 +203,9 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ + + +.DS_Store + +*.db +last_id.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..423d0b2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.11-slim +LABEL maintainer="Dmitry Titenkov " +LABEL version="1.0" +LABEL description="Satire Pulp parser" +WORKDIR /app +COPY requirements.txt . +RUN pip3 install -r /app/requirements.txt --no-cache-dir +COPY . . diff --git a/README.md b/README.md index 0af2d79..1ebdad8 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,165 @@ -# satire_pulp_parser -Парсер с сайта сатирических новостей [Панорама](https://panorama.pub/ "Перейти") - -![Satire Pulp parser](https://github.com/dmsnback/satire_pulp_parser/actions/workflows/main.yml/badge.svg) -![Python](https://img.shields.io/badge/python-3.11-blue) -![Tests](https://img.shields.io/badge/tests-pytest-brightgreen) -![Black](https://img.shields.io/badge/code%20style-black-000000) -![License](https://img.shields.io/badge/license-MIT-green) + + +## Satire Pulp Parser + +![Satire Pulp parser](https://github.com/dmsnback/satire_pulp_parser/actions/workflows/main.yml/badge.svg) ![Python](https://img.shields.io/badge/python-3.11-blue) ![Tests](https://img.shields.io/badge/tests-pytest-brightgreen) ![Black](https://img.shields.io/badge/code%20style-black-000000) ![License](https://img.shields.io/badge/license-MIT-green) + + +- [Описание](#Описание) +- [Технологии](#Технологии) +- [Тестирование](#Тестирование) +- [Шаблон заполнения .env-файла](#Шаблон) +- [Запуск проекта](#Запуск) +- [Автор](#Автор) + + + +### Описание + +Проект представляет собой парсер сатирических новостей с сайта [Панорама](https://panorama.pub/ "Перейти") и Telegram-бот для автоматической рассылки новых публикаций пользователям. + +**Возможности:** + +```md + - Парсинг новостей с сайта panorama.pub + - Сохранение новостей в PostgreSQL + - Автоматическая рассылка новых новостей через Telegram-бот + - Планировщик запуска парсера каждые 20 минут + - Асинхронная работа бота с данными +``` + +Парсер написан с использованием **Scrapy**, **SQLAlchemy**, **PostgreSQL** и **Python Telegram Bot** + +В проекте настроен **CI pipeline** с использованием **GitHub Actions**: + +```md + - Автоматическая проверка кода (black, isort, flake8) + - Запуск unit-тестов (`pytest`) + - Сборка Docker-образа + - Публикация образа в **Docker Hub** при пуше в соответствующие ветки +``` + +```md +Проект адаптирован для использования **PostgreSQL** и развёртывания в контейнерах **Docker**. +``` + +> [Вернуться в начало](#Начало) + + + +### Технологии + +[![Python](https://img.shields.io/badge/Python-1000?style=for-the-badge&logo=python&logoColor=ffffff&labelColor=000000&color=000000)](https://www.python.org) +[![Scrapy](https://img.shields.io/badge/Scrapy-1000?style=for-the-badge&logo=scrapy&logoColor=ffffff&labelColor=000000&color=000000)](https://docs.scrapy.org/en/latest/index.html) +[![python_telegram_bot](https://img.shields.io/badge/python_telegram_bot-1000?style=for-the-badge&logo=telegram&logoColor=ffffff&labelColor=000000&color=000000)](https://docs.python-telegram-bot.org/en/stable/index.html) +[![Postgres](https://img.shields.io/badge/Postgres-1000?style=for-the-badge&logo=postgresql&logoColor=ffffff&labelColor=000000&color=000000)](https://www.postgresql.org) +[![SQLAlchemy](https://img.shields.io/badge/SQLAlchemy-1000?style=for-the-badge&logo=sqlalchemy&logoColor=ffffff&labelColor=000000&color=000000)](https://www.sqlalchemy.org) +[![Docker](https://img.shields.io/badge/Docker-1000?style=for-the-badge&logo=docker&logoColor=ffffff&labelColor=000000&color=000000)](https://www.docker.com) +[![Pytest](https://img.shields.io/badge/Pytest-1000?style=for-the-badge&logo=pytest&logoColor=ffffff&labelColor=000000&color=000000)](https://docs.pytest.org/en/stable/index.htmlc) +[![GitHub Actions](https://img.shields.io/badge/github%20actions-%232671E5.svg?style=for-the-badge&logo=githubactions&logoColor=ffffff&labelColor=000000&color=000000)](https://github.com/features/actions) + +> [Вернуться в начало](#Начало) + + + +### Тестирование + +В проекте реализованы **unit-тесты** с использованием `pytest`. + +Запуск тестов локально: + +```python +pytest -v +``` + +> [Вернуться в начало](#Начало) + + + +### Шаблон заполнения .env-файла + +> `env.example` с дефолтными значениями расположен в корневой папке + +```python +TELEGRAM_TOKEN=1234567890:Telegram-Token # Токен Telegram бота +DATABASE_URL_SYNC = postgresql+psycopg2://postgres:postgres@db:5432/satire_pulp_db # Указываем адрес БД (Синхронная версия) +DATABASE_URL_ASYNC=postgresql+asyncpg://postgres:postgres@db:5432/satire_pulp_db # Указываем адрес БД (Асинхронная версия) +POSTGRES_DB = satire_pulp_db # Имя базы дданных +POSTGRES_USER = postgres # Имя юзера PostgreSQL +POSTGRES_PASSWORD = yourpassword # Пароль юзера PostgreSQL +POSTGRES_HOST=db # Имя сервиса PostgreSQL в docker-compose +POSTGRES_PORT=5432 # Порт PostgreSQL внутри контейнера +``` + +> [Вернуться в начало](#Начало) + + + +### Запуск проекта + +- Склонируйте репозиторий + +```python +git clone git@github.com:dmsnback/satire_pulp_parser.git +``` + +- Установите и активируйте виртуальное окружение + +```python +python3 -m venv venv +``` + +Для `Windows` + +```python +source venv/Scripts/activate +``` + +Для `Mac/Linux` + +```python +source venv/bin/activate +``` + +- Установите зависимости из файла +`requirements.txt` + +```python +python3 -m pip install --upgrade pip +``` + +```python +pip install -r requirements.txt +``` + +- Запускаем Docker контейнеры (db, bot) + +```python +docker-compose up -d db bot +``` + +- Создаём таблицы в БД + +```python +docker-compose exec bot python -m db.init_db +``` + +- Перезапускаем Docker контейнеры + +```python +docker-compose up -d +``` + +- После запуска запустите бота командой ```/start``` + +> Команда ```/show_news``` пришлёт последние 10 новостей из базы, если они ещё не были отправлены, далее бот будет присылать только новые новости, которые появятся на сайте. + +> [Вернуться в начало](#Начало) + + + +### Автор + +- [Титенков Дмитрий](https://github.com/dmsnback) + +> [Вернуться в начало](#Начало) diff --git a/bot/__init__.py b/bot/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bot/bot_storage.py b/bot/bot_storage.py new file mode 100644 index 0000000..ef23f1e --- /dev/null +++ b/bot/bot_storage.py @@ -0,0 +1,79 @@ +import logging + +from db.models import LastSentNews, News +from sqlalchemy import select +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +logger = logging.getLogger(__name__) + + +async def get_news_after_id(last_id: int, session: AsyncSession, n=10): + """ + Получение новостей если если id новости меньше чем last_id отправленной новости, + либо получение 10 последних новостей если last_id = 0 + """ + try: + if last_id == 0: + news = await session.execute( + select(News) + .where(News.id > last_id) + .order_by(News.id.desc()) + .limit(n) + ) + return news.scalars().all()[::-1] + else: + news = await session.execute( + select(News).where(News.id > last_id).order_by(News.id.asc()) + ) + return news.scalars().all() + except SQLAlchemyError as e: + logger.error(f"Ошибка при получении списка новостей: {e}") + raise + + +async def get_last_sent_id(chat_id: int, session: AsyncSession): + """Получение last_id последней отправленной новости""" + try: + last_sent = await session.execute( + select(LastSentNews).where(LastSentNews.chat_id == chat_id) + ) + last_sent_id = last_sent.scalar_one_or_none() + if last_sent_id: + return last_sent_id.last_news_id + else: + return 0 + except SQLAlchemyError as e: + logger.error( + f"Ошибка при получении id последней отправленной новости: {e}" + ) + + +async def save_last_sent_news_id( + chat_id: int, last_id: int, session: AsyncSession +): + """Сохранение last_id последней отправленной новости""" + try: + last_sent = await session.execute( + select(LastSentNews).where(LastSentNews.chat_id == chat_id) + ) + last_sent = last_sent.scalar_one_or_none() + if last_sent: + last_sent.last_news_id = last_id + else: + last_sent = LastSentNews(chat_id=chat_id, last_news_id=last_id) + session.add(last_sent) + await session.commit() + except SQLAlchemyError as e: + await session.rollback() + logger.error(f"Ошибка при сохранении last_news_id: {e}") + raise + + +async def get_all_users(session: AsyncSession): + """Получение id юзера""" + try: + users = await session.execute(select(LastSentNews.chat_id)) + return users.scalars().all() + except SQLAlchemyError as e: + logger.error(f"Ошибка получения chat_id пользователей: {e}") diff --git a/bot/handlers.py b/bot/handlers.py new file mode 100644 index 0000000..a21d441 --- /dev/null +++ b/bot/handlers.py @@ -0,0 +1,150 @@ +import logging + +from bot.bot_storage import ( + get_all_users, + get_last_sent_id, + get_news_after_id, + save_last_sent_news_id, +) +from bot.sender import send_news +from config import setup_logger +from db.db_async import AsyncSessionLocal +from dotenv import load_dotenv +from telegram import ( + BotCommand, + InlineKeyboardButton, + InlineKeyboardMarkup, + Update, +) +from telegram.ext import ContextTypes + +load_dotenv() + +setup_logger() +logger = logging.getLogger(__name__) + + +async def auto_send_news(context: ContextTypes.DEFAULT_TYPE): + async with AsyncSessionLocal() as session: + users = await get_all_users(session) + if not users: + return + for chat_id in users: + last_id = await get_last_sent_id(chat_id, session) + news_list = await get_news_after_id(last_id, session) + if not news_list: + continue + for news in news_list: + try: + await send_news( + chat_id, + context, + news.title, + news.image, + news.text, + news.url, + ) + await save_last_sent_news_id(chat_id, news.id, session) + except Exception as e: + logger.error( + f"Ошибка при автоматической отправке новости: {e}" + ) + + +async def menu(update: Update, context: ContextTypes.DEFAULT_TYPE): + # chat_id = update.message.chat_id + welcome_text = ( + "Привеет\!\n" + "Я бот, который присылает сатирические новости с сайта [*Панорама*](https://panorama.pub)\.\n\n" + "Нажмите 📰 '*Показать новости*', чтобы получить новости\.\n\n" + "Если на сайте появится новая новость \- я пришлю её\." + ) + keyboard = [ + [ + InlineKeyboardButton( + "📰 Показать новости", callback_data="send_news" + ) + ], + [InlineKeyboardButton("ℹ️ Помощь", callback_data="help")], + ] + reply_markup = InlineKeyboardMarkup(keyboard) + await update.message.reply_text( + welcome_text, reply_markup=reply_markup, parse_mode="MarkdownV2" + ) + + +async def button_handler(update: Update, context: ContextTypes.DEFAULT_TYPE): + query = update.callback_query + try: + await query.answer() + except Exception as e: + logger.warning(f"callback_query не удалось ответить: {e}") + if query.data == "send_news": + chat_id = query.message.chat_id + async with AsyncSessionLocal() as session: + last_id = await get_last_sent_id(chat_id, session) + news_list = await get_news_after_id(last_id, session) + if not news_list: + await query.message.reply_text("Новых новостей пока нет 🙁") + logger.info("Новых новостей нет") + return + for news in news_list: + try: + await send_news( + chat_id, + context, + news.title, + news.image, + news.text, + news.url, + ) + await save_last_sent_news_id(chat_id, news.id, session) + except Exception as e: + logger.error( + f"Ошибка при отправке новости '{news.title[:25]}': {e}" + ) + elif query.data == "help": + help_text = "Нажмите 📰 'Показать новости', чтобы получить новости." + await query.message.reply_text(help_text) + + +async def show_news_command( + update: Update, context: ContextTypes.DEFAULT_TYPE +): + chat_id = update.message.chat_id + async with AsyncSessionLocal() as session: + last_id = await get_last_sent_id(chat_id, session) + news_list = await get_news_after_id(last_id, session) + if not news_list: + await update.message.reply_text("Новостей пока нет 🙁") + logger.info("Новостей нет") + return + try: + for news in news_list: + await send_news( + chat_id, + context, + news.title, + news.image, + news.text, + news.url, + ) + await save_last_sent_news_id(chat_id, news.id, session) + except Exception as e: + logger.error( + f"Ошибка при отправке новости '{news.title[:25]}': {e}" + ) + + +async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE): + help_text = "Нажмите 📰 'Показать новости', чтобы получить новости." + await update.message.reply_text(help_text) + + +async def set_commands(app): + commands = [ + BotCommand("start", "Главное меню"), + BotCommand("show_news", "Показать новости"), + BotCommand("help", "Помощь"), + ] + await app.bot.set_my_commands(commands) diff --git a/bot/sender.py b/bot/sender.py new file mode 100644 index 0000000..9cf63c6 --- /dev/null +++ b/bot/sender.py @@ -0,0 +1,55 @@ +import logging + +from config import setup_logger +from dotenv import load_dotenv +from telegram import InlineKeyboardButton, InlineKeyboardMarkup +from telegram.ext import ContextTypes + +load_dotenv() + +setup_logger() +logger = logging.getLogger(__name__) + + +MAX_CAPTION_LENGTH = 1024 + + +def format_message(title, text): + message = f"*{title}*\n\n{text}\n" + if len(message) > MAX_CAPTION_LENGTH: + message = f"*{title}*\n\n{text[:MAX_CAPTION_LENGTH]} ...✂️\n" + return message + + +async def send_news( + chat_id: int, context: ContextTypes.DEFAULT_TYPE, title, image, text, url +): + message = format_message(title, text) + keyboard = [ + [InlineKeyboardButton("Читать полную версию на сайте", url=url)] + ] + reply_markup = InlineKeyboardMarkup(keyboard) + if image: + try: + await context.bot.send_photo( + chat_id=chat_id, + photo=image, + caption=message, + parse_mode="Markdown", + reply_markup=reply_markup, + ) + logger.info("Новость отправлена с картинкой") + return + except Exception as e: + logger.error( + f"Не удалось отправить фото по ссылке, ошибка: {e}", + ) + try: + await context.bot.send_message( + chat_id, message, parse_mode="Markdown", reply_markup=reply_markup + ) + logger.info(f"Новость '{title[:25]}' отправлена без картинки") + except Exception as e: + logger.error( + f"Не удалось отправить сообщение с новостью '{title[:25]}', ошибка: {e}" + ) diff --git a/config.py b/config.py new file mode 100644 index 0000000..0a1746b --- /dev/null +++ b/config.py @@ -0,0 +1,25 @@ +import logging +from logging.handlers import RotatingFileHandler +from pathlib import Path + +LOG_DIR = Path("logs") +LOG_DIR.mkdir(exist_ok=True) + +LOG_FILE = LOG_DIR / "app.log" + + +def setup_logger(): + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s: - |%(levelname)s| %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + handlers=[ + logging.StreamHandler(), + RotatingFileHandler( + filename=LOG_FILE, + maxBytes=1024 * 1024 * 5, + backupCount=5, + encoding="utf-8", + ), + ], + ) diff --git a/db/__init__.py b/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/db/db_async.py b/db/db_async.py new file mode 100644 index 0000000..44c6e37 --- /dev/null +++ b/db/db_async.py @@ -0,0 +1,15 @@ +import os + +from dotenv import load_dotenv +from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine + +load_dotenv() + + +DATABASE_URL_ASYNC = os.getenv("DATABASE_URL_ASYNC") + + +engine = create_async_engine(DATABASE_URL_ASYNC, echo=False) + + +AsyncSessionLocal = async_sessionmaker(engine, expire_on_commit=False) diff --git a/db/db_sync.py b/db/db_sync.py new file mode 100644 index 0000000..4c9c0eb --- /dev/null +++ b/db/db_sync.py @@ -0,0 +1,15 @@ +import os + +from dotenv import load_dotenv +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +load_dotenv() + + +DATABASE_URL_SYNC = os.getenv("DATABASE_URL_SYNC") + + +engine = create_engine(DATABASE_URL_SYNC) + +SessionLocal = sessionmaker(engine) diff --git a/db/init_db.py b/db/init_db.py new file mode 100644 index 0000000..88c230a --- /dev/null +++ b/db/init_db.py @@ -0,0 +1,22 @@ +import asyncio +import logging + +from config import setup_logger +from db.db_async import engine +from db.models import Base, LastSentNews, News # noqa +from dotenv import load_dotenv + +load_dotenv() + +setup_logger() +logger = logging.getLogger(__name__) + + +async def init(): + """Создание таблиц в базе""" + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + logger.info("...Таблицы созданы...") + + +asyncio.run(init()) diff --git a/db/models.py b/db/models.py new file mode 100644 index 0000000..89c0fe4 --- /dev/null +++ b/db/models.py @@ -0,0 +1,30 @@ +from datetime import datetime + +from sqlalchemy import BigInteger, DateTime, Integer, String, Text, func +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column + + +class Base(DeclarativeBase): + pass + + +class News(Base): + __tablename__ = "news" + + id: Mapped[int] = mapped_column(primary_key=True) + url: Mapped[str] = mapped_column(String, unique=True, nullable=False) + title: Mapped[str | None] = mapped_column(String, nullable=True) + image: Mapped[str | None] = mapped_column(String, nullable=True) + text: Mapped[str | None] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), default=func.now(), nullable=False + ) + + +class LastSentNews(Base): + __tablename__ = "last_sent_news" + + chat_id: Mapped[int] = mapped_column(BigInteger, primary_key=True) + last_news_id: Mapped[int] = mapped_column( + Integer, nullable=True, default=0 + ) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..0ab8446 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,33 @@ +version: '3.8' + +services: + db: + image: postgres:15.0-alpine + volumes: + - postgres_data:/var/lib/postgresql/data/ + ports: + - "5432:5432" + env_file: + - .env + + bot: + image: dmsn/satire_pulp_parser:prod + restart: always + depends_on: + - db + env_file: + - .env + command: python3 main.py + + scheduler: + image: dmsn/satire_pulp_parser:prod + restart: always + command: python3 -m scheduler.scheduler + depends_on: + - db + - bot + env_file: + - .env + +volumes: + postgres_data: \ No newline at end of file diff --git a/env.example b/env.example new file mode 100644 index 0000000..b26e7c3 --- /dev/null +++ b/env.example @@ -0,0 +1,8 @@ +TELEGRAM_TOKEN=1234567890:Telegram-Token +DATABASE_URL_SYNC=postgresql+psycopg2://postgres:postgres@db:5432/satire_pulp_db +DATABASE_URL_ASYNC=postgresql+asyncpg://postgres:postgres@db:5432/satire_pulp_db +POSTGRES_DB=satire_pulp_db +POSTGRES_USER=postgres +POSTGRES_PASSWORD=postgres +POSTGRES_HOST=db +POSTGRES_PORT=5432 \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..d2e15e0 --- /dev/null +++ b/main.py @@ -0,0 +1,42 @@ +import asyncio +import logging +import os + +from bot.handlers import ( + auto_send_news, + button_handler, + help_command, + menu, + set_commands, + show_news_command, +) +from config import setup_logger +from dotenv import load_dotenv +from telegram.ext import ( + ApplicationBuilder, + CallbackQueryHandler, + CommandHandler, +) + +load_dotenv() + +setup_logger() +logger = logging.getLogger(__name__) + + +def main(): + app = ApplicationBuilder().token(os.getenv("TELEGRAM_TOKEN")).build() + app.job_queue.run_repeating(auto_send_news, interval=600, first=10) + app.add_handler(CommandHandler("start", menu)) + app.add_handler(CommandHandler("show_news", show_news_command)) + app.add_handler(CommandHandler("help", help_command)) + app.add_handler(CallbackQueryHandler(button_handler)) + + asyncio.get_event_loop().run_until_complete(set_commands(app)) + logger.info("... Бот запущен ...") + app.run_polling() + logger.info("... Бот остановлен ...") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 7345173..278ff90 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,7 @@ +aiosqlite==0.22.1 +anyio==4.12.1 +APScheduler==3.11.2 +asyncpg==0.31.0 attrs==25.4.0 Automat==25.4.16 black==26.1.0 @@ -11,6 +15,10 @@ cssselect==1.3.0 defusedxml==0.7.1 filelock==3.20.3 flake8==7.3.0 +greenlet==3.3.1 +h11==0.16.0 +httpcore==1.0.9 +httpx==0.28.1 hyperlink==21.0.0 idna==3.11 Incremental==24.11.0 @@ -28,6 +36,7 @@ pathspec==1.0.4 platformdirs==4.5.1 pluggy==1.6.0 Protego==0.5.0 +psycopg2-binary==2.9.11 pyasn1==0.6.2 pyasn1_modules==0.4.2 pycodestyle==2.14.0 @@ -37,16 +46,21 @@ pyflakes==3.4.0 Pygments==2.19.2 pyOpenSSL==25.3.0 pytest==9.0.2 +pytest-asyncio==1.3.0 python-dotenv==1.2.1 +python-telegram-bot==22.6 pytokens==0.4.0 queuelib==1.8.0 requests==2.32.5 requests-file==3.0.1 +schedule==1.2.2 Scrapy==2.14.1 service-identity==24.2.0 +SQLAlchemy==2.0.46 tldextract==5.3.1 Twisted==25.5.0 typing_extensions==4.15.0 +tzlocal==5.3.1 urllib3==2.6.3 w3lib==2.3.1 zope.interface==8.2 diff --git a/satire_pulp_parser/__init__.py b/satire_pulp_parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/satire_pulp_parser/items.py b/satire_pulp_parser/items.py new file mode 100644 index 0000000..3b19bb9 --- /dev/null +++ b/satire_pulp_parser/items.py @@ -0,0 +1,8 @@ +import scrapy + + +class NewsItem(scrapy.Item): + title = scrapy.Field() + text = scrapy.Field() + image = scrapy.Field() + url = scrapy.Field() diff --git a/satire_pulp_parser/middlewares.py b/satire_pulp_parser/middlewares.py new file mode 100644 index 0000000..5be215a --- /dev/null +++ b/satire_pulp_parser/middlewares.py @@ -0,0 +1,93 @@ +# from itemadapter import ItemAdapter +from scrapy import signals + + +class SatirePulpParserSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + async def process_start(self, start): + # Called with an async iterator over the spider start() method or the + # matching method of an earlier spider middleware. + async for item_or_request in start: + yield item_or_request + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class SatirePulpParserDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/satire_pulp_parser/pipelines.py b/satire_pulp_parser/pipelines.py new file mode 100644 index 0000000..c863928 --- /dev/null +++ b/satire_pulp_parser/pipelines.py @@ -0,0 +1,24 @@ +import logging + +from db.db_sync import SessionLocal +from satire_pulp_parser.items import NewsItem +from satire_pulp_parser.spider_storage import save_news + +logger = logging.getLogger(__name__) + + +class SaveNewsPipeline: + def process_item(self, item: NewsItem, spider): + try: + with SessionLocal() as session: + save_news( + url=item["url"], + title=item["title"], + image=item["image"], + text=item["text"], + session=session, + ) + logger.info("...Новость сохранена...") + except Exception as e: + logger.error(f"Ошибка при сохранении новости: {e}") + return item diff --git a/satire_pulp_parser/settings.py b/satire_pulp_parser/settings.py new file mode 100644 index 0000000..5921b6b --- /dev/null +++ b/satire_pulp_parser/settings.py @@ -0,0 +1,87 @@ +# Scrapy settings for satire_pulp_parser project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "satire_pulp" + +SPIDER_MODULES = ["satire_pulp_parser.spiders"] +NEWSPIDER_MODULE = "satire_pulp_parser.spiders" + +ADDONS = {} + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = "satire_pulp (+http://panorama.pub)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Concurrency and throttling settings +# CONCURRENT_REQUESTS = 16 +CONCURRENT_REQUESTS_PER_DOMAIN = 1 +DOWNLOAD_DELAY = 1 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# "satire_pulp_parser.middlewares.SatirePulpParserSpiderMiddleware": 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# "satire_pulp_parser.middlewares.SatirePulpParserDownloaderMiddleware": 543, +# } + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + "satire_pulp_parser.pipelines.SaveNewsPipeline": 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = "httpcache" +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +FEED_EXPORT_ENCODING = "utf-8" diff --git a/satire_pulp_parser/spider_storage.py b/satire_pulp_parser/spider_storage.py new file mode 100644 index 0000000..f84e886 --- /dev/null +++ b/satire_pulp_parser/spider_storage.py @@ -0,0 +1,28 @@ +import logging + +from db.models import News +from sqlalchemy.exc import SQLAlchemyError + +logger = logging.getLogger(__name__) + + +def is_news_exists(url: str, session): + """Проверка новости в базе по URL""" + try: + news = session.query(News).filter(News.url == url) + return news.first() is not None + except SQLAlchemyError as e: + logger.error(f"Ошибка проверки новости в базе: {e}") + raise + + +def save_news(url: str, title: str, image: str, text: str, session): + """Созранение новости в базе""" + try: + news = News(url=url, title=title, image=image, text=text) + session.add(news) + session.commit() + except SQLAlchemyError as e: + session.rollback() + logger.error(f"Ошибка сохранения новости в бд: {e}") + raise diff --git a/satire_pulp_parser/spiders/__init__.py b/satire_pulp_parser/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/satire_pulp_parser/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/satire_pulp_parser/spiders/satire_pulp.py b/satire_pulp_parser/spiders/satire_pulp.py new file mode 100644 index 0000000..700162b --- /dev/null +++ b/satire_pulp_parser/spiders/satire_pulp.py @@ -0,0 +1,37 @@ +import scrapy +from db.db_sync import SessionLocal +from satire_pulp_parser.items import NewsItem +from satire_pulp_parser.spider_storage import is_news_exists + + +class SatirePulpSpider(scrapy.Spider): + name = "satire_pulp" + allowed_domains = ["panorama.pub"] + start_urls = ["https://panorama.pub"] + + def parse(self, response): + news_links = response.css("div.shrink-0 li a::attr(href)").getall() + for link in news_links: + full_link = response.urljoin(link) + with SessionLocal() as session: + if is_news_exists(full_link, session): + self.logger.info(f"Новость уже есть: {full_link}") + continue + + yield scrapy.Request(url=full_link, callback=self.parse_news) + + def parse_news(self, response): + title = response.css('h1[itemprop="headline"]::text').get() + text = response.css("div.entry-contents p::text").get() + image = response.css('meta[itemprop="image"]::attr(content)').get() + final_title = title.strip() + final_text = text.strip() + if image: + image = response.urljoin(image).strip() + + else: + image = None + item = NewsItem( + title=final_title, text=final_text, image=image, url=response.url + ) + yield item diff --git a/scheduler/__init__.py b/scheduler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scheduler/scheduler.py b/scheduler/scheduler.py new file mode 100644 index 0000000..b7b6b32 --- /dev/null +++ b/scheduler/scheduler.py @@ -0,0 +1,40 @@ +import logging +import subprocess + +from apscheduler.schedulers.blocking import BlockingScheduler +from config import setup_logger + +setup_logger() +logger = logging.getLogger(__name__) + + +def run_spider(): + """Запуск паука""" + try: + logger.info("...Запуск парсера...") + result = subprocess.run( + ["scrapy", "crawl", "satire_pulp"], + capture_output=True, + text=True, + ) + if result.returncode == 0: + logger.info("...Парсер успешно завершил работу...") + else: + logger.error(f"Ошибка при запуске парсера: {result.stderr}") + except Exception as e: + logger.error(f"Неожиданная ошибка при запуске парсера: {e}") + + +def run_scheduler(): + """Запуск планировщика, паук запускаеется каждые 20 минут""" + scheduler = BlockingScheduler() + try: + logger.info("...Планировщик запущен...") + run_spider() + scheduler.add_job(run_spider, "interval", minutes=20) + scheduler.start() + except Exception as e: + logger.error(f"Ошибкка в планировщике: {e}") + + +run_scheduler() diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..0346a59 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = satire_pulp_parser.settings + +[deploy] +#url = http://localhost:6800/ +project = satire_pulp_parser diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..158038b --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,62 @@ +import pytest +import pytest_asyncio +from db.models import Base, News +from sqlalchemy import create_engine +from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine +from sqlalchemy.orm import sessionmaker + +DATABASE_SYNC_URL = "sqlite:///:memory:" +DATABASE_ASYNC_URL = "sqlite+aiosqlite:///:memory:" + + +@pytest.fixture +def session(): + engine = create_engine(DATABASE_SYNC_URL, echo=False) + Base.metadata.create_all(engine) + Session = sessionmaker(engine) + session = Session() + yield session + session.close() + + +@pytest_asyncio.fixture +async def async_session(): + engine = create_async_engine(DATABASE_ASYNC_URL, echo=False) + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + AsyncSession = async_sessionmaker(engine, expire_on_commit=False) + async with AsyncSession() as async_session: + yield async_session + await engine.dispose() + + +@pytest.fixture +def news(): + test_news = { + "url": "https://panorama.pub/test-news", + "title": "Test Title", + "text": "Test Text", + "image": None, + } + return test_news + + +@pytest_asyncio.fixture +async def news_list(async_session): + first_news = News( + url="https://panorama.pub/test-news_1", + title="Test Title 1", + image=None, + text="Ttest Text_1", + ) + second_news = News( + url="https://panorama.pub/test-news_2", + title="Test Title 2", + image=None, + text="Ttest Text_2", + ) + + async_session.add_all([first_news, second_news]) + await async_session.commit() + + return [first_news, second_news] diff --git a/tests/test_bot_storage.py b/tests/test_bot_storage.py new file mode 100644 index 0000000..032ad6f --- /dev/null +++ b/tests/test_bot_storage.py @@ -0,0 +1,55 @@ +import pytest +from bot.bot_storage import ( + get_all_users, + get_last_sent_id, + get_news_after_id, + save_last_sent_news_id, +) + + +@pytest.mark.asyncio +async def test_get_last_sent_id(async_session): + """Проверка id последней отправленой новости""" + chat_id = 100 + + last_id = await get_last_sent_id(chat_id, async_session) + assert ( + last_id == 0 + ), "id последней отпрвленной новости должен быть равен 0, если нет отправленных новостей" + + await save_last_sent_news_id( + chat_id=chat_id, last_id=10, session=async_session + ) + last_id = await get_last_sent_id(chat_id, async_session) + assert ( + last_id == 10 + ), "id последней отпрвленной новости должен быть равен 10" + + +@pytest.mark.asyncio +async def test_get_news_after_id(async_session, news_list): + """Получение новостей""" + news = await get_news_after_id(last_id=0, session=async_session) + assert len(news) == 2, "Должно быть получено 2 новости" + assert ( + news[0].title == "Test Title 1" + ), "Заголовок первой новости должен быть 'Test Title 1'" + assert ( + news[1].title == "Test Title 2" + ), "Заголовок второй новости должен быть 'Test Title 2'" + + +@pytest.mark.asyncio +async def test_get_all_users(async_session): + """Получение пользователей""" + await save_last_sent_news_id( + chat_id=111, last_id=11, session=async_session + ) + await save_last_sent_news_id( + chat_id=222, last_id=22, session=async_session + ) + + users = await get_all_users(async_session) + assert len(users) == 2, "Должно быть 2 пользователя в базе" + assert users[0] == 111, "id первого юзера должен быть 111" + assert users[1] == 222, "id второго юзеера должен быть 222" diff --git a/tests/test_spider_storage.py b/tests/test_spider_storage.py new file mode 100644 index 0000000..fa3817e --- /dev/null +++ b/tests/test_spider_storage.py @@ -0,0 +1,20 @@ +from satire_pulp_parser.spider_storage import is_news_exists, save_news + + +def test_save_and_check_news(session, news): + """Проверяет что новостей нет в базе, + сохранение новости + и новость появилась в базе после сохранения.""" + assert not is_news_exists( + news["url"], session + ), "Перед сохраением новой новости её не должно быть в базе" + save_news( + news["url"], + news["title"], + news["image"], + news["text"], + session=session, + ) + assert is_news_exists( + news["url"], session + ), "Новость должна появиться в базе после сохранения"