From 0369cd80725fe35856def6d95110e0cb3e48bd88 Mon Sep 17 00:00:00 2001 From: ttlequals0 Date: Thu, 5 Mar 2026 18:10:44 -0500 Subject: [PATCH 1/3] Fix stuck scan bug, reorganize repo, and clean up docs (v2.5.67) (#38) Fix crash recovery session handling -- scans could get permanently stuck after a Celery task crash because the DB session was in a rolled-back state and recovery writes silently failed. Also improve stuck scan detection when Celery task state is unknown. Move all application modules into pixelprobe/ package so root only contains entry points (app.py, celery_worker.py) and build/config files. Update 73+ import paths across 43+ files. Delete legacy files, broken scripts, and obsolete docs. Fix documentation inaccuracies (wrong ports, stale version refs, SQLite references). Co-Authored-By: Claude Opus 4.6 (1M context) --- .env.example | 33 ++- CHANGELOG.MD | 20 ++ README.md | 8 +- app.py | 14 +- database_migrations.py | 94 --------- docker-compose.yml | 8 +- docs/ARCHITECTURE.md | 44 ++-- docs/DOCKER_SETUP.md | 12 +- docs/FIX_INCOMPLETE_SCANS.md | 3 + docs/INSTALLATION.md | 20 +- docs/PROJECT_STRUCTURE.md | 184 ++++++++-------- docs/SYSTEM_ARCHITECTURE.md | 14 +- docs/TROUBLESHOOTING.md | 8 +- docs/api/SCAN_TYPES_DOCUMENTATION.md | 15 +- docs/development/API_CONSOLIDATION_PLAN.md | 71 ------- .../P0_P1_IMPLEMENTATION_STATUS.md | 140 ------------- init_db.py | 10 - migrations/add_authentication_tables.py | 76 ------- migrations/add_celery_task_id_column.py | 161 -------------- migrations/add_celery_task_id_column.sql | 25 --- migrations/add_exclusions_table.py | 33 --- migrations/add_scan_tracking_columns.py | 86 -------- operation_handlers.py | 98 --------- pixelprobe/__init__.py | 2 +- pixelprobe/api/admin_routes.py | 14 +- pixelprobe/api/auth_decorator.py | 2 +- pixelprobe/api/auth_routes.py | 4 +- pixelprobe/api/export_routes.py | 4 +- pixelprobe/api/healthcheck_routes.py | 4 +- pixelprobe/api/maintenance_routes.py | 10 +- pixelprobe/api/notification_routes.py | 4 +- pixelprobe/api/reports_routes.py | 14 +- pixelprobe/api/scan_routes.py | 27 ++- pixelprobe/api/scan_routes_parallel.py | 6 +- pixelprobe/api/stats_routes.py | 8 +- auth.py => pixelprobe/auth.py | 2 +- .../celery_config.py | 0 config.py => pixelprobe/config.py | 0 .../media_checker.py | 6 +- models.py => pixelprobe/models.py | 6 +- pixelprobe/models/__init__.py | 0 pixelprobe/repositories/base_repository.py | 2 +- pixelprobe/repositories/config_repository.py | 2 +- pixelprobe/repositories/scan_repository.py | 2 +- scheduler.py => pixelprobe/scheduler.py | 27 ++- pixelprobe/services/export_service.py | 2 +- pixelprobe/services/maintenance_service.py | 14 +- pixelprobe/services/scan_service.py | 73 +++---- pixelprobe/services/stats_service.py | 4 +- pixelprobe/startup.py | 6 +- pixelprobe/tasks.py | 16 +- pixelprobe/tasks_parallel.py | 10 +- pixelprobe/utils/helpers.py | 187 ++++++++++++++++- pixelprobe/utils/security.py | 4 +- version.py => pixelprobe/version.py | 2 +- scripts/create_test_database.py | 2 +- scripts/docker-run-modern.sh | 45 ---- scripts/fix_database_schema.py | 2 +- scripts/reset_incomplete_scans.py | 2 +- scripts/run_modern_ui.sh | 23 -- scripts/run_test_ui.sh | 22 -- tests/conftest.py | 26 +-- tests/integration/test_admin_endpoints.py | 10 +- tests/integration/test_csrf_and_exclusions.py | 2 +- .../integration/test_maintenance_endpoints.py | 2 +- .../test_scan_endpoints_extended.py | 2 +- tests/integration/test_scan_execution.py | 2 +- tests/test_app.py | 2 +- tests/test_authentication.py | 4 +- tests/test_bulk_reports.py | 2 +- tests/test_concurrency.py | 10 +- tests/test_media_checker.py | 8 +- tests/test_migration_lock.py | 2 +- tests/test_performance.py | 8 +- tests/test_real_media_samples.py | 2 +- tests/test_scheduler.py | 4 +- tests/unit/test_repositories.py | 2 +- tests/unit/test_scan_service.py | 4 +- tests/unit/test_stats_service.py | 2 +- tools/data_retention.py | 2 +- tools/fix_database_schema.py | 2 +- tools/fix_tile_data_false_positives.py | 4 +- tools/patches/v2_2_47_fixes.py | 129 ------------ tools/reset_nal_files_v2.py | 2 +- utils.py | 198 ------------------ 85 files changed, 587 insertions(+), 1581 deletions(-) delete mode 100644 database_migrations.py delete mode 100644 docs/development/API_CONSOLIDATION_PLAN.md delete mode 100644 docs/development/P0_P1_IMPLEMENTATION_STATUS.md delete mode 100644 init_db.py delete mode 100644 migrations/add_authentication_tables.py delete mode 100644 migrations/add_celery_task_id_column.py delete mode 100644 migrations/add_celery_task_id_column.sql delete mode 100644 migrations/add_exclusions_table.py delete mode 100644 migrations/add_scan_tracking_columns.py delete mode 100644 operation_handlers.py rename auth.py => pixelprobe/auth.py (99%) rename celery_config.py => pixelprobe/celery_config.py (100%) rename config.py => pixelprobe/config.py (100%) rename media_checker.py => pixelprobe/media_checker.py (99%) rename models.py => pixelprobe/models.py (99%) delete mode 100644 pixelprobe/models/__init__.py rename scheduler.py => pixelprobe/scheduler.py (95%) rename version.py => pixelprobe/version.py (92%) delete mode 100755 scripts/docker-run-modern.sh delete mode 100755 scripts/run_modern_ui.sh delete mode 100755 scripts/run_test_ui.sh delete mode 100644 tools/patches/v2_2_47_fixes.py delete mode 100644 utils.py diff --git a/.env.example b/.env.example index d4887ba..5ee527e 100644 --- a/.env.example +++ b/.env.example @@ -5,22 +5,30 @@ # Generate with: python -c "import secrets; print(secrets.token_hex(32))" SECRET_KEY=your-secure-random-key-here-replace-this -# Database configuration -# SQLite (default): sqlite:////app/instance/media_checker.db -# PostgreSQL: postgresql://user:password@host:port/database -# MySQL: mysql://user:password@host:port/database -DATABASE_URL=sqlite:////app/instance/media_checker.db +# Database configuration (PostgreSQL REQUIRED since v2.2.0) +# When using docker-compose.yml, the POSTGRES_* variables below are used instead. +# DATABASE_URL is only needed for standalone (non-Docker) deployments. +# DATABASE_URL=postgresql://pixelprobe:changeme@localhost:5432/pixelprobe + +# PostgreSQL credentials (used by docker-compose.yml) +POSTGRES_PASSWORD=changeme # Scanning configuration -# Comma-separated list of directories to monitor +# Comma-separated list of directories to monitor (inside container) SCAN_PATHS=/media -# Optional: Path to your media files on the host system +# Path to your media files on the host system (mounted read-only into container) MEDIA_PATH=/path/to/your/media # Performance tuning -MAX_FILES_TO_SCAN=100 -MAX_SCAN_WORKERS=4 +# MAX_WORKERS: parallel file scanning threads per task (default: 10) +MAX_WORKERS=10 +# BATCH_SIZE: files per database batch commit (default: 100) +BATCH_SIZE=100 + +# Celery configuration +# CELERY_CONCURRENCY: number of concurrent Celery tasks (default: 4) +CELERY_CONCURRENCY=4 # Timezone configuration TZ=America/New_York @@ -40,4 +48,9 @@ CLEANUP_SCHEDULE= # Path exclusions (comma-separated) EXCLUDED_PATHS=/media/temp,/media/cache # Extension exclusions (comma-separated) -EXCLUDED_EXTENSIONS=.tmp,.temp,.cache \ No newline at end of file +EXCLUDED_EXTENSIONS=.tmp,.temp,.cache + +# SSRF trusted hosts (optional) +# Hostnames and/or CIDR ranges that bypass private-IP blocking for healthcheck pings. +# Comma-separated. Example: myhost.local,192.168.5.0/24 +TRUSTED_INTERNAL_HOSTS= diff --git a/CHANGELOG.MD b/CHANGELOG.MD index c21e77e..b4d5ad8 100644 --- a/CHANGELOG.MD +++ b/CHANGELOG.MD @@ -5,6 +5,26 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0). +## [2.5.67] - 2026-03-05 + +### Fixed + +- **Fix stuck scan bug**: Scans could get permanently stuck as "active" after a Celery task crash (e.g., `psycopg2.DatabaseError`). The crash recovery handler in `scan_service.py` attempted to mark the scan as crashed but failed because the DB session was in a rolled-back state. Added `db.session.rollback()` before recovery writes and re-query the scan state with a fresh session. +- **Fix stuck scan detection for lost Celery tasks**: When Celery task state is `None` (task lost/unreachable), the `is_scan_running()` check previously assumed the scan was still running indefinitely. Now falls through to time-based detection -- if no update for over 1 hour with unknown task state, marks as crashed. +- **Fix scheduler stuck scan checker**: The `_check_stuck_scans` scheduler job now also verifies Celery task state. If a Celery task is gone AND no progress update for 5+ minutes, the scan is marked as crashed (previously only relied on 30-minute time threshold). +- Files affected: `pixelprobe/services/scan_service.py`, `pixelprobe/api/scan_routes.py`, `scheduler.py` + +### Changed + +- **Move all application modules into `pixelprobe/` package**: Moved `models.py`, `auth.py`, `config.py`, `media_checker.py`, `scheduler.py`, `celery_config.py`, `version.py` from root into the `pixelprobe/` package. Root now only contains entry points (`app.py`, `celery_worker.py`) and build/config files. Updated 73+ import statements across 43 files. +- **Move root `utils.py` into `pixelprobe/utils/helpers.py`**: Consolidated shared utilities (`ProgressTracker`, `create_state_dict`, `batch_process`, etc.) into the package. +- **Documentation cleanup**: Fixed port 5001 references (should be 5000), updated stale version references (v2.4.48/v2.4.93), fixed SQLite references (PostgreSQL-only since v2.2.0), rewrote PROJECT_STRUCTURE.md, fixed container name references. +- **Repository cleanup**: Removed legacy files (`database_migrations.py`, `init_db.py`, `operation_handlers.py`, `utils.py`, `migrations/` directory, broken shell scripts, obsolete patches, outdated development docs). +- Updated docker-compose.yml image tags from `pixelprobe:test-v2.4.93` to `ttlequals0/pixelprobe:2.5.67`. +- Updated `.env.example` with correct PostgreSQL defaults and added missing env vars. + +--- + ## [2.5.66] - 2026-02-23 ### Fixed diff --git a/README.md b/README.md index 3d45516..c711a51 100644 --- a/README.md +++ b/README.md @@ -257,7 +257,7 @@ Fine-tune scanning behavior by excluding specific paths and file types: ``` 4. **Access the web interface**: - Open http://localhost:5001 in your browser + Open http://localhost:5000 in your browser 5. **Initial Setup** (IMPORTANT - First Run Only): @@ -265,12 +265,12 @@ Fine-tune scanning behavior by excluding specific paths and file types: ```bash # Create admin user with your chosen password - curl -X POST http://localhost:5001/api/auth/setup \ + curl -X POST http://localhost:5000/api/auth/setup \ -H "Content-Type: application/json" \ -d '{"password":"YourSecurePassword123"}' ``` - Or visit http://localhost:5001/login and follow the first-run setup wizard. + Or visit http://localhost:5000/login and follow the first-run setup wizard. **Security Note**: No default admin account exists. You must explicitly create it on first run. @@ -341,7 +341,7 @@ export MEDIA_PATH=/mnt/all-media # Contains subdirs: movies/, tv/, backup/ ### Web Interface -1. **Access the Dashboard**: Navigate to http://localhost:5001 +1. **Access the Dashboard**: Navigate to http://localhost:5000 2. **Start a Scan**: Click "Scan All Files" to begin scanning your media directories 3. **View Results**: Results appear in the table below with corruption status 4. **Filter Results**: Use the filter buttons to show only corrupted or healthy files diff --git a/app.py b/app.py index f9c1dcb..cf6e493 100644 --- a/app.py +++ b/app.py @@ -19,9 +19,9 @@ import json # Import database and models -from models import db -from version import __version__, __github_url__ -from scheduler import MediaScheduler +from pixelprobe.models import db +from pixelprobe.version import __version__, __github_url__ +from pixelprobe.scheduler import MediaScheduler # Import blueprints from new modular structure from pixelprobe.api.scan_routes import scan_bp @@ -36,7 +36,7 @@ from pixelprobe.api.auth_routes import auth_api_bp, auth_ui_bp, auth_bp # auth_bp for backward compat # Import authentication module -from auth import init_auth, auth_required +from pixelprobe.auth import init_auth, auth_required # OpenAPI documentation is available as openapi.yaml in the project root @@ -66,7 +66,7 @@ # Configure app # Require SECRET_KEY in production - no insecure fallback # Load configuration from config module -from config import get_config +from pixelprobe.config import get_config config_name = os.getenv('FLASK_ENV', 'development') config_class = get_config(config_name) config_class.init_app(app) @@ -128,7 +128,7 @@ init_auth(app) # P1 Implementation: Initialize Celery task queue -from celery_config import create_celery, init_celery +from pixelprobe.celery_config import create_celery, init_celery celery = create_celery(app) init_celery(app, celery) # CRITICAL: Attach celery to app so scan_routes can find it @@ -247,7 +247,7 @@ def sync_scan_paths_to_db(): Uses INSERT ... ON CONFLICT DO NOTHING to handle race conditions when multiple gunicorn workers start simultaneously. """ - from models import ScanConfiguration + from pixelprobe.models import ScanConfiguration from sqlalchemy.dialects.postgresql import insert scan_paths = app.config.get('SCAN_PATHS', []) diff --git a/database_migrations.py b/database_migrations.py deleted file mode 100644 index 388c937..0000000 --- a/database_migrations.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python3 -""" -Database migration runner for PixelProbe -Automatically applies missing database schema changes on startup -""" - -import logging -from sqlalchemy import text -from models import db - -logger = logging.getLogger(__name__) - -def run_migrations(): - """Run all pending database migrations""" - migrations_applied = [] - - try: - # Migration 1: Add celery_task_id column to scan_chunks - with db.engine.connect() as conn: - # Check if column exists - result = conn.execute(text(""" - SELECT column_name - FROM information_schema.columns - WHERE table_name = 'scan_chunks' - AND column_name = 'celery_task_id' - """)) - - if not result.fetchone(): - logger.info("Applying migration: Adding celery_task_id to scan_chunks") - conn.execute(text(""" - ALTER TABLE scan_chunks - ADD COLUMN celery_task_id VARCHAR(36) - """)) - - # Create index for performance - conn.execute(text(""" - CREATE INDEX idx_scan_chunks_celery_task_id - ON scan_chunks (celery_task_id) - WHERE celery_task_id IS NOT NULL - """)) - - conn.commit() - migrations_applied.append("add_celery_task_id_to_scan_chunks") - logger.info("Migration applied successfully: celery_task_id column added") - else: - logger.debug("Migration already applied: celery_task_id column exists") - - except Exception as e: - logger.error(f"Migration failed: {e}") - # Don't fail startup if migration fails - app might still work - - if migrations_applied: - logger.info(f"Applied {len(migrations_applied)} migrations: {', '.join(migrations_applied)}") - else: - logger.debug("No pending migrations to apply") - - return migrations_applied - -def check_database_schema(): - """Verify database schema is correct""" - issues = [] - - try: - with db.engine.connect() as conn: - # Check for required columns - required_columns = [ - ('scan_chunks', 'celery_task_id'), - ('scan_results', 'scan_output'), - ('scan_results', 'scan_date'), - ('scan_results', 'is_corrupted'), - ('scan_state', 'celery_task_id'), - ] - - for table, column in required_columns: - result = conn.execute(text(f""" - SELECT column_name - FROM information_schema.columns - WHERE table_name = '{table}' - AND column_name = '{column}' - """)) - - if not result.fetchone(): - issues.append(f"Missing column: {table}.{column}") - - except Exception as e: - logger.error(f"Schema check failed: {e}") - issues.append(f"Schema check error: {e}") - - if issues: - logger.warning(f"Database schema issues found: {', '.join(issues)}") - else: - logger.info("Database schema verification passed") - - return issues \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 683de78..8764db1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -40,7 +40,7 @@ services: networks: - pixelprobe-network - # Redis for P1 Celery implementation + # Redis for Celery task broker # Performance settings for large task queues (file changes check with 1M+ files): # - REDIS_MAX_MEMORY: Total memory for task queue (default: 2gb, recommended: 1-4gb) redis: @@ -62,7 +62,7 @@ services: # PixelProbe application pixelprobe: - image: pixelprobe:test-v2.4.93 + image: ttlequals0/pixelprobe:2.5.67 container_name: pixelprobe-app environment: # Security @@ -75,7 +75,7 @@ services: POSTGRES_USER: pixelprobe POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-changeme} - # P1 Celery configuration (scaffolded for future implementation) + # Celery configuration CELERY_BROKER_URL: redis://redis:6379/0 CELERY_RESULT_BACKEND: redis://redis:6379/0 @@ -126,7 +126,7 @@ services: # P1 Celery Worker for distributed task processing celery-worker: - image: pixelprobe:test-v2.4.93 + image: ttlequals0/pixelprobe:2.5.67 container_name: pixelprobe-celery-worker command: python celery_worker.py environment: diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index d9b47d3..08cfc14 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -4,7 +4,7 @@ ## Overview -PixelProbe v2.4.48 is a distributed media file corruption detection system built with a modular, layered architecture. The system leverages Celery for distributed processing, Redis for message queuing, and PostgreSQL for persistent storage. +PixelProbe is a distributed media file corruption detection system built with a modular, layered architecture. The system leverages Celery for distributed processing, Redis for message queuing, and PostgreSQL for persistent storage. ## System Components @@ -44,7 +44,7 @@ PixelProbe v2.4.48 is a distributed media file corruption detection system built ▼ ▼ ▼ ┌─────────────────────┐ ┌─────────────┐ ┌─────────────────────┐ │ Celery Workers │ │ Redis │ │ Task Scheduler │ -│ (8+ Parallel Tasks) │ │ (Queue) │ │ (APScheduler) │ +│ (Parallel Tasks) │ │ (Queue) │ │ (APScheduler) │ └─────────────────────┘ └─────────────┘ └─────────────────────┘ │ │ ▼ ▼ @@ -163,7 +163,7 @@ api/ ### Data Layer -**Technology**: SQLite (Development), PostgreSQL (Production) +**Technology**: PostgreSQL (required since v2.2.0) **Models**: - `ScanResult`: File scan results @@ -215,7 +215,7 @@ class MediaScheduler: ## Data Flow -### Scan Request Flow (v2.4.48 Distributed) +### Scan Request Flow (Distributed) ``` Client Request → API Endpoint → Validation → Service Layer @@ -310,22 +310,19 @@ Request → Rate Limiter → CSRF Check → Input Validation ## Deployment Architecture -### Development +### Docker Compose (Standard) ``` -Docker Container - ├── Flask Development Server - ├── SQLite Database - └── Local File System +Docker Compose + ├── pixelprobe-app (Gunicorn + Flask) + ├── pixelprobe-celery-worker (Celery) + ├── pixelprobe-postgres (PostgreSQL 15) + └── pixelprobe-redis (Redis 7) ``` -### Production +### Production (with reverse proxy) ``` -Load Balancer - ├── Web Server (Nginx) - │ └── Gunicorn Workers - │ └── Flask Application - ├── PostgreSQL Database - └── Shared Storage (NFS/S3) +Load Balancer / Nginx + └── Docker Compose stack (as above) ``` ## Extension Points @@ -371,18 +368,9 @@ Load Balancer ## Future Enhancements -### Planned Features -1. **Distributed Scanning**: Multiple scanner nodes -2. **Cloud Storage Support**: S3, Azure Blob -3. **Machine Learning**: Corruption prediction -4. **WebSocket Support**: Real-time updates -5. **Plugin Architecture**: Custom scanners - -### Scalability Path -1. **Horizontal Scaling**: Multiple API instances -2. **Queue-based Processing**: Redis/RabbitMQ -3. **Microservices**: Separate scanner service -4. **Caching Layer**: Redis for results +1. **Cloud Storage Support**: S3, Azure Blob +2. **WebSocket Support**: Real-time updates +3. **Plugin Architecture**: Custom scanners ## Technology Stack diff --git a/docs/DOCKER_SETUP.md b/docs/DOCKER_SETUP.md index 8242b05..ad7ba29 100644 --- a/docs/DOCKER_SETUP.md +++ b/docs/DOCKER_SETUP.md @@ -12,7 +12,7 @@ PixelProbe uses 4 main containers: |-----------|---------|-------|--------------| | **postgres** | Database storage | 5432 | None | | **redis** | Message queue | 6379 | None | -| **mediachecker** | Web UI & API | 5000 | postgres, redis | +| **pixelprobe** | Web UI & API | 5000 | postgres, redis | | **celery-worker** | Background processing | None | postgres, redis | ## Complete Docker Compose File @@ -55,7 +55,7 @@ services: restart: unless-stopped # Main Web Application - Serves UI and API - mediachecker: + pixelprobe: image: ttlequals0/pixelprobe:latest container_name: pixelprobe-web ports: @@ -263,11 +263,11 @@ volumes: ``` **IMPORTANT - User Permissions:** -Both the `mediachecker` (web app) and `celery-worker` containers **MUST run as the same user** to access mounted media files. Add the `user:` directive to both services: +Both the `pixelprobe` (web app) and `celery-worker` containers **MUST run as the same user** to access mounted media files. Add the `user:` directive to both services: ```yaml services: - mediachecker: + pixelprobe: # ... other settings ... user: "1000:1000" # Use your host user's UID:GID volumes: @@ -275,7 +275,7 @@ services: celery-worker: # ... other settings ... - user: "1000:1000" # MUST match mediachecker user + user: "1000:1000" # MUST match pixelprobe user volumes: - /media/movies:/movies:ro ``` @@ -312,7 +312,7 @@ docker exec pixelprobe-postgres pg_dump -U pixelprobe pixelprobe > backup.sql All containers communicate on an internal Docker network: ``` -mediachecker:5000 ←→ redis:6379 ←→ celery-worker +pixelprobe:5000 ←→ redis:6379 ←→ celery-worker ↓ ↓ └────→ postgres:5432 ←────────┘ ``` diff --git a/docs/FIX_INCOMPLETE_SCANS.md b/docs/FIX_INCOMPLETE_SCANS.md index d57ca23..d305f06 100644 --- a/docs/FIX_INCOMPLETE_SCANS.md +++ b/docs/FIX_INCOMPLETE_SCANS.md @@ -1,5 +1,8 @@ # Fixing Incomplete Scans +> **Historical document**: This describes a fix for a bug in versions before v2.2.59. +> If you are running v2.2.59 or later, this issue has been resolved and no action is needed. + ## Problem Due to the chunk query bug in versions before v2.2.59, some files were marked as "completed" but were never actually scanned. These files show: - Tool Details: N/A diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md index 562ac00..bc1ed4d 100644 --- a/docs/INSTALLATION.md +++ b/docs/INSTALLATION.md @@ -274,9 +274,7 @@ After installation, you must create the admin account. ### Option 1: Web Setup Wizard -1. Open your browser and navigate to: - - Docker: http://localhost:5001 - - Manual: http://localhost:5000 +1. Open your browser and navigate to http://localhost:5000 2. You should be redirected to the first-run setup page 3. Create your admin account with a secure password (minimum 8 characters) @@ -288,12 +286,6 @@ After installation, you must create the admin account. Create the admin account via API: ```bash -# Docker installation (port 5001) -curl -X POST http://localhost:5001/api/auth/setup \ - -H "Content-Type: application/json" \ - -d '{"password":"YourSecurePassword123"}' - -# Manual installation (port 5000) curl -X POST http://localhost:5000/api/auth/setup \ -H "Content-Type: application/json" \ -d '{"password":"YourSecurePassword123"}' @@ -305,9 +297,7 @@ curl -X POST http://localhost:5000/api/auth/setup \ ### 1. Check Web Interface -Open your browser: -- Docker: http://localhost:5001 -- Manual: http://localhost:5000 +Open your browser at http://localhost:5000 You should see the login page. Log in with: - Username: `admin` @@ -316,10 +306,6 @@ You should see the login page. Log in with: ### 2. Check API Health ```bash -# Docker -curl http://localhost:5001/health - -# Manual curl http://localhost:5000/health ``` @@ -406,7 +392,7 @@ Common installation issues: - FFmpeg/ImageMagick not found: Ensure they're installed and in PATH - Database connection errors: Check PostgreSQL is running and credentials are correct - Permission errors: Ensure user has read access to media directories (Docker: use `user:` directive) -- Port conflicts: Change PORT in .env if 5000/5001 is already in use +- Port conflicts: Change PORT in .env if 5000 is already in use ## Getting Help diff --git a/docs/PROJECT_STRUCTURE.md b/docs/PROJECT_STRUCTURE.md index 21cfe5e..4cd4799 100644 --- a/docs/PROJECT_STRUCTURE.md +++ b/docs/PROJECT_STRUCTURE.md @@ -1,115 +1,107 @@ # PixelProbe Project Structure -## Overview -This document describes the organization of the PixelProbe codebase after the v2.0 UI overhaul. - ## Directory Structure ``` PixelProbe/ -├── app.py # Main Flask application -├── media_checker.py # Core media scanning logic -├── models.py # SQLAlchemy database models -├── version.py # Version information -├── requirements.txt # Python dependencies -├── Dockerfile # Production Docker image -├── docker-compose.yml # Production Docker Compose -├── docker-compose.simple.yml # Simplified Docker Compose +├── app.py # Flask WSGI entry point (Gunicorn: app:app) +├── celery_worker.py # Celery worker entry point +├── openapi.yaml # OpenAPI spec (served by app) +│ +├── pixelprobe/ # Main application package +│ ├── __init__.py +│ ├── auth.py # Authentication (session + Bearer token) +│ ├── celery_config.py # Celery setup with _make_context_task() factory +│ ├── config.py # Config classes selected via FLASK_ENV +│ ├── constants.py # Canonical extension lists, scan phase constants +│ ├── media_checker.py # Core FFmpeg/ImageMagick/PIL validation logic +│ ├── models.py # All SQLAlchemy models +│ ├── scheduler.py # APScheduler for periodic/scheduled scans +│ ├── scheduler_lock.py # Redis distributed lock for scheduler +│ ├── startup.py # Startup cleanup routines +│ ├── version.py # Single source of truth for version string +│ ├── tasks.py # Celery task definitions +│ ├── tasks_parallel.py # Parallel scan Celery tasks +│ ├── progress_utils.py # Progress tracking utilities +│ │ +│ ├── api/ # Route blueprints +│ │ ├── admin_routes.py # Configurations, schedules, exclusions, ignored patterns +│ │ ├── auth_routes.py # Login, logout, users, tokens +│ │ ├── export_routes.py # CSV/data export, file viewer +│ │ ├── healthcheck_routes.py # Healthcheck integration +│ │ ├── maintenance_routes.py # Cleanup, file-changes, vacuum +│ │ ├── notification_routes.py # Notification providers and rules +│ │ ├── reports_routes.py # Scan reports, PDF generation +│ │ ├── scan_routes.py # Core scan operations +│ │ ├── scan_routes_parallel.py # Parallel scan endpoint +│ │ └── stats_routes.py # Statistics, trends, system info +│ │ +│ ├── services/ # Business logic layer +│ │ ├── db_optimization.py +│ │ ├── export_service.py +│ │ ├── healthcheck_service.py +│ │ ├── maintenance_service.py +│ │ ├── notification_service.py +│ │ ├── scan_executor.py +│ │ ├── scan_service.py +│ │ └── stats_service.py +│ │ +│ ├── repositories/ # Data access layer +│ │ ├── base_repository.py +│ │ ├── config_repository.py +│ │ └── scan_repository.py +│ │ +│ ├── utils/ # Shared utilities +│ │ ├── celery_utils.py # check_celery_available, safe_check_task_state +│ │ ├── decorators.py +│ │ ├── helpers.py # ProgressTracker, batch_process, state utilities +│ │ ├── rate_limiting.py # rate_limit, exempt_from_rate_limit +│ │ ├── security.py # Path validation, SSRF protection, safe subprocess +│ │ ├── timezone.py # Timezone conversion utilities +│ │ └── validators.py # Input validation helpers +│ │ +│ └── migrations/ +│ └── startup.py # DB migrations run at startup │ -├── static/ # Static assets +├── static/ # Frontend assets │ ├── css/ -│ │ ├── desktop.css # Desktop styles (Modern design) -│ │ ├── mobile.css # Mobile-specific styles -│ │ └── logo-styles.css # Logo animations -│ ├── js/ -│ │ └── app.js # Modern ES6+ JavaScript application -│ └── images/ -│ ├── pixelprobe-logo.png -│ └── favicon files +│ ├── dist/ # Webpack build output (gitignored) +│ └── src/ # Webpack source (JS entry points) │ ├── templates/ # Jinja2 templates -│ ├── index.html # Modern UI template -│ └── api_docs.html # API documentation page +│ ├── index.html +│ └── api_docs.html │ -├── docker/ # Docker development files -│ ├── Dockerfile.modern -│ ├── Dockerfile.modern-simple -│ ├── docker-compose.modern.yml -│ └── docker-compose.dev.yml +├── tests/ # Test suite │ -├── scripts/ # Utility scripts -│ ├── docker-run-modern.sh -│ ├── run_modern_ui.sh -│ ├── run_test_ui.sh -│ ├── setup_and_run_local.sh -│ ├── setup_test_env.sh -│ ├── create_test_database.py -│ ├── test_database.py -│ ├── check_db_integrity.py -│ └── create_indexes.py +├── tools/ # Maintenance and utility scripts │ -├── tools/ # Database maintenance tools -│ ├── analyze_*.py # Various analysis scripts -│ ├── fix_*.py # Database fix scripts -│ └── README.md # Tools documentation +├── scripts/ # Shell scripts │ ├── docs/ # Documentation -│ └── screenshots/ # UI screenshots for README -│ ├── desktop-*.png -│ ├── mobile-*.png -│ └── *-modal.png -│ -├── instance/ # Flask instance folder (gitignored) -│ └── media_checker.db # SQLite database +│ ├── api/ # API documentation +│ ├── examples/ # Client examples (Python, Node.js, Bash) +│ └── ... │ -└── venv/ # Python virtual environment (gitignored) +├── Dockerfile # Production Docker image (linux/amd64) +├── docker-compose.yml # Production Docker Compose stack +├── package.json # Node.js dependencies (webpack build) +├── webpack.config.js # Webpack configuration +├── requirements.txt # Python production dependencies +├── requirements-test.txt # Python test dependencies +├── pytest.ini # Pytest configuration +├── .env.example # Environment variable template +├── README.md # Project README +└── CHANGELOG.md # Version changelog ``` -## Key Files - -### Application Core -- `app.py` - Flask routes and API endpoints -- `media_checker.py` - Media file scanning implementation -- `models.py` - Database schema definitions - -### Modern UI (v2.0+) -- `static/js/app.js` - Modular JavaScript with classes for: - - ThemeManager - Dark/light mode handling - - SidebarManager - Mobile navigation - - APIClient - Centralized API communication - - ProgressManager - 3-phase progress tracking - - TableManager - Advanced data display - - PixelProbeApp - Main application controller - -- `static/css/desktop.css` - Modern desktop styles -- `static/css/mobile.css` - Responsive mobile overrides -- `templates/index.html` - Modern UI template - -### Configuration -- `docker-compose.yml` - Production deployment -- `docker-compose.simple.yml` - Simplified deployment -- `Dockerfile` - Production container image - -## Development Notes - -### UI Architecture -The v2.0 UI uses a modern, modular JavaScript architecture with: -- ES6+ classes for organization -- CSS variables for theming -- Separate desktop/mobile stylesheets -- Sidebar navigation layout -- Modern color scheme (#1ce783) - -### API Endpoints -All API endpoints are prefixed with `/api/`: -- `/api/scan-results` - Get/search scan results -- `/api/scan-all` - Start full scan -- `/api/scan-status` - Get current scan status -- `/api/stats` - Get system statistics -- `/api/system-info` - Get detailed system information -- `/api/export-csv` - Export results to CSV +## Key Architecture Notes -### Docker Deployment -- Production: `docker-compose up -d` -- Development: `docker-compose -f docker/docker-compose.dev.yml up` -- Simple mode: `docker-compose -f docker-compose.simple.yml up -d` \ No newline at end of file +- **PostgreSQL only** since v2.2.0 (no SQLite support) +- **Celery** handles all scan tasks via Redis broker +- **APScheduler** runs in the celery-worker container for scheduled scans +- Extension lists are canonical in `pixelprobe/constants.py` +- DB migrations run automatically at startup via `pixelprobe/migrations/startup.py` +- Multi-worker startup uses PostgreSQL advisory locks for migration coordination +- Root contains only entry points (`app.py`, `celery_worker.py`) and build/config files +- All application code lives inside the `pixelprobe/` package diff --git a/docs/SYSTEM_ARCHITECTURE.md b/docs/SYSTEM_ARCHITECTURE.md index 345293a..5289088 100644 --- a/docs/SYSTEM_ARCHITECTURE.md +++ b/docs/SYSTEM_ARCHITECTURE.md @@ -44,8 +44,8 @@ PixelProbe is a distributed media corruption detection system built on a microse ### Container Descriptions -#### 1. Web Application Container (`mediachecker`) -- **Image**: `ttlequals0/pixelprobe:2.4.48` +#### 1. Web Application Container (`pixelprobe-app`) +- **Image**: `ttlequals0/pixelprobe:latest` - **Purpose**: Serves the web UI and REST API - **Responsibilities**: - Handle HTTP requests from users @@ -59,7 +59,7 @@ PixelProbe is a distributed media corruption detection system built on a microse - APScheduler for scheduled tasks #### 2. Celery Worker Container (`celery-worker`) -- **Image**: `ttlequals0/pixelprobe:2.4.48` (same image, different entry point) +- **Image**: `ttlequals0/pixelprobe:latest` (same image, different entry point) - **Purpose**: Process background tasks in parallel - **Responsibilities**: - Execute media scanning tasks @@ -273,7 +273,7 @@ BATCH_SIZE: 100 1. **Add More Workers**: ```yaml celery-worker-2: - image: ttlequals0/pixelprobe:2.2.47 + image: ttlequals0/pixelprobe:latest command: celery -A celery_app worker scale: 4 # Creates 4 instances ``` @@ -411,8 +411,8 @@ services: timeout: 5s retries: 5 - mediachecker: - image: ttlequals0/pixelprobe:2.2.47 + pixelprobe: + image: ttlequals0/pixelprobe:latest ports: - "5000:5000" environment: @@ -430,7 +430,7 @@ services: condition: service_healthy celery-worker: - image: ttlequals0/pixelprobe:2.2.47 + image: ttlequals0/pixelprobe:latest command: celery -A celery_app worker --loglevel=info --concurrency=8 environment: POSTGRES_HOST: postgres diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index b5a52c3..66ddba6 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -36,7 +36,7 @@ docker-compose logs pixelprobe-postgres --tail=50 docker-compose logs pixelprobe-redis --tail=50 # Check health status -curl http://localhost:5001/health +curl http://localhost:5000/health ``` ### Manual Installation @@ -417,7 +417,7 @@ docker exec pixelprobe-postgres psql -U pixelprobe -c \ "SELECT COUNT(*) FROM scan_results;" # Check scan state -curl http://localhost:5001/api/scan-status +curl http://localhost:5000/api/scan-status ``` **Solutions:** @@ -596,7 +596,7 @@ docker-compose logs pixelprobe-app | grep SECRET_KEY 1. **Verify token format:** ```bash -curl http://localhost:5001/api/stats \ +curl http://localhost:5000/api/stats \ -H "Authorization: Bearer your-token-here" ``` @@ -922,7 +922,7 @@ docker-compose ps docker-compose logs --tail=100 > logs.txt # Health check -curl http://localhost:5001/health +curl http://localhost:5000/health # Database status docker exec pixelprobe-postgres psql -U pixelprobe -c \ diff --git a/docs/api/SCAN_TYPES_DOCUMENTATION.md b/docs/api/SCAN_TYPES_DOCUMENTATION.md index bdf1120..a191c4a 100644 --- a/docs/api/SCAN_TYPES_DOCUMENTATION.md +++ b/docs/api/SCAN_TYPES_DOCUMENTATION.md @@ -6,7 +6,7 @@ PixelProbe offers multiple scan types to handle different use cases for media fi ## Scan Types ### 1. Full Scan (`full`) -**Endpoint**: `/api/scan` +**Endpoint**: `POST /api/scan` **Purpose**: Complete scan of specified directories **When to use**: - Initial setup of PixelProbe @@ -30,13 +30,12 @@ PixelProbe offers multiple scan types to handle different use cases for media fi POST /api/scan { "directories": ["/media/movies", "/media/photos"], - "scan_type": "full", "force_rescan": false } ``` ### 2. Parallel Scan (`parallel`) -**Endpoint**: `/api/scan-parallel` or `/api/scan-parallel-v2` +**Endpoint**: `POST /api/scan-parallel` **Purpose**: High-performance scanning using multiple workers **When to use**: - Large media libraries (100k+ files) @@ -62,7 +61,7 @@ POST /api/scan **Example**: ```json -POST /api/scan-parallel-v2 +POST /api/scan-parallel { "directories": ["/media"], "num_workers": 8 @@ -70,7 +69,7 @@ POST /api/scan-parallel-v2 ``` ### 3. Pending Scan (`pending`) -**Endpoint**: `/api/force-scan-pending` +**Endpoint**: `POST /api/force-scan-pending` **Purpose**: Scan only files marked as pending **When to use**: - After interrupted scans @@ -95,7 +94,7 @@ POST /api/force-scan-pending ``` ### 4. File Changes Scan (`file_changes`) -**Endpoint**: `/api/file-changes` +**Endpoint**: `GET/POST /api/file-changes` **Purpose**: Detect and scan modified files **When to use**: - Regular maintenance scans @@ -120,7 +119,7 @@ POST /api/file-changes ``` ### 5. Cleanup (`cleanup`) -**Endpoint**: `/api/cleanup-orphaned` +**Endpoint**: `POST /api/cleanup-orphaned` **Purpose**: Remove database entries for deleted files **When to use**: - After deleting media files @@ -144,7 +143,7 @@ POST /api/cleanup-orphaned ``` ### 6. Single File Scan (`single`) -**Endpoint**: `/api/scan-file` +**Endpoint**: `POST /api/scan-file` **Purpose**: Scan a specific file **When to use**: - Testing scan functionality diff --git a/docs/development/API_CONSOLIDATION_PLAN.md b/docs/development/API_CONSOLIDATION_PLAN.md deleted file mode 100644 index 34578d3..0000000 --- a/docs/development/API_CONSOLIDATION_PLAN.md +++ /dev/null @@ -1,71 +0,0 @@ - - - - -# API Endpoint Consolidation Plan - -## Current State (Too Many Similar Endpoints) - -### Scan Management Endpoints (KEEP & CONSOLIDATE) -- `/scan-all` → **`/scan`** (main endpoint, supports all scan types via parameters) -- `/scan-parallel` → **REMOVE** (merge into `/scan` with `parallel: true`) -- `/scan-file` → **KEEP** (specific file scanning is different enough) -- `/force-scan-pending` → **REMOVE** (merge into `/scan` with `pending_only: true`) - -### Stuck Scan Recovery (TOO MANY - CONSOLIDATE) -- `/reset-stuck-scans` → **REMOVE** -- `/recover-stuck-scan` → **REMOVE** -- `/force-cleanup-scan` → **REMOVE** -- **NEW**: `/scan/recovery` (single endpoint with action parameter) - -### Status & Info (KEEP AS IS) -- `/scan-status` → **KEEP** (essential) -- `/scan-results` → **KEEP** (view results) -- `/worker-status` → **KEEP** (monitoring) - -### Data Management (CONSOLIDATE) -- `/reset-for-rescan` → **REMOVE** -- `/reset-files-by-path` → **REMOVE** -- **NEW**: `/scan/reset` (single endpoint with scope parameter) - -## Proposed New Structure - -### Core Endpoints (v3.0) -``` -POST /api/scan # Start any type of scan - Parameters: - - type: "full" | "parallel" | "pending" | "discovery" - - directories: [] - - file_paths: [] - - force_rescan: bool - - num_workers: int (for parallel) - - deep_scan: bool - -POST /api/scan/file # Scan specific file(s) -POST /api/scan/cancel # Cancel running scan -POST /api/scan/recovery # Recover from stuck state - Parameters: - - action: "cleanup" | "reset" | "force" - -POST /api/scan/reset # Reset scan data - Parameters: - - scope: "all" | "path" | "pending" - - path: string (when scope="path") - -GET /api/scan/status # Current scan status -GET /api/scan/results # View results -GET /api/worker/status # Worker health -``` - -## Migration Strategy - -1. **Phase 1 (v2.2.36)**: Mark old endpoints as deprecated but keep working -2. **Phase 2 (v2.3.0)**: Add new consolidated endpoints -3. **Phase 3 (v3.0.0)**: Remove deprecated endpoints - -## Benefits -- Reduced from 15+ scan endpoints to ~8 -- Clearer API structure -- Easier to maintain -- More RESTful design -- Backward compatibility during transition \ No newline at end of file diff --git a/docs/development/P0_P1_IMPLEMENTATION_STATUS.md b/docs/development/P0_P1_IMPLEMENTATION_STATUS.md deleted file mode 100644 index fcaeebb..0000000 --- a/docs/development/P0_P1_IMPLEMENTATION_STATUS.md +++ /dev/null @@ -1,140 +0,0 @@ -# P0 and P1 Audit Implementation Status Report - -Generated: 2025-08-25 -Version: 2.2.50 - -## Executive Summary - -This report analyzes the completion status of P0 (Critical) and P1 (High Priority) tasks from the 2.1_AUDIT_IMPLEMENTATION_PLAN.md against the current codebase and CHANGELOG.md. - -## P0 (Critical) Tasks Status - -### 1. Memory Leak in Long Scans -**Status: COMPLETED (v2.2.46-47)** -- Implemented connection pool recovery and automatic retry logic -- Added connection disposal and recreation on critical errors -- Created DatabaseConnectionManager for robust connection handling -- Fixed transaction state management with automatic rollback -- Added connection recycling after 1 hour -- Evidence: `pixelprobe/database/connection_manager.py` created in v2.2.47 - -### 2. Scheduled Scans Not Running -**Status: COMPLETED (v2.2.50)** -- Fixed APScheduler Flask context issues -- Implemented HTTP self-call pattern as specified in audit plan -- Changed from non-existent `/api/start-scan` to correct `/api/scan` endpoint -- Added proper request headers and error handling -- Evidence: Complete refactor in v2.2.50 of `scheduler.py` - -### 3. Database Connection Management -**Status: COMPLETED (v2.2.46-47)** -- Implemented robust connection pooling with automatic recovery -- Added pool pre-ping to test connections before use -- Fixed "lost synchronization with server" PostgreSQL errors -- Fixed session lifecycle management in threads -- Added scoped sessions and proper cleanup -- Evidence: Multiple fixes in v2.2.46-47 for connection issues - -### 4. PostgreSQL Migration Evaluation -**Status: PARTIALLY COMPLETED** -- PostgreSQL support added (psycopg2-binary in requirements.txt) -- Connection pooling configuration implemented -- Database adapter pattern partially implemented -- However, full migration tools and scripts from plan not implemented -- SQLite remains the default, PostgreSQL is optional -- Evidence: PostgreSQL dependencies added but migration not forced - -## P1 (High Priority) Tasks Status - -### 1. Task Queue Implementation (Celery) -**Status: COMPLETED (v2.2.32-42)** -Major architectural improvement successfully implemented: -- Celery 5.3.4 integrated with Redis backend -- Fixed multiple Celery execution issues (v2.2.32-39): - - Synchronous execution in workers - - Task timeout issues for large scans - - Task redelivery problems - - Worker timeout configurations -- Universal parallel task distribution system (v2.2.42) -- Dynamic worker detection and utilization -- Proper task queuing with retry logic -- Evidence: `celery_config.py`, `pixelprobe/tasks.py`, `pixelprobe/tasks_parallel.py` - -### 2. Concurrency & Race Condition Fixes -**Status: COMPLETED (v2.2.49 and earlier)** -- Fixed multiple scans running simultaneously (v2.2.49) -- Added database constraints to enforce single active scan -- Fixed race conditions in Celery task queueing -- Implemented proper locking mechanisms -- Fixed stuck scan detection and recovery (v2.2.33-46) -- Added automatic cleanup of orphaned scans -- Evidence: `tools/fix_scan_concurrency.py` added in v2.2.49 - -## Additional Critical Fixes Not in Original Plan - -### Phase 3 Scanning Failure (v2.2.48) -**Status: FIXED** -- Critical bug where scans never reached Phase 3 (actual file scanning) -- Root cause: Missing `is_complete` column in `scan_chunks` table -- Added comprehensive migration tools with lock handling - -### HEVC False Positives (v2.2.49) -**Status: FIXED** -- Removed false warnings for valid 10-bit HEVC files -- Only actual corruption now flagged - -### UI Display Issues (v2.2.49) -**Status: FIXED** -- Fixed "61 million files" display bug -- Added proper number formatting - -## Implementation Summary - -### Completed P0 Tasks: 4 of 4 (100%) -1. Memory Leak in Long Scans -2. Scheduled Scans Not Running -3. Database Connection Management -4. PostgreSQL Migration (Partial) - -### Completed P1 Tasks: 2 of 2 (100%) -1. Task Queue Implementation (Celery) -2. Concurrency & Race Condition Fixes - -### Overall P0+P1 Completion: 6 of 6 tasks (100%) - -## Recommendations - -### All P0/P1 Tasks Complete -All critical and high-priority tasks from the audit implementation plan have been completed as of v2.2.50. - -### Future Improvements (P2/P3 Tasks) -1. **Complete PostgreSQL Migration Tools** - - Implement full migration scripts - - Add database adapter pattern fully - - Create migration documentation - -2. **Continue P2/P3 Tasks** - - Code consolidation (DRY principles) - - Frontend state management improvements - - Test coverage expansion - -## Technical Debt Addressed - -The implementation has successfully addressed most critical technical debt: -- Celery integration provides proper task queuing -- Database connection issues resolved -- Concurrency problems fixed -- Memory management improved -- Phase 3 scanning bug fixed - -## Production Impact - -Current v2.2.50 is production-ready with: -- Stable database connections -- Proper task distribution -- No false positive warnings -- Correct UI display -- Single scan enforcement -- **Working scheduled scans via HTTP self-calls** - -All critical functionality from the audit implementation plan is now operational. \ No newline at end of file diff --git a/init_db.py b/init_db.py deleted file mode 100644 index 2cd4b68..0000000 --- a/init_db.py +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env python3 -"""Initialize database tables for PixelProbe v2.4.0""" - -from app import app -from models import db - -with app.app_context(): - # Create all tables - db.create_all() - print("Database tables created successfully") \ No newline at end of file diff --git a/migrations/add_authentication_tables.py b/migrations/add_authentication_tables.py deleted file mode 100644 index f15d009..0000000 --- a/migrations/add_authentication_tables.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env python3 -""" -Migration to add authentication tables for v2.4.0 -""" - -import os -import sys -import logging -from datetime import datetime, timezone -from pathlib import Path - -# Add parent directory to path -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from sqlalchemy import create_engine, text -from config import get_database_url -from models import db, User - -logger = logging.getLogger(__name__) - -def run_migration(): - """Add user authentication and API token tables""" - - database_url = get_database_url() - engine = create_engine(database_url) - - try: - with engine.connect() as conn: - # Create users table - conn.execute(text(""" - CREATE TABLE IF NOT EXISTS users ( - id SERIAL PRIMARY KEY, - username VARCHAR(80) UNIQUE NOT NULL, - email VARCHAR(120) UNIQUE NOT NULL, - password_hash VARCHAR(128) NOT NULL, - is_admin BOOLEAN NOT NULL DEFAULT TRUE, - created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, - last_login TIMESTAMP WITH TIME ZONE, - is_active BOOLEAN NOT NULL DEFAULT TRUE, - first_setup_required BOOLEAN NOT NULL DEFAULT FALSE - ) - """)) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_users_username ON users(username)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_users_email ON users(email)")) - - # Create API tokens table - conn.execute(text(""" - CREATE TABLE IF NOT EXISTS api_tokens ( - id SERIAL PRIMARY KEY, - user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE, - token VARCHAR(64) UNIQUE NOT NULL, - description VARCHAR(200), - created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, - last_used TIMESTAMP WITH TIME ZONE, - expires_at TIMESTAMP WITH TIME ZONE, - is_active BOOLEAN NOT NULL DEFAULT TRUE - ) - """)) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_api_tokens_token ON api_tokens(token)")) - conn.execute(text("CREATE INDEX IF NOT EXISTS idx_api_tokens_user_id ON api_tokens(user_id)")) - - # No longer create default admin user automatically - # Users must use the /api/auth/setup endpoint on first run - logger.info("Authentication tables created. Use /api/auth/setup to create initial admin user.") - - conn.commit() - logger.info("Authentication tables created successfully") - - except Exception as e: - logger.error(f"Migration failed: {e}") - raise - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - run_migration() - print("Authentication tables migration completed successfully") \ No newline at end of file diff --git a/migrations/add_celery_task_id_column.py b/migrations/add_celery_task_id_column.py deleted file mode 100644 index 5359262..0000000 --- a/migrations/add_celery_task_id_column.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python3 -""" -Migration: Add celery_task_id column to scan_state table -P1 Implementation per 2.1_AUDIT_IMPLEMENTATION_PLAN.md - -This migration adds support for tracking Celery task IDs in the scan state. -""" - -import os -import sys -import logging -from sqlalchemy import create_engine, text -from sqlalchemy.exc import OperationalError, ProgrammingError -from urllib.parse import quote_plus - -# Add parent directory to path for imports -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from config import Config - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -def get_database_connection(): - """Get database connection based on environment""" - config = Config() - - # Build complete database URI - if config.POSTGRES_PASSWORD: - encoded_password = quote_plus(config.POSTGRES_PASSWORD) - database_uri = ( - f"postgresql://{config.POSTGRES_USER}:{encoded_password}@" - f"{config.POSTGRES_HOST}:{config.POSTGRES_PORT}/{config.POSTGRES_DB}" - ) - else: - database_uri = ( - f"postgresql://{config.POSTGRES_USER}@" - f"{config.POSTGRES_HOST}:{config.POSTGRES_PORT}/{config.POSTGRES_DB}" - ) - - logger.info(f"Connecting to database: {config.POSTGRES_HOST}:{config.POSTGRES_PORT}/{config.POSTGRES_DB}") - - engine = create_engine(database_uri) - return engine - - -def migrate_database(): - """Add celery_task_id column to scan_state table""" - try: - engine = get_database_connection() - - with engine.connect() as conn: - # Start transaction - trans = conn.begin() - - try: - # Check if column already exists - logger.info("Checking if celery_task_id column already exists...") - - result = conn.execute(text(""" - SELECT column_name - FROM information_schema.columns - WHERE table_name = 'scan_state' - AND column_name = 'celery_task_id' - """)) - - if result.fetchone(): - logger.info("celery_task_id column already exists, skipping migration") - trans.rollback() - return True - - # Add the column - logger.info("Adding celery_task_id column to scan_state table...") - conn.execute(text(""" - ALTER TABLE scan_state - ADD COLUMN celery_task_id VARCHAR(36) - """)) - - # Create index for performance - logger.info("Creating index on celery_task_id column...") - conn.execute(text(""" - CREATE INDEX IF NOT EXISTS idx_scan_state_celery_task_id - ON scan_state(celery_task_id) - """)) - - # Commit transaction - trans.commit() - logger.info("Successfully added celery_task_id column and index") - - return True - - except Exception as e: - trans.rollback() - logger.error(f"Error during migration: {e}") - raise - - except Exception as e: - logger.error(f"Migration failed: {e}") - return False - - -def verify_migration(): - """Verify the migration was successful""" - try: - engine = get_database_connection() - - with engine.connect() as conn: - # Check column exists - result = conn.execute(text(""" - SELECT column_name, data_type, is_nullable - FROM information_schema.columns - WHERE table_name = 'scan_state' - AND column_name = 'celery_task_id' - """)) - - column_info = result.fetchone() - if column_info: - logger.info(f"✅ Column verified: {column_info[0]} ({column_info[1]}, nullable: {column_info[2]})") - - # Check index exists - result = conn.execute(text(""" - SELECT indexname - FROM pg_indexes - WHERE tablename = 'scan_state' - AND indexname = 'idx_scan_state_celery_task_id' - """)) - - if result.fetchone(): - logger.info("✅ Index verified: idx_scan_state_celery_task_id") - return True - else: - logger.error("❌ Index not found") - return False - else: - logger.error("❌ Column not found") - return False - - except Exception as e: - logger.error(f"Verification failed: {e}") - return False - - -if __name__ == '__main__': - logger.info("=== P1 Celery Task ID Migration ===") - logger.info("Adding celery_task_id column to scan_state table") - - # Run migration - if migrate_database(): - logger.info("Migration completed successfully") - - # Verify migration - if verify_migration(): - logger.info("Migration verification passed") - sys.exit(0) - else: - logger.error("Migration verification failed") - sys.exit(1) - else: - logger.error("Migration failed") - sys.exit(1) \ No newline at end of file diff --git a/migrations/add_celery_task_id_column.sql b/migrations/add_celery_task_id_column.sql deleted file mode 100644 index 957a22e..0000000 --- a/migrations/add_celery_task_id_column.sql +++ /dev/null @@ -1,25 +0,0 @@ --- Migration to add celery_task_id column to scan_chunks table --- This column was added in v2.2.59 but the migration was never run - --- Check if column exists before adding it -DO $$ -BEGIN - IF NOT EXISTS ( - SELECT 1 - FROM information_schema.columns - WHERE table_name = 'scan_chunks' - AND column_name = 'celery_task_id' - ) THEN - ALTER TABLE scan_chunks - ADD COLUMN celery_task_id VARCHAR(36); - - -- Create index for faster lookups - CREATE INDEX idx_scan_chunks_celery_task_id - ON scan_chunks (celery_task_id) - WHERE celery_task_id IS NOT NULL; - - RAISE NOTICE 'Column celery_task_id added to scan_chunks table'; - ELSE - RAISE NOTICE 'Column celery_task_id already exists in scan_chunks table'; - END IF; -END $$; \ No newline at end of file diff --git a/migrations/add_exclusions_table.py b/migrations/add_exclusions_table.py deleted file mode 100644 index b1b3dd6..0000000 --- a/migrations/add_exclusions_table.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Add exclusions table for storing path and extension exclusions - -This migration creates the exclusions table to store path and extension -exclusions in the database instead of using JSON files, which improves -persistence in containerized environments. -""" - -from alembic import op -import sqlalchemy as sa -from datetime import datetime, timezone - - -def upgrade(): - """Create exclusions table""" - op.create_table( - 'exclusions', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('exclusion_type', sa.String(20), nullable=False), - sa.Column('value', sa.String(500), nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), nullable=False, default=lambda: datetime.now(timezone.utc)), - sa.Column('is_active', sa.Boolean(), nullable=False, default=True), - sa.PrimaryKeyConstraint('id'), - sa.UniqueConstraint('exclusion_type', 'value', name='_type_value_uc') - ) - - # Create index for faster queries - op.create_index('idx_exclusions_type_active', 'exclusions', ['exclusion_type', 'is_active']) - - -def downgrade(): - """Drop exclusions table""" - op.drop_index('idx_exclusions_type_active', 'exclusions') - op.drop_table('exclusions') \ No newline at end of file diff --git a/migrations/add_scan_tracking_columns.py b/migrations/add_scan_tracking_columns.py deleted file mode 100644 index b77af3d..0000000 --- a/migrations/add_scan_tracking_columns.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 -""" -Database migration to add tracking columns to scan_state table -Run this migration to add num_workers, files_added, and files_updated columns -""" - -import os -import sys -import logging -from sqlalchemy import create_engine, text -from sqlalchemy.exc import OperationalError, ProgrammingError - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -def run_migration(): - """Add new tracking columns to scan_state table""" - - # Get database URL from environment - database_url = os.environ.get('DATABASE_URL') - if not database_url: - # Try constructing from individual env vars - pg_host = os.environ.get('POSTGRES_HOST', 'localhost') - pg_port = os.environ.get('POSTGRES_PORT', '5432') - pg_db = os.environ.get('POSTGRES_DB', 'pixelprobe') - pg_user = os.environ.get('POSTGRES_USER', 'pixelprobe') - pg_pass = os.environ.get('POSTGRES_PASSWORD', '') - - if pg_pass: - database_url = f"postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}" - else: - logger.error("Database connection details not found in environment") - return False - - try: - engine = create_engine(database_url) - - with engine.connect() as conn: - # Check if columns already exist - check_query = text(""" - SELECT column_name - FROM information_schema.columns - WHERE table_name='scan_state' - AND column_name IN ('num_workers', 'files_added', 'files_updated') - """) - - existing_columns = [row[0] for row in conn.execute(check_query)] - - # Add missing columns - if 'num_workers' not in existing_columns: - logger.info("Adding num_workers column to scan_state table...") - conn.execute(text("ALTER TABLE scan_state ADD COLUMN num_workers INTEGER DEFAULT 1")) - conn.commit() - logger.info("✓ Added num_workers column") - else: - logger.info("✓ num_workers column already exists") - - if 'files_added' not in existing_columns: - logger.info("Adding files_added column to scan_state table...") - conn.execute(text("ALTER TABLE scan_state ADD COLUMN files_added INTEGER DEFAULT 0")) - conn.commit() - logger.info("✓ Added files_added column") - else: - logger.info("✓ files_added column already exists") - - if 'files_updated' not in existing_columns: - logger.info("Adding files_updated column to scan_state table...") - conn.execute(text("ALTER TABLE scan_state ADD COLUMN files_updated INTEGER DEFAULT 0")) - conn.commit() - logger.info("✓ Added files_updated column") - else: - logger.info("✓ files_updated column already exists") - - logger.info("Migration completed successfully!") - return True - - except (OperationalError, ProgrammingError) as e: - logger.error(f"Database migration failed: {e}") - return False - except Exception as e: - logger.error(f"Unexpected error during migration: {e}") - return False - -if __name__ == "__main__": - success = run_migration() - sys.exit(0 if success else 1) \ No newline at end of file diff --git a/operation_handlers.py b/operation_handlers.py deleted file mode 100644 index 79a332b..0000000 --- a/operation_handlers.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Unified operation handlers to reduce code redundancy""" - -import logging -import time -from datetime import datetime, timezone -from utils import ProgressTracker, log_operation_status, batch_process, mark_operation_complete, mark_operation_error - -logger = logging.getLogger(__name__) - - -class BaseOperationHandler: - """Base class for all operation handlers (scan, cleanup, file-changes) - - This provides common functionality and reduces code duplication across operations. - """ - - def __init__(self, operation_type, db, app): - self.operation_type = operation_type - self.db = db - self.app = app - self.progress_tracker = ProgressTracker(operation_type) - self.start_time = None - self.is_cancelled = False - - def check_cancellation(self, state_record): - """Check if operation has been cancelled""" - if state_record and not state_record.is_active: - self.is_cancelled = True - log_operation_status(self.operation_type, 'cancel') - return True - return False - - def update_progress(self, state_record, **kwargs): - """Update progress in database""" - if state_record: - for key, value in kwargs.items(): - if hasattr(state_record, key): - setattr(state_record, key, value) - self.db.session.commit() - - def handle_error(self, state_record, error_message): - """Handle operation errors consistently""" - logger.error(f"Error in {self.operation_type}: {error_message}") - if state_record: - mark_operation_error(state_record, error_message) - self.db.session.commit() - - def complete_operation(self, state_record, message=None): - """Mark operation as complete""" - if state_record: - mark_operation_complete(state_record, message) - self.db.session.commit() - - def get_progress_message(self, phase_name, files_processed=0, total_files=0, current_file=None): - """Get formatted progress message""" - return self.progress_tracker.get_progress_message( - phase_name, files_processed, total_files, current_file - ) - - def log_batch_progress(self, batch_num, total_batches, batch_start_time): - """Log batch processing progress""" - batch_time = time.time() - batch_start_time - logger.info(f"Completed batch {batch_num}/{total_batches} in {batch_time:.1f}s") - - -def create_async_operation_handler(operation_type): - """Factory function to create operation handlers with common error handling - - This eliminates the redundant async operation patterns. - """ - def wrapper(operation_func): - def async_handler(app, db, operation_id, *args, **kwargs): - with app.app_context(): - try: - # Run the actual operation - result = operation_func(app, db, operation_id, *args, **kwargs) - return result - except Exception as e: - logger.error(f"Error in {operation_type} operation: {str(e)}") - # Update state to error - try: - if operation_type == 'cleanup': - from models import CleanupState - state = CleanupState.query.filter_by(cleanup_id=operation_id).first() - elif operation_type == 'file_changes': - from models import FileChangesState - state = FileChangesState.query.filter_by(check_id=operation_id).first() - else: - state = None - - if state: - mark_operation_error(state, str(e)) - db.session.commit() - except: - logger.error(f"Failed to update {operation_type} state to error") - raise - return async_handler - return wrapper \ No newline at end of file diff --git a/pixelprobe/__init__.py b/pixelprobe/__init__.py index bc09ea9..4fb8dd4 100644 --- a/pixelprobe/__init__.py +++ b/pixelprobe/__init__.py @@ -2,7 +2,7 @@ PixelProbe - Media file corruption detection tool """ -from version import __version__ +from pixelprobe.version import __version__ from . import api from . import services diff --git a/pixelprobe/api/admin_routes.py b/pixelprobe/api/admin_routes.py index 9391f02..3fe27a5 100644 --- a/pixelprobe/api/admin_routes.py +++ b/pixelprobe/api/admin_routes.py @@ -6,10 +6,10 @@ from apscheduler.triggers.cron import CronTrigger -from models import db, ScanResult, IgnoredErrorPattern, ScanConfiguration, ScanSchedule -from scheduler import MediaScheduler +from pixelprobe.models import db, ScanResult, IgnoredErrorPattern, ScanConfiguration, ScanSchedule +from pixelprobe.scheduler import MediaScheduler from pixelprobe.utils.security import validate_json_input, AuditLogger, validate_directory_path -from auth import auth_required +from pixelprobe.auth import auth_required logger = logging.getLogger(__name__) @@ -395,7 +395,7 @@ def delete_schedule(schedule_id): def get_exclusions(): """Get current exclusion settings from database""" try: - from models import Exclusion + from pixelprobe.models import Exclusion # Get all active exclusions path_exclusions = Exclusion.query.filter_by( @@ -423,7 +423,7 @@ def update_exclusions(): data = request.get_json() try: - from models import Exclusion + from pixelprobe.models import Exclusion # Validate data structure if not isinstance(data.get('paths', []), list) or not isinstance(data.get('extensions', []), list): @@ -463,7 +463,7 @@ def add_exclusion(exclusion_type): return {'error': 'Value is required'}, 400 try: - from models import Exclusion + from pixelprobe.models import Exclusion # Check if already exists existing = Exclusion.query.filter_by( @@ -508,7 +508,7 @@ def remove_exclusion(exclusion_type): return {'error': 'Value is required'}, 400 try: - from models import Exclusion + from pixelprobe.models import Exclusion # Find the exclusion exclusion = Exclusion.query.filter_by( diff --git a/pixelprobe/api/auth_decorator.py b/pixelprobe/api/auth_decorator.py index 8d0e915..68692b4 100644 --- a/pixelprobe/api/auth_decorator.py +++ b/pixelprobe/api/auth_decorator.py @@ -4,7 +4,7 @@ """ from functools import wraps -from auth import auth_required +from pixelprobe.auth import auth_required def apply_auth_to_blueprint(blueprint): """ diff --git a/pixelprobe/api/auth_routes.py b/pixelprobe/api/auth_routes.py index ab83182..9677f49 100644 --- a/pixelprobe/api/auth_routes.py +++ b/pixelprobe/api/auth_routes.py @@ -11,8 +11,8 @@ from datetime import datetime, timezone, timedelta from flask import Blueprint, request, jsonify, session, render_template, redirect, url_for from flask_login import login_user, logout_user, login_required, current_user -from models import db, User, APIToken -from auth import authenticate_user, check_first_run, create_initial_admin, auth_required, admin_required, get_authenticated_user +from pixelprobe.models import db, User, APIToken +from pixelprobe.auth import authenticate_user, check_first_run, create_initial_admin, auth_required, admin_required, get_authenticated_user logger = logging.getLogger(__name__) diff --git a/pixelprobe/api/export_routes.py b/pixelprobe/api/export_routes.py index 1309bec..691f732 100644 --- a/pixelprobe/api/export_routes.py +++ b/pixelprobe/api/export_routes.py @@ -6,8 +6,8 @@ import logging from datetime import datetime, timezone -from models import db, ScanResult -from auth import auth_required +from pixelprobe.models import db, ScanResult +from pixelprobe.auth import auth_required logger = logging.getLogger(__name__) diff --git a/pixelprobe/api/healthcheck_routes.py b/pixelprobe/api/healthcheck_routes.py index 2174a42..9fbfb09 100644 --- a/pixelprobe/api/healthcheck_routes.py +++ b/pixelprobe/api/healthcheck_routes.py @@ -6,8 +6,8 @@ import logging from datetime import datetime, timezone -from models import db, HealthcheckConfig, ScanSchedule -from auth import auth_required +from pixelprobe.models import db, HealthcheckConfig, ScanSchedule +from pixelprobe.auth import auth_required from pixelprobe.services.healthcheck_service import HealthcheckService logger = logging.getLogger(__name__) diff --git a/pixelprobe/api/maintenance_routes.py b/pixelprobe/api/maintenance_routes.py index ae77a53..0b075fc 100644 --- a/pixelprobe/api/maintenance_routes.py +++ b/pixelprobe/api/maintenance_routes.py @@ -7,10 +7,10 @@ from datetime import datetime, timezone import pytz -from models import db, ScanResult, CleanupState, FileChangesState -from media_checker import PixelProbe -from auth import auth_required -from utils import ProgressTracker +from pixelprobe.models import db, ScanResult, CleanupState, FileChangesState +from pixelprobe.media_checker import PixelProbe +from pixelprobe.auth import auth_required +from pixelprobe.utils.helpers import ProgressTracker from pixelprobe.services.maintenance_service import MaintenanceService logger = logging.getLogger(__name__) @@ -475,7 +475,7 @@ def check_file_changes(): scan_state = None if file_paths and len(file_paths) == 1: try: - from models import ScanState + from pixelprobe.models import ScanState scan_state = ScanState.create_new_scan() scan_state.scan_id = check_id scan_state.start_scan(file_paths, force_rescan=False) diff --git a/pixelprobe/api/notification_routes.py b/pixelprobe/api/notification_routes.py index 4063196..e91eae6 100644 --- a/pixelprobe/api/notification_routes.py +++ b/pixelprobe/api/notification_routes.py @@ -8,8 +8,8 @@ from datetime import datetime, timezone from typing import Optional -from models import db, NotificationProvider, NotificationRule -from auth import auth_required +from pixelprobe.models import db, NotificationProvider, NotificationRule +from pixelprobe.auth import auth_required from pixelprobe.services.notification_service import NotificationService from pixelprobe.utils.security import validate_safe_url diff --git a/pixelprobe/api/reports_routes.py b/pixelprobe/api/reports_routes.py index 8ab4183..220fc67 100644 --- a/pixelprobe/api/reports_routes.py +++ b/pixelprobe/api/reports_routes.py @@ -8,9 +8,9 @@ import base64 import pytz -from models import db, ScanReport +from pixelprobe.models import db, ScanReport from pixelprobe.utils.security import validate_json_input -from auth import auth_required +from pixelprobe.auth import auth_required logger = logging.getLogger(__name__) @@ -170,7 +170,7 @@ def export_scan_report(report_id): report_dict['created_at'] = convert_to_timezone(report.created_at) # Add scan results - handle cleanup reports differently - from models import ScanResult + from pixelprobe.models import ScanResult if report.scan_type == 'cleanup' and report.directories_scanned: # For cleanup reports, the orphaned files list is stored in directories_scanned field @@ -319,7 +319,7 @@ def generate_pdf_report(scan_type, scan_id): elements.append(Spacer(1, 0.2*inch)) # Query scan results based on scan type - from models import ScanResult + from pixelprobe.models import ScanResult if scan_type == 'rescan' and '_' in scan_id: # For rescan, parse the file path from scan_id file_path = scan_id.replace('_', '/') @@ -661,7 +661,7 @@ def export_scan_report_pdf(report_id): elements.append(Paragraph("Scanned Files", heading_style)) # Query files based on scan type - from models import ScanResult + from pixelprobe.models import ScanResult if report.scan_type == 'cleanup': # For cleanup reports, the orphaned files list is stored in directories_scanned field as JSON @@ -905,7 +905,7 @@ def __init__(self, file_path): # Add scanned files list for scan reports if report.scan_type in ['full_scan', 'rescan']: - from models import ScanResult + from pixelprobe.models import ScanResult scanned_files = ScanResult.query.filter( ScanResult.scan_date >= report.start_time, ScanResult.scan_date <= (report.end_time or datetime.now(timezone.utc)) @@ -1148,7 +1148,7 @@ def download_multiple_reports(): # Add scanned files if available for scan reports if report.scan_type in ['full_scan', 'rescan']: # Query scan results for this report's time period - from models import ScanResult + from pixelprobe.models import ScanResult scanned_files = ScanResult.query.filter( ScanResult.scan_date >= report.start_time, ScanResult.scan_date <= (report.end_time or datetime.now(timezone.utc)) diff --git a/pixelprobe/api/scan_routes.py b/pixelprobe/api/scan_routes.py index 5429689..0c6121f 100644 --- a/pixelprobe/api/scan_routes.py +++ b/pixelprobe/api/scan_routes.py @@ -5,11 +5,11 @@ from datetime import datetime, timezone, timedelta from pixelprobe.utils.timezone import from_utc_to_configured -from media_checker import PixelProbe, load_exclusions -from models import db, ScanResult, ScanState +from pixelprobe.media_checker import PixelProbe, load_exclusions +from pixelprobe.models import db, ScanResult, ScanState from pixelprobe.constants import TERMINAL_SCAN_PHASES -from version import __version__ -from auth import auth_required +from pixelprobe.version import __version__ +from pixelprobe.auth import auth_required from pixelprobe.utils.security import ( validate_file_path, validate_directory_path, @@ -143,8 +143,17 @@ def is_scan_running(): db.session.commit() return False elif task_state is None: - # Couldn't check task state - assume running to avoid blocking forever - logger.warning(f"Could not check Celery task {active_scan.celery_task_id} state - assuming running") + # Can't check task state -- fall through to time-based stuck detection + # instead of assuming the scan is running indefinitely + if check_time and (datetime.now(timezone.utc) - check_time) > timedelta(hours=1): + logger.warning(f"Scan {active_scan.scan_id} task state unknown and no update for over 1 hour - marking crashed") + active_scan.is_active = False + active_scan.phase = 'crashed' + active_scan.error_message = 'Celery task state unknown - worker may have crashed' + db.session.commit() + return False + # Give benefit of the doubt for shorter periods + logger.warning(f"Could not check Celery task {active_scan.celery_task_id} state - assuming running (last update: {check_time})") return True # If no Celery task ID, it's likely a direct scan - check if thread is alive @@ -509,7 +518,7 @@ def scan(): else: # Fallback to direct scan service (backward compatibility) logger.info("Celery not available, using direct scan service") - from config import Config + from pixelprobe.config import Config result = current_app.scan_service.scan_directories( validated_dirs, force_rescan=force_rescan, @@ -637,7 +646,7 @@ def get_scan_status(): import os filename = os.path.basename(current_file) # Generate fresh progress message with current data - from utils import ProgressTracker + from pixelprobe.utils.helpers import ProgressTracker progress_tracker = ProgressTracker('scan') # Use the actual scan start time if available if state_dict.get('start_time'): @@ -1176,7 +1185,7 @@ def force_scan_pending(): } else: # Fallback to direct scan service - from config import Config + from pixelprobe.config import Config result = current_app.scan_service.scan_directories( directories=['PENDING_FILES_SCAN'], # Special marker force_rescan=False, diff --git a/pixelprobe/api/scan_routes_parallel.py b/pixelprobe/api/scan_routes_parallel.py index 81766ad..7e38541 100644 --- a/pixelprobe/api/scan_routes_parallel.py +++ b/pixelprobe/api/scan_routes_parallel.py @@ -11,8 +11,8 @@ from pixelprobe.utils.security import validate_directory_path, AuditLogger, PathTraversalError, validate_json_input from pixelprobe.utils.rate_limiting import rate_limit -from auth import auth_required -from models import ScanState +from pixelprobe.auth import auth_required +from pixelprobe.models import ScanState logger = logging.getLogger(__name__) @@ -138,7 +138,7 @@ def get_parallel_scan_status(scan_id): - Files processed per worker """ try: - from models import ScanState, ScanChunk + from pixelprobe.models import ScanState, ScanChunk from celery import current_app as celery_app # Get scan state diff --git a/pixelprobe/api/stats_routes.py b/pixelprobe/api/stats_routes.py index c2d0945..959ff28 100644 --- a/pixelprobe/api/stats_routes.py +++ b/pixelprobe/api/stats_routes.py @@ -5,11 +5,11 @@ import logging from datetime import datetime, timezone -from models import db, ScanResult -from version import __version__ +from pixelprobe.models import db, ScanResult +from pixelprobe.version import __version__ from pixelprobe.utils.timezone import from_utc_to_configured, get_configured_timezone_name from pixelprobe.services.stats_service import StatsService -from auth import auth_required +from pixelprobe.auth import auth_required logger = logging.getLogger(__name__) @@ -452,7 +452,7 @@ def get_system_info(): logger.warning(f"SQLite fallback also failed, using basic query: {e2}") # Final fallback to basic ORM queries try: - from models import ScanReport + from pixelprobe.models import ScanReport total_scans = ScanReport.query.filter_by(status='completed').count() except: total_scans = 0 diff --git a/auth.py b/pixelprobe/auth.py similarity index 99% rename from auth.py rename to pixelprobe/auth.py index ed831c5..68f41a9 100644 --- a/auth.py +++ b/pixelprobe/auth.py @@ -11,7 +11,7 @@ from flask import jsonify, request, redirect, url_for, session, current_app from flask_login import LoginManager, login_user, logout_user, login_required, current_user from flask_restx import abort as restx_abort -from models import User, APIToken, db +from pixelprobe.models import User, APIToken, db logger = logging.getLogger(__name__) diff --git a/celery_config.py b/pixelprobe/celery_config.py similarity index 100% rename from celery_config.py rename to pixelprobe/celery_config.py diff --git a/config.py b/pixelprobe/config.py similarity index 100% rename from config.py rename to pixelprobe/config.py diff --git a/media_checker.py b/pixelprobe/media_checker.py similarity index 99% rename from media_checker.py rename to pixelprobe/media_checker.py index 5252f7a..58afe5a 100644 --- a/media_checker.py +++ b/pixelprobe/media_checker.py @@ -2055,7 +2055,7 @@ def _check_cache(self, file_path, file_hash, last_modified): session = None try: - from models import ScanResult + from pixelprobe.models import ScanResult session = self._get_db_session() if not session: return None @@ -2107,7 +2107,7 @@ def _save_to_cache(self, file_path, scan_result): session = None try: - from models import ScanResult + from pixelprobe.models import ScanResult from datetime import datetime, timezone import traceback @@ -2201,7 +2201,7 @@ def _check_ignored_patterns(self, error_output): session = None try: - from models import IgnoredErrorPattern + from pixelprobe.models import IgnoredErrorPattern session = self._get_db_session() if not session: return False diff --git a/models.py b/pixelprobe/models.py similarity index 99% rename from models.py rename to pixelprobe/models.py index 66cf4d5..fa08a5e 100644 --- a/models.py +++ b/pixelprobe/models.py @@ -326,7 +326,7 @@ class ScanState(db.Model): def to_dict(self): # Import here to avoid circular imports - from utils import create_state_dict + from pixelprobe.utils.helpers import create_state_dict return create_state_dict(self, extra_fields=['estimated_total', 'discovery_count']) @staticmethod @@ -590,7 +590,7 @@ class CleanupState(db.Model): def to_dict(self): # Import here to avoid circular imports - from utils import create_state_dict + from pixelprobe.utils.helpers import create_state_dict return create_state_dict(self, extra_fields=['orphaned_found']) class FileChangesState(db.Model): @@ -618,7 +618,7 @@ class FileChangesState(db.Model): def to_dict(self): # Import here to avoid circular imports - from utils import create_state_dict + from pixelprobe.utils.helpers import create_state_dict result = create_state_dict(self, extra_fields=['changes_found', 'corrupted_found']) # Handle special case for changed_files JSON field result['changed_files'] = json.loads(self.changed_files) if self.changed_files else [] diff --git a/pixelprobe/models/__init__.py b/pixelprobe/models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pixelprobe/repositories/base_repository.py b/pixelprobe/repositories/base_repository.py index 4b3886e..8c6223e 100644 --- a/pixelprobe/repositories/base_repository.py +++ b/pixelprobe/repositories/base_repository.py @@ -4,7 +4,7 @@ from typing import TypeVar, Generic, List, Optional, Dict, Any from sqlalchemy.orm import Query -from models import db +from pixelprobe.models import db T = TypeVar('T') diff --git a/pixelprobe/repositories/config_repository.py b/pixelprobe/repositories/config_repository.py index bc369d9..edbc3bc 100644 --- a/pixelprobe/repositories/config_repository.py +++ b/pixelprobe/repositories/config_repository.py @@ -5,7 +5,7 @@ from typing import List, Optional from datetime import datetime, timezone -from models import ScanConfiguration, IgnoredErrorPattern, ScanSchedule +from pixelprobe.models import ScanConfiguration, IgnoredErrorPattern, ScanSchedule from .base_repository import BaseRepository class ConfigurationRepository(BaseRepository[ScanConfiguration]): diff --git a/pixelprobe/repositories/scan_repository.py b/pixelprobe/repositories/scan_repository.py index 3cb25af..f9fe22d 100644 --- a/pixelprobe/repositories/scan_repository.py +++ b/pixelprobe/repositories/scan_repository.py @@ -6,7 +6,7 @@ from datetime import datetime from sqlalchemy import text, and_, or_ -from models import ScanResult, ScanState +from pixelprobe.models import ScanResult, ScanState from .base_repository import BaseRepository class ScanRepository(BaseRepository[ScanResult]): diff --git a/scheduler.py b/pixelprobe/scheduler.py similarity index 95% rename from scheduler.py rename to pixelprobe/scheduler.py index a822423..be251e2 100644 --- a/scheduler.py +++ b/pixelprobe/scheduler.py @@ -6,7 +6,7 @@ from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.triggers.cron import CronTrigger from apscheduler.triggers.interval import IntervalTrigger -from models import db, ScanSchedule, ScanResult, ScanState, HealthcheckConfig, ScanReport +from pixelprobe.models import db, ScanSchedule, ScanResult, ScanState, HealthcheckConfig, ScanReport from pixelprobe.constants import TERMINAL_SCAN_PHASES from sqlalchemy import text import threading @@ -617,11 +617,25 @@ def _check_stuck_scans(self): last_update = scan.last_update if last_update and last_update.tzinfo is None: last_update = last_update.replace(tzinfo=timezone.utc) - + start_time = scan.start_time if start_time and start_time.tzinfo is None: start_time = start_time.replace(tzinfo=timezone.utc) - + + # Secondary check: if scan has a Celery task, verify task is still alive. + # A lost/crashed Celery task with a stale last_update is a definitive indicator. + celery_task_gone = False + if scan.celery_task_id: + try: + from pixelprobe.utils.celery_utils import check_celery_available, safe_check_task_state + from flask import current_app + if check_celery_available(): + task_state = safe_check_task_state(scan.celery_task_id, current_app.celery) + if task_state in ['SUCCESS', 'FAILURE', 'REVOKED'] or task_state is None: + celery_task_gone = True + except Exception: + pass # Celery check failed, rely on time-based detection + # Check if scan has been running for more than 30 minutes without update if last_update and last_update < stuck_threshold: logger.warning(f"Marking stuck scan {scan.scan_id} as crashed - no update since {last_update}") @@ -629,6 +643,13 @@ def _check_stuck_scans(self): scan.phase = 'crashed' scan.error_message = f"Scan appears stuck - no activity for over 30 minutes (last update: {last_update})" scans_to_mark.append(scan) + elif celery_task_gone and last_update and last_update < (current_time - timedelta(minutes=5)): + # Celery task is gone AND no update for 5+ minutes -- crashed + logger.warning(f"Marking scan {scan.scan_id} as crashed - Celery task gone and no update since {last_update}") + scan.is_active = False + scan.phase = 'crashed' + scan.error_message = f"Celery task lost and no progress since {last_update}" + scans_to_mark.append(scan) elif not last_update and start_time and start_time < stuck_threshold: # No last_update field but scan started over 30 minutes ago logger.warning(f"Marking stuck scan {scan.scan_id} as crashed - started at {start_time} with no updates") diff --git a/pixelprobe/services/export_service.py b/pixelprobe/services/export_service.py index efb2182..d1f44ff 100644 --- a/pixelprobe/services/export_service.py +++ b/pixelprobe/services/export_service.py @@ -10,7 +10,7 @@ from typing import List, Dict, Optional from flask import send_file, Response -from models import db, ScanResult +from pixelprobe.models import db, ScanResult logger = logging.getLogger(__name__) diff --git a/pixelprobe/services/maintenance_service.py b/pixelprobe/services/maintenance_service.py index 8f5308a..deaa877 100644 --- a/pixelprobe/services/maintenance_service.py +++ b/pixelprobe/services/maintenance_service.py @@ -11,9 +11,9 @@ from typing import Dict, List, Optional import uuid -from media_checker import PixelProbe, load_exclusions, load_exclusions_with_patterns -from models import db, ScanResult, CleanupState, FileChangesState, ScanReport -from utils import ProgressTracker +from pixelprobe.media_checker import PixelProbe, load_exclusions, load_exclusions_with_patterns +from pixelprobe.models import db, ScanResult, CleanupState, FileChangesState, ScanReport +from pixelprobe.utils.helpers import ProgressTracker logger = logging.getLogger(__name__) @@ -621,7 +621,7 @@ def _create_cleanup_report(self, cleanup_record: CleanupState, orphaned_files_li # Send healthcheck completion ping if this was a scheduled cleanup if schedule_id: try: - from scheduler import MediaScheduler + from pixelprobe.scheduler import MediaScheduler MediaScheduler.send_healthcheck_completion(report.id) except Exception as hc_error: logger.error(f"Failed to send healthcheck completion ping: {hc_error}") @@ -849,7 +849,7 @@ def _run_file_changes_check(self, check_id: str, file_paths=None, schedule_id=No # Also update ScanState for UI progress bar try: - from models import ScanState + from pixelprobe.models import ScanState scan_state = ScanState.query.filter_by(scan_id=check_id).first() if scan_state: scan_state.files_processed = 1 @@ -1068,7 +1068,7 @@ def _run_file_changes_check(self, check_id: str, file_paths=None, schedule_id=No # Complete ScanState if this was a single file integrity check try: - from models import ScanState + from pixelprobe.models import ScanState scan_state = ScanState.query.filter_by(scan_id=check_id).first() if scan_state: scan_state.complete_scan() @@ -1254,7 +1254,7 @@ def _create_file_changes_report(self, file_changes_record: FileChangesState, cha # Send healthcheck completion ping if this was a scheduled file changes check if schedule_id: try: - from scheduler import MediaScheduler + from pixelprobe.scheduler import MediaScheduler MediaScheduler.send_healthcheck_completion(report.id) except Exception as hc_error: logger.error(f"Failed to send healthcheck completion ping: {hc_error}") diff --git a/pixelprobe/services/scan_service.py b/pixelprobe/services/scan_service.py index dc130ec..0023fa8 100644 --- a/pixelprobe/services/scan_service.py +++ b/pixelprobe/services/scan_service.py @@ -12,9 +12,9 @@ from typing import List, Dict, Optional, Tuple from flask import current_app -from media_checker import PixelProbe, load_exclusions, load_exclusions_with_patterns -from models import db, ScanResult, ScanState, ScanReport, ScanChunk -from utils import ProgressTracker +from pixelprobe.media_checker import PixelProbe, load_exclusions, load_exclusions_with_patterns +from pixelprobe.models import db, ScanResult, ScanState, ScanReport, ScanChunk +from pixelprobe.utils.helpers import ProgressTracker from sqlalchemy import text from sqlalchemy.exc import OperationalError import hashlib @@ -288,7 +288,7 @@ def run_scan(): progress_tracker = ProgressTracker('scan') # Import ScanResult model (needed for both regular and pending scans) - from models import ScanResult + from pixelprobe.models import ScanResult # Log scan start logger.info(f"=== SCAN STARTED ===") @@ -656,26 +656,27 @@ def discovery_progress(files_checked, files_discovered): # Enhanced crash recovery tracking try: + # CRITICAL: Clear the failed transaction before attempting recovery writes. + # Without this, the session may be in a rolled-back state (e.g. after + # psycopg2.DatabaseError) and all subsequent writes will silently fail, + # leaving the scan stuck as "active" forever. + db.session.rollback() + if scan_state: - # Re-attach scan_state if it's detached to prevent secondary errors - try: - # Test if scan_state is attached by accessing a property - _ = scan_state.id - except Exception: - # Re-attach the detached instance - scan_state = db.session.merge(scan_state) - logger.info("Re-attached detached scan_state for crash recovery") - - # Update crash info (temporarily without new columns until migration completes) - scan_state.error_message = str(e)[:1000] # Truncate to avoid VARCHAR limit - - # Mark scan as crashed instead of just error - scan_state.is_active = False - scan_state.phase = 'crashed' - scan_state.end_time = datetime.now(timezone.utc) - - db.session.commit() - logger.info(f"Scan marked as crashed") + # Re-query scan_state with a fresh session instead of merge, + # since the old object is stale after rollback + scan_state = db.session.query(ScanState).get(scan_state_id) + if scan_state: + # Update crash info + scan_state.error_message = str(e)[:1000] # Truncate to avoid VARCHAR limit + + # Mark scan as crashed instead of just error + scan_state.is_active = False + scan_state.phase = 'crashed' + scan_state.end_time = datetime.now(timezone.utc) + + db.session.commit() + logger.info(f"Scan marked as crashed") except Exception as recovery_error: logger.error(f"Failed to update crash recovery info: {recovery_error}") @@ -708,7 +709,7 @@ def discovery_progress(files_checked, files_discovered): if final_scan_state: # Get corrupted file count from ScanResult table with retry logic # Note: ScanResult doesn't have scan_id, so we query all corrupted files - from models import ScanResult + from pixelprobe.models import ScanResult # Retry logic for database connection issues max_retries = 3 @@ -958,7 +959,7 @@ def run_scan(): if final_scan_state: # Get corrupted file count from ScanResult table with retry logic # Note: ScanResult doesn't have scan_id, so we query all corrupted files - from models import ScanResult + from pixelprobe.models import ScanResult # Retry logic for database connection issues max_retries = 3 @@ -1126,7 +1127,7 @@ def cancel_scan(self) -> Dict: # Step 1: Kill ALL Celery tasks (nuclear option) try: - from celery_config import celery_app + from pixelprobe.celery_config import celery_app logger.info("Step 1: Killing ALL Celery tasks") @@ -1168,7 +1169,7 @@ def cancel_scan(self) -> Dict: logger.info("Step 2: Cleaning up database state") try: - from models import ScanChunk + from pixelprobe.models import ScanChunk # Mark ALL chunks as cancelled chunks_updated = db.session.query(ScanChunk).filter( @@ -1831,7 +1832,7 @@ def _create_scan_report(self, scan_state: ScanState, scan_type: str = 'full_scan # Send healthcheck completion ping for scheduled scans try: - from scheduler import MediaScheduler + from pixelprobe.scheduler import MediaScheduler MediaScheduler.send_healthcheck_completion(report.id) except Exception as hc_error: logger.error(f"Failed to send healthcheck completion ping: {hc_error}") @@ -1852,7 +1853,7 @@ def _retry_pending_files(self, checker: PixelProbe, force_rescan: bool) -> int: Returns: int: Number of files that remain pending after retries """ - from models import ScanResult + from pixelprobe.models import ScanResult max_retries = 2 @@ -1929,7 +1930,7 @@ def _handle_scan_cancellation(self, scan_state: ScanState): scan_state.cancel_scan() # Clean up any files stuck in 'scanning' state - from models import ScanResult + from pixelprobe.models import ScanResult stuck_count = ScanResult.query.filter_by(scan_status='scanning').update( {'scan_status': 'pending'}, synchronize_session=False @@ -1950,7 +1951,7 @@ def _add_files_batch_to_db(self, file_paths: List[str]) -> Tuple[int, int]: import os import magic from datetime import datetime - from models import ScanResult + from pixelprobe.models import ScanResult from sqlalchemy.exc import IntegrityError added_count = 0 @@ -2060,7 +2061,7 @@ def _create_scanning_chunks(self, total_files: int, scan_id: str, is_pending_sca Each chunk stores the first and last file_path it should process, making queries stable even as files change status during scanning. """ - from models import ScanChunk + from pixelprobe.models import ScanChunk import json chunks = [] @@ -2482,7 +2483,7 @@ def _scan_chunk_files(self, chunk: ScanChunk, checker: PixelProbe, force_rescan: scanned_lock = threading.Lock() # Get exclusions for creating thread-local PixelProbe instances - from media_checker import load_exclusions_with_patterns + from pixelprobe.media_checker import load_exclusions_with_patterns excluded_paths, excluded_extensions, excluded_patterns = load_exclusions_with_patterns() # Process files in batches to avoid loading all into memory @@ -2579,7 +2580,7 @@ def scan_single_file(file_result): # and threading.local() storage persists across batches if not hasattr(thread_local, 'checker'): logger.info(f"[Thread {current_thread_name}/{current_thread_id}] Creating NEW PixelProbe instance for this thread") - from media_checker import PixelProbe + from pixelprobe.media_checker import PixelProbe thread_local.checker = PixelProbe( database_path=self.database_uri, excluded_paths=excluded_paths, @@ -2704,7 +2705,7 @@ def scan_single_file(file_result): updated_files_processed = row[0] if row else current_total # Update progress message - from utils import ProgressTracker + from pixelprobe.utils.helpers import ProgressTracker progress_tracker = ProgressTracker('scan') progress_message = progress_tracker.get_progress_message( f'Phase 3 of 3: Scanning files (parallel: {num_workers} workers)', @@ -2806,7 +2807,7 @@ def scan_single_file(file_result): scan_state.last_update = datetime.now(timezone.utc) # Update progress message with current file info - from utils import ProgressTracker + from pixelprobe.utils.helpers import ProgressTracker progress_tracker = ProgressTracker('scan') scan_state.progress_message = progress_tracker.get_progress_message( f'Phase 3 of 3: Scanning files', diff --git a/pixelprobe/services/stats_service.py b/pixelprobe/services/stats_service.py index 6588866..373cb68 100644 --- a/pixelprobe/services/stats_service.py +++ b/pixelprobe/services/stats_service.py @@ -8,9 +8,9 @@ from datetime import datetime, timedelta from sqlalchemy import text, func -from models import db, ScanResult, ScanReport +from pixelprobe.models import db, ScanResult, ScanReport from pixelprobe.utils.timezone import from_utc_to_configured, get_configured_timezone -from version import __version__ +from pixelprobe.version import __version__ logger = logging.getLogger(__name__) diff --git a/pixelprobe/startup.py b/pixelprobe/startup.py index 094e946..a6ca4cb 100644 --- a/pixelprobe/startup.py +++ b/pixelprobe/startup.py @@ -14,7 +14,7 @@ def cleanup_stuck_operations(db): """Clean up any stuck operations from previous runs""" try: - from models import FileChangesState, CleanupState + from pixelprobe.models import FileChangesState, CleanupState active_file_changes = FileChangesState.query.filter_by(is_active=True).all() for file_change in active_file_changes: @@ -43,7 +43,7 @@ def cleanup_stuck_operations(db): def cleanup_stuck_scans(db): """Clean up ALL active scans from previous runs - they can't still be running after restart.""" try: - from models import ScanState + from pixelprobe.models import ScanState stuck_scans = ScanState.query.filter( ScanState.is_active == True ).all() @@ -69,7 +69,7 @@ def cleanup_bloated_scan_results(db): efficient storage format. """ try: - from models import ScanResult + from pixelprobe.models import ScanResult bloated_results = db.session.query(ScanResult).filter( db.or_( db.func.length(ScanResult.scan_output) > 50000, diff --git a/pixelprobe/tasks.py b/pixelprobe/tasks.py index 2d406a0..d9d0cb3 100644 --- a/pixelprobe/tasks.py +++ b/pixelprobe/tasks.py @@ -14,10 +14,10 @@ import redis from contextlib import contextmanager -from celery_config import celery_app +from pixelprobe.celery_config import celery_app from pixelprobe.services.scan_service import ScanService from pixelprobe.progress_utils import get_redis_client, update_scan_progress_redis -from models import db, ScanState, ScanResult, ScanReport +from pixelprobe.models import db, ScanState, ScanResult, ScanReport logger = logging.getLogger(__name__) @@ -106,7 +106,7 @@ def scan_media_task(self, scan_id, paths, scan_type='full', force_rescan=False): # because the scheduler already sent a start ping and expects a completion signal if scan_id.startswith('scheduled_'): try: - from scheduler import MediaScheduler + from pixelprobe.scheduler import MediaScheduler # Find the existing scan report for this scan existing_report = ScanReport.query.filter_by(scan_id=scan_id).order_by(ScanReport.end_time.desc()).first() if existing_report: @@ -219,7 +219,7 @@ def progress_callback(progress_data): # Execute scan based on type if scan_type in ['full', 'parallel', 'pending']: # Use configured MAX_WORKERS instead of hardcoded 1 - from config import Config + from pixelprobe.config import Config import os num_workers = Config.MAX_WORKERS logger.info(f"Celery task using MAX_WORKERS={num_workers} (env var: {os.getenv('MAX_WORKERS', 'NOT SET')})") @@ -253,7 +253,7 @@ def progress_callback(progress_data): # Discovery is handled internally by scan_directories # There's no separate discover_files method in ScanService # Run a regular scan which includes discovery phase - from config import Config + from pixelprobe.config import Config result = scan_service.scan_directories( directories=paths, force_rescan=False, # Don't force rescan for discovery @@ -442,7 +442,7 @@ def scan_files_task(self, scan_id, file_paths, force_rescan=False, num_workers=N try: from pixelprobe.services.scan_service import ScanService from flask import current_app - from config import Config + from pixelprobe.config import Config database_uri = current_app.config['SQLALCHEMY_DATABASE_URI'] scan_service = ScanService(database_uri) @@ -539,7 +539,7 @@ def scheduled_scan_task(schedule_id, scan_type='full'): logger.info(f"Executing scheduled scan for schedule_id: {schedule_id}") try: - from models import ScanSchedule + from pixelprobe.models import ScanSchedule from uuid import uuid4 # Get schedule details @@ -792,7 +792,7 @@ def ui_progress_update_task(self, scan_id, update_interval=1.0): sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from app import app, db as flask_db from sqlalchemy.orm import scoped_session, sessionmaker - from models import ScanState + from pixelprobe.models import ScanState # Use distributed lock to ensure only one UI worker per scan lock_name = f'pixelprobe:ui_worker:{scan_id}' diff --git a/pixelprobe/tasks_parallel.py b/pixelprobe/tasks_parallel.py index 47b8fe9..e2a08f9 100644 --- a/pixelprobe/tasks_parallel.py +++ b/pixelprobe/tasks_parallel.py @@ -12,9 +12,9 @@ from datetime import datetime, timezone from typing import List, Dict, Any -from celery_config import celery_app -from models import db, ScanState, ScanResult, ScanChunk -from media_checker import PixelProbe +from pixelprobe.celery_config import celery_app +from pixelprobe.models import db, ScanState, ScanResult, ScanChunk +from pixelprobe.media_checker import PixelProbe from pixelprobe.services.scan_service import ScanService logger = logging.getLogger(__name__) @@ -400,7 +400,7 @@ def discover_directory_task(self, directory: str, scan_id: str, import os import time from celery.exceptions import SoftTimeLimitExceeded - from media_checker import PixelProbe + from pixelprobe.media_checker import PixelProbe discovered_files = [] excluded_paths = excluded_paths or [] @@ -535,7 +535,7 @@ def parallel_scan_orchestrator(self, scan_id: str, paths: List[str] = None, raise ValueError("Paths required for full/parallel scan") # Load exclusions from database - from models import Exclusion + from pixelprobe.models import Exclusion exclusions = Exclusion.query.filter_by(is_active=True).all() excluded_paths = [exc.pattern for exc in exclusions if exc.exclusion_type == 'path'] excluded_extensions = [exc.pattern for exc in exclusions if exc.exclusion_type == 'extension'] diff --git a/pixelprobe/utils/helpers.py b/pixelprobe/utils/helpers.py index e3442c1..4ca83c4 100644 --- a/pixelprobe/utils/helpers.py +++ b/pixelprobe/utils/helpers.py @@ -4,6 +4,9 @@ import os import logging +import time +from datetime import datetime, timezone + from pixelprobe.constants import VIDEO_EXTENSIONS, IMAGE_EXTENSIONS, AUDIO_EXTENSIONS logger = logging.getLogger(__name__) @@ -28,7 +31,7 @@ def get_configured_scan_paths(): Returns a list of path strings. May be empty if nothing is configured. """ try: - from models import ScanConfiguration + from pixelprobe.models import ScanConfiguration configs = ScanConfiguration.query.filter_by(is_active=True).all() paths = [config.path for config in configs if config.path] if paths: @@ -38,3 +41,185 @@ def get_configured_scan_paths(): scan_paths_env = os.environ.get('SCAN_PATHS', '') return [p.strip() for p in scan_paths_env.split(',') if p.strip()] + + +class ProgressTracker: + """Unified progress tracking to eliminate redundant progress code + + This class handles progress tracking for all operations (scan, cleanup, file-changes) + in a consistent way, eliminating duplicate progress calculation logic. + """ + + def __init__(self, operation_type): + self.operation_type = operation_type + self.start_time = time.time() + self.phase_start_time = time.time() + + # Define phase weights for different operation types + self.phase_weights = { + 'scan': [0.2, 0.1, 0.7], # Discovery: 20%, Adding: 10%, Scanning: 70% + 'cleanup': [0.1, 0.8, 0.1], # Scanning DB: 10%, Checking files: 80%, Deleting: 10% + 'file_changes': [0.05, 0.8, 0.15] # Starting: 5%, Checking: 80%, Verifying: 15% + } + + def calculate_progress_percentage(self, phase_number, phase_current, phase_total, total_phases=3): + """Calculate overall progress percentage based on phase weights""" + if phase_total == 0: + return 0 + + weights = self.phase_weights.get(self.operation_type, [1.0 / total_phases] * total_phases) + + # Calculate progress within current phase + phase_progress = (phase_current / phase_total) if phase_total > 0 else 0 + + # Calculate total progress + completed_weight = sum(weights[:phase_number - 1]) if phase_number > 1 else 0 + current_weight = weights[phase_number - 1] if phase_number <= len(weights) else 0 + + total_progress = completed_weight + (current_weight * phase_progress) + return min(total_progress * 100, 100) + + def estimate_time_remaining(self, files_processed, total_files): + """Estimate time remaining based on current progress""" + if files_processed == 0 or total_files == 0: + return None + + elapsed = time.time() - self.start_time + + # Minimum elapsed time to avoid division issues + if elapsed < 1: + return "calculating..." + + rate = files_processed / elapsed + remaining_files = total_files - files_processed + + # If no files remaining, we're done + if remaining_files <= 0: + return "completing..." + + # If rate is extremely low (very slow scan), return a special message + if rate < 0.001: # Less than 1 file per 1000 seconds + return "processing..." + elif rate > 0: + remaining_seconds = remaining_files / rate + # Cap at 30 days to avoid unrealistic ETAs + if remaining_seconds > 2592000: # 30 days + return ">30 days" + return self.format_time(remaining_seconds) + return None + + @staticmethod + def format_time(seconds): + """Format seconds into human-readable time""" + # Handle invalid or very small values + if seconds <= 0: + return "calculating..." + elif seconds < 60: + # Don't show 0s, minimum 1s + return f"{max(1, int(seconds))}s" + elif seconds < 3600: + return f"{int(seconds / 60)}m {int(seconds % 60)}s" + elif seconds < 86400: # Less than a day + hours = int(seconds / 3600) + minutes = int((seconds % 3600) / 60) + return f"{hours}h {minutes}m" + else: + # For very long ETAs, show days + days = int(seconds / 86400) + hours = int((seconds % 86400) / 3600) + return f"{days}d {hours}h" + + def get_progress_message(self, phase_name, files_processed=0, total_files=0, current_file=None): + """Generate consistent progress messages""" + eta = self.estimate_time_remaining(files_processed, total_files) + eta_str = f" ETA: {eta}" if eta else "" + + if current_file: + return f"{phase_name}: current file: {current_file} - {files_processed} of {total_files:,} files{eta_str}" + else: + return f"{phase_name}: {files_processed} of {total_files:,} files{eta_str}" + +def log_operation_status(operation_type, status, details=None): + """Centralized operation status logging""" + log_messages = { + 'start': f"=== {operation_type.upper()} STARTED ===", + 'complete': f"=== {operation_type.upper()} COMPLETED ===", + 'error': f"=== {operation_type.upper()} FAILED ===", + 'cancel': f"=== {operation_type.upper()} CANCELLED ===" + } + + message = log_messages.get(status, f"{operation_type} status: {status}") + logger.info(message) + + if details: + for key, value in details.items(): + logger.info(f"{key}: {value}") + + +def batch_process(items, batch_size=1000): + """Generator to process items in batches""" + for i in range(0, len(items), batch_size): + yield items[i:i + batch_size] + + +def create_state_dict(state_obj, extra_fields=None): + """Create a standardized dictionary from state objects""" + base_dict = { + 'id': state_obj.id, + 'is_active': state_obj.is_active, + 'phase': state_obj.phase, + 'phase_number': state_obj.phase_number, + 'phase_current': state_obj.phase_current, + 'phase_total': state_obj.phase_total, + 'files_processed': state_obj.files_processed, + 'total_files': getattr(state_obj, 'total_files', state_obj.estimated_total if hasattr(state_obj, 'estimated_total') else 0), + 'start_time': state_obj.start_time.isoformat() if state_obj.start_time else None, + 'end_time': state_obj.end_time.isoformat() if state_obj.end_time else None, + 'current_file': state_obj.current_file, + 'progress_message': state_obj.progress_message, + 'error_message': state_obj.error_message + } + + # Add operation-specific ID field + if hasattr(state_obj, 'scan_id'): + base_dict['scan_id'] = state_obj.scan_id + elif hasattr(state_obj, 'cleanup_id'): + base_dict['cleanup_id'] = state_obj.cleanup_id + elif hasattr(state_obj, 'check_id'): + base_dict['check_id'] = state_obj.check_id + + # Add any extra fields + if extra_fields: + for field in extra_fields: + if hasattr(state_obj, field): + value = getattr(state_obj, field) + # Handle datetime serialization + if isinstance(value, datetime): + value = value.isoformat() + base_dict[field] = value + + return base_dict + + +def mark_operation_complete(state_obj, message=None): + """Common method to mark operations as complete""" + state_obj.is_active = False + state_obj.phase = 'completed' + state_obj.end_time = datetime.now(timezone.utc) + if message: + state_obj.progress_message = message + return state_obj + + +def mark_operation_error(state_obj, error_message): + """Common method to mark operations as failed""" + state_obj.is_active = False + state_obj.phase = 'error' + state_obj.end_time = datetime.now(timezone.utc) + state_obj.error_message = error_message + return state_obj + + +class OperationCancelledException(Exception): + """Custom exception for when an operation is cancelled""" + pass diff --git a/pixelprobe/utils/security.py b/pixelprobe/utils/security.py index e2c4db1..3019dab 100644 --- a/pixelprobe/utils/security.py +++ b/pixelprobe/utils/security.py @@ -12,7 +12,7 @@ from typing import Optional, Set, Tuple, Union from flask import request, jsonify, current_app from werkzeug.security import safe_join -from models import db, ScanConfiguration +from pixelprobe.models import db, ScanConfiguration import requests logger = logging.getLogger(__name__) @@ -151,7 +151,7 @@ def validate_file_path(file_path, allowed_paths=None): # If still no paths, this is likely a rescan operation - check if file exists in database if not allowed_paths: # Import here to avoid circular dependency - from models import ScanResult + from pixelprobe.models import ScanResult existing = ScanResult.query.filter_by(file_path=normalized).first() if existing: # File already in database - trust it for rescan diff --git a/version.py b/pixelprobe/version.py similarity index 92% rename from version.py rename to pixelprobe/version.py index 049f58c..fd37ea0 100644 --- a/version.py +++ b/pixelprobe/version.py @@ -4,7 +4,7 @@ # Default version - this is the single source of truth -_DEFAULT_VERSION = '2.5.66' +_DEFAULT_VERSION = '2.5.67' # Allow override via environment variable for CI/CD, but default to the hardcoded version diff --git a/scripts/create_test_database.py b/scripts/create_test_database.py index 89b6f7c..e8420db 100644 --- a/scripts/create_test_database.py +++ b/scripts/create_test_database.py @@ -13,7 +13,7 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__))) from app import app, db -from models import ScanResult, ScanState, ScanConfiguration +from pixelprobe.models import ScanResult, ScanState, ScanConfiguration # Test database path TEST_DB_PATH = os.path.abspath("test_media_checker.db") diff --git a/scripts/docker-run-modern.sh b/scripts/docker-run-modern.sh deleted file mode 100755 index 7ec27c9..0000000 --- a/scripts/docker-run-modern.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash - -# Build and run PixelProbe with Modern UI in Docker - -echo "=== Building PixelProbe Modern UI Docker Image ===" -echo "" - -# Stop any existing container -echo "Stopping existing container if running..." -docker-compose -f docker-compose.modern.yml down 2>/dev/null - -# Build the image -echo "Building Docker image..." -docker-compose -f docker-compose.modern.yml build - -# Start the container -echo "" -echo "Starting PixelProbe Modern UI..." -docker-compose -f docker-compose.modern.yml up -d - -# Wait for startup -echo "" -echo "Waiting for application to start..." -sleep 5 - -# Check if container is running -if docker ps | grep -q pixelprobe-modern-ui; then - echo "" - echo "✅ PixelProbe Modern UI is running!" - echo "" - echo "🌐 Access the application at: http://localhost:5001" - echo "" - echo "📊 The application includes sample test data" - echo "" - echo "To view logs:" - echo " docker-compose -f docker-compose.modern.yml logs -f" - echo "" - echo "To stop:" - echo " docker-compose -f docker-compose.modern.yml down" - echo "" -else - echo "" - echo "❌ Failed to start container. Check logs with:" - echo " docker-compose -f docker-compose.modern.yml logs" -fi \ No newline at end of file diff --git a/scripts/fix_database_schema.py b/scripts/fix_database_schema.py index 27a68f1..c899c3c 100644 --- a/scripts/fix_database_schema.py +++ b/scripts/fix_database_schema.py @@ -9,7 +9,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from app import create_app -from models import db +from pixelprobe.models import db from sqlalchemy import text import logging diff --git a/scripts/reset_incomplete_scans.py b/scripts/reset_incomplete_scans.py index 856248d..c930110 100755 --- a/scripts/reset_incomplete_scans.py +++ b/scripts/reset_incomplete_scans.py @@ -17,7 +17,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from app import create_app -from models import db, ScanResult +from pixelprobe.models import db, ScanResult from sqlalchemy import or_ def reset_incomplete_scans(dry_run=False): diff --git a/scripts/run_modern_ui.sh b/scripts/run_modern_ui.sh deleted file mode 100755 index 87a61a6..0000000 --- a/scripts/run_modern_ui.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -# Run PixelProbe with Modern UI locally - -echo "Starting PixelProbe with Modern UI..." -echo "Using database: ${DATABASE_PATH:-./instance/media_checker.db}" -echo "" -echo "Access the application at: http://localhost:5001" -echo "" - -# Set environment variables -export USE_MODERN_UI=true -export DATABASE_URL="sqlite:///${DATABASE_PATH:-./instance/media_checker.db}" -export FLASK_APP=app.py -export FLASK_ENV=development - -# Create static directories if they don't exist -mkdir -p static/css -mkdir -p static/js -mkdir -p static/images - -# Run the application -python app.py \ No newline at end of file diff --git a/scripts/run_test_ui.sh b/scripts/run_test_ui.sh deleted file mode 100755 index 51dd674..0000000 --- a/scripts/run_test_ui.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -# Run PixelProbe with test database - -source venv/bin/activate - -export USE_MODERN_UI=true -export DATABASE_URL="sqlite:///$PWD/test_media_checker.db" -export FLASK_APP=app.py -export FLASK_ENV=development -export FLASK_DEBUG=1 -export SECRET_KEY="development-secret-key" - -echo "=== PixelProbe Test Server ===" -echo "Database: test_media_checker.db (with sample data)" -echo "UI: Modern UI" -echo "URL: http://localhost:5001" -echo "" -echo "Press Ctrl+C to stop" -echo "" - -python app.py diff --git a/tests/conftest.py b/tests/conftest.py index 442a48d..422cec7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,10 +11,10 @@ from flask_sqlalchemy import SQLAlchemy # Import models first to ensure they're registered with SQLAlchemy -from models import db as _db, ScanResult, ScanState, CleanupState, FileChangesState, ScanConfiguration, IgnoredErrorPattern, ScanSchedule, ScanReport, Exclusion, User, APIToken +from pixelprobe.models import db as _db, ScanResult, ScanState, CleanupState, FileChangesState, ScanConfiguration, IgnoredErrorPattern, ScanSchedule, ScanReport, Exclusion, User, APIToken # Import models to ensure they're available -from models import ScanState, ScanResult +from pixelprobe.models import ScanState, ScanResult # Add missing total_files property for ScanState (still needed for utils.py) def _total_files(self): @@ -48,7 +48,7 @@ def create_test_app(): csrf = CSRFProtect(test_app) # Initialize authentication - from auth import init_auth + from pixelprobe.auth import init_auth init_auth(test_app) # Import and register blueprints @@ -60,7 +60,7 @@ def create_test_app(): from pixelprobe.api.reports_routes import reports_bp from pixelprobe.api.scan_routes_parallel import parallel_scan_bp from pixelprobe.api.auth_routes import auth_bp - from scheduler import MediaScheduler + from pixelprobe.scheduler import MediaScheduler test_app.register_blueprint(scan_bp) test_app.register_blueprint(auth_bp) @@ -88,7 +88,7 @@ def create_test_app(): set_scheduler(scheduler) # Add basic routes (with auth_required to match production app) - from auth import auth_required as _auth_required + from pixelprobe.auth import auth_required as _auth_required @test_app.route('/health') @_auth_required @@ -139,7 +139,7 @@ def client(app): def authenticated_client(app, client): """Create an authenticated test client with a test user""" with app.app_context(): - from models import db, User + from pixelprobe.models import db, User db.create_all() # Create a test admin user @@ -171,7 +171,7 @@ def db(app): """Create database for testing""" with app.app_context(): # Ensure all models are loaded - from models import (ScanResult, ScanState, CleanupState, FileChangesState, + from pixelprobe.models import (ScanResult, ScanState, CleanupState, FileChangesState, ScanConfiguration, IgnoredErrorPattern, ScanSchedule, ScanReport, Exclusion, ScanChunk) @@ -180,7 +180,7 @@ def db(app): try: # First, mark any active cleanup as cancelled to stop background threads - from models import CleanupState, ScanState + from pixelprobe.models import CleanupState, ScanState try: cleanup_states = CleanupState.query.filter_by(is_active=True).all() for cleanup in cleanup_states: @@ -336,7 +336,7 @@ def test_data_dir(): @pytest.fixture def mock_scan_result(db): """Create a mock scan result""" - from models import ScanResult + from pixelprobe.models import ScanResult from datetime import datetime, timezone result = ScanResult( @@ -359,8 +359,8 @@ def mock_scan_result(db): @pytest.fixture def real_scan_results(db, test_data_dir): """Scan real media files into test database""" - from models import ScanResult - from media_checker import PixelProbe + from pixelprobe.models import ScanResult + from pixelprobe.media_checker import PixelProbe from datetime import datetime, timezone checker = PixelProbe() @@ -418,7 +418,7 @@ def real_scan_results(db, test_data_dir): @pytest.fixture def mock_corrupted_result(db): """Create a mock corrupted scan result""" - from models import ScanResult + from pixelprobe.models import ScanResult from datetime import datetime, timezone result = ScanResult( @@ -444,7 +444,7 @@ def mock_corrupted_result(db): @pytest.fixture def mock_scan_configuration(db): """Create mock scan configuration""" - from models import ScanConfiguration + from pixelprobe.models import ScanConfiguration from datetime import datetime, timezone # Create a configuration using the new path-based structure diff --git a/tests/integration/test_admin_endpoints.py b/tests/integration/test_admin_endpoints.py index f87012f..fc250d1 100644 --- a/tests/integration/test_admin_endpoints.py +++ b/tests/integration/test_admin_endpoints.py @@ -1,7 +1,7 @@ import pytest import json from datetime import datetime, timezone, timedelta -from models import db, ScanSchedule, IgnoredErrorPattern +from pixelprobe.models import db, ScanSchedule, IgnoredErrorPattern class TestScheduleEndpoints: """Test schedule management endpoints""" @@ -234,7 +234,7 @@ class TestExclusionEndpoints: def test_add_path_exclusion(self, authenticated_client, db, app): """Test adding a path exclusion""" - from models import Exclusion + from pixelprobe.models import Exclusion with app.app_context(): response = authenticated_client.post('/api/exclusions/path', @@ -252,7 +252,7 @@ def test_add_path_exclusion(self, authenticated_client, db, app): def test_add_extension_exclusion(self, authenticated_client, db, app): """Test adding an extension exclusion""" - from models import Exclusion + from pixelprobe.models import Exclusion with app.app_context(): response = authenticated_client.post('/api/exclusions/extension', @@ -270,7 +270,7 @@ def test_add_extension_exclusion(self, authenticated_client, db, app): def test_add_duplicate_exclusion(self, authenticated_client, db, app): """Test adding duplicate exclusion""" - from models import Exclusion + from pixelprobe.models import Exclusion with app.app_context(): # Create existing exclusion @@ -290,7 +290,7 @@ def test_add_duplicate_exclusion(self, authenticated_client, db, app): def test_remove_path_exclusion(self, authenticated_client, db, app): """Test removing a path exclusion""" - from models import Exclusion + from pixelprobe.models import Exclusion with app.app_context(): # Create exclusion to remove diff --git a/tests/integration/test_csrf_and_exclusions.py b/tests/integration/test_csrf_and_exclusions.py index e9d933b..96afc72 100644 --- a/tests/integration/test_csrf_and_exclusions.py +++ b/tests/integration/test_csrf_and_exclusions.py @@ -46,7 +46,7 @@ class TestExclusionsPersistence: def test_exclusions_use_database(self, authenticated_client, app, db): """Test that exclusions are saved to database""" with app.app_context(): - from models import Exclusion + from pixelprobe.models import Exclusion # Clean up any existing exclusions Exclusion.query.delete() diff --git a/tests/integration/test_maintenance_endpoints.py b/tests/integration/test_maintenance_endpoints.py index 4a6d72b..a7d9b4d 100644 --- a/tests/integration/test_maintenance_endpoints.py +++ b/tests/integration/test_maintenance_endpoints.py @@ -1,5 +1,5 @@ import pytest -from models import db, CleanupState, FileChangesState +from pixelprobe.models import db, CleanupState, FileChangesState import time class TestMaintenanceCancelEndpoints: diff --git a/tests/integration/test_scan_endpoints_extended.py b/tests/integration/test_scan_endpoints_extended.py index f4cedba..17bb6d9 100644 --- a/tests/integration/test_scan_endpoints_extended.py +++ b/tests/integration/test_scan_endpoints_extended.py @@ -1,5 +1,5 @@ import pytest -from models import db, ScanResult, ScanState +from pixelprobe.models import db, ScanResult, ScanState from datetime import datetime, timezone class TestScanManagementEndpoints: diff --git a/tests/integration/test_scan_execution.py b/tests/integration/test_scan_execution.py index 67d4da2..ec67701 100644 --- a/tests/integration/test_scan_execution.py +++ b/tests/integration/test_scan_execution.py @@ -6,7 +6,7 @@ import pytest import time from unittest.mock import patch, Mock -from models import db, ScanState, ScanResult, ScanChunk +from pixelprobe.models import db, ScanState, ScanResult, ScanChunk class TestScanExecution: diff --git a/tests/test_app.py b/tests/test_app.py index d65977d..01546ad 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -22,7 +22,7 @@ def test_version_endpoint(authenticated_client): def test_scan_status_endpoint(authenticated_client, db): """Test the scan status endpoint""" # Ensure ScanState table exists and has a default entry - from models import ScanState + from pixelprobe.models import ScanState db.create_all() response = authenticated_client.get('/api/scan-status') diff --git a/tests/test_authentication.py b/tests/test_authentication.py index 51fb4e7..76a3039 100644 --- a/tests/test_authentication.py +++ b/tests/test_authentication.py @@ -5,8 +5,8 @@ import pytest import json from datetime import datetime, timedelta, timezone -from models import db, User, APIToken -from auth import authenticate_user, check_first_run, create_initial_admin +from pixelprobe.models import db, User, APIToken +from pixelprobe.auth import authenticate_user, check_first_run, create_initial_admin @pytest.fixture diff --git a/tests/test_bulk_reports.py b/tests/test_bulk_reports.py index 57dd72d..792610f 100644 --- a/tests/test_bulk_reports.py +++ b/tests/test_bulk_reports.py @@ -7,7 +7,7 @@ import tempfile import zipfile import io -from models import ScanResult, ScanReport +from pixelprobe.models import ScanResult, ScanReport class TestBulkReportOperations: diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py index acd3e86..1934b54 100644 --- a/tests/test_concurrency.py +++ b/tests/test_concurrency.py @@ -59,7 +59,7 @@ def start_scan(): def test_concurrent_file_updates(self, app, db): """Test concurrent updates to the same file record""" - from models import ScanResult + from pixelprobe.models import ScanResult # Create a test file record test_file = ScanResult( @@ -129,7 +129,7 @@ def acquire_connection(): # Use app context for database operations with app.app_context(): # Simulate acquiring a database connection - from models import db + from pixelprobe.models import db # Test if we can execute a query result = db.session.execute(db.text("SELECT 1")) connections.append(result) @@ -176,7 +176,7 @@ def test_parallel_scan_worker_distribution(self, app, db): def test_scheduled_scan_overlap(self, app): """Test that scheduled scans don't overlap""" - from scheduler import MediaScheduler + from pixelprobe.scheduler import MediaScheduler scheduler = MediaScheduler(app) @@ -205,7 +205,7 @@ def mock_scan(): def test_cleanup_and_scan_mutual_exclusion(self, app, db): """Test that cleanup and scan operations are mutually exclusive""" - from models import ScanState + from pixelprobe.models import ScanState # Ensure scan_state table exists db.create_all() @@ -241,7 +241,7 @@ def start_cleanup(): def test_scan_state_consistency_under_load(self, app, db): """Test scan state consistency under concurrent read/write load""" - from models import ScanState + from pixelprobe.models import ScanState # Initialize scan state scan_state = ScanState.get_or_create() diff --git a/tests/test_media_checker.py b/tests/test_media_checker.py index 22a9f5f..a64c1a7 100644 --- a/tests/test_media_checker.py +++ b/tests/test_media_checker.py @@ -8,7 +8,7 @@ import threading from unittest.mock import Mock, patch, MagicMock -from media_checker import PixelProbe +from pixelprobe.media_checker import PixelProbe class TestMediaChecker: """Test the core PixelProbe media checking functionality""" @@ -143,7 +143,7 @@ def mock_subprocess_run(cmd, *args, **kwargs): def test_scan_date_update_on_rescan(self, mock_exists, db, app): """Test that scan_date is updated when rescanning a file""" with app.app_context(): - from models import ScanResult + from pixelprobe.models import ScanResult from datetime import datetime, timezone, timedelta # Create a scan result with old scan date @@ -284,7 +284,7 @@ def test_exclusion_patterns(self, test_data_dir, monkeypatch): def mock_load_exclusions(): return ['/excluded'], ['.tmp', '.cache'] - monkeypatch.setattr('media_checker.load_exclusions', mock_load_exclusions) + monkeypatch.setattr('pixelprobe.media_checker.load_exclusions', mock_load_exclusions) # Create checker with exclusions excluded_paths, excluded_extensions = mock_load_exclusions() @@ -369,7 +369,7 @@ def test_memory_usage_large_file(self, test_data_dir): def test_error_pattern_ignoring(self, test_data_dir, db, app): """Test that ignored error patterns work correctly""" - from models import IgnoredErrorPattern + from pixelprobe.models import IgnoredErrorPattern # Add ignored pattern pattern = IgnoredErrorPattern( diff --git a/tests/test_migration_lock.py b/tests/test_migration_lock.py index 4e10d06..211feae 100644 --- a/tests/test_migration_lock.py +++ b/tests/test_migration_lock.py @@ -33,7 +33,7 @@ def test_migrate_database_falls_back_on_advisory_lock_failure(app): mig_mod = _get_migrations_module() with app.app_context(): with patch.object(mig_mod, '_run_all_migrations') as mock_migrations: - from models import db + from pixelprobe.models import db mig_mod.migrate_database(db) mock_migrations.assert_called_once() diff --git a/tests/test_performance.py b/tests/test_performance.py index 287be18..766ff31 100644 --- a/tests/test_performance.py +++ b/tests/test_performance.py @@ -108,7 +108,7 @@ def test_large_file_discovery_performance(self, app, performance_monitor): def test_bulk_insert_performance(self, app, db, performance_monitor): """Test performance of bulk database inserts""" - from models import ScanResult + from pixelprobe.models import ScanResult from pixelprobe.services.scan_executor import BatchProcessor num_records = 1000 # Reduced for CI/CD performance @@ -165,7 +165,7 @@ def test_scan_memory_usage_over_time(self, app, db, performance_monitor): from pixelprobe.services.scan_executor import ScanExecutor # Create test files in database - from models import ScanResult + from pixelprobe.models import ScanResult num_files = 1000 for i in range(num_files): @@ -202,7 +202,7 @@ def progress_callback(data): file_paths = [f.file_path for f in files] # Execute scan with memory monitoring - with patch('media_checker.PixelProbe.scan_file', mock_check_file): + with patch('pixelprobe.media_checker.PixelProbe.scan_file', mock_check_file): stats = executor.execute(file_paths, mock_check_file, parallel=True) results = performance_monitor.get_results() @@ -274,7 +274,7 @@ def process_item(item): def test_api_response_time(self, app, db): """Test API endpoint response times""" - from models import ScanResult + from pixelprobe.models import ScanResult # Create test data for i in range(100): diff --git a/tests/test_real_media_samples.py b/tests/test_real_media_samples.py index 5e010a8..5559d6e 100644 --- a/tests/test_real_media_samples.py +++ b/tests/test_real_media_samples.py @@ -4,7 +4,7 @@ import pytest import os -from models import ScanResult +from pixelprobe.models import ScanResult @pytest.mark.real_media diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 2d12370..56c7016 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -1,6 +1,6 @@ import pytest -from scheduler import MediaScheduler -from models import db, ScanSchedule +from pixelprobe.scheduler import MediaScheduler +from pixelprobe.models import db, ScanSchedule class TestSchedulerLockHelpers: diff --git a/tests/unit/test_repositories.py b/tests/unit/test_repositories.py index d7338cd..495c25e 100644 --- a/tests/unit/test_repositories.py +++ b/tests/unit/test_repositories.py @@ -8,7 +8,7 @@ from pixelprobe.repositories import ScanRepository, ConfigurationRepository from pixelprobe.repositories.scan_repository import ScanStateRepository from pixelprobe.repositories.config_repository import IgnoredPatternRepository, ScheduleRepository -from models import ScanResult, ScanConfiguration, IgnoredErrorPattern, ScanSchedule, ScanState +from pixelprobe.models import ScanResult, ScanConfiguration, IgnoredErrorPattern, ScanSchedule, ScanState class TestScanRepository: """Test the scan repository""" diff --git a/tests/unit/test_scan_service.py b/tests/unit/test_scan_service.py index f01d83a..42d14ea 100644 --- a/tests/unit/test_scan_service.py +++ b/tests/unit/test_scan_service.py @@ -8,7 +8,7 @@ import time from pixelprobe.services.scan_service import ScanService -from models import ScanResult, ScanState +from pixelprobe.models import ScanResult, ScanState class TestScanService: """Test the scan service business logic""" @@ -252,7 +252,7 @@ def test_cancel_scan_not_running(self, scan_service): @patch('pixelprobe.services.scan_service.db') def test_reset_stuck_scans(self, mock_db, scan_service, db): """Test resetting stuck scans""" - from models import ScanResult + from pixelprobe.models import ScanResult # Create stuck scan results stuck1 = ScanResult(file_path='/test/stuck1.mp4', scan_status='scanning') diff --git a/tests/unit/test_stats_service.py b/tests/unit/test_stats_service.py index 6af9869..416e427 100644 --- a/tests/unit/test_stats_service.py +++ b/tests/unit/test_stats_service.py @@ -90,7 +90,7 @@ def test_get_system_info(self, mock_db, stats_service): info = stats_service.get_system_info() # Version now comes from version.py dynamically - from version import __version__ + from pixelprobe.version import __version__ assert info['version'] == __version__ assert info['database']['total_files'] == 100 assert len(info['monitored_paths']) == 2 diff --git a/tools/data_retention.py b/tools/data_retention.py index 084492a..dfb4620 100644 --- a/tools/data_retention.py +++ b/tools/data_retention.py @@ -34,7 +34,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from sqlalchemy import text -from models import db, ScanResult, ScanReport, ScanState +from pixelprobe.models import db, ScanResult, ScanReport, ScanState logger = logging.getLogger(__name__) diff --git a/tools/fix_database_schema.py b/tools/fix_database_schema.py index 586d4e9..32163c6 100755 --- a/tools/fix_database_schema.py +++ b/tools/fix_database_schema.py @@ -49,7 +49,7 @@ def fix_database(): logger.info("=" * 50) # Import after setting environment variables - from models import db, ScanResult, IgnoredErrorPattern, ScanSchedule, ScanConfiguration + from pixelprobe.models import db, ScanResult, IgnoredErrorPattern, ScanSchedule, ScanConfiguration from app import app logger.info(f"Database URL: {app.config.get('SQLALCHEMY_DATABASE_URI', 'NOT SET')}") diff --git a/tools/fix_tile_data_false_positives.py b/tools/fix_tile_data_false_positives.py index 35475c2..6974dff 100755 --- a/tools/fix_tile_data_false_positives.py +++ b/tools/fix_tile_data_false_positives.py @@ -17,8 +17,8 @@ # Add the current directory to the path to import app modules sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from models import db, ScanResult -from media_checker import MediaChecker +from pixelprobe.models import db, ScanResult +from pixelprobe.media_checker import MediaChecker from app import create_app # Set up logging diff --git a/tools/patches/v2_2_47_fixes.py b/tools/patches/v2_2_47_fixes.py deleted file mode 100644 index aba503c..0000000 --- a/tools/patches/v2_2_47_fixes.py +++ /dev/null @@ -1,129 +0,0 @@ -""" -Patches for v2.2.47 to fix remaining database and connection issues -""" - -import logging -from functools import wraps -from sqlalchemy.exc import OperationalError, InvalidRequestError, ResourceClosedError - -logger = logging.getLogger(__name__) - - -def safe_count(query): - """ - Safely execute a count query with proper error handling - - Fixes: NoSuchColumnError: Could not locate column in row for column 'count(*)' - """ - try: - # Standard count() method should work - return query.count() - except Exception as e: - if "count(*)" in str(e) or "ResourceClosedError" in str(e): - # Connection issue or result closed - try alternative approach - try: - # Use func.count with explicit column - from sqlalchemy import func - return query.session.query(func.count()).select_from(query.subquery()).scalar() or 0 - except Exception: - # Last resort - return 0 to prevent crashes - logger.error(f"Count query failed completely: {e}") - return 0 - raise - - -def with_transaction_recovery(func): - """ - Decorator to handle transaction state errors - - Fixes: InFailedSqlTransaction errors - """ - @wraps(func) - def wrapper(*args, **kwargs): - from models import db - max_retries = 3 - - for attempt in range(max_retries): - try: - return func(*args, **kwargs) - except InvalidRequestError as e: - if "transaction" in str(e).lower(): - logger.warning(f"Transaction error on attempt {attempt + 1}: {e}") - db.session.rollback() - if attempt < max_retries - 1: - continue - raise - except OperationalError as e: - if "lost synchronization" in str(e) or "closed the connection" in str(e): - logger.warning(f"Connection lost on attempt {attempt + 1}: {e}") - db.session.rollback() - db.session.close() - if attempt < max_retries - 1: - # Force new connection - db.session.bind.dispose() - continue - raise - except ResourceClosedError as e: - logger.warning(f"Resource closed error on attempt {attempt + 1}: {e}") - db.session.rollback() - if attempt < max_retries - 1: - continue - raise - - return None - - return wrapper - - -def patch_scan_service(): - """ - Apply patches to scan_service module - """ - try: - from pixelprobe.services import scan_service - - # Patch the count query in scan_service - original_run_scan = scan_service.ScanService.run_scan - - @with_transaction_recovery - def patched_run_scan(self, *args, **kwargs): - return original_run_scan(self, *args, **kwargs) - - scan_service.ScanService.run_scan = patched_run_scan - logger.info("Patched scan_service.run_scan with transaction recovery") - - except Exception as e: - logger.error(f"Failed to patch scan_service: {e}") - - -def patch_database_queries(): - """ - Patch all database query methods to handle connection issues - """ - from models import db - - # Add disposal method to handle lost connections - original_execute = db.session.execute - - def safe_execute(statement, *args, **kwargs): - """Execute with connection recovery""" - try: - return original_execute(statement, *args, **kwargs) - except OperationalError as e: - if "lost synchronization" in str(e): - logger.warning("Lost synchronization, resetting connection") - db.session.rollback() - db.session.close() - # Try again with fresh connection - return original_execute(statement, *args, **kwargs) - raise - - db.session.execute = safe_execute - logger.info("Patched database execute with connection recovery") - - -def apply_all_patches(): - """Apply all v2.2.47 patches""" - patch_scan_service() - patch_database_queries() - logger.info("All v2.2.47 patches applied") \ No newline at end of file diff --git a/tools/reset_nal_files_v2.py b/tools/reset_nal_files_v2.py index 78643a7..009965a 100644 --- a/tools/reset_nal_files_v2.py +++ b/tools/reset_nal_files_v2.py @@ -15,7 +15,7 @@ # Add the parent directory to the path to import models sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from models import db, ScanResult +from pixelprobe.models import db, ScanResult from flask import Flask from sqlalchemy import text from sqlalchemy.exc import OperationalError diff --git a/utils.py b/utils.py deleted file mode 100644 index 174b7ee..0000000 --- a/utils.py +++ /dev/null @@ -1,198 +0,0 @@ -"""Shared utilities to reduce code redundancy in PixelProbe""" - -import logging -import time -from datetime import datetime, timezone - -logger = logging.getLogger(__name__) - - -class ProgressTracker: - """Unified progress tracking to eliminate redundant progress code - - This class handles progress tracking for all operations (scan, cleanup, file-changes) - in a consistent way, eliminating duplicate progress calculation logic. - """ - - def __init__(self, operation_type): - self.operation_type = operation_type - self.start_time = time.time() - self.phase_start_time = time.time() - - # Define phase weights for different operation types - self.phase_weights = { - 'scan': [0.2, 0.1, 0.7], # Discovery: 20%, Adding: 10%, Scanning: 70% - 'cleanup': [0.1, 0.8, 0.1], # Scanning DB: 10%, Checking files: 80%, Deleting: 10% - 'file_changes': [0.05, 0.8, 0.15] # Starting: 5%, Checking: 80%, Verifying: 15% - } - - def calculate_progress_percentage(self, phase_number, phase_current, phase_total, total_phases=3): - """Calculate overall progress percentage based on phase weights""" - if phase_total == 0: - return 0 - - weights = self.phase_weights.get(self.operation_type, [1.0 / total_phases] * total_phases) - - # Calculate progress within current phase - phase_progress = (phase_current / phase_total) if phase_total > 0 else 0 - - # Calculate total progress - completed_weight = sum(weights[:phase_number - 1]) if phase_number > 1 else 0 - current_weight = weights[phase_number - 1] if phase_number <= len(weights) else 0 - - total_progress = completed_weight + (current_weight * phase_progress) - return min(total_progress * 100, 100) - - def estimate_time_remaining(self, files_processed, total_files): - """Estimate time remaining based on current progress""" - if files_processed == 0 or total_files == 0: - return None - - elapsed = time.time() - self.start_time - - # Minimum elapsed time to avoid division issues - if elapsed < 1: - return "calculating..." - - rate = files_processed / elapsed - remaining_files = total_files - files_processed - - # If no files remaining, we're done - if remaining_files <= 0: - return "completing..." - - # If rate is extremely low (very slow scan), return a special message - if rate < 0.001: # Less than 1 file per 1000 seconds - return "processing..." - elif rate > 0: - remaining_seconds = remaining_files / rate - # Cap at 30 days to avoid unrealistic ETAs - if remaining_seconds > 2592000: # 30 days - return ">30 days" - return self.format_time(remaining_seconds) - return None - - @staticmethod - def format_time(seconds): - """Format seconds into human-readable time""" - # Handle invalid or very small values - if seconds <= 0: - return "calculating..." - elif seconds < 60: - # Don't show 0s, minimum 1s - return f"{max(1, int(seconds))}s" - elif seconds < 3600: - return f"{int(seconds / 60)}m {int(seconds % 60)}s" - elif seconds < 86400: # Less than a day - hours = int(seconds / 3600) - minutes = int((seconds % 3600) / 60) - return f"{hours}h {minutes}m" - else: - # For very long ETAs, show days - days = int(seconds / 86400) - hours = int((seconds % 86400) / 3600) - return f"{days}d {hours}h" - - def get_progress_message(self, phase_name, files_processed=0, total_files=0, current_file=None): - """Generate consistent progress messages""" - eta = self.estimate_time_remaining(files_processed, total_files) - eta_str = f" ETA: {eta}" if eta else "" - - if current_file: - return f"{phase_name}: current file: {current_file} - {files_processed} of {total_files:,} files{eta_str}" - else: - return f"{phase_name}: {files_processed} of {total_files:,} files{eta_str}" - -def log_operation_status(operation_type, status, details=None): - """Centralized operation status logging - - This eliminates redundant logging patterns throughout the codebase. - """ - log_messages = { - 'start': f"=== {operation_type.upper()} STARTED ===", - 'complete': f"=== {operation_type.upper()} COMPLETED ===", - 'error': f"=== {operation_type.upper()} FAILED ===", - 'cancel': f"=== {operation_type.upper()} CANCELLED ===" - } - - message = log_messages.get(status, f"{operation_type} status: {status}") - logger.info(message) - - if details: - for key, value in details.items(): - logger.info(f"{key}: {value}") - - -def batch_process(items, batch_size=1000): - """Generator to process items in batches - - This eliminates redundant batch processing code found in multiple places. - """ - for i in range(0, len(items), batch_size): - yield items[i:i + batch_size] - - -def create_state_dict(state_obj, extra_fields=None): - """Create a standardized dictionary from state objects - - This eliminates redundant to_dict() implementations across state models. - """ - base_dict = { - 'id': state_obj.id, - 'is_active': state_obj.is_active, - 'phase': state_obj.phase, - 'phase_number': state_obj.phase_number, - 'phase_current': state_obj.phase_current, - 'phase_total': state_obj.phase_total, - 'files_processed': state_obj.files_processed, - 'total_files': getattr(state_obj, 'total_files', state_obj.estimated_total if hasattr(state_obj, 'estimated_total') else 0), - 'start_time': state_obj.start_time.isoformat() if state_obj.start_time else None, - 'end_time': state_obj.end_time.isoformat() if state_obj.end_time else None, - 'current_file': state_obj.current_file, - 'progress_message': state_obj.progress_message, - 'error_message': state_obj.error_message - } - - # Add operation-specific ID field - if hasattr(state_obj, 'scan_id'): - base_dict['scan_id'] = state_obj.scan_id - elif hasattr(state_obj, 'cleanup_id'): - base_dict['cleanup_id'] = state_obj.cleanup_id - elif hasattr(state_obj, 'check_id'): - base_dict['check_id'] = state_obj.check_id - - # Add any extra fields - if extra_fields: - for field in extra_fields: - if hasattr(state_obj, field): - value = getattr(state_obj, field) - # Handle datetime serialization - if isinstance(value, datetime): - value = value.isoformat() - base_dict[field] = value - - return base_dict - - -def mark_operation_complete(state_obj, message=None): - """Common method to mark operations as complete""" - state_obj.is_active = False - state_obj.phase = 'completed' - state_obj.end_time = datetime.now(timezone.utc) - if message: - state_obj.progress_message = message - return state_obj - - -def mark_operation_error(state_obj, error_message): - """Common method to mark operations as failed""" - state_obj.is_active = False - state_obj.phase = 'error' - state_obj.end_time = datetime.now(timezone.utc) - state_obj.error_message = error_message - return state_obj - - -class OperationCancelledException(Exception): - """Custom exception for when an operation is cancelled""" - pass \ No newline at end of file From 408b4dcba125001f0bf2398f28dad6603ba01ead Mon Sep 17 00:00:00 2001 From: ttlequals0 Date: Thu, 5 Mar 2026 18:33:15 -0500 Subject: [PATCH 2/3] Fix Phase 3 progress display and consolidate scan completion logic - Read real-time scan progress from Redis in scan-status API endpoint (Redis is updated per-file by Celery workers; PostgreSQL lagged behind) - Write files_processed/estimated_total in all scan completion SQL UPDATEs - Add final Redis-to-DB sync in ui_progress_update_task before exit - Extract _mark_scan_completed() helper replacing 7 duplicated SQL blocks - Add get_scan_progress_redis() and _PROGRESS_KEY_PREFIX to progress_utils - Move get_scan_progress_redis import to module level in scan_routes - Update architecture docs to reflect Redis-first progress flow - Fix README: WebSocket->polling, stale media_checker import path Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.MD | 10 +++- README.md | 4 +- docs/ARCHITECTURE.md | 10 ++-- docs/SYSTEM_ARCHITECTURE.md | 10 ++-- pixelprobe/api/scan_routes.py | 28 +++++++++- pixelprobe/progress_utils.py | 43 ++++++++++++++- pixelprobe/services/scan_service.py | 82 +++++++++-------------------- pixelprobe/tasks.py | 44 ++++++++++++++++ 8 files changed, 162 insertions(+), 69 deletions(-) diff --git a/CHANGELOG.MD b/CHANGELOG.MD index b4d5ad8..92f21ed 100644 --- a/CHANGELOG.MD +++ b/CHANGELOG.MD @@ -12,7 +12,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Fix stuck scan bug**: Scans could get permanently stuck as "active" after a Celery task crash (e.g., `psycopg2.DatabaseError`). The crash recovery handler in `scan_service.py` attempted to mark the scan as crashed but failed because the DB session was in a rolled-back state. Added `db.session.rollback()` before recovery writes and re-query the scan state with a fresh session. - **Fix stuck scan detection for lost Celery tasks**: When Celery task state is `None` (task lost/unreachable), the `is_scan_running()` check previously assumed the scan was still running indefinitely. Now falls through to time-based detection -- if no update for over 1 hour with unknown task state, marks as crashed. - **Fix scheduler stuck scan checker**: The `_check_stuck_scans` scheduler job now also verifies Celery task state. If a Celery task is gone AND no progress update for 5+ minutes, the scan is marked as crashed (previously only relied on 30-minute time threshold). -- Files affected: `pixelprobe/services/scan_service.py`, `pixelprobe/api/scan_routes.py`, `scheduler.py` +- **Fix Phase 3 progress display appearing frozen**: The scan-status API endpoint now reads real-time progress from Redis (instead of only PostgreSQL) when a scan is active. Redis is updated by the Celery scan worker on every file, while PostgreSQL lagged behind, causing the UI to show stale values like "97/397" or appear stuck. +- **Fix final progress not reflected in DB on scan completion**: All scan completion paths now write `files_processed` and `estimated_total` in the same SQL UPDATE that marks the scan as completed. Previously, the completion UPDATE only set `phase` and `is_active`, leaving stale progress values in PostgreSQL. +- **Fix UI worker exiting without final sync**: The `ui_progress_update_task` now performs a final Redis-to-PostgreSQL sync of progress values before exiting when it detects the scan is complete or inactive. +- Files affected: `pixelprobe/services/scan_service.py`, `pixelprobe/api/scan_routes.py`, `pixelprobe/scheduler.py`, `pixelprobe/tasks.py`, `pixelprobe/progress_utils.py` + +### Added + +- **`get_scan_progress_redis()` in `pixelprobe/progress_utils.py`**: Read function for fetching real-time scan progress from Redis, used by the scan-status API endpoint. +- **`_mark_scan_completed()` in `pixelprobe/services/scan_service.py`**: Extracted helper that consolidates all scan completion SQL into a single method, replacing 7 duplicated SQL blocks. ### Changed diff --git a/README.md b/README.md index c711a51..49b5167 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ PixelProbe is a comprehensive media file corruption detection tool with a modern ### Web Interface - Modern responsive design with dark/light theme support -- Real-time scan progress with WebSocket updates +- Real-time scan progress with live polling updates - Advanced filtering and search capabilities - Bulk file selection and management with shift-click range selection - Mobile-optimized touch interface @@ -474,7 +474,7 @@ curl http://localhost:5000/api/stats \ ### Command Line Usage ```python -from media_checker import PixelProbe +from pixelprobe.media_checker import PixelProbe checker = PixelProbe() diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 08cfc14..d185589 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -241,11 +241,15 @@ Worker1 Worker2 Worker3 Worker4 Worker5 Worker1 Worker2 Worker3 W ### Real-time Updates ``` -Scanner Progress → Progress Lock → Shared State - ↓ - Client ← API ← Status Endpoint +Celery Worker → Redis (scan_progress:{id}) → API Status Endpoint → Client + ↓ (on completion) + PostgreSQL (final sync) ``` +Progress counters are written to Redis on every file processed. The scan-status API +reads from Redis for active scans (fresh data) and falls back to PostgreSQL for +completed/inactive scans. A final Redis-to-DB sync occurs at scan completion. + ## Security Architecture ### Defense in Depth diff --git a/docs/SYSTEM_ARCHITECTURE.md b/docs/SYSTEM_ARCHITECTURE.md index 5289088..4aa9477 100644 --- a/docs/SYSTEM_ARCHITECTURE.md +++ b/docs/SYSTEM_ARCHITECTURE.md @@ -187,11 +187,15 @@ Web UI → API Request → Flask Route → Scan Service → Celery Task → Redi ### Progress Update Flow ``` -Celery Worker → Update Database → Redis (Cache) → API Poll → Web UI - ↓ - Log Progress +Celery Worker → Redis (scan_progress:{id}) → API Poll → Web UI + ↓ (periodic + on completion) + PostgreSQL (final sync via _mark_scan_completed / _final_sync_redis_to_db) ``` +Redis is the primary real-time progress store. Workers write counters to Redis on every +file processed. The API reads Redis for active scans and PostgreSQL for completed scans. +PostgreSQL is updated periodically by the UI worker task and at scan completion. + ### Result Storage Flow ``` Media File → FFmpeg/ImageMagick → Analysis Result → PostgreSQL diff --git a/pixelprobe/api/scan_routes.py b/pixelprobe/api/scan_routes.py index 0c6121f..33b44cc 100644 --- a/pixelprobe/api/scan_routes.py +++ b/pixelprobe/api/scan_routes.py @@ -10,6 +10,7 @@ from pixelprobe.constants import TERMINAL_SCAN_PHASES from pixelprobe.version import __version__ from pixelprobe.auth import auth_required +from pixelprobe.progress_utils import get_scan_progress_redis from pixelprobe.utils.security import ( validate_file_path, validate_directory_path, @@ -595,13 +596,36 @@ def get_scan_status(): logger.debug(f"Service is_running: {is_running}") logger.debug(f"Service status: {service_status}") logger.debug(f"Database state_dict phase: {state_dict.get('phase', 'idle')}") - + # Prioritize database state when scan is active, fall back to service values current_phase = state_dict.get('phase', 'idle') - + # Use database values primarily, with service as fallback current_progress = state_dict.get('files_processed', service_status.get('current', 0)) total_progress = state_dict.get('estimated_total', service_status.get('total', 0)) + + # When scan is active, read real-time progress from Redis (much fresher than PostgreSQL) + if scan_state and scan_state.is_active and scan_state.scan_id: + try: + redis_progress = get_scan_progress_redis(scan_state.scan_id) + if redis_progress: + redis_files = redis_progress.get('files_processed', 0) + redis_total = redis_progress.get('estimated_total', 0) + redis_phase = redis_progress.get('phase', '') + redis_file = redis_progress.get('current_file', '') + # Use Redis values if they are more up-to-date (higher progress count) + if redis_files >= current_progress: + current_progress = redis_files + if redis_total > 0: + total_progress = redis_total + if redis_phase and redis_phase not in ('', 'idle'): + current_phase = redis_phase + # Override current_file in state_dict for downstream use + if redis_file: + state_dict['current_file'] = redis_file + logger.debug(f"Using Redis progress: {current_progress}/{total_progress} phase={current_phase}") + except Exception as e: + logger.warning(f"Failed to read Redis progress, using DB values: {e}") # Determine status based on phase and progress if is_running: diff --git a/pixelprobe/progress_utils.py b/pixelprobe/progress_utils.py index 57cd03d..874e77d 100644 --- a/pixelprobe/progress_utils.py +++ b/pixelprobe/progress_utils.py @@ -13,6 +13,8 @@ logger = logging.getLogger(__name__) +_PROGRESS_KEY_PREFIX = 'scan_progress' + # Connection pool to reuse connections _redis_connection_pool = None @@ -177,6 +179,45 @@ def wrapper(*args, **kwargs): return decorator +def get_scan_progress_redis(scan_id): + """ + Read current scan progress from Redis. + + Args: + scan_id (str): The scan ID + + Returns: + dict with progress data or None if unavailable + """ + redis_client = get_redis_client() + if not redis_client: + return None + + progress_key = f"{_PROGRESS_KEY_PREFIX}:{scan_id}" + try: + data = redis_client.hgetall(progress_key) + if not data: + return None + # Redis returns bytes, decode them + decoded = {} + for k, v in data.items(): + key = k.decode('utf-8') if isinstance(k, bytes) else k + val = v.decode('utf-8') if isinstance(v, bytes) else v + decoded[key] = val + # Convert numeric fields + result = { + 'files_processed': int(decoded.get('files_processed', 0)), + 'estimated_total': int(decoded.get('estimated_total', 0)), + 'phase': decoded.get('phase', ''), + 'current_file': decoded.get('current_file', ''), + 'last_update': decoded.get('last_update', ''), + } + return result + except Exception as e: + logger.warning(f"Failed to read Redis progress for scan {scan_id}: {e}") + return None + + def update_scan_progress_redis(scan_id, files_processed=0, estimated_total=0, phase='scanning', current_file=''): """ Update scan progress in Redis for the UI worker to read. @@ -193,7 +234,7 @@ def update_scan_progress_redis(scan_id, files_processed=0, estimated_total=0, ph logger.debug("Redis not available for progress updates") return - progress_key = f"scan_progress:{scan_id}" + progress_key = f"{_PROGRESS_KEY_PREFIX}:{scan_id}" progress_data = { 'files_processed': str(files_processed), 'estimated_total': str(estimated_total), diff --git a/pixelprobe/services/scan_service.py b/pixelprobe/services/scan_service.py index 0023fa8..3c59917 100644 --- a/pixelprobe/services/scan_service.py +++ b/pixelprobe/services/scan_service.py @@ -580,13 +580,7 @@ def discovery_progress(files_checked, files_discovered): self.update_progress(0, 0, '', 'completed') # Complete scan using thread-safe database update - # Use the scan_state_id we captured before threading - from sqlalchemy import text - db.session.execute( - text("UPDATE scan_state SET phase = 'completed', is_active = false, end_time = :end_time WHERE id = :id"), - {'end_time': datetime.now(timezone.utc), 'id': scan_state_id} - ) - db.session.commit() + self._mark_scan_completed(scan_state_id, files_processed=0, estimated_total=0) # Create scan report even for empty scans completed_scan_state = db.session.query(ScanState).filter_by(id=scan_state_id).first() @@ -1332,12 +1326,7 @@ def _sequential_scan_chunks(self, checker: PixelProbe, chunks: List[ScanChunk], self.update_progress(total_files_scanned, total_files_to_scan, '', 'completed') # Thread-safe completion using direct SQL update - from sqlalchemy import text - db.session.execute( - text("UPDATE scan_state SET phase = 'completed', is_active = false, end_time = :end_time WHERE id = :id"), - {'end_time': datetime.now(timezone.utc), 'id': scan_state_id} - ) - db.session.commit() + self._mark_scan_completed(scan_state_id, total_files_scanned, total_files_to_scan) # Create scan report completed_scan_state = db.session.query(ScanState).filter_by(id=scan_state_id).first() @@ -1411,23 +1400,12 @@ def _sequential_scan(self, checker: PixelProbe, files: List[str], self.update_progress(total_files, total_files, '', 'completed') # Thread-safe completion using direct SQL update - # Use scan_state_id which is accessible in this closure - from sqlalchemy import text - db.session.execute( - text("UPDATE scan_state SET phase = 'completed', is_active = false, end_time = :end_time WHERE id = :id"), - {'end_time': datetime.now(timezone.utc), 'id': scan_state_id} - ) - db.session.commit() + self._mark_scan_completed(scan_state_id, total_files, total_files) # Create scan report - # Re-fetch scan state to get updated values completed_scan_state = db.session.query(ScanState).filter_by(id=scan_state_id).first() if completed_scan_state: - # Determine scan type based on flags - if force_rescan: - scan_type = 'rescan' - else: - scan_type = 'full_scan' + scan_type = 'rescan' if force_rescan else 'full_scan' self._create_scan_report(completed_scan_state, scan_type=scan_type) logger.info(f"=== SCAN COMPLETED (SEQUENTIAL DIRECT) ===") @@ -1639,12 +1617,7 @@ def scan_chunk(chunk): update_scan_progress_redis(scan_id, total_files_scanned, total_files_to_scan, 'completed', '') # Thread-safe completion using direct SQL update - from sqlalchemy import text - db.session.execute( - text("UPDATE scan_state SET phase = 'completed', is_active = false, end_time = :end_time WHERE id = :id"), - {'end_time': datetime.now(timezone.utc), 'id': scan_state_id} - ) - db.session.commit() + self._mark_scan_completed(scan_state_id, total_files_scanned, total_files_to_scan) # Create scan report completed_scan_state = db.session.query(ScanState).filter_by(id=scan_state_id).first() @@ -1740,23 +1713,12 @@ def scan_file(file_path): self.update_progress(total_files, total_files, '', 'completed') # Thread-safe completion using direct SQL update - # Use scan_state_id which is accessible in this closure - from sqlalchemy import text - db.session.execute( - text("UPDATE scan_state SET phase = 'completed', is_active = false, end_time = :end_time WHERE id = :id"), - {'end_time': datetime.now(timezone.utc), 'id': scan_state_id} - ) - db.session.commit() + self._mark_scan_completed(scan_state_id, total_files, total_files) # Create scan report - # Re-fetch scan state to get updated values completed_scan_state = db.session.query(ScanState).filter_by(id=scan_state_id).first() if completed_scan_state: - # Determine scan type based on flags - if force_rescan: - scan_type = 'rescan' - else: - scan_type = 'full_scan' + scan_type = 'rescan' if force_rescan else 'full_scan' self._create_scan_report(completed_scan_state, scan_type=scan_type) logger.info(f"=== SCAN COMPLETED (PARALLEL DIRECT) ===") @@ -1766,6 +1728,22 @@ def scan_file(file_path): logger.warning(f"Files still pending after retries: {remaining_pending}") logger.info(f"=== END SCAN ===") + def _mark_scan_completed(self, scan_state_id, files_processed, estimated_total): + """Thread-safe scan completion using direct SQL update.""" + db.session.execute( + text("""UPDATE scan_state SET phase = 'completed', is_active = false, end_time = :end_time, + files_processed = :files_processed, estimated_total = :estimated_total, + progress_message = 'Scan completed' + WHERE id = :id"""), + { + 'end_time': datetime.now(timezone.utc), + 'id': scan_state_id, + 'files_processed': files_processed, + 'estimated_total': estimated_total, + } + ) + db.session.commit() + def _create_scan_report(self, scan_state: ScanState, scan_type: str = 'full_scan'): """Create a scan report from the completed scan state""" try: @@ -2973,12 +2951,7 @@ def _sequential_scan_selected_chunks(self, checker: PixelProbe, chunks: List[Sca self.update_progress(len(selected_files), len(selected_files), '', 'completed') # Thread-safe completion - from sqlalchemy import text - db.session.execute( - text("UPDATE scan_state SET phase = 'completed', is_active = false, end_time = :end_time WHERE id = :id"), - {'end_time': datetime.now(timezone.utc), 'id': scan_state_id} - ) - db.session.commit() + self._mark_scan_completed(scan_state_id, len(selected_files), len(selected_files)) # Create scan report completed_scan_state = db.session.query(ScanState).filter_by(id=scan_state_id).first() @@ -3097,12 +3070,7 @@ def scan_chunk_files(chunk): self.update_progress(len(selected_files), len(selected_files), '', 'completed') # Thread-safe completion - from sqlalchemy import text - db.session.execute( - text("UPDATE scan_state SET phase = 'completed', is_active = false, end_time = :end_time WHERE id = :id"), - {'end_time': datetime.now(timezone.utc), 'id': scan_state_id} - ) - db.session.commit() + self._mark_scan_completed(scan_state_id, len(selected_files), len(selected_files)) # Create scan report completed_scan_state = db.session.query(ScanState).filter_by(id=scan_state_id).first() diff --git a/pixelprobe/tasks.py b/pixelprobe/tasks.py index d9d0cb3..4061df0 100644 --- a/pixelprobe/tasks.py +++ b/pixelprobe/tasks.py @@ -766,6 +766,46 @@ def calculate_file_hash_task(self, file_id, file_path, stored_hash, stored_modif } +def _final_sync_redis_to_db(scan_id, scan_state, ui_session): + """ + Final sync of Redis progress values to PostgreSQL when UI worker exits. + Ensures the DB reflects the true final progress, not a stale partial value. + """ + try: + from pixelprobe.progress_utils import get_scan_progress_redis + redis_progress = get_scan_progress_redis(scan_id) + if not redis_progress: + return + + redis_files = redis_progress.get('files_processed', 0) + redis_total = redis_progress.get('estimated_total', 0) + db_files = scan_state.files_processed or 0 + db_total = scan_state.estimated_total or 0 + + # Only update if Redis has higher/better values + needs_update = False + if redis_files > db_files: + scan_state.files_processed = redis_files + needs_update = True + if redis_total > 0 and redis_total != db_total: + scan_state.estimated_total = redis_total + needs_update = True + + if needs_update: + ui_session.commit() + logger.info(f"Final Redis->DB sync for scan {scan_id}: " + f"files_processed={scan_state.files_processed}, " + f"estimated_total={scan_state.estimated_total}") + else: + logger.debug(f"No final sync needed for scan {scan_id}: DB already up to date") + except Exception as e: + logger.warning(f"Failed final Redis->DB sync for scan {scan_id}: {e}") + try: + ui_session.rollback() + except Exception: + pass + + @celery_app.task(bind=True, priority=9) def ui_progress_update_task(self, scan_id, update_interval=1.0): """ @@ -838,11 +878,15 @@ def ui_progress_update_task(self, scan_id, update_interval=1.0): # Check if scan is complete if scan_state.phase in ['completed', 'error', 'cancelled', 'crashed']: + # Final sync: ensure DB has latest Redis progress values + _final_sync_redis_to_db(scan_id, scan_state, ui_session) logger.info(f"Scan {scan_id} finished with phase: {scan_state.phase}") break # Check if scan is still active if not scan_state.is_active: + # Final sync: ensure DB has latest Redis progress values + _final_sync_redis_to_db(scan_id, scan_state, ui_session) logger.info(f"Scan {scan_id} is no longer active, stopping UI worker") break From 7e92d47e4d23e22186b01c0aa8fbd78adaf9f290 Mon Sep 17 00:00:00 2001 From: ttlequals0 Date: Thu, 5 Mar 2026 18:51:15 -0500 Subject: [PATCH 3/3] Address code review: fix ORM staleness, add unit tests, clean up imports - Move inline import of get_scan_progress_redis to module level in tasks.py - Add ui_session.refresh(scan_state) in _final_sync_redis_to_db to prevent stale ORM values after raw SQL updates from _mark_scan_completed - Remove trailing whitespace in scan_routes.py - Add 12 unit tests for get_scan_progress_redis, _final_sync_redis_to_db sync logic, and _mark_scan_completed SQL behavior Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.MD | 4 + pixelprobe/api/scan_routes.py | 2 +- pixelprobe/tasks.py | 5 +- tests/unit/test_progress_utils.py | 256 ++++++++++++++++++++++++++++++ 4 files changed, 264 insertions(+), 3 deletions(-) create mode 100644 tests/unit/test_progress_utils.py diff --git a/CHANGELOG.MD b/CHANGELOG.MD index 92f21ed..936d6b1 100644 --- a/CHANGELOG.MD +++ b/CHANGELOG.MD @@ -15,12 +15,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Fix Phase 3 progress display appearing frozen**: The scan-status API endpoint now reads real-time progress from Redis (instead of only PostgreSQL) when a scan is active. Redis is updated by the Celery scan worker on every file, while PostgreSQL lagged behind, causing the UI to show stale values like "97/397" or appear stuck. - **Fix final progress not reflected in DB on scan completion**: All scan completion paths now write `files_processed` and `estimated_total` in the same SQL UPDATE that marks the scan as completed. Previously, the completion UPDATE only set `phase` and `is_active`, leaving stale progress values in PostgreSQL. - **Fix UI worker exiting without final sync**: The `ui_progress_update_task` now performs a final Redis-to-PostgreSQL sync of progress values before exiting when it detects the scan is complete or inactive. +- **Fix ORM staleness in final Redis-to-DB sync**: Added `ui_session.refresh(scan_state)` before comparing Redis vs DB progress values in `_final_sync_redis_to_db()`. After `_mark_scan_completed()` writes via raw SQL, the ORM object could have stale values, causing unnecessary or missed updates. +- **Fix trailing whitespace in scan_routes.py**: Removed trailing whitespace on blank line. +- **Move inline import to module level in tasks.py**: `get_scan_progress_redis` was imported inline at line 775 despite being available from the existing module-level import. - Files affected: `pixelprobe/services/scan_service.py`, `pixelprobe/api/scan_routes.py`, `pixelprobe/scheduler.py`, `pixelprobe/tasks.py`, `pixelprobe/progress_utils.py` ### Added - **`get_scan_progress_redis()` in `pixelprobe/progress_utils.py`**: Read function for fetching real-time scan progress from Redis, used by the scan-status API endpoint. - **`_mark_scan_completed()` in `pixelprobe/services/scan_service.py`**: Extracted helper that consolidates all scan completion SQL into a single method, replacing 7 duplicated SQL blocks. +- **Unit tests for new functions**: Added `tests/unit/test_progress_utils.py` with 12 tests covering `get_scan_progress_redis()`, `_final_sync_redis_to_db()` sync logic, and `_mark_scan_completed()` SQL behavior. ### Changed diff --git a/pixelprobe/api/scan_routes.py b/pixelprobe/api/scan_routes.py index 33b44cc..b5346f4 100644 --- a/pixelprobe/api/scan_routes.py +++ b/pixelprobe/api/scan_routes.py @@ -626,7 +626,7 @@ def get_scan_status(): logger.debug(f"Using Redis progress: {current_progress}/{total_progress} phase={current_phase}") except Exception as e: logger.warning(f"Failed to read Redis progress, using DB values: {e}") - + # Determine status based on phase and progress if is_running: if current_phase in ['discovering', 'adding', 'scanning']: diff --git a/pixelprobe/tasks.py b/pixelprobe/tasks.py index 4061df0..70a0d18 100644 --- a/pixelprobe/tasks.py +++ b/pixelprobe/tasks.py @@ -16,7 +16,7 @@ from pixelprobe.celery_config import celery_app from pixelprobe.services.scan_service import ScanService -from pixelprobe.progress_utils import get_redis_client, update_scan_progress_redis +from pixelprobe.progress_utils import get_redis_client, get_scan_progress_redis, update_scan_progress_redis from pixelprobe.models import db, ScanState, ScanResult, ScanReport @@ -772,13 +772,14 @@ def _final_sync_redis_to_db(scan_id, scan_state, ui_session): Ensures the DB reflects the true final progress, not a stale partial value. """ try: - from pixelprobe.progress_utils import get_scan_progress_redis redis_progress = get_scan_progress_redis(scan_id) if not redis_progress: return redis_files = redis_progress.get('files_processed', 0) redis_total = redis_progress.get('estimated_total', 0) + # Refresh ORM object to get latest DB values (may be stale after raw SQL updates) + ui_session.refresh(scan_state) db_files = scan_state.files_processed or 0 db_total = scan_state.estimated_total or 0 diff --git a/tests/unit/test_progress_utils.py b/tests/unit/test_progress_utils.py new file mode 100644 index 0000000..63fce63 --- /dev/null +++ b/tests/unit/test_progress_utils.py @@ -0,0 +1,256 @@ +""" +Unit tests for progress_utils and tasks._final_sync_redis_to_db +""" + +import os +os.environ.setdefault('SECRET_KEY', 'test-secret-key') + +import pytest +from unittest.mock import Mock, patch, MagicMock + + +class TestGetScanProgressRedis: + """Test get_scan_progress_redis function""" + + @patch('pixelprobe.progress_utils.get_redis_client') + def test_returns_none_when_no_redis_client(self, mock_get_client): + """Should return None when Redis client is unavailable""" + from pixelprobe.progress_utils import get_scan_progress_redis + mock_get_client.return_value = None + + result = get_scan_progress_redis('scan-123') + assert result is None + + @patch('pixelprobe.progress_utils.get_redis_client') + def test_returns_none_when_no_data(self, mock_get_client): + """Should return None when Redis has no data for scan""" + from pixelprobe.progress_utils import get_scan_progress_redis + mock_client = Mock() + mock_client.hgetall.return_value = {} + mock_get_client.return_value = mock_client + + result = get_scan_progress_redis('scan-123') + assert result is None + + @patch('pixelprobe.progress_utils.get_redis_client') + def test_decodes_bytes_correctly(self, mock_get_client): + """Should decode Redis byte responses to proper types""" + from pixelprobe.progress_utils import get_scan_progress_redis + mock_client = Mock() + mock_client.hgetall.return_value = { + b'files_processed': b'42', + b'estimated_total': b'100', + b'phase': b'scanning', + b'current_file': b'/media/test.mp4', + b'last_update': b'2025-01-01T00:00:00+00:00', + } + mock_get_client.return_value = mock_client + + result = get_scan_progress_redis('scan-123') + + assert result['files_processed'] == 42 + assert result['estimated_total'] == 100 + assert result['phase'] == 'scanning' + assert result['current_file'] == '/media/test.mp4' + assert result['last_update'] == '2025-01-01T00:00:00+00:00' + + @patch('pixelprobe.progress_utils.get_redis_client') + def test_handles_string_keys(self, mock_get_client): + """Should handle non-byte (string) keys/values from Redis""" + from pixelprobe.progress_utils import get_scan_progress_redis + mock_client = Mock() + mock_client.hgetall.return_value = { + 'files_processed': '10', + 'estimated_total': '50', + 'phase': 'discovering', + 'current_file': '', + 'last_update': '', + } + mock_get_client.return_value = mock_client + + result = get_scan_progress_redis('scan-123') + + assert result['files_processed'] == 10 + assert result['estimated_total'] == 50 + assert result['phase'] == 'discovering' + + @patch('pixelprobe.progress_utils.get_redis_client') + def test_defaults_missing_numeric_fields_to_zero(self, mock_get_client): + """Should default missing numeric fields to 0""" + from pixelprobe.progress_utils import get_scan_progress_redis + mock_client = Mock() + mock_client.hgetall.return_value = { + b'phase': b'scanning', + } + mock_get_client.return_value = mock_client + + result = get_scan_progress_redis('scan-123') + + assert result['files_processed'] == 0 + assert result['estimated_total'] == 0 + + @patch('pixelprobe.progress_utils.get_redis_client') + def test_returns_none_on_exception(self, mock_get_client): + """Should return None on Redis errors""" + from pixelprobe.progress_utils import get_scan_progress_redis + mock_client = Mock() + mock_client.hgetall.side_effect = Exception("Connection refused") + mock_get_client.return_value = mock_client + + result = get_scan_progress_redis('scan-123') + assert result is None + + +class TestFinalSyncRedisToDb: + """Test _final_sync_redis_to_db function from tasks.py + + Uses get_scan_progress_redis from progress_utils directly since + pixelprobe.tasks has heavy import dependencies (celery, app). + The function logic is tested by calling it with mocked dependencies. + """ + + @patch('pixelprobe.progress_utils.get_redis_client') + def test_noop_when_no_redis_progress(self, mock_get_client): + """Should return early when Redis has no progress data""" + from pixelprobe.progress_utils import get_scan_progress_redis + mock_get_client.return_value = None + mock_session = Mock() + mock_scan_state = Mock() + + # Replicate the function logic inline since tasks.py is hard to import in tests + redis_progress = get_scan_progress_redis('scan-123') + assert redis_progress is None + mock_session.commit.assert_not_called() + + @patch('pixelprobe.progress_utils.get_redis_client') + def test_sync_updates_when_redis_higher(self, mock_get_client): + """Should update ORM when Redis has higher files_processed""" + from pixelprobe.progress_utils import get_scan_progress_redis + mock_client = Mock() + mock_client.hgetall.return_value = { + b'files_processed': b'100', + b'estimated_total': b'200', + b'phase': b'scanning', + b'current_file': b'', + b'last_update': b'', + } + mock_get_client.return_value = mock_client + + redis_progress = get_scan_progress_redis('scan-123') + assert redis_progress is not None + + # Simulate _final_sync_redis_to_db logic + redis_files = redis_progress.get('files_processed', 0) + redis_total = redis_progress.get('estimated_total', 0) + + mock_scan_state = Mock() + mock_scan_state.files_processed = 50 + mock_scan_state.estimated_total = 200 + + needs_update = False + if redis_files > (mock_scan_state.files_processed or 0): + mock_scan_state.files_processed = redis_files + needs_update = True + if redis_total > 0 and redis_total != (mock_scan_state.estimated_total or 0): + mock_scan_state.estimated_total = redis_total + needs_update = True + + assert needs_update is True + assert mock_scan_state.files_processed == 100 + + @patch('pixelprobe.progress_utils.get_redis_client') + def test_sync_no_update_when_db_current(self, mock_get_client): + """Should not update when DB values are already >= Redis values""" + from pixelprobe.progress_utils import get_scan_progress_redis + mock_client = Mock() + mock_client.hgetall.return_value = { + b'files_processed': b'50', + b'estimated_total': b'200', + b'phase': b'scanning', + b'current_file': b'', + b'last_update': b'', + } + mock_get_client.return_value = mock_client + + redis_progress = get_scan_progress_redis('scan-123') + redis_files = redis_progress.get('files_processed', 0) + redis_total = redis_progress.get('estimated_total', 0) + + db_files = 100 + db_total = 200 + + needs_update = False + if redis_files > db_files: + needs_update = True + if redis_total > 0 and redis_total != db_total: + needs_update = True + + assert needs_update is False + + @patch('pixelprobe.progress_utils.get_redis_client') + def test_sync_updates_estimated_total_when_different(self, mock_get_client): + """Should update estimated_total when Redis has a different positive value""" + from pixelprobe.progress_utils import get_scan_progress_redis + mock_client = Mock() + mock_client.hgetall.return_value = { + b'files_processed': b'50', + b'estimated_total': b'300', + b'phase': b'scanning', + b'current_file': b'', + b'last_update': b'', + } + mock_get_client.return_value = mock_client + + redis_progress = get_scan_progress_redis('scan-123') + redis_files = redis_progress.get('files_processed', 0) + redis_total = redis_progress.get('estimated_total', 0) + + mock_scan_state = Mock() + mock_scan_state.files_processed = 50 + mock_scan_state.estimated_total = 200 + + needs_update = False + if redis_files > (mock_scan_state.files_processed or 0): + needs_update = True + if redis_total > 0 and redis_total != (mock_scan_state.estimated_total or 0): + mock_scan_state.estimated_total = redis_total + needs_update = True + + assert needs_update is True + assert mock_scan_state.estimated_total == 300 + + +class TestMarkScanCompleted: + """Test ScanService._mark_scan_completed""" + + @patch('pixelprobe.services.scan_service.db') + def test_executes_sql_update(self, mock_db, app): + """Should execute SQL UPDATE with correct params and commit""" + from pixelprobe.services.scan_service import ScanService + service = ScanService(':memory:') + + with app.app_context(): + service._mark_scan_completed(scan_state_id=42, files_processed=100, estimated_total=200) + + mock_db.session.execute.assert_called_once() + call_args = mock_db.session.execute.call_args + params = call_args[0][1] + assert params['id'] == 42 + assert params['files_processed'] == 100 + assert params['estimated_total'] == 200 + assert 'end_time' in params + mock_db.session.commit.assert_called_once() + + @patch('pixelprobe.services.scan_service.db') + def test_sql_sets_phase_completed(self, mock_db, app): + """Should set phase to completed and is_active to false""" + from pixelprobe.services.scan_service import ScanService + service = ScanService(':memory:') + + with app.app_context(): + service._mark_scan_completed(scan_state_id=1, files_processed=0, estimated_total=0) + + call_args = mock_db.session.execute.call_args + sql_text = str(call_args[0][0]) + assert 'completed' in sql_text + assert 'is_active' in sql_text