diff --git a/.github/SECRETS.md b/.github/SECRETS.md index 8c3f960..4f601e4 100644 --- a/.github/SECRETS.md +++ b/.github/SECRETS.md @@ -13,10 +13,12 @@ All secrets needed for CI/CD pipelines. Configure in **Settings → Secrets and | `VPS_PORT` | SSH port | `22` | | `VPS_SSH_KEY` | Private SSH key for the deploy user | Full PEM key | -### Container Registry (GHCR) +### Docker Hub -> [!NOTE] -> GHCR uses `GITHUB_TOKEN` automatically — no additional secrets needed for pushing images. +| Secret | Description | Example | +| -------------------- | ------------------------------------- | ------------------ | +| `DOCKERHUB_USERNAME` | Your Docker Hub username | `utsavjoshi` | +| `DOCKERHUB_TOKEN` | Access token generated via Docker Hub | `dckr_pat_xxxxxxx` | ## How to Add Secrets @@ -31,8 +33,8 @@ Before the deploy workflow can succeed, ensure the VPS has: 1. Docker and Docker Compose installed 2. The `deploy` user with docker group access -3. Project directory at `/opt/postly` with `.env` file (`chmod 600`) -4. GHCR login configured: `docker login ghcr.io -u -p ` +3. Project directory at `/var/www/postly` with `.env` file (`chmod 600`) +4. Docker Hub login configured: `docker login -u -p ` 5. SSH key added to `~/.ssh/authorized_keys` for the deploy user ## Branch Protection Rules (Recommended) @@ -50,9 +52,9 @@ Every deploy tags images with the Git SHA. To rollback: ```bash ssh deploy@ -cd /opt/postly -export API_IMAGE=ghcr.io//api: -export SCRAPER_IMAGE=ghcr.io//scraper: -export BOT_IMAGE=ghcr.io//bot: -docker compose -f docker-compose.prod.yml up -d --no-deps api bot scraper +cd /var/www/postly +export API_IMAGE=/postly-api: + +export BOT_IMAGE=/postly-bot: +docker compose -f docker-compose.prod.yml up -d --no-deps api bot ``` diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 27451b6..0b637e7 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -1,18 +1,78 @@ -name: Continuous Deployment (CD) +name: CI/CD Pipeline on: push: branches: [main] + pull_request: + branches: [main] jobs: - deploy: - name: Deploy to VPS + build-and-push: + name: Build & Push to Docker Hub runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v6 + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Extract metadata for API + id: meta-api + uses: docker/metadata-action@v5 + with: + images: ${{ secrets.DOCKERHUB_USERNAME }}/postly-api + tags: | + type=ref,event=pr + type=raw,value=latest,enable={{is_default_branch}} + type=sha + + - name: Build and push API + uses: docker/build-push-action@v5 + with: + context: . + file: apps/api/Dockerfile + push: true + tags: ${{ steps.meta-api.outputs.tags }} + labels: ${{ steps.meta-api.outputs.labels }} + cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/postly-api:buildcache + cache-to: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/postly-api:buildcache,mode=max + + - name: Extract metadata for Bot + id: meta-bot + uses: docker/metadata-action@v5 + with: + images: ${{ secrets.DOCKERHUB_USERNAME }}/postly-bot + tags: | + type=ref,event=pr + type=raw,value=latest,enable={{is_default_branch}} + type=sha + + - name: Build and push Bot + uses: docker/build-push-action@v5 + with: + context: . + file: apps/bot/Dockerfile + push: true + tags: ${{ steps.meta-bot.outputs.tags }} + labels: ${{ steps.meta-bot.outputs.labels }} + cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/postly-bot:buildcache + cache-to: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/postly-bot:buildcache,mode=max + + deploy: + name: Deploy to VPS + runs-on: ubuntu-latest + needs: build-and-push + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + + steps: - name: Deploy via SSH uses: appleboy/ssh-action@v1.2.5 with: @@ -20,19 +80,18 @@ jobs: username: ${{ secrets.REMOTE_USER }} port: ${{ secrets.REMOTE_PORT }} key: ${{ secrets.SSH_PRIVATE_KEY }} - script: | set -e # Exit immediately on any error # Navigate to project directory cd /var/www/postly - # Ensure clean state and pull latest changes (handles divergent branches) + # Ensure clean state and pull latest changes git fetch --all git reset --hard origin/main - # Rebuild and start all services - docker compose -f docker-compose.prod.yml build --no-cache + # Pull latest images and start services + docker compose -f docker-compose.prod.yml pull docker compose -f docker-compose.prod.yml up -d # Wait for API to become healthy @@ -57,9 +116,6 @@ jobs: # Run database migrations (if any) docker exec postly-api npm run migrate:up || echo "No migrations to run" - # Rebuild other services if needed - docker compose -f docker-compose.prod.yml up -d --build scraper bot - # Cleanup old images docker image prune -f diff --git a/README.md b/README.md index 85e1d0e..0073a8a 100644 --- a/README.md +++ b/README.md @@ -7,24 +7,22 @@ Complete runbook for deploying Postly to a VPS. ## Architecture Overview ``` -VPS Proxy (Nginx/Traefik) → API (Express) - → Static Web (Vite build) +VPS Proxy (Nginx/Traefik) → API (Express) → Static Web (Vite build) Internal Docker Network: API ←→ PostgreSQL (pgvector) API ←→ Redis (BullMQ + Caching) - Scraper ←→ PostgreSQL Bot ←→ PostgreSQL + Redis ``` -**Stack:** Node.js API · Python Scraper · Python Discord Bot · PostgreSQL 16 + pgvector · Redis 7 +**Stack:** Node.js API · Python Discord Bot · PostgreSQL 16 + pgvector · Redis 7 --- ## Quick-Start Checklist ``` -□ 1. Clone repo to /opt/postly, create .env +□ 1. Clone repo to /var/www/postly, create .env □ 2. docker compose -f docker-compose.prod.yml up -d □ 3. Verify all services healthy □ 4. Configure GitHub Actions secrets @@ -42,7 +40,7 @@ Internal Docker Network: Log into your VPS and run: ```bash -cd /opt/postly +cd /var/www/postly git clone https://github.com/.git . # Create production .env from template @@ -60,11 +58,11 @@ nano .env - `DISCORD_BOT_TOKEN` — From Discord Developer Portal - `WEB_URL` — Your production domain (e.g., `https://postly.io`) -### 2. Login to GHCR +### 2. Login to Docker Hub ```bash -# Login to pull pre-built images from GitHub Container Registry -echo "" | docker login ghcr.io -u --password-stdin +# Login to pull pre-built images from Docker Hub +echo "" | docker login -u --password-stdin ``` ### 3. Start the Stack @@ -90,10 +88,10 @@ Expected health response: } ``` -### 4. Run HNSW Index Migration (One-time) +### 4. Run Database Migrations ```bash -docker exec -i postly-postgres psql -U postly -d postly < scripts/add-hnsw-indexes.sql +docker exec postly-api npm run migrate:up ``` ### 5. Setup Backups @@ -106,7 +104,7 @@ chmod +x scripts/backup.sh bash scripts/backup.sh # Add to cron (runs daily at 2 AM) -(crontab -l 2>/dev/null; echo "0 2 * * * /opt/postly/scripts/backup.sh >> /var/log/postly-backup.log 2>&1") | crontab - +(crontab -l 2>/dev/null; echo "0 2 * * * /var/www/postly/scripts/backup.sh >> /var/log/postly-backup.log 2>&1") | crontab - ``` ### 6. Configure GitHub Actions @@ -129,11 +127,11 @@ Push to `main` and verify the pipeline deploys successfully. Every deploy tags images with the Git SHA. To rollback: ```bash -cd /opt/postly -export API_IMAGE=ghcr.io//api: -export SCRAPER_IMAGE=ghcr.io//scraper: -export BOT_IMAGE=ghcr.io//bot: -docker compose -f docker-compose.prod.yml up -d --no-deps api bot scraper +cd /var/www/postly +export API_IMAGE=/postly-api: + +export BOT_IMAGE=/postly-bot: +docker compose -f docker-compose.prod.yml up -d --no-deps api bot ``` --- @@ -164,19 +162,18 @@ docker rm -f pg-restore-test | ------- | ----------------------------------------------- | ----------- | | 0–1K | Current setup, no changes | — | | 1K–10K | Add Postgres read replica (second VPS) | +€4.5/mo | -| 10K–50K | Extract scraper to own VPS, add pgBouncer | +€4.5/mo | +| 10K–50K | Add pgBouncer | +€4.5/mo | | 50K+ | Consider managed DB, split into domain services | Variable | --- ## File Reference -| File | Purpose | -| ------------------------------ | -------------------------------- | -| `docker-compose.prod.yml` | Main production stack | -| `scripts/backup.sh` | Daily PostgreSQL backup | -| `scripts/add-hnsw-indexes.sql` | pgvector HNSW indexes (run once) | -| `.env.production.example` | Production env template | -| `.github/workflows/deploy.yml` | CI/CD pipeline | -| `.github/workflows/ci.yml` | PR checks | -| `.github/SECRETS.md` | GitHub secrets reference | +| File | Purpose | +| ------------------------------ | ------------------------ | +| `docker-compose.prod.yml` | Main production stack | +| `scripts/backup.sh` | Daily PostgreSQL backup | +| `.env.production.example` | Production env template | +| `.github/workflows/deploy.yml` | CI/CD pipeline | +| `.github/workflows/ci.yml` | PR checks | +| `.github/SECRETS.md` | GitHub secrets reference | diff --git a/apps/api/.eslintrc.cjs b/apps/api/.eslintrc.cjs deleted file mode 100644 index 9db4d13..0000000 --- a/apps/api/.eslintrc.cjs +++ /dev/null @@ -1 +0,0 @@ -module.exports = { extends: ['@postly/eslint-config'] }; diff --git a/apps/api/eslint.config.js b/apps/api/eslint.config.js new file mode 100644 index 0000000..f79abec --- /dev/null +++ b/apps/api/eslint.config.js @@ -0,0 +1,36 @@ +import tsPlugin from "@typescript-eslint/eslint-plugin"; +import tsParser from "@typescript-eslint/parser"; + +export default [ + { + files: ["src/**/*.ts"], + languageOptions: { + parser: tsParser, + parserOptions: { + ecmaVersion: 2022, + sourceType: "module", + }, + }, + plugins: { + "@typescript-eslint": tsPlugin, + }, + rules: { + "@typescript-eslint/no-unused-vars": [ + "error", + { argsIgnorePattern: "^_" }, + ], + "@typescript-eslint/no-explicit-any": "warn", + "@typescript-eslint/explicit-function-return-type": "off", + "@typescript-eslint/explicit-module-boundary-types": "off", + }, + }, + { + files: ["src/__tests__/**/*.ts"], + rules: { + "@typescript-eslint/no-explicit-any": "off", + }, + }, + { + ignores: ["dist/**", "node_modules/**"], + }, +]; diff --git a/apps/api/package.json b/apps/api/package.json index 9132e02..05d6563 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -7,48 +7,50 @@ "dev": "tsx watch src/server.ts", "build": "tsc", "start": "node dist/server.js", - "lint": "ESLINT_USE_FLAT_CONFIG=false eslint src --ext ts", + "lint": "eslint src", "type-check": "tsc --noEmit", "test": "vitest", "test:watch": "vitest --watch" }, "dependencies": { - "@dodopayments/express": "^0.2.6", + "@dodopayments/express": "^0.2.8", "@postly/config": "*", "@postly/database": "*", "@postly/logger": "*", "@postly/shared-types": "*", - "bcrypt": "^5.1.1", - "bullmq": "^5.31.3", + "bcrypt": "^6.0.0", + "bullmq": "^5.76.4", "compression": "^1.8.1", "cors": "^2.8.6", - "dotenv": "^16.4.7", - "express": "^4.21.2", + "dotenv": "^17.4.2", + "express": "^5.2.1", "express-prom-bundle": "^8.0.0", - "express-rate-limit": "^8.2.1", - "helmet": "^8.0.0", - "ioredis": "^5.9.2", - "jsonwebtoken": "^9.0.2", - "mammoth": "^1.11.0", - "multer": "^2.0.2", + "express-rate-limit": "^8.4.1", + "helmet": "^8.1.0", + "ioredis": "^5.10.1", + "jsonwebtoken": "^9.0.3", + "mammoth": "^1.12.0", + "multer": "^2.1.1", "pdf-parse": "^2.4.5", "prom-client": "^15.1.3", - "resend": "^6.9.4", - "zod": "^3.24.1" + "resend": "^6.1.3", + "zod": "^4.4.1" }, "devDependencies": { "@postly/eslint-config": "*", "@postly/typescript-config": "*", - "@types/bcrypt": "^5.0.2", + "@types/bcrypt": "^6.0.0", "@types/compression": "^1.8.1", - "@types/cors": "^2.8.17", - "@types/express": "^5.0.0", - "@types/jsonwebtoken": "^9.0.7", - "@types/multer": "^1.4.12", - "@types/node": "^25.2.0", - "@types/pdf-parse": "^1.1.4", - "tsx": "^4.19.2", - "typescript": "^5.7.2", - "vitest": "^3.0.5" + "@types/cors": "^2.8.19", + "@types/express": "^5.0.6", + "@types/jsonwebtoken": "^9.0.10", + "@types/multer": "^2.1.0", + "@types/node": "^25.6.0", + "@types/pdf-parse": "^1.1.5", + "@types/supertest": "^7.2.0", + "supertest": "^7.2.2", + "tsx": "^4.21.0", + "typescript": "^6.0.3", + "vitest": "^4.1.5" } } diff --git a/apps/api/src/__tests__/application.test.ts b/apps/api/src/__tests__/application.test.ts new file mode 100644 index 0000000..3159343 --- /dev/null +++ b/apps/api/src/__tests__/application.test.ts @@ -0,0 +1,29 @@ +import { describe, it, expect, vi } from "vitest"; +import request from "supertest"; +import app from "../server.js"; +import { applicationQueries } from "@postly/database"; + +vi.mock("@postly/database", () => ({ + pool: { query: vi.fn(), end: vi.fn() }, + applicationQueries: { findBySeeker: vi.fn(), create: vi.fn() }, +})); + +vi.mock("../middleware/auth.js", () => ({ + authenticateToken: (req: any, _res: any, next: any) => { + req.user = { id: "test-id" }; + next(); + }, +})); + +describe("Application Routes (/api/v1/applications)", () => { + it("should get my applications", async () => { + vi.mocked(applicationQueries.findBySeeker).mockResolvedValueOnce([] as any); + const res = await request(app).get("/api/v1/applications"); + expect(res.status).toBe(200); + }); + + it("should reject apply without job_id", async () => { + const res = await request(app).post("/api/v1/applications").send({}); + expect(res.status).toBe(400); + }); +}); diff --git a/apps/api/src/__tests__/auth.test.ts b/apps/api/src/__tests__/auth.test.ts new file mode 100644 index 0000000..1f751e4 --- /dev/null +++ b/apps/api/src/__tests__/auth.test.ts @@ -0,0 +1,134 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import request from "supertest"; +import app from "../server.js"; +import { AuthService } from "../services/auth.service.js"; +import { AuthError } from "../services/auth.service.js"; + +// Mock dependencies to prevent side effects +vi.mock("@postly/database", () => ({ + pool: { query: vi.fn(), end: vi.fn() }, +})); +vi.mock("../lib/redis.js", () => ({ + redis: { ping: vi.fn(), disconnect: vi.fn() }, +})); + +// Mock AuthService +vi.mock("../services/auth.service.js", () => { + return { + AuthService: vi.fn().mockImplementation(() => ({ + register: vi.fn(), + login: vi.fn(), + })), + AuthError: class AuthError extends Error { + statusCode: number; + code?: string; + constructor(message: string, statusCode: number = 400, code?: string) { + super(message); + this.statusCode = statusCode; + this.code = code; + this.name = "AuthError"; + } + }, + }; +}); + +describe("Auth Routes (/api/v1/auth)", () => { + let authServiceMock: any; + + beforeEach(() => { + vi.clearAllMocks(); + // Get instance of the mocked service + authServiceMock = new AuthService(); + }); + + describe("POST /register", () => { + it("should return 400 for invalid email", async () => { + const response = await request(app) + .post("/api/v1/auth/register") + .send({ email: "invalid-email", password: "password123" }); + + expect(response.status).toBe(400); + expect(response.body).toEqual({ + success: false, + error: { message: "Invalid email address" }, + }); + }); + + it("should return 400 for short password", async () => { + const response = await request(app) + .post("/api/v1/auth/register") + .send({ email: "test@example.com", password: "short" }); + + expect(response.status).toBe(400); + expect(response.body.error.message).toContain( + "Password must be at least 8 characters", + ); + }); + + it("should register successfully with valid data", async () => { + // Setup mock + authServiceMock.register.mockResolvedValueOnce({ + user: { id: "1", email: "test@example.com" }, + access_token: "mock_token", + }); + + const response = await request(app) + .post("/api/v1/auth/register") + .send({ email: "test@example.com", password: "securepassword123" }); + + expect(response.status).toBe(201); + expect(response.body.success).toBe(true); + expect(response.body.data).toHaveProperty("access_token"); + }); + + it("should handle AuthError correctly", async () => { + authServiceMock.register.mockRejectedValueOnce( + new AuthError("Email already in use", 409, "EMAIL_EXISTS"), + ); + + const response = await request(app) + .post("/api/v1/auth/register") + .send({ email: "test@example.com", password: "securepassword123" }); + + expect(response.status).toBe(409); + expect(response.body).toEqual({ + success: false, + error: { message: "Email already in use", code: "EMAIL_EXISTS" }, + }); + }); + }); + + describe("POST /login", () => { + it("should return 400 for missing credentials", async () => { + const response = await request(app).post("/api/v1/auth/login").send({}); + expect(response.status).toBe(400); + }); + + it("should login successfully", async () => { + authServiceMock.login.mockResolvedValueOnce({ + user: { id: "1", email: "test@example.com" }, + access_token: "mock_token", + }); + + const response = await request(app) + .post("/api/v1/auth/login") + .send({ email: "test@example.com", password: "securepassword123" }); + + expect(response.status).toBe(200); + expect(response.body.success).toBe(true); + }); + + it("should return 401 for invalid credentials", async () => { + authServiceMock.login.mockRejectedValueOnce( + new AuthError("Invalid email or password", 401, "INVALID_CREDENTIALS"), + ); + + const response = await request(app) + .post("/api/v1/auth/login") + .send({ email: "test@example.com", password: "wrongpassword" }); + + expect(response.status).toBe(401); + expect(response.body.error.message).toBe("Invalid email or password"); + }); + }); +}); diff --git a/apps/api/src/__tests__/chat.test.ts b/apps/api/src/__tests__/chat.test.ts new file mode 100644 index 0000000..f47966c --- /dev/null +++ b/apps/api/src/__tests__/chat.test.ts @@ -0,0 +1,27 @@ +import { describe, it, expect, vi } from "vitest"; +import request from "supertest"; +import app from "../server.js"; + +vi.mock("@postly/database", () => ({ + pool: { query: vi.fn(), end: vi.fn() }, + conversationQueries: { findByUser: vi.fn(), create: vi.fn() }, +})); + +vi.mock("../middleware/auth.js", () => ({ + authenticateToken: (req: any, _res: any, next: any) => { + req.user = { id: "test-id" }; + next(); + }, +})); + +describe("Chat Routes (/api/v1/chat)", () => { + it("should get conversations", async () => { + const res = await request(app).get("/api/v1/chat/conversations"); + expect(res.status).toBe(200); + }); + + it("should validate stream request", async () => { + const res = await request(app).post("/api/v1/chat/stream").send({}); + expect(res.status).toBe(400); // Missing required fields + }); +}); diff --git a/apps/api/src/__tests__/cors.test.ts b/apps/api/src/__tests__/cors.test.ts new file mode 100644 index 0000000..0ef4d1a --- /dev/null +++ b/apps/api/src/__tests__/cors.test.ts @@ -0,0 +1,31 @@ +import { describe, it, expect, vi } from "vitest"; +import request from "supertest"; +import app from "../server.js"; + +// Mock dependencies to prevent side effects during app boot +vi.mock("@postly/database", () => ({ + pool: { query: vi.fn(), end: vi.fn() }, +})); +vi.mock("../lib/redis.js", () => ({ + redis: { ping: vi.fn(), disconnect: vi.fn() }, +})); + +describe("CORS Configuration", () => { + it("should allow requests with no origin (like cURL or Postman)", async () => { + const response = await request(app).options("/health"); + // No Origin header is sent + expect(response.status).toBe(204); // preflight returns 204 + }); + + it("should block requests from unauthorized origins", async () => { + const response = await request(app) + .get("/health") + .set("Origin", "http://malicious-site.com"); + + // Express cors middleware returns 500 when it throws Error + // or just blocks. Let's see how the app is configured. + // Our cors config throws `new Error("CORS: origin ... not allowed")` + expect(response.status).toBe(500); + expect(response.text).toContain("CORS"); + }); +}); diff --git a/apps/api/src/__tests__/health.test.ts b/apps/api/src/__tests__/health.test.ts new file mode 100644 index 0000000..5f54f6d --- /dev/null +++ b/apps/api/src/__tests__/health.test.ts @@ -0,0 +1,91 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import request from "supertest"; +import app from "../server.js"; +import { pool } from "@postly/database"; +import { redis } from "../lib/redis.js"; + +// Mock external dependencies +vi.mock("@postly/database", () => ({ + pool: { + query: vi.fn(), + end: vi.fn(), + }, +})); + +vi.mock("../lib/redis.js", () => ({ + redis: { + ping: vi.fn(), + disconnect: vi.fn(), + }, +})); + +describe("GET /health", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("should return 200 OK when both DB and Redis are healthy", async () => { + // Setup mocks to resolve successfully + vi.mocked(pool.query).mockResolvedValueOnce({ + rows: [{ "?column?": 1 }], + } as any); + vi.mocked(redis.ping).mockResolvedValueOnce("PONG"); + + const response = await request(app).get("/health"); + + expect(response.status).toBe(200); + expect(response.body).toEqual( + expect.objectContaining({ + status: "ok", + checks: { + db: "ok", + redis: "ok", + }, + }), + ); + }); + + it("should return 503 DEGRADED when DB fails", async () => { + // DB fails, Redis succeeds + vi.mocked(pool.query).mockRejectedValueOnce( + new Error("DB Connection Error"), + ); + vi.mocked(redis.ping).mockResolvedValueOnce("PONG"); + + const response = await request(app).get("/health"); + + expect(response.status).toBe(503); + expect(response.body).toEqual( + expect.objectContaining({ + status: "degraded", + checks: { + db: "failed", + redis: "ok", + }, + }), + ); + }); + + it("should return 503 DEGRADED when Redis fails", async () => { + // DB succeeds, Redis fails + vi.mocked(pool.query).mockResolvedValueOnce({ + rows: [{ "?column?": 1 }], + } as any); + vi.mocked(redis.ping).mockRejectedValueOnce( + new Error("Redis Connection Error"), + ); + + const response = await request(app).get("/health"); + + expect(response.status).toBe(503); + expect(response.body).toEqual( + expect.objectContaining({ + status: "degraded", + checks: { + db: "ok", + redis: "failed", + }, + }), + ); + }); +}); diff --git a/apps/api/src/__tests__/job.test.ts b/apps/api/src/__tests__/job.test.ts new file mode 100644 index 0000000..7ad65c0 --- /dev/null +++ b/apps/api/src/__tests__/job.test.ts @@ -0,0 +1,34 @@ +import { describe, it, expect, vi } from "vitest"; +import request from "supertest"; +import app from "../server.js"; +import { jobQueries } from "@postly/database"; + +vi.mock("@postly/database", () => ({ + pool: { query: vi.fn(), end: vi.fn() }, + jobQueries: { findActive: vi.fn(), countActive: vi.fn(), findById: vi.fn() }, +})); + +vi.mock("../middleware/auth.js", () => ({ + authenticateToken: (req: any, _res: any, next: any) => { + req.user = { id: "test-id" }; + next(); + }, +})); + +describe("Job Routes (/api/v1/jobs)", () => { + it("should list active jobs", async () => { + vi.mocked(jobQueries.findActive).mockResolvedValueOnce([ + { id: "job-1" }, + ] as any); + vi.mocked(jobQueries.countActive).mockResolvedValueOnce(1); + const res = await request(app).get("/api/v1/jobs?limit=10"); + expect(res.status).toBe(200); + expect(res.body.data.jobs).toHaveLength(1); + }); + + it("should return 404 for missing job", async () => { + vi.mocked(jobQueries.findById).mockResolvedValueOnce(undefined as any); + const res = await request(app).get("/api/v1/jobs/non-existent"); + expect(res.status).toBe(404); + }); +}); diff --git a/apps/api/src/__tests__/resume.test.ts b/apps/api/src/__tests__/resume.test.ts new file mode 100644 index 0000000..3a7c251 --- /dev/null +++ b/apps/api/src/__tests__/resume.test.ts @@ -0,0 +1,34 @@ +import { describe, it, expect, vi } from "vitest"; +import request from "supertest"; +import app from "../server.js"; + +vi.mock("@postly/database", () => ({ + pool: { query: vi.fn(), end: vi.fn() }, +})); + +vi.mock("../services/resume.service.js", () => ({ + resumeService: { + getUserResumes: vi.fn().mockResolvedValue([]), + processResume: vi.fn(), + }, +})); + +vi.mock("../middleware/auth.js", () => ({ + authenticateToken: (req: any, _res: any, next: any) => { + req.user = { id: "test-id" }; + next(); + }, +})); + +describe("Resume Routes (/api/v1/resumes)", () => { + it("should get user resumes", async () => { + const res = await request(app).get("/api/v1/resumes"); + expect(res.status).toBe(200); + expect(res.body.data).toEqual([]); + }); + + it("should reject upload without file", async () => { + const res = await request(app).post("/api/v1/resumes/upload"); + expect(res.status).toBe(400); // Handled by controller explicitly checking for req.file + }); +}); diff --git a/apps/api/src/__tests__/user.test.ts b/apps/api/src/__tests__/user.test.ts new file mode 100644 index 0000000..53c0136 --- /dev/null +++ b/apps/api/src/__tests__/user.test.ts @@ -0,0 +1,59 @@ +import { Request, Response, NextFunction } from "express"; +import { describe, it, expect, vi } from "vitest"; +import request from "supertest"; +import app from "../server.js"; +import { userQueries } from "@postly/database"; + +vi.mock("@postly/database", () => ({ + pool: { query: vi.fn(), end: vi.fn() }, + userQueries: { findById: vi.fn(), update: vi.fn() }, + seekerProfileQueries: { findByUserId: vi.fn() }, +})); + +vi.mock("../services/cache.service.js", () => ({ + CacheService: { + generateKey: vi.fn(), + getOrSet: vi.fn((_key, _ttl, fetcher) => fetcher()), + invalidate: vi.fn(), + }, +})); + +vi.mock("../middleware/auth.js", () => ({ + authenticateToken: (req: Request, _res: Response, next: NextFunction) => { + (req as any).user = { id: "test-id", email: "test@example.com" }; + next(); + }, +})); + +describe("User Routes (/api/v1/users)", () => { + describe("GET /profile", () => { + it("should return user profile", async () => { + vi.mocked(userQueries.findById).mockResolvedValueOnce({ + id: "test-id", + email: "test@example.com", + } as any); + const res = await request(app).get("/api/v1/users/profile"); + expect(res.status).toBe(200); + expect(res.body.data.email).toBe("test@example.com"); + }); + }); + + describe("PATCH /profile", () => { + it("should reject invalid avatar URLs", async () => { + const res = await request(app) + .patch("/api/v1/users/profile") + .send({ avatar_url: "not-a-url" }); + expect(res.status).toBe(400); + }); + + it("should update profile", async () => { + vi.mocked(userQueries.update).mockResolvedValueOnce({ + id: "test-id", + } as any); + const res = await request(app) + .patch("/api/v1/users/profile") + .send({ full_name: "Test Name" }); + expect(res.status).toBe(200); + }); + }); +}); diff --git a/apps/api/src/controllers/application.controller.ts b/apps/api/src/controllers/application.controller.ts index c2709d2..6b82468 100644 --- a/apps/api/src/controllers/application.controller.ts +++ b/apps/api/src/controllers/application.controller.ts @@ -61,7 +61,7 @@ export class ApplicationController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -137,7 +137,7 @@ export class ApplicationController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -206,7 +206,7 @@ export class ApplicationController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } diff --git a/apps/api/src/controllers/auth.controller.ts b/apps/api/src/controllers/auth.controller.ts index 5750e0f..a4852c8 100644 --- a/apps/api/src/controllers/auth.controller.ts +++ b/apps/api/src/controllers/auth.controller.ts @@ -73,7 +73,7 @@ export class AuthController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -103,7 +103,7 @@ export class AuthController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -129,7 +129,7 @@ export class AuthController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -173,7 +173,7 @@ export class AuthController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -200,7 +200,7 @@ export class AuthController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -228,7 +228,7 @@ export class AuthController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -256,7 +256,7 @@ export class AuthController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } diff --git a/apps/api/src/controllers/chat.controller.ts b/apps/api/src/controllers/chat.controller.ts index 9cc46b1..4cda939 100644 --- a/apps/api/src/controllers/chat.controller.ts +++ b/apps/api/src/controllers/chat.controller.ts @@ -70,7 +70,7 @@ export class ChatController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -223,7 +223,7 @@ export class ChatController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -297,7 +297,7 @@ export class ChatController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } diff --git a/apps/api/src/controllers/user.controller.ts b/apps/api/src/controllers/user.controller.ts index 5e21629..8797665 100644 --- a/apps/api/src/controllers/user.controller.ts +++ b/apps/api/src/controllers/user.controller.ts @@ -49,7 +49,7 @@ const updateEmployerProfileSchema = z.object({ company_size: z.string().optional(), industry: z.string().max(150).optional(), headquarters_location: z.string().max(255).optional(), - social_links: z.record(z.string()).optional(), + social_links: z.record(z.string(), z.string()).optional(), }); export class UserController { @@ -102,7 +102,7 @@ export class UserController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -149,7 +149,7 @@ export class UserController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -188,7 +188,7 @@ export class UserController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } @@ -233,7 +233,7 @@ export class UserController { if (!validation.success) { res.status(400).json({ success: false, - error: { message: validation.error.errors[0].message }, + error: { message: validation.error.issues[0].message }, }); return; } diff --git a/apps/api/src/middleware/auth.ts b/apps/api/src/middleware/auth.ts index 122ef3f..fc7d83e 100644 --- a/apps/api/src/middleware/auth.ts +++ b/apps/api/src/middleware/auth.ts @@ -1,4 +1,3 @@ -/* eslint-disable @typescript-eslint/no-namespace */ import { Request, Response, NextFunction } from "express"; import jwt from "jsonwebtoken"; import { JWT_SECRET } from "../config/secrets.js"; diff --git a/apps/api/src/server.ts b/apps/api/src/server.ts index e3cbc34..1888f9f 100644 --- a/apps/api/src/server.ts +++ b/apps/api/src/server.ts @@ -191,23 +191,25 @@ app.use("/uploads", express.static(path.join(__dirname, "../uploads"))); app.use(notFoundHandler); app.use(errorHandler); -// Start server -app.listen(API_PORT, "0.0.0.0", async () => { - logger.info("API server started", { - port: API_PORT, - environment: NODE_ENV, - url: `http://0.0.0.0:${API_PORT}`, - }); - - // Initialize Bot Job Queue - try { - await queueService.initDailyCron(); - } catch (err) { - logger.error("Failed to initialize Bot Queue", { - error: err instanceof Error ? err.message : "Unknown", +// Start server only if not in test mode +if (process.env.NODE_ENV !== "test") { + app.listen(API_PORT, "0.0.0.0", async () => { + logger.info("API server started", { + port: API_PORT, + environment: NODE_ENV, + url: `http://0.0.0.0:${API_PORT}`, }); - } -}); + + // Initialize Bot Job Queue + try { + await queueService.initDailyCron(); + } catch (err) { + logger.error("Failed to initialize Bot Queue", { + error: err instanceof Error ? err.message : "Unknown", + }); + } + }); +} // Graceful shutdown — close all connections before exiting const shutdown = async (signal: string) => { diff --git a/apps/api/src/services/resume.service.ts b/apps/api/src/services/resume.service.ts index 432e062..1a4e911 100644 --- a/apps/api/src/services/resume.service.ts +++ b/apps/api/src/services/resume.service.ts @@ -12,7 +12,6 @@ export class ResumeService { * Removes control characters (including newlines) and normalizes to a single line. */ private sanitizeForLog(input: string): string { - // eslint-disable-next-line no-control-regex return String(input).replace(/[\x00-\x1F\x7F]/g, " "); } diff --git a/apps/scraper/Dockerfile b/apps/scraper/Dockerfile deleted file mode 100644 index 3954af5..0000000 --- a/apps/scraper/Dockerfile +++ /dev/null @@ -1,55 +0,0 @@ -# stage 1: builder - for python dependencies -FROM python:3.12-slim-bookworm AS builder - -WORKDIR /app -RUN python -m venv /opt/venv -ENV PATH="/opt/venv/bin:$PATH" - -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# stage 2: final - minimal runtime -FROM python:3.12-slim-bookworm - -WORKDIR /app -COPY --from=builder /opt/venv /opt/venv -ENV PATH="/opt/venv/bin:$PATH" - -ENV PLAYWRIGHT_BROWSERS_PATH=/opt/pw-browsers -RUN apt-get update && apt-get install -y --no-install-recommends \ - libnss3 \ - libnspr4 \ - libatk1.0-0 \ - libatk-bridge2.0-0 \ - libcups2 \ - libdrm2 \ - libdbus-1-3 \ - libxkbcommon0 \ - libxcomposite1 \ - libxdamage1 \ - libxfixes3 \ - libxrandr2 \ - libgbm1 \ - libasound2 \ - libpango-1.0-0 \ - libcairo2 \ - && pip install playwright \ - && playwright install chromium \ - && chmod -R 755 /opt/pw-browsers \ - && apt-get clean && rm -rf /var/lib/apt/lists/* - -# Create non-root user -RUN useradd -m -u 1001 scraper && \ - chown -R scraper:scraper /app /opt/venv - -COPY --chown=scraper:scraper src/ ./src/ - -ENV PYTHONPATH=/app/src -USER scraper - -HEALTHCHECK --interval=60s --timeout=10s --start-period=60s --retries=2 \ - CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')" || exit 1 - -EXPOSE 8080 - -CMD ["python", "-m", "src.main"] diff --git a/apps/scraper/execute_smoke.py b/apps/scraper/execute_smoke.py deleted file mode 100644 index 0e6ba66..0000000 --- a/apps/scraper/execute_smoke.py +++ /dev/null @@ -1,16 +0,0 @@ -import asyncio -from playwright.async_api import async_playwright - -async def _smoke_test_browser(): - try: - async with async_playwright() as p: - browser = await p.chromium.launch(headless=False, channel="chrome") - page = await browser.new_page() - await page.goto("https://example.com", wait_until="domcontentloaded") - await browser.close() - print("SMOKE TEST PASSED: Bare Chromium works fine") - except Exception as e: - print(f"SMOKE TEST FAILED: {e}") - -if __name__ == '__main__': - asyncio.run(_smoke_test_browser()) diff --git a/apps/scraper/requirements.txt b/apps/scraper/requirements.txt deleted file mode 100644 index 0274f7a..0000000 --- a/apps/scraper/requirements.txt +++ /dev/null @@ -1,25 +0,0 @@ -# Production Scraper Dependencies -# Python 3.9+ - -# Core -python-dotenv==1.0.1 -asyncpg==0.29.0 -apscheduler==3.10.4 -pydantic==2.6.1 - -# HTTP Client -aiohttp==3.9.3 - -# AI & Embeddings (Voyage AI) -voyageai -tenacity==8.2.3 - -# Logging & Monitoring -python-json-logger==2.0.7 -colorlog==6.8.2 - -# Testing -pytest==8.0.2 -pytest-asyncio==0.23.5 -playwright==1.41.0 -playwright-stealth==2.0.2 diff --git a/apps/scraper/scraper.pid b/apps/scraper/scraper.pid deleted file mode 100644 index 652b2d7..0000000 --- a/apps/scraper/scraper.pid +++ /dev/null @@ -1 +0,0 @@ -36647 diff --git a/apps/scraper/src/__init__.py b/apps/scraper/src/__init__.py deleted file mode 100644 index 33ff99a..0000000 --- a/apps/scraper/src/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -""" -Postly Job Scraper Package - -Production-grade multi-source job aggregator: -- Sources: Remotive, Arbeitnow, Greenhouse ATS, hiring.cafe -- Voyage AI embeddings (768-dim, matches Drizzle schema) -- Source-URL-based deduplication across all sources -- aiohttp async spiders (API-first) + Playwright (hiring.cafe fallback) -- Structured JSON logging -""" - -__version__ = "3.0.0" diff --git a/apps/scraper/src/database.py b/apps/scraper/src/database.py deleted file mode 100644 index c80e58d..0000000 --- a/apps/scraper/src/database.py +++ /dev/null @@ -1,419 +0,0 @@ -#!/usr/bin/env python3 -""" -database.py -Database layer writing to the Drizzle-managed `jobs` table. - -CHANGELOG: -- Added format_vector from utils to safely format pgvector strings -- Added index creation for source_url on startup -- Replaced str() embedding conversions with format_vector() -- Added get_existing_urls() for fast, targeted duplicate filtering -- Fixed remove_duplicates() to delete the older record using created_at -""" - -import logging -from typing import Optional, List, Dict, Any, Set -import asyncpg -from datetime import datetime, timezone - -from utils import format_vector - -logger = logging.getLogger(__name__) - - -class Database: - """ - Database layer with connection pooling and batch operations. - - Writes to Drizzle `jobs` table with columns: - id (uuid), title, company_name, description, location, - salary_min, salary_max, job_type, remote, source, - source_url, embedding (vector 768), skills_required (jsonb), - experience_required, posted_at, expires_at, is_active, - employer_id, created_at, updated_at - """ - - def __init__(self, database_url: str): - self.database_url = database_url - self.pool: Optional[asyncpg.Pool] = None - - async def connect(self): - """Initialize connection pool.""" - try: - self.pool = await asyncpg.create_pool( - self.database_url, - min_size=2, - max_size=10, - command_timeout=60, - ) - logger.info("Database connection pool created") - - # Verify the jobs table exists and set up index - async with self.pool.acquire() as conn: - exists = await conn.fetchval(""" - SELECT EXISTS( - SELECT 1 FROM information_schema.tables - WHERE table_name = 'jobs' - ) - """) - if not exists: - raise RuntimeError( - "Table 'jobs' does not exist — run Drizzle migrations first" - ) - - # Add index on source_url for O(1) existence checks during batch dedup - await conn.execute(""" - CREATE INDEX IF NOT EXISTS idx_jobs_source_url ON jobs(source_url); - """) - - logger.info("Verified jobs table exists and source_url index is ready") - - except Exception as e: - logger.error(f"Failed to connect to database: {e}") - raise - - # ─── INSERT ─────────────────────────────────────────────────── - - async def insert_job(self, job_data: Dict[str, Any]) -> bool: - """ - Insert a single job. Returns True if inserted, False if duplicate. - Upserts on source_url to handle re-scrapes gracefully. - """ - try: - async with self.pool.acquire() as conn: - await conn.execute(""" - INSERT INTO jobs ( - id, title, company_name, description, location, - salary_min, salary_max, job_type, remote, source, - source_url, skills_required, experience_required, - posted_at, expires_at, is_active, embedding - ) - VALUES ( - $1, $2, $3, $4, $5, - $6, $7, $8, $9, $10, - $11, $12, $13, - $14, $15, $16, $17 - ) - ON CONFLICT (id) DO UPDATE SET - title = EXCLUDED.title, - company_name = EXCLUDED.company_name, - description = EXCLUDED.description, - salary_min = EXCLUDED.salary_min, - salary_max = EXCLUDED.salary_max, - is_active = TRUE, - updated_at = NOW() - """, - job_data.get("id"), - job_data.get("title"), - job_data.get("company_name"), - job_data.get("description"), - job_data.get("location"), - job_data.get("salary_min"), - job_data.get("salary_max"), - job_data.get("job_type"), - job_data.get("remote", False), - job_data.get("source", "hiring_cafe"), - job_data.get("source_url"), - job_data.get("skills_required"), # jsonb — pass as text - job_data.get("experience_required"), - job_data.get("posted_at"), - job_data.get("expires_at"), - job_data.get("is_active", True), - format_vector(job_data.get("embedding")), - ) - return True - except asyncpg.exceptions.UniqueViolationError: - logger.debug(f"Duplicate job: {job_data.get('title')}") - return False - except Exception as e: - logger.error(f"Failed to insert job: {e}") - return False - - async def insert_jobs_batch(self, jobs: List[Dict[str, Any]]) -> int: - """Batch insert jobs. Returns count of successfully inserted jobs.""" - if not jobs: - return 0 - - inserted = 0 - async with self.pool.acquire() as conn: - async with conn.transaction(): - for job in jobs: - try: - await conn.execute(""" - INSERT INTO jobs ( - id, title, company_name, description, location, - salary_min, salary_max, job_type, remote, source, - source_url, skills_required, experience_required, - posted_at, expires_at, is_active, embedding - ) - VALUES ( - $1, $2, $3, $4, $5, - $6, $7, $8, $9, $10, - $11, $12, $13, - $14, $15, $16, $17 - ) - ON CONFLICT (id) DO NOTHING - """, - job.get("id"), - job.get("title"), - job.get("company_name"), - job.get("description"), - job.get("location"), - job.get("salary_min"), - job.get("salary_max"), - job.get("job_type"), - job.get("remote", False), - job.get("source", "hiring_cafe"), - job.get("source_url"), - job.get("skills_required"), - job.get("experience_required"), - job.get("posted_at"), - job.get("expires_at"), - job.get("is_active", True), - format_vector(job.get("embedding")), - ) - inserted += 1 - except Exception as e: - logger.warning(f"Failed to insert job {job.get('id')}: {e}") - - logger.info(f"Batch inserted {inserted}/{len(jobs)} jobs") - return inserted - - # ─── EMBEDDING OPERATIONS ───────────────────────────────────── - - async def get_jobs_without_embeddings(self, limit: int = 100) -> List[Dict[str, Any]]: - """Fetch jobs that need embeddings.""" - async with self.pool.acquire() as conn: - rows = await conn.fetch(""" - SELECT id, title, description, skills_required, - company_name, location - FROM jobs - WHERE embedding IS NULL AND is_active = TRUE - LIMIT $1 - """, limit) - return [dict(row) for row in rows] - - async def update_embedding(self, job_id: str, embedding: List[float]): - """Update a single job's embedding.""" - async with self.pool.acquire() as conn: - await conn.execute(""" - UPDATE jobs - SET embedding = $1::vector, updated_at = NOW() - WHERE id = $2 - """, format_vector(embedding), job_id) - - async def update_embeddings_batch(self, updates: List[tuple]): - """Batch update embeddings. Each tuple is (job_id, embedding_list).""" - if not updates: - return - - success = 0 - async with self.pool.acquire() as conn: - async with conn.transaction(): - for job_id, emb in updates: - try: - vec_str = format_vector(emb) - await conn.execute(""" - UPDATE jobs - SET embedding = $2::vector, updated_at = NOW() - WHERE id = $1 - """, job_id, vec_str) - success += 1 - except Exception as e: - logger.warning(f"Failed to update embedding for {job_id}: {e}") - - logger.info(f"Batch updated {success}/{len(updates)} embeddings") - - # ─── SEARCH ─────────────────────────────────────────────────── - - async def vector_search( - self, - query_embedding: List[float], - limit: int = 20, - filters: Optional[Dict[str, Any]] = None, - ) -> List[Dict[str, Any]]: - """Perform vector similarity search.""" - async with self.pool.acquire() as conn: - query = """ - SELECT - id, title, company_name, location, job_type, - salary_min, salary_max, skills_required, source_url, - remote, posted_at, - 1 - (embedding <=> $1::vector) as similarity - FROM jobs - WHERE embedding IS NOT NULL AND is_active = TRUE - """ - params = [query_embedding] - param_idx = 2 - - if filters: - if filters.get("remote_only"): - query += " AND remote = TRUE" - if filters.get("job_type"): - query += f" AND job_type = ${param_idx}" - params.append(filters["job_type"]) - param_idx += 1 - - query += f" ORDER BY embedding <=> $1::vector LIMIT ${param_idx}" - params.append(limit) - - rows = await conn.fetch(query, *params) - return [dict(row) for row in rows] - - async def full_text_search( - self, query: str, limit: int = 20 - ) -> List[Dict[str, Any]]: - """Full-text search using PostgreSQL tsvector.""" - async with self.pool.acquire() as conn: - rows = await conn.fetch(""" - SELECT - id, title, company_name, location, job_type, - salary_min, salary_max, skills_required, source_url, - remote, posted_at, - ts_rank( - to_tsvector('english', - COALESCE(title, '') || ' ' || - COALESCE(description, '') || ' ' || - COALESCE(company_name, '') - ), - plainto_tsquery('english', $1) - ) as rank - FROM jobs - WHERE - is_active = TRUE AND - to_tsvector('english', - COALESCE(title, '') || ' ' || - COALESCE(description, '') || ' ' || - COALESCE(company_name, '') - ) @@ plainto_tsquery('english', $1) - ORDER BY rank DESC - LIMIT $2 - """, query, limit) - return [dict(row) for row in rows] - - # ─── DEDUPLICATION ──────────────────────────────────────────── - - async def get_existing_source_urls(self, source: str = "hiring_cafe") -> set: - """Get all existing source_urls for a given source — used for dedup.""" - async with self.pool.acquire() as conn: - rows = await conn.fetch(""" - SELECT source_url FROM jobs WHERE source = $1 - """, source) - return {row["source_url"] for row in rows if row["source_url"]} - - async def get_existing_urls(self, urls: List[str]) -> Set[str]: - """Check which of the provided URLs already exist in the database.""" - if not urls: - return set() - - async with self.pool.acquire() as conn: - rows = await conn.fetch(""" - SELECT source_url FROM jobs WHERE source_url = ANY($1) - """, urls) - return {row["source_url"] for row in rows if row["source_url"]} - - async def get_all_source_ids(self) -> Set[str]: - """Fetch all known source_urls for skipping already-scraped jobs across all sources.""" - async with self.pool.acquire() as conn: - rows = await conn.fetch( - "SELECT source_url, source FROM jobs WHERE source_url IS NOT NULL" - ) - ids = set() - for row in rows: - url = row["source_url"] - source = row["source"] - # Add the full URL for general dedup - ids.add(url) - # For hiring.cafe, also extract the requisition_id suffix - if source == "hiring_cafe" and "/viewjob/" in url: - ids.add(url.split("/viewjob/")[-1].split("/")[0]) - # For greenhouse, add the gh-{board}-{id} key - elif source == "greenhouse": - # URL format: https://boards.greenhouse.io/{board}/jobs/{id} - parts = url.rstrip("/").split("/") - if len(parts) >= 2: - try: - gh_id = parts[-1] - board = parts[-3] if len(parts) >= 4 else "" - ids.add(f"gh-{board}-{gh_id}") - except (IndexError, ValueError): - pass - return ids - - # ─── CLEANUP ────────────────────────────────────────────────── - - async def cleanup_old_jobs(self, days: int = 30) -> int: - async with self.pool.acquire() as conn: - result = await conn.execute(""" - DELETE FROM jobs - WHERE created_at < NOW() - MAKE_INTERVAL(days => $1) - """, days) - count = int(result.split()[-1]) - logger.info(f"Cleaned up {count} old jobs (> {days} days)") - return count - - async def cleanup_expired_jobs(self) -> int: - async with self.pool.acquire() as conn: - result = await conn.execute(""" - DELETE FROM jobs - WHERE expires_at IS NOT NULL AND expires_at < NOW() - """) - count = int(result.split()[-1]) - logger.info(f"Cleaned up {count} expired jobs") - return count - - async def deactivate_stale_jobs(self, days: int = 14) -> int: - """Mark jobs as inactive if not refreshed recently (across all sources).""" - async with self.pool.acquire() as conn: - result = await conn.execute(""" - UPDATE jobs - SET is_active = FALSE, updated_at = NOW() - WHERE updated_at < NOW() - MAKE_INTERVAL(days => $1) - AND is_active = TRUE - """, days) - count = int(result.split()[-1]) - logger.info(f"Deactivated {count} stale jobs (> {days} days)") - return count - - async def remove_duplicates(self) -> int: - """Remove duplicate jobs based on identical source_url.""" - async with self.pool.acquire() as conn: - result = await conn.execute(""" - -- Explaining decision: using created_at to preserve the new scrape data. - -- We delete the OLDER record (smaller created_at) when duplicate URLs exist - -- because id is a UUID type which cannot be ordered reliably by comparison operator. - DELETE FROM jobs a USING jobs b - WHERE a.created_at < b.created_at - AND a.source_url = b.source_url - AND a.source = b.source - """) - count = int(result.split()[-1]) - logger.info(f"Removed {count} duplicate jobs") - return count - - # ─── STATS ──────────────────────────────────────────────────── - - async def get_stats(self) -> Dict[str, Any]: - async with self.pool.acquire() as conn: - stats = {} - stats["total_jobs"] = await conn.fetchval("SELECT COUNT(*) FROM jobs") - stats["active_jobs"] = await conn.fetchval( - "SELECT COUNT(*) FROM jobs WHERE is_active = TRUE" - ) - stats["jobs_with_embeddings"] = await conn.fetchval( - "SELECT COUNT(*) FROM jobs WHERE embedding IS NOT NULL" - ) - stats["remote_jobs"] = await conn.fetchval( - "SELECT COUNT(*) FROM jobs WHERE remote = TRUE" - ) - # Per-source breakdown - source_rows = await conn.fetch( - "SELECT source, COUNT(*) as cnt FROM jobs GROUP BY source" - ) - stats["by_source"] = {row["source"]: row["cnt"] for row in source_rows} - return stats - - async def close(self): - if self.pool: - await self.pool.close() - logger.info("Database connection pool closed") diff --git a/apps/scraper/src/embedding_service.py b/apps/scraper/src/embedding_service.py deleted file mode 100644 index 9e36562..0000000 --- a/apps/scraper/src/embedding_service.py +++ /dev/null @@ -1,419 +0,0 @@ -#!/usr/bin/env python3 -""" -embedding_service.py -Voyage AI embedding client with rate-limiting and batch processing. -""" - -import asyncio -import logging -import time -import os -from typing import List, Optional, Dict, Any -from tenacity import ( - retry, - stop_after_attempt, - wait_exponential, - retry_if_exception_type, - before_sleep_log, -) - -logger = logging.getLogger(__name__) - -# Try to import voyageai - handle gracefully if not installed -try: - import voyageai - VOYAGE_AVAILABLE = True -except ImportError: - VOYAGE_AVAILABLE = False - logger.warning("voyageai package not installed. Run: pip install voyageai") - - -class VoyageEmbeddingService: - """ - Dedicated Voyage AI embedding client with production features. - - Features: - - voyage-4-lite model (1024 dimensions, matches Drizzle schema) - - Built-in rate limiting (300 RPM max) - - Batch processing (128 texts per request max) - - Tenacity retry with exponential backoff - - Token consumption tracking - """ - - # Voyage AI limits - MODEL = os.getenv("VOYAGE_MODEL", "voyage-4-lite") - EMBEDDING_DIM = 1024 - MAX_BATCH_SIZE = 128 - MAX_RPM = 300 - MAX_TOKENS_PER_REQUEST = 120000 - - def __init__( - self, - api_key: str, - model: Optional[str] = None, - max_rpm: int = 300, - max_batch_size: int = 128, - ): - """ - Initialize Voyage AI embedding service. - - Args: - api_key: Voyage AI API key - model: Model name (default: voyage-3.5-lite) - max_rpm: Maximum requests per minute - max_batch_size: Maximum texts per API call - """ - if not VOYAGE_AVAILABLE: - raise ImportError( - "voyageai package is required. Install with: pip install voyageai" - ) - - self.api_key = api_key - self.model = model or self.MODEL - self.max_rpm = max_rpm - self.max_batch_size = min(max_batch_size, self.MAX_BATCH_SIZE) - - # Initialize client - self.client = voyageai.Client(api_key=api_key) - - # Rate limiting - self.min_interval = 60.0 / max_rpm - self.last_request_time = 0 - self._semaphore = asyncio.Semaphore(5) # Max concurrent requests - - # Metrics - self.total_tokens_consumed = 0 - self.total_requests = 0 - self.total_embeddings = 0 - - logger.info({ - "event": "embedding_service_initialized", - "model": self.model, - "max_rpm": max_rpm, - "max_batch_size": self.max_batch_size, - }) - - async def _rate_limit(self): - """Enforce rate limiting between requests.""" - elapsed = time.time() - self.last_request_time - if elapsed < self.min_interval: - wait_time = self.min_interval - elapsed - await asyncio.sleep(wait_time) - self.last_request_time = time.time() - - def _prepare_text(self, job: Dict[str, Any]) -> str: - """ - Prepare comprehensive weighted text for embedding. - - Reads ALL available job fields for richer vector search: - - Job title (high weight — repeated 3x) - - Skills (high weight — repeated 2x) - - Description (medium weight — truncated to 3000 chars) - - Location, job type, remote status (medium weight) - - Salary range, experience (low-medium weight) - - Company name, industry (low weight) - - This produces embeddings that capture the full context of a job, - enabling more accurate vector search for queries like: - "remote React developer in New York $120k+" - """ - parts = [] - - # Job title (high weight — repeat 3x for emphasis) - title = job.get('job_title', '') or job.get('title', '') - if title: - parts.extend([f"Job Title: {title}"] * 3) - - # Skills (high weight — repeated 2x) - skills = job.get('skills_required', []) - if skills: - if isinstance(skills, list): - skills_text = ', '.join(str(s) for s in skills) - else: - skills_text = str(skills) - parts.extend([f"Skills: {skills_text}"] * 2) - - # Description (medium-high weight, truncated) - description = job.get('job_description', '') or job.get('description', '') - if description: - parts.append(description[:3000]) - - # Location (important for geo-based search) - location = job.get('location', '') - if location: - parts.append(f"Location: {location}") - - # Remote status (critical for modern job search) - remote = job.get('remote', False) - if remote: - parts.append("Work Type: Remote, Work from home, WFH") - - # Job type (full-time, part-time, contract, etc.) - job_type = job.get('job_type', '') - if job_type: - parts.append(f"Employment Type: {job_type}") - - # Salary range (important for salary-based queries) - salary_min = job.get('salary_min') - salary_max = job.get('salary_max') - if salary_min or salary_max: - salary_parts = [] - if salary_min: - salary_parts.append(f"${salary_min:,.0f}" if isinstance(salary_min, (int, float)) else f"${salary_min}") - if salary_max: - salary_parts.append(f"${salary_max:,.0f}" if isinstance(salary_max, (int, float)) else f"${salary_max}") - parts.append(f"Salary Range: {' - '.join(salary_parts)} per year") - - # Experience level - experience = job.get('experience_required', '') - if experience: - parts.append(f"Experience Required: {experience}") - - # Company name (low weight) - company = job.get('company_name', '') or job.get('company', '') - if company: - parts.append(f"Company: {company}") - - # Industry (low weight) - industry = job.get('industry', '') - if industry: - parts.append(f"Industry: {industry}") - - combined = '\n'.join(parts) - - # Truncate to safe limit (~30k chars ≈ 8k tokens) - return combined[:30000] - - @retry( - stop=stop_after_attempt(5), - wait=wait_exponential(multiplier=2, min=4, max=120), - retry=retry_if_exception_type((Exception,)), - before_sleep=before_sleep_log(logger, logging.WARNING), - ) - async def _embed_batch_request( - self, - texts: List[str], - input_type: str = "document" - ) -> List[List[float]]: - """ - Make a single batch embedding request to Voyage AI (1024-dim). - - Args: - texts: List of texts to embed - input_type: "document" for job content, "query" for search - - Returns: - List of embedding vectors (1024-dim each) - """ - async with self._semaphore: - await self._rate_limit() - - try: - # Voyage AI client is sync, run in executor - loop = asyncio.get_event_loop() - result = await loop.run_in_executor( - None, - lambda: self.client.embed( - texts=texts, - model=self.model, - input_type=input_type, - ) - ) - - # Track metrics - self.total_requests += 1 - self.total_embeddings += len(texts) - if hasattr(result, 'total_tokens'): - self.total_tokens_consumed += result.total_tokens - - logger.debug({ - "event": "embedding_batch_complete", - "texts": len(texts), - "tokens": getattr(result, 'total_tokens', 0), - }) - - return result.embeddings - - except Exception as e: - logger.error(f"Voyage AI embedding error: {e}") - raise - - async def embed_single(self, text: str, input_type: str = "document") -> List[float]: - """ - Embed a single text. - - Args: - text: Text to embed - input_type: "document" or "query" - - Returns: - 1024-dimensional embedding vector - """ - embeddings = await self._embed_batch_request([text], input_type) - return embeddings[0] if embeddings else [] - - async def embed_batch( - self, - texts: List[str], - input_type: str = "document" - ) -> List[List[float]]: - """ - Embed a batch of texts with automatic chunking. - - Args: - texts: List of texts to embed - input_type: "document" or "query" - - Returns: - List of embedding vectors - """ - if not texts: - return [] - - all_embeddings = [] - - # Process in chunks respecting max batch size - for i in range(0, len(texts), self.max_batch_size): - chunk = texts[i:i + self.max_batch_size] - embeddings = await self._embed_batch_request(chunk, input_type) - all_embeddings.extend(embeddings) - - return all_embeddings - - async def embed_jobs( - self, - jobs: List[Dict[str, Any]] - ) -> List[Dict[str, Any]]: - """ - Embed a list of job dictionaries. - - Args: - jobs: List of job dicts with job_title, job_description, etc. - - Returns: - Jobs with 'embedding' field added - """ - if not jobs: - return [] - - # Prepare texts - texts = [self._prepare_text(job) for job in jobs] - - logger.info({ - "event": "embedding_jobs_start", - "count": len(jobs), - }) - - try: - embeddings = await self.embed_batch(texts) - - # Add embeddings to jobs - for job, embedding in zip(jobs, embeddings): - job['embedding'] = embedding if embedding else None - - successful = sum(1 for job in jobs if job.get('embedding')) - - logger.info({ - "event": "embedding_jobs_complete", - "total": len(jobs), - "successful": successful, - "tokens_consumed": self.total_tokens_consumed, - }) - - return jobs - - except Exception as e: - logger.error(f"Failed to embed jobs: {e}") - # Return jobs without embeddings rather than failing entirely - for job in jobs: - job['embedding'] = None - return jobs - - async def embed_query(self, query: str) -> List[float]: - """ - Embed a search query. - - Uses "query" input type for optimal search performance. - """ - return await self.embed_single(query, input_type="query") - - def get_metrics(self) -> Dict[str, Any]: - """Get service metrics.""" - return { - "model": self.model, - "total_requests": self.total_requests, - "total_embeddings": self.total_embeddings, - "total_tokens_consumed": self.total_tokens_consumed, - "embedding_dimension": self.EMBEDDING_DIM, - } - - -class EmbeddingWorker: - """ - Background worker that processes jobs without embeddings. - - Runs continuously to catch any jobs that were inserted - without embeddings due to errors or rate limits. - """ - - def __init__( - self, - database, - embedding_service: VoyageEmbeddingService, - batch_size: int = 50, - interval_seconds: int = 60, - ): - """ - Initialize embedding worker. - - Args: - database: Database instance - embedding_service: VoyageEmbeddingService instance - batch_size: Jobs to process per cycle - interval_seconds: Time between cycles - """ - self.db = database - self.embedder = embedding_service - self.batch_size = batch_size - self.interval = interval_seconds - self.running = False - - async def start(self): - """Start the embedding worker loop.""" - self.running = True - logger.info("Embedding worker started") - - while self.running: - try: - # Fetch jobs needing embeddings - jobs = await self.db.get_jobs_without_embeddings(self.batch_size) - - if jobs: - logger.info(f"Processing {len(jobs)} jobs for embeddings") - - # Generate embeddings - embedded_jobs = await self.embedder.embed_jobs(jobs) - - # Batch update database - updates = [ - (job['id'], job['embedding']) - for job in embedded_jobs - if job.get('embedding') - ] - - if updates: - await self.db.update_embeddings_batch(updates) - logger.info(f"Updated {len(updates)} embeddings") - else: - logger.debug("No jobs pending embeddings") - - await asyncio.sleep(self.interval) - - except Exception as e: - logger.error(f"Embedding worker error: {e}") - await asyncio.sleep(self.interval) - - def stop(self): - """Stop the embedding worker.""" - self.running = False - logger.info("Embedding worker stopped") diff --git a/apps/scraper/src/health.py b/apps/scraper/src/health.py deleted file mode 100644 index eceb340..0000000 --- a/apps/scraper/src/health.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env python3 -""" -health.py -Health check HTTP endpoint for container orchestration. -""" - -import asyncio -import logging -from aiohttp import web -from datetime import datetime -from typing import Dict, Any - -logger = logging.getLogger(__name__) - - -class HealthCheckServer: - """Lightweight HTTP server for health checks.""" - - def __init__(self, port: int = 8080): - self.port = port - self.app = web.Application() - self.runner = None - self.site = None - self.start_time = datetime.utcnow() - - # Health status - self.status = { - "healthy": True, - "database_connected": False, - "last_scrape": None, - "jobs_in_db": 0, - "consecutive_failures": 0, - "errors": [], - } - - # Setup routes - self.app.router.add_get("/health", self.health_check) - self.app.router.add_get("/ready", self.readiness_check) - self.app.router.add_get("/metrics", self.metrics) - - async def health_check(self, request) -> web.Response: - """Liveness probe.""" - uptime = (datetime.utcnow() - self.start_time).total_seconds() - consecutive_failures = self.status.get("consecutive_failures", 0) - is_healthy = self.status["healthy"] and consecutive_failures < 3 - - return web.json_response( - { - "status": "healthy" if is_healthy else "unhealthy", - "uptime_seconds": uptime, - "consecutive_failures": consecutive_failures, - "timestamp": datetime.utcnow().isoformat(), - }, - status=200 if is_healthy else 503, - ) - - async def readiness_check(self, request) -> web.Response: - """Readiness probe — only requires DB connection.""" - ready = self.status["database_connected"] - return web.json_response( - { - "ready": ready, - "database": self.status["database_connected"], - "last_scrape": self.status["last_scrape"], - "timestamp": datetime.utcnow().isoformat(), - }, - status=200 if ready else 503, - ) - - async def metrics(self, request) -> web.Response: - """Prometheus-style metrics endpoint.""" - uptime = (datetime.utcnow() - self.start_time).total_seconds() - - metrics_text = f"""# HELP scraper_uptime_seconds Total uptime in seconds -# TYPE scraper_uptime_seconds gauge -scraper_uptime_seconds {uptime} - -# HELP scraper_healthy Health status (1=healthy, 0=unhealthy) -# TYPE scraper_healthy gauge -scraper_healthy {1 if self.status["healthy"] else 0} - -# HELP scraper_database_connected Database connection status -# TYPE scraper_database_connected gauge -scraper_database_connected {1 if self.status["database_connected"] else 0} - -# HELP scraper_jobs_in_db Total jobs in database -# TYPE scraper_jobs_in_db gauge -scraper_jobs_in_db {self.status.get("jobs_in_db", 0)} - -# HELP scraper_errors_total Total number of errors -# TYPE scraper_errors_total counter -scraper_errors_total {len(self.status["errors"])} -""" - return web.Response(text=metrics_text, content_type="text/plain") - - def update_status(self, **kwargs): - """Update health status.""" - self.status.update(kwargs) - - async def start(self): - """Start the health check server.""" - try: - self.runner = web.AppRunner(self.app) - await self.runner.setup() - self.site = web.TCPSite(self.runner, "0.0.0.0", self.port) - await self.site.start() - logger.info(f"Health check server started on port {self.port}") - except Exception as e: - logger.error(f"Failed to start health server: {e}") - - async def stop(self): - """Stop the health check server gracefully.""" - try: - if self.site: - await self.site.stop() - self.site = None - if self.runner: - await self.runner.cleanup() - self.runner = None - logger.info("Health check server stopped") - except Exception as e: - logger.warning(f"Error during health server shutdown: {e}") diff --git a/apps/scraper/src/janitor.py b/apps/scraper/src/janitor.py deleted file mode 100644 index 63a73d4..0000000 --- a/apps/scraper/src/janitor.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env python3 -""" -janitor.py -Maintenance service for keeping the job database clean. -Works with the Drizzle-managed `jobs` table. -""" - -import asyncio -import logging -from typing import Dict, Any -from datetime import datetime - -logger = logging.getLogger(__name__) - - -class JanitorService: - """ - Scheduled maintenance tasks: - 1. Remove expired jobs (past expires_at) - 2. Deactivate stale jobs (not refreshed for N days) - 3. Remove old jobs (> max_age_days) - 4. Remove duplicates - """ - - def __init__( - self, - database, - max_age_days: int = 30, - stale_days: int = 14, - ): - self.db = database - self.max_age_days = max_age_days - self.stale_days = stale_days - - async def remove_expired_jobs(self) -> int: - """Remove jobs past their expiry date.""" - try: - count = await self.db.cleanup_expired_jobs() - logger.info(f"Removed {count} expired jobs") - return count - except Exception as e: - logger.error(f"Failed to remove expired jobs: {e}") - return 0 - - async def remove_old_jobs(self) -> int: - """Remove jobs older than max_age_days.""" - try: - count = await self.db.cleanup_old_jobs(self.max_age_days) - logger.info(f"Removed {count} old jobs (> {self.max_age_days} days)") - return count - except Exception as e: - logger.error(f"Failed to remove old jobs: {e}") - return 0 - - async def deactivate_stale(self) -> int: - """Mark jobs as inactive if not refreshed recently.""" - try: - count = await self.db.deactivate_stale_jobs(self.stale_days) - logger.info(f"Deactivated {count} stale jobs (> {self.stale_days} days)") - return count - except Exception as e: - logger.error(f"Failed to deactivate stale jobs: {e}") - return 0 - - async def remove_duplicates(self) -> int: - """Remove duplicate jobs.""" - try: - count = await self.db.remove_duplicates() - logger.info(f"Removed {count} duplicate jobs") - return count - except Exception as e: - logger.error(f"Failed to remove duplicates: {e}") - return 0 - - async def run_maintenance(self) -> Dict[str, Any]: - """Execute all maintenance tasks.""" - logger.info("=== Starting Janitor Maintenance ===") - start_time = datetime.now() - - summary = { - "started_at": start_time.isoformat(), - "tasks": {}, - } - - try: - summary["tasks"]["expired_removed"] = await self.remove_expired_jobs() - summary["tasks"]["stale_deactivated"] = await self.deactivate_stale() - summary["tasks"]["old_removed"] = await self.remove_old_jobs() - summary["tasks"]["duplicates_removed"] = await self.remove_duplicates() - - end_time = datetime.now() - summary["completed_at"] = end_time.isoformat() - summary["duration_seconds"] = (end_time - start_time).total_seconds() - summary["success"] = True - - logger.info( - f"=== Janitor complete ({summary['duration_seconds']:.1f}s) ===" - ) - - except Exception as e: - logger.error(f"Maintenance failed: {e}") - summary["success"] = False - summary["error"] = str(e) - - return summary diff --git a/apps/scraper/src/main.py b/apps/scraper/src/main.py deleted file mode 100644 index 747cb94..0000000 --- a/apps/scraper/src/main.py +++ /dev/null @@ -1,424 +0,0 @@ -#!/usr/bin/env python3 -""" -main.py -Entry-point for the hiring.cafe job scraper. - -CHANGELOG: -- Added _consecutive_failures logic to track crash loops without silent shutdown -- Updated spider call to pass known_ids for detail fetching bypass -- Changed scheduler.shutdown(wait=False) to wait=True to fix shutdown race -- Pushing _consecutive_failures to the health server payload -""" - -import asyncio -import logging -import os -import signal -import sys -import warnings -from datetime import datetime, timezone -from pathlib import Path - -# Suppress urllib3 NotOpenSSLWarning (common on macOS with LibreSSL) -try: - from urllib3.exceptions import NotOpenSSLWarning - warnings.filterwarnings("ignore", category=NotOpenSSLWarning) -except ImportError: - pass - -from dotenv import load_dotenv -from apscheduler.schedulers.asyncio import AsyncIOScheduler - -# Local imports -sys.path.insert(0, str(Path(__file__).parent)) - -from database import Database -from pipeline import JobProcessingPipeline -from embedding_service import VoyageEmbeddingService, EmbeddingWorker -from spiders.remotive import RemotiveSpider -from spiders.arbeitnow import ArbeitnowSpider -from spiders.greenhouse import GreenhouseSpider - -# HiringCafeSpider requires playwright — import conditionally -try: - from spiders.hiring_cafe import HiringCafeSpider - HIRING_CAFE_AVAILABLE = True -except ImportError: - HiringCafeSpider = None - HIRING_CAFE_AVAILABLE = False -from janitor import JanitorService -from health import HealthCheckServer - -# ─── Logging ────────────────────────────────────────────────────── - -def setup_logging(): - """Configure structured readable logging.""" - log_level = os.getenv("LOG_LEVEL", "INFO").upper() - - handler = logging.StreamHandler() - - try: - import colorlog - formatter = colorlog.ColoredFormatter( - "%(log_color)s%(asctime)s [%(levelname)s] %(cyan)s%(name)s:%(reset)s %(message)s", - datefmt="%Y-%m-%d %H:%M:%S" - ) - except ImportError: - formatter = logging.Formatter( - "%(asctime)s [%(levelname)s] %(name)s: %(message)s", - datefmt="%Y-%m-%d %H:%M:%S" - ) - - handler.setFormatter(formatter) - - root = logging.getLogger() - root.setLevel(log_level) - # Remove existing handlers to avoid duplicates - if root.hasHandlers(): - root.handlers.clear() - root.addHandler(handler) - - # Quiet noisy loggers - for noisy in ("aiohttp", "asyncpg", "apscheduler"): - logging.getLogger(noisy).setLevel(logging.WARNING) - - -logger = logging.getLogger(__name__) - - -# ─── Scraper System ────────────────────────────────────────────── - -class ScraperSystem: - """ - Orchestrator that coordinates scraping, processing, and maintenance. - """ - - def __init__(self): - load_dotenv() - - # Config - self.database_url = os.getenv("DATABASE_URL", "") - self.voyage_api_key = os.getenv("VOYAGE_API_KEY", "") - self.health_port = int(os.getenv("HEALTH_PORT", "8080")) - self.scrape_interval_minutes = int(os.getenv("SCRAPE_INTERVAL_MINUTES", "60")) - self.requests_per_minute = int(os.getenv("REQUESTS_PER_MINUTE", "20")) - - if not self.database_url: - raise ValueError("DATABASE_URL is required") - - # Components (initialized in start()) - self.db: Database = None - self.spiders: list = [] # All spider instances - self.pipeline: JobProcessingPipeline = None - self.embedder: VoyageEmbeddingService = None - self.embedding_worker: EmbeddingWorker = None - self.janitor: JanitorService = None - self.health: HealthCheckServer = None - self.scheduler: AsyncIOScheduler = None - - # State - self._running = False - self._stopping = False - self._cycle_count = 0 - self._consecutive_failures: int = 0 - - async def start(self): - """Initialize all components and start the service.""" - logger.info("Starting scraper system...") - - # 1. Database - self.db = Database(self.database_url) - await self.db.connect() - - # 2. Embedding service (optional — runs without if no key) - if self.voyage_api_key: - try: - self.embedder = VoyageEmbeddingService(api_key=self.voyage_api_key) - self.embedding_worker = EmbeddingWorker( - database=self.db, - embedding_service=self.embedder, - batch_size=50, - interval_seconds=120, - ) - logger.info("Voyage AI embeddings enabled") - except Exception as e: - logger.warning(f"Embeddings disabled: {e}") - self.embedder = None - else: - logger.warning("No VOYAGE_API_KEY — embeddings disabled") - - # 3. Spiders — API-based sources (always available) + hiring.cafe (if playwright installed) - self.spiders = [ - RemotiveSpider(requests_per_minute=2), # TOS: max 2 req/min - ArbeitnowSpider(requests_per_minute=20), # Generous limits - GreenhouseSpider(requests_per_minute=30), # Per-board, very fast - ] - - if HIRING_CAFE_AVAILABLE and HiringCafeSpider: - self.spiders.append( - HiringCafeSpider(requests_per_minute=self.requests_per_minute) - ) - logger.info("HiringCafeSpider enabled (playwright found)") - else: - logger.warning("HiringCafeSpider disabled (playwright not installed)") - - logger.info(f"Initialized {len(self.spiders)} spiders: " - f"{[s.SOURCE_NAME if hasattr(s, 'SOURCE_NAME') else 'hiring_cafe' for s in self.spiders]}") - - # 4. Pipeline - self.pipeline = JobProcessingPipeline( - database=self.db, - embedding_service=self.embedder, - batch_size=100, - ) - - # 5. Janitor - self.janitor = JanitorService(database=self.db) - - # 6. Health check server - pass a reference to expose failures dynamically - self.health = HealthCheckServer(port=self.health_port) - self.health.scraper = self # Give health server access to self._consecutive_failures - self.health.update_status(database_connected=True) - await self.health.start() - - # 7. Scheduler - self.scheduler = AsyncIOScheduler() - self.scheduler.add_job( - self._scrape_cycle, - "interval", - minutes=self.scrape_interval_minutes, - id="scrape_cycle", - next_run_time=datetime.now(timezone.utc), # Run immediately - ) - self.scheduler.add_job( - self._maintenance_cycle, - "interval", - hours=24, - id="maintenance", - ) - self.scheduler.start() - - # 8. Embedding worker (background) - if self.embedding_worker: - asyncio.create_task(self.embedding_worker.start()) - - self._running = True - logger.info({ - "event": "system_started", - "scrape_interval_min": self.scrape_interval_minutes, - "embeddings_enabled": self.embedder is not None, - "health_port": self.health_port, - }) - - async def _run_pipeline(self): - """Isolated scraping orchestration logic — runs ALL spiders sequentially.""" - self._cycle_count += 1 - cycle = self._cycle_count - - logger.info({"event": "cycle_start", "cycle": cycle}) - - # Shared known IDs for cross-source dedup - known_ids = await self.db.get_all_source_ids() - logger.info(f"Loaded {len(known_ids)} known IDs for cross-source dedup") - - total_scraped = 0 - total_stored = 0 - source_results = {} - - for spider in self.spiders: - source_name = getattr(spider, 'SOURCE_NAME', 'hiring_cafe') - spider_scraped = 0 - spider_stored = 0 - batch = [] - - try: - logger.info(f"--- Starting spider: {source_name} ---") - - async for job in spider.scrape(known_ids=known_ids): - batch.append(job) - spider_scraped += 1 - - if len(batch) >= self.pipeline.batch_size: - logger.info(f"[{source_name}] Batch full ({len(batch)} jobs). Processing...") - metrics = await self.pipeline.process(batch) - spider_stored += metrics.jobs_stored - batch = [] - - # Update health periodically - self.health.update_status( - last_scrape=datetime.now(timezone.utc).isoformat(), - jobs_in_db=(await self.db.get_stats()).get("total_jobs", 0), - consecutive_failures=self._consecutive_failures, - ) - - # Process remaining jobs in the last batch - if batch: - metrics = await self.pipeline.process(batch) - spider_stored += metrics.jobs_stored - - total_scraped += spider_scraped - total_stored += spider_stored - source_results[source_name] = { - "scraped": spider_scraped, - "stored": spider_stored, - "errors": spider.errors, - } - - logger.info(f"--- Spider {source_name} complete: " - f"scraped={spider_scraped}, stored={spider_stored}, " - f"errors={spider.errors} ---") - - except Exception as e: - logger.error(f"Spider {source_name} crashed: {e}", exc_info=True) - source_results[source_name] = { - "scraped": spider_scraped, - "stored": spider_stored, - "error": str(e), - } - finally: - # Reset spider metrics for next cycle - spider.jobs_found = 0 - spider.pages_scraped = 0 - spider.errors = 0 - if hasattr(spider, 'detail_fetches'): - spider.detail_fetches = 0 - - logger.info({ - "event": "cycle_complete", - "cycle": cycle, - "total_scraped": total_scraped, - "total_stored": total_stored, - "sources": source_results, - }) - - # Final health update - stats = await self.db.get_stats() - self.health.update_status( - last_scrape=datetime.now(timezone.utc).isoformat(), - jobs_in_db=stats.get("total_jobs", 0), - consecutive_failures=self._consecutive_failures, - by_source=stats.get("by_source", {}), - ) - - - async def _scrape_cycle(self): - """Execute one full scrape → process → store cycle inside resilience wrapper.""" - try: - await self._run_pipeline() - self._consecutive_failures = 0 - # update health state on success explicitly - self.health.update_status(consecutive_failures=0) - except asyncio.CancelledError: - # Shutdown initiated, exit silently and return to avoid APScheduler error logging - return - except Exception as e: - self._consecutive_failures += 1 - logger.exception( - "Scrape cycle failed (%d consecutive)", - self._consecutive_failures - ) - - # Update health state failures - self.health.update_status( - errors=self.health.status.get("errors", []) + [str(e)], - consecutive_failures=self._consecutive_failures - ) - - if self._consecutive_failures >= 3: - logger.critical( - "3 consecutive failures — pipeline may be broken, " - "manual intervention required" - ) - - async def _maintenance_cycle(self): - """Run janitor maintenance tasks.""" - try: - summary = await self.janitor.run_maintenance() - logger.info({"event": "maintenance_complete", **summary}) - except Exception as e: - logger.error(f"Maintenance failed: {e}") - - async def stop(self): - """Gracefully shut down all components with defensive checks and idempotency.""" - if self._stopping: - return - self._stopping = True - - logger.info("Shutting down scraper system...") - self._running = False - - # Stop scheduler first to prevent new jobs from starting (with blocking wait) - if self.scheduler: - try: - if self.scheduler.running: - self.scheduler.shutdown(wait=True) - except Exception as e: - logger.warning(f"Scheduler shutdown interrupted — job may have been mid-run: {e}") - - # Stop other background tasks - if self.embedding_worker: - try: - self.embedding_worker.stop() - except Exception as e: - logger.warning(f"Error stopping embedding worker: {e}") - - # Close all spiders - for spider in self.spiders: - try: - await spider.close() - except Exception as e: - source_name = getattr(spider, 'SOURCE_NAME', 'unknown') - logger.warning(f"Error closing spider {source_name}: {e}") - - # Stop health server - if self.health: - try: - await self.health.stop() - except Exception as e: - logger.warning(f"Error stopping health server: {e}") - - # Final database cleanup - if self.db: - try: - await self.db.close() - except Exception as e: - logger.warning(f"Error closing database: {e}") - - logger.info("Scraper system stopped") - - -# ─── Main Entry ─────────────────────────────────────────────────── - -async def main(): - setup_logging() - - system = ScraperSystem() - - # Handle signals for graceful shutdown - # We simply set _running to False to break the loop; - # the finally block will handle the component shutdown. - loop = asyncio.get_event_loop() - def signal_handler(): - system._running = False - logger.info("Interrupt received, stopping...") - - for sig in (signal.SIGINT, signal.SIGTERM): - loop.add_signal_handler(sig, signal_handler) - - try: - await system.start() - - # Keep running until stopped - while system._running: - await asyncio.sleep(1) - - except KeyboardInterrupt: - pass - except Exception as e: - logger.error(f"Fatal error: {e}") - finally: - await system.stop() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/apps/scraper/src/middlewares/__init__.py b/apps/scraper/src/middlewares/__init__.py deleted file mode 100644 index f3693ca..0000000 --- a/apps/scraper/src/middlewares/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Deduplication middleware package.""" diff --git a/apps/scraper/src/middlewares/deduplication.py b/apps/scraper/src/middlewares/deduplication.py deleted file mode 100644 index 1716a81..0000000 --- a/apps/scraper/src/middlewares/deduplication.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 -""" -deduplication.py -Lightweight dedup — uses requisition_id / source_url for uniqueness. - -SHA-256 fingerprinting removed — hiring.cafe provides stable IDs. -""" - -import logging -from typing import Optional, Set - -logger = logging.getLogger(__name__) - - -class DeduplicationCache: - """ - In-memory + DB dedup for the current scrape cycle. - Checks source_url uniqueness before inserting. - """ - - def __init__(self, database=None): - self.db = database - self._seen: Set[str] = set() # URLs seen in current batch - self._db_urls: Optional[Set[str]] = None # Cached from DB - - async def load_from_db(self, source: str = "hiring_cafe"): - """Pre-load existing source_urls from the database.""" - if self.db and self._db_urls is None: - self._db_urls = await self.db.get_existing_source_urls(source) - logger.info(f"Loaded {len(self._db_urls)} existing URLs for dedup") - - def is_duplicate(self, source_url: str) -> bool: - """Check if a URL has already been seen or exists in DB.""" - if not source_url: - return False - - # Check batch cache - if source_url in self._seen: - return True - - # Check DB cache - if self._db_urls and source_url in self._db_urls: - return True - - return False - - def mark_seen(self, source_url: str): - """Mark a URL as seen in the current batch.""" - if source_url: - self._seen.add(source_url) - - def clear_batch(self): - """Clear the current batch cache (keep DB cache).""" - self._seen.clear() - - def clear_all(self): - """Clear all caches.""" - self._seen.clear() - self._db_urls = None diff --git a/apps/scraper/src/models.py b/apps/scraper/src/models.py deleted file mode 100644 index a6f9c28..0000000 --- a/apps/scraper/src/models.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python3 -""" -models.py -Pydantic V2 schemas aligned to the Drizzle `jobs` table. -""" - -from datetime import datetime -from decimal import Decimal -from typing import Optional -from pydantic import BaseModel, Field, field_validator -import re -import uuid - - -class ScrapedJob(BaseModel): - """ - Validated job data that maps 1:1 to the Drizzle `jobs` table. - - Drizzle columns: - id, title, company_name, description, location, - salary_min, salary_max, job_type, remote, source, - source_url, embedding, skills_required, experience_required, - posted_at, expires_at, is_active, employer_id, - created_at, updated_at - """ - - # Primary key — generated here, not by the DB, so we can dedup before insert - id: str = Field(default_factory=lambda: str(uuid.uuid4())) - - # Core required fields - title: str = Field(..., min_length=3, description="Job title") - company_name: str = Field(..., min_length=1, description="Company name") - description: str = Field(..., min_length=10, description="Job description (plain text)") - - # Optional fields - location: Optional[str] = None - salary_min: Optional[Decimal] = None - salary_max: Optional[Decimal] = None - job_type: Optional[str] = None - remote: bool = False - source: str = Field(..., description="Source identifier (e.g. hiring_cafe, remotive, arbeitnow, greenhouse)") - source_url: Optional[str] = None # External apply URL - skills_required: Optional[list[str]] = Field(default_factory=list) - experience_required: Optional[str] = None - posted_at: Optional[datetime] = None - expires_at: Optional[datetime] = None - is_active: bool = True - employer_id: Optional[str] = None # UUID — unused for scraped jobs - - # Embedding (768-dim for Drizzle schema) - embedding: Optional[list[float]] = None - - # Source-specific ID — used for dedup. Not stored in DB. - # hiring.cafe uses requisition_id, Greenhouse uses gh-{board}-{id}, etc. - requisition_id: str = Field(default="", description="Source-specific ID for dedup") - - # Metadata (not stored in DB) - meta: dict = Field(default_factory=dict, exclude=True) - - model_config = { - "str_strip_whitespace": True, - "validate_default": True, - } - - @field_validator("title", "company_name", mode="before") - @classmethod - def strip_whitespace(cls, v: Optional[str]) -> Optional[str]: - if v is None: - return v - return " ".join(v.split()) - - @field_validator("skills_required", mode="before") - @classmethod - def ensure_list(cls, v) -> list[str]: - if v is None: - return [] - if isinstance(v, str): - return [s.strip() for s in v.split(",") if s.strip()] - return list(v) - - @field_validator("source_url", mode="before") - @classmethod - def validate_url(cls, v: Optional[str]) -> Optional[str]: - if v and not re.match(r"^https?://", v): - return None - return v - - def to_db_dict(self) -> dict: - """Convert to dict matching Drizzle column names for INSERT.""" - import json - - return { - "id": self.id, - "title": self.title, - "company_name": self.company_name, - "description": self.description, - "location": self.location, - "salary_min": float(self.salary_min) if self.salary_min else None, - "salary_max": float(self.salary_max) if self.salary_max else None, - "job_type": self.job_type, - "remote": self.remote, - "source": self.source, - "source_url": self.source_url, - "skills_required": json.dumps(self.skills_required) if self.skills_required else None, - "experience_required": self.experience_required, - "posted_at": self.posted_at, - "expires_at": self.expires_at, - "is_active": self.is_active, - "embedding": self.embedding, - } - - -class ScrapingMetrics(BaseModel): - """Structured logging metrics for scraping operations.""" - - source: str = "hiring_cafe" - event: str = "metrics" - jobs_found: int = 0 - jobs_stored: int = 0 - jobs_embedded: int = 0 - duplicates_skipped: int = 0 - errors: int = 0 - duration_seconds: float = 0.0 - timestamp: datetime = Field(default_factory=datetime.utcnow) - - def to_log_dict(self) -> dict: - return { - "timestamp": self.timestamp.isoformat(), - "source": self.source, - "event": self.event, - "jobs_found": self.jobs_found, - "jobs_stored": self.jobs_stored, - "jobs_embedded": self.jobs_embedded, - "duplicates_skipped": self.duplicates_skipped, - "errors": self.errors, - "duration_seconds": round(self.duration_seconds, 2), - } diff --git a/apps/scraper/src/pipeline.py b/apps/scraper/src/pipeline.py deleted file mode 100644 index ab9aad6..0000000 --- a/apps/scraper/src/pipeline.py +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env python3 -""" -pipeline.py -Simplified job processing pipeline: Scrape → Dedup → Embed → Store. - -CHANGELOG: -- Removed self._known_urls global state to prevent infinite RAM leak -- Implemented scoped database existence checking per batch -- Optimized embedding calls to only run for genuinely new jobs -""" - -import asyncio -import logging -import time -from typing import List, Dict, Any, Optional - -from models import ScrapedJob, ScrapingMetrics - -logger = logging.getLogger(__name__) - - -class JobProcessingPipeline: - """ - Processes scraped jobs through: - 1. Deduplication (by scoped DB check) - 2. Embedding generation (Voyage AI) - 3. Batch insertion to DB - """ - - def __init__( - self, - database, - embedding_service=None, - batch_size: int = 25, - ): - self.db = database - self.embedder = embedding_service - self.batch_size = batch_size - - # Metrics - self.metrics = ScrapingMetrics() - - async def _embed_batch(self, jobs: List[ScrapedJob]) -> List[ScrapedJob]: - """Generate embeddings for a batch of jobs.""" - if not self.embedder or not jobs: - return jobs - - # Pass ALL fields for comprehensive embeddings - job_dicts = [] - for job in jobs: - job_dicts.append({ - "job_title": job.title, - "company_name": job.company_name, - "job_description": job.description[:3000], - "skills_required": job.skills_required or [], - "location": job.location or "", - "remote": job.remote, - "job_type": job.job_type or "", - "salary_min": float(job.salary_min) if job.salary_min else None, - "salary_max": float(job.salary_max) if job.salary_max else None, - "experience_required": job.experience_required or "", - }) - - try: - embedded = await self.embedder.embed_jobs(job_dicts) - - for job, emb_dict in zip(jobs, embedded): - embedding = emb_dict.get("embedding") - if embedding: - job.embedding = embedding - self.metrics.jobs_embedded += 1 - - except Exception as e: - logger.error(f"Embedding batch failed: {e}") - - return jobs - - async def _store_batch(self, jobs: List[ScrapedJob]) -> int: - """Insert a batch of jobs to DB.""" - db_dicts = [job.to_db_dict() for job in jobs] - count = await self.db.insert_jobs_batch(db_dicts) - return count - - async def process(self, jobs: List[ScrapedJob]) -> ScrapingMetrics: - """ - Run the full pipeline on a list of scraped jobs. - - NOTE on deduplication: We previously held `_known_urls` in memory, - which caused unbounded RAM growth over weeks. The new approach queries - `source_url = ANY($1)` scoping existence checks strictly to the current batch. - """ - start = time.time() - self.metrics = ScrapingMetrics() - self.metrics.jobs_found = len(jobs) - - # Step 1: Dedup - urls = [j.source_url for j in jobs if j.source_url] - existing_urls = await self.db.get_existing_urls(urls) - - unique_jobs = [j for j in jobs if j.source_url not in existing_urls] - - self.metrics.duplicates_skipped += len(jobs) - len(unique_jobs) - - logger.info({ - "event": "dedup_complete", - "total": len(jobs), - "unique": len(unique_jobs), - "duped": self.metrics.duplicates_skipped, - }) - - if not unique_jobs: - self.metrics.duration_seconds = time.time() - start - return self.metrics - - # Step 2: Embed in batches - # We only embed `unique_jobs` to avoid re-embedding jobs that already exist in DB - for i in range(0, len(unique_jobs), self.batch_size): - batch = unique_jobs[i : i + self.batch_size] - await self._embed_batch(batch) - - # Step 3: Store in batches - for i in range(0, len(unique_jobs), self.batch_size): - batch = unique_jobs[i : i + self.batch_size] - stored = await self._store_batch(batch) - self.metrics.jobs_stored += stored - - self.metrics.duration_seconds = time.time() - start - - logger.info({ - "event": "pipeline_complete", - **self.metrics.to_log_dict(), - }) - - return self.metrics diff --git a/apps/scraper/src/spiders/__init__.py b/apps/scraper/src/spiders/__init__.py deleted file mode 100644 index 33e7533..0000000 --- a/apps/scraper/src/spiders/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Postly job spiders package.""" -from .base import BaseSpider -from .remotive import RemotiveSpider -from .arbeitnow import ArbeitnowSpider -from .greenhouse import GreenhouseSpider - -# HiringCafeSpider requires playwright — import conditionally -try: - from .hiring_cafe import HiringCafeSpider -except ImportError: - HiringCafeSpider = None - -__all__ = [ - "BaseSpider", - "RemotiveSpider", - "ArbeitnowSpider", - "GreenhouseSpider", - "HiringCafeSpider", -] diff --git a/apps/scraper/src/spiders/arbeitnow.py b/apps/scraper/src/spiders/arbeitnow.py deleted file mode 100644 index ccd055f..0000000 --- a/apps/scraper/src/spiders/arbeitnow.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python3 -""" -arbeitnow.py -Spider for Arbeitnow's public REST API. - -Endpoint: https://www.arbeitnow.com/api/job-board-api -- Free, no auth, no Cloudflare -- Focuses on EU + remote jobs -- Paginated results -""" - -import logging -from datetime import datetime, timezone -from typing import AsyncIterator, Optional, Set - -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from models import ScrapedJob -from spiders.base import ( - BaseSpider, - html_to_text, - extract_yoe, - extract_salary, - detect_remote, - detect_job_type, - safe_decimal, -) - -logger = logging.getLogger(__name__) - - -class ArbeitnowSpider(BaseSpider): - """ - Production spider for Arbeitnow.com job listings. - - Features: - - Pure aiohttp — no browser, no Playwright, no Cloudflare - - Paginated API with clean JSON responses - - Aggregates jobs from multiple ATS (Greenhouse, SmartRecruiters, Join.com) - - Good coverage of EU + remote positions - """ - - SOURCE_NAME = "arbeitnow" - BASE_URL = "https://www.arbeitnow.com/api/job-board-api" - MAX_PAGES = 50 # Safety limit - - def __init__(self, requests_per_minute: int = 20): - super().__init__(requests_per_minute=requests_per_minute) - - def _parse_job(self, raw: dict) -> Optional[ScrapedJob]: - """Parse an Arbeitnow API job object into a ScrapedJob.""" - try: - title = (raw.get("title") or "").strip() - company = (raw.get("company_name") or "").strip() - - if not title or not company: - return None - - # Description — HTML - desc_html = raw.get("description", "") - description = html_to_text(desc_html) - if len(description) < 10: - return None - - # URL - source_url = raw.get("url", "") or raw.get("link", "") - if not source_url: - slug = raw.get("slug", "") - if slug: - source_url = f"https://www.arbeitnow.com/view/{slug}" - if not source_url: - return None - - # Location - location = raw.get("location", "") - - # Remote detection - is_remote = raw.get("remote", False) - if not is_remote: - is_remote = detect_remote(description, location) - - # Tags → skills - tags = raw.get("tags", []) or [] - skills = [str(t) for t in tags if t] - - # Job type — from tags or description - job_type = None - for tag in tags: - jt = detect_job_type(str(tag)) - if jt: - job_type = jt - break - if not job_type: - job_type = detect_job_type(description[:1000]) - - # Salary — extract from description - salary_min, salary_max = extract_salary(description[:3000]) - - # YOE — extract from description - yoe = extract_yoe(description) - - # Posted date - posted_at = None - created_at_ts = raw.get("created_at") - if created_at_ts: - try: - if isinstance(created_at_ts, (int, float)): - posted_at = datetime.fromtimestamp(created_at_ts, tz=timezone.utc) - elif isinstance(created_at_ts, str): - posted_at = datetime.fromisoformat(created_at_ts.replace("Z", "+00:00")) - except (ValueError, OSError): - pass - - # Generate a stable source_id for dedup - slug = raw.get("slug", "") - requisition_id = slug or source_url - - return ScrapedJob( - title=title, - company_name=company, - description=description, - location=location if location else ("Remote" if is_remote else None), - salary_min=salary_min, - salary_max=salary_max, - job_type=job_type, - remote=is_remote, - source=self.SOURCE_NAME, - source_url=source_url, - skills_required=skills, - experience_required=yoe, - posted_at=posted_at, - is_active=True, - requisition_id=requisition_id, - ) - - except Exception as e: - logger.error(f"[arbeitnow] Parse error: {e}", exc_info=True) - self.errors += 1 - return None - - async def scrape( - self, known_ids: Optional[Set[str]] = None - ) -> AsyncIterator[ScrapedJob]: - """ - Scrape all Arbeitnow jobs with pagination. - """ - known = known_ids or set() - logger.info({"event": "scrape_start", "source": self.SOURCE_NAME}) - - page = 1 - consecutive_empty = 0 - - try: - while page <= self.MAX_PAGES: - data = await self._get_json( - self.BASE_URL, - params={"page": page}, - ) - - if not data: - consecutive_empty += 1 - if consecutive_empty >= 2: - break - page += 1 - continue - - jobs_list = data.get("data", []) - if not jobs_list: - logger.info(f"[arbeitnow] No more jobs at page {page}") - break - - self.pages_scraped += 1 - consecutive_empty = 0 - new_count = 0 - - for raw_job in jobs_list: - slug = raw_job.get("slug", "") - url = raw_job.get("url", "") or raw_job.get("link", "") - dedup_key = slug or url - - if dedup_key in known: - continue - known.add(dedup_key) - if url: - known.add(url) - - job = self._parse_job(raw_job) - if job: - self.jobs_found += 1 - new_count += 1 - yield job - - if new_count > 0: - logger.info(f"[arbeitnow] Page {page}: {new_count} new jobs") - - # Check for next page - meta = data.get("meta", {}) or data.get("links", {}) - has_next = bool(meta.get("next")) if meta else len(jobs_list) > 0 - - if not has_next: - break - - page += 1 - - except Exception as e: - logger.error(f"[arbeitnow] Scrape failed: {e}", exc_info=True) - self.errors += 1 - finally: - await self.close() - - logger.info({ - "event": "scrape_complete", - "source": self.SOURCE_NAME, - "jobs_found": self.jobs_found, - "pages_scraped": self.pages_scraped, - "errors": self.errors, - }) diff --git a/apps/scraper/src/spiders/base.py b/apps/scraper/src/spiders/base.py deleted file mode 100644 index 4b84ecd..0000000 --- a/apps/scraper/src/spiders/base.py +++ /dev/null @@ -1,337 +0,0 @@ -#!/usr/bin/env python3 -""" -base.py -Abstract base spider with shared field extraction utilities. - -All spiders inherit from BaseSpider, which provides: -- HTML → plaintext conversion -- YOE (years of experience) regex extraction -- Salary range regex extraction from free text -- Remote/onsite/hybrid detection -- Job type detection (full-time, part-time, contract) -- Rate limiting -- Metrics tracking -""" - -import asyncio -import logging -import re -import time -from abc import ABC, abstractmethod -from decimal import Decimal, InvalidOperation -from html import unescape -from typing import AsyncIterator, Optional, Set, Tuple, Any - -import aiohttp - -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from models import ScrapedJob - -logger = logging.getLogger(__name__) - -# ─── HTML Cleaning ──────────────────────────────────────────────── - -_BLOCK_TAG_RE = re.compile(r"", re.I) -_BR_TAG_RE = re.compile(r"", re.I) -_ALL_TAGS_RE = re.compile(r"<[^>]+>") -_MULTI_SPACE_RE = re.compile(r"[ \t]+") -_MULTI_NEWLINE_RE = re.compile(r"\n{3,}") - - -def html_to_text(html: str) -> str: - """Convert HTML to plain text without external dependencies.""" - if not html: - return "" - text = _BR_TAG_RE.sub("\n", html) - text = _BLOCK_TAG_RE.sub("\n", text) - text = _ALL_TAGS_RE.sub(" ", text) - text = unescape(text) - text = _MULTI_SPACE_RE.sub(" ", text) - text = _MULTI_NEWLINE_RE.sub("\n\n", text) - return text.strip() - - -# ─── Field Extraction ───────────────────────────────────────────── - -# Matches patterns like: "3+ years", "5-7 years", "2 years of experience", -# "minimum 3 years", "at least 5+ years", "3-5 yrs" -_YOE_PATTERNS = [ - # "3-5 years" range — MUST be before single-number patterns - re.compile(r"(\d+)\s*[-–—to]+\s*(\d+)\s*(?:years?|yrs?)", re.I), - # "minimum 3 years" / "at least 3 years" - re.compile(r"(?:minimum|min|at\s+least)\s*(\d+)\s*(?:years?|yrs?)", re.I), - # "5+ years of experience" - re.compile(r"(\d+)\+?\s*(?:years?|yrs?)\s*(?:of\s+)?(?:experience|exp)", re.I), - # "3+ years" standalone - re.compile(r"(\d+)\+\s*(?:years?|yrs?)", re.I), - # "experience: 3 years" or "experience required: 5 years" - re.compile(r"experience\s*(?:required)?\s*:?\s*(\d+)\s*(?:years?|yrs?)", re.I), -] - -# Salary patterns — matches "$80,000", "$80k", "$120,000 - $150,000", "80k-120k" -_SALARY_PATTERNS = [ - # "$80,000 - $150,000" or "$80,000-$150,000" or "$80k - $150k" - re.compile( - r"\$\s*([\d,]+(?:\.\d+)?)\s*[kK]?\s*[-–—to]+\s*\$?\s*([\d,]+(?:\.\d+)?)\s*[kK]?", - re.I, - ), - # "80k-150k" without dollar sign - re.compile( - r"([\d,]+)\s*[kK]\s*[-–—to]+\s*([\d,]+)\s*[kK]", - re.I, - ), - # Single salary "$120,000" or "$120k" - re.compile(r"\$\s*([\d,]+(?:\.\d+)?)\s*[kK]?", re.I), -] - -_REMOTE_KEYWORDS = { - "remote", "work from home", "wfh", "fully remote", - "100% remote", "remote-first", "remote first", - "work remotely", "anywhere", "distributed", -} - -_ONSITE_KEYWORDS = { - "on-site", "onsite", "on site", "in-office", "in office", - "office-based", "office based", -} - -_HYBRID_KEYWORDS = { - "hybrid", "flex", "flexible location", -} - - -def extract_yoe(text: str) -> Optional[str]: - """ - Extract years of experience from free text. - Returns strings like "3+ years", "5-7 years", or None. - """ - if not text: - return None - - for pattern in _YOE_PATTERNS: - match = pattern.search(text) - if match: - groups = match.groups() - if len(groups) == 2 and groups[1]: - return f"{groups[0]}-{groups[1]} years" - return f"{groups[0]}+ years" - - return None - - -def extract_salary(text: str) -> Tuple[Optional[Decimal], Optional[Decimal]]: - """ - Extract salary range from free text. - Returns (min, max) as Decimal, normalizing 'k' to thousands. - """ - if not text: - return None, None - - for pattern in _SALARY_PATTERNS: - match = pattern.search(text) - if match: - groups = match.groups() - try: - values = [] - for g in groups: - if g: - # Remove commas - clean = g.replace(",", "") - val = Decimal(clean) - # Check if the original text had 'k' after this number - pos = match.end() - suffix = text[match.start():pos + 5].lower() - if "k" in suffix and val < 1000: - val *= 1000 - values.append(val) - - if len(values) == 2: - return min(values), max(values) - elif len(values) == 1: - return values[0], None - except (InvalidOperation, ValueError): - continue - - return None, None - - -def safe_decimal(value: Any) -> Optional[Decimal]: - """Safely convert a value to Decimal, returning None on failure.""" - if value is None: - return None - try: - return Decimal(str(value)) - except (InvalidOperation, ValueError, TypeError): - return None - - -def detect_remote(text: str, location: Optional[str] = None) -> bool: - """ - Detect if a job is remote based on text content and location. - """ - combined = f"{text or ''} {location or ''}".lower() - - for keyword in _REMOTE_KEYWORDS: - if keyword in combined: - return True - - return False - - -def detect_job_type(text: str) -> Optional[str]: - """ - Detect job type from text. Returns normalized string. - Uses word boundary matching to avoid false positives - (e.g., 'international' should NOT match 'internship'). - """ - if not text: - return None - - lower = text.lower() - - if "full-time" in lower or "full time" in lower or "fulltime" in lower: - return "full_time" - if "part-time" in lower or "part time" in lower or "parttime" in lower: - return "part_time" - if "contract" in lower or "freelance" in lower: - return "contract" - # Use regex word boundary to avoid 'international' / 'internal' matching - if re.search(r'\binternship\b', lower): - return "internship" - if re.search(r'\bintern\b', lower) and not re.search(r'\bintern(al|ation)', lower): - return "internship" - if "temporary" in lower or re.search(r'\btemp\b', lower): - return "temporary" - - return None - - -def detect_workplace_type(text: str, location: Optional[str] = None) -> str: - """ - Detect workplace type: 'remote', 'hybrid', or 'onsite'. - """ - combined = f"{text or ''} {location or ''}".lower() - - for keyword in _REMOTE_KEYWORDS: - if keyword in combined: - return "remote" - - for keyword in _HYBRID_KEYWORDS: - if keyword in combined: - return "hybrid" - - for keyword in _ONSITE_KEYWORDS: - if keyword in combined: - return "onsite" - - return "onsite" # Default assumption - - -# ─── Base Spider ────────────────────────────────────────────────── - - -class BaseSpider(ABC): - """ - Abstract base for all API-based spiders. - - Provides: - - aiohttp session management - - Rate limiting - - Metrics tracking - - Shared field extraction methods - """ - - SOURCE_NAME: str = "unknown" - - def __init__(self, requests_per_minute: int = 20): - self._min_interval = 60.0 / requests_per_minute - self._last_request_at = 0.0 - self._session: Optional[aiohttp.ClientSession] = None - - # Metrics — reset between cycles by the orchestrator - self.jobs_found = 0 - self.pages_scraped = 0 - self.errors = 0 - - async def _ensure_session(self) -> aiohttp.ClientSession: - """Lazily create an aiohttp session.""" - if self._session is None or self._session.closed: - self._session = aiohttp.ClientSession( - headers={ - "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/131.0.0.0 Safari/537.36" - ), - "Accept": "application/json", - }, - timeout=aiohttp.ClientTimeout(total=30), - ) - return self._session - - async def _throttle(self) -> None: - """Enforce minimum interval between outbound requests.""" - elapsed = time.monotonic() - self._last_request_at - if elapsed < self._min_interval: - await asyncio.sleep(self._min_interval - elapsed) - self._last_request_at = time.monotonic() - - async def _get_json(self, url: str, params: dict = None) -> Optional[dict]: - """GET a URL and return parsed JSON, with rate limiting and error handling.""" - await self._throttle() - session = await self._ensure_session() - - try: - async with session.get(url, params=params) as resp: - if resp.status == 429: - retry_after = int(resp.headers.get("Retry-After", 60)) - logger.warning(f"[{self.SOURCE_NAME}] Rate limited. Waiting {retry_after}s...") - await asyncio.sleep(retry_after) - return None - - if resp.status != 200: - logger.warning(f"[{self.SOURCE_NAME}] GET {url} returned {resp.status}") - return None - - return await resp.json() - - except asyncio.TimeoutError: - logger.warning(f"[{self.SOURCE_NAME}] Timeout on {url}") - self.errors += 1 - return None - except Exception as e: - logger.error(f"[{self.SOURCE_NAME}] Request failed: {e}") - self.errors += 1 - return None - - @abstractmethod - async def scrape( - self, known_ids: Optional[Set[str]] = None - ) -> AsyncIterator[ScrapedJob]: - """Scrape all jobs. Yields ScrapedJob objects.""" - ... - - async def scrape_all(self, known_ids: Optional[Set[str]] = None) -> list: - """Scrape all jobs and return as a list.""" - jobs = [] - async for job in self.scrape(known_ids): - jobs.append(job) - return jobs - - async def close(self) -> None: - """Close the aiohttp session.""" - if self._session and not self._session.closed: - await self._session.close() - self._session = None - - def get_metrics(self) -> dict: - """Return current cycle metrics.""" - return { - "source": self.SOURCE_NAME, - "jobs_found": self.jobs_found, - "pages_scraped": self.pages_scraped, - "errors": self.errors, - } diff --git a/apps/scraper/src/spiders/greenhouse.py b/apps/scraper/src/spiders/greenhouse.py deleted file mode 100644 index 02f5d0e..0000000 --- a/apps/scraper/src/spiders/greenhouse.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -""" -greenhouse.py -Spider for Greenhouse ATS public job board API. - -Endpoint: https://boards-api.greenhouse.io/v1/boards/{token}/jobs?content=true -- Free, no auth, no Cloudflare -- Returns richly structured job data per company -- Targets curated list of high-profile tech companies -""" - -import logging -from datetime import datetime, timezone -from typing import AsyncIterator, Optional, Set, List - -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from models import ScrapedJob -from spiders.base import ( - BaseSpider, - html_to_text, - extract_yoe, - extract_salary, - detect_remote, - detect_job_type, - safe_decimal, -) - -logger = logging.getLogger(__name__) - -# Curated list of high-profile companies with public Greenhouse boards. -# Board tokens are typically the company subdomain on greenhouse. -# Add/remove companies as needed — these are all verified public boards. -COMPANY_BOARDS = [ - # Big Tech / Unicorns - "stripe", - "figma", - "notion", - "cloudflare", - "datadog", - "vercel", - "linear", - "supabase", - "dbt labs", - "airbyte", - "gitlabcom", - "hashicorp", - "confluent", - "snyk", - # Growth Stage - "postman", - "retool", - "airtable", - "mux", - "render", - "sentry", - "grafanalabs", - "planetscale", - "railway", - # Enterprise - "twilio", - "gusto", - "brex", - "ramp", - "navan", - "plaid", - "benchling", - "vanta", -] - - -class GreenhouseSpider(BaseSpider): - """ - Production spider for Greenhouse ATS public job boards. - - Features: - - Pure aiohttp — no browser, no Playwright, zero Cloudflare friction - - Iterates through curated list of company boards - - Returns richest structured data: departments, offices, content (HTML) - - Parses compensation, YOE, remote status from content - """ - - SOURCE_NAME = "greenhouse" - API_BASE = "https://boards-api.greenhouse.io/v1/boards" - - def __init__( - self, - requests_per_minute: int = 30, - company_boards: Optional[List[str]] = None, - ): - super().__init__(requests_per_minute=requests_per_minute) - self.boards = company_boards or COMPANY_BOARDS - - def _parse_job(self, raw: dict, board_token: str) -> Optional[ScrapedJob]: - """Parse a Greenhouse API job object into a ScrapedJob.""" - try: - gh_id = raw.get("id") - title = (raw.get("title") or "").strip() - - if not title or not gh_id: - return None - - # Content — Greenhouse provides rich HTML content - content_html = raw.get("content", "") - description = html_to_text(content_html) - if len(description) < 10: - return None - - # Company name — from the board token, capitalized - company_name = board_token.replace("-", " ").replace("_", " ").title() - - # Location — from offices and location fields - location_obj = raw.get("location", {}) or {} - location_name = location_obj.get("name", "") - - offices = raw.get("offices", []) or [] - if not location_name and offices: - office_names = [o.get("name", "") for o in offices if o.get("name")] - location_name = ", ".join(office_names[:3]) - - # Remote detection — from location and content - is_remote = detect_remote(description, location_name) - - # Departments → skills/categories - departments = raw.get("departments", []) or [] - dept_names = [d.get("name", "") for d in departments if d.get("name")] - - # Job URL - source_url = raw.get("absolute_url", "") - if not source_url: - source_url = f"https://boards.greenhouse.io/{board_token}/jobs/{gh_id}" - - # Salary — extract from content - salary_min, salary_max = extract_salary(description[:5000]) - - # Check metadata for compensation (some boards include it) - metadata = raw.get("metadata", []) or [] - for meta in metadata: - if meta.get("name", "").lower() in ("compensation", "salary", "pay"): - comp_text = str(meta.get("value", "")) - if comp_text: - s_min, s_max = extract_salary(comp_text) - if s_min: - salary_min = s_min - if s_max: - salary_max = s_max - - # YOE — extract from content - yoe = extract_yoe(description) - - # Job type — from content - job_type = detect_job_type(description[:2000]) - - # Posted date - posted_at = None - updated_at_str = raw.get("updated_at") or raw.get("first_published_at") - if updated_at_str: - try: - posted_at = datetime.fromisoformat( - updated_at_str.replace("Z", "+00:00") - ) - except (ValueError, AttributeError): - pass - - return ScrapedJob( - title=title, - company_name=company_name, - description=description, - location=location_name or ("Remote" if is_remote else None), - salary_min=salary_min, - salary_max=salary_max, - job_type=job_type, - remote=is_remote, - source=self.SOURCE_NAME, - source_url=source_url, - skills_required=dept_names, - experience_required=yoe, - posted_at=posted_at, - is_active=True, - requisition_id=f"gh-{board_token}-{gh_id}", - ) - - except Exception as e: - logger.error(f"[greenhouse] Parse error for {board_token}: {e}", exc_info=True) - self.errors += 1 - return None - - async def _scrape_board( - self, board_token: str, known: Set[str] - ) -> AsyncIterator[ScrapedJob]: - """Scrape all jobs from a single Greenhouse board.""" - url = f"{self.API_BASE}/{board_token}/jobs" - data = await self._get_json(url, params={"content": "true"}) - - if not data: - return - - jobs_list = data.get("jobs", []) - if not jobs_list: - return - - self.pages_scraped += 1 - new_count = 0 - - for raw_job in jobs_list: - gh_id = raw_job.get("id") - dedup_key = f"gh-{board_token}-{gh_id}" - - if dedup_key in known: - continue - known.add(dedup_key) - - job = self._parse_job(raw_job, board_token) - if job: - self.jobs_found += 1 - new_count += 1 - yield job - - if new_count > 0: - logger.info(f"[greenhouse] Board '{board_token}': {new_count} jobs") - - async def scrape( - self, known_ids: Optional[Set[str]] = None - ) -> AsyncIterator[ScrapedJob]: - """ - Scrape all jobs across all configured Greenhouse company boards. - """ - known = known_ids or set() - logger.info({ - "event": "scrape_start", - "source": self.SOURCE_NAME, - "boards": len(self.boards), - }) - - try: - for board_token in self.boards: - try: - async for job in self._scrape_board(board_token, known): - yield job - except Exception as e: - logger.warning(f"[greenhouse] Board '{board_token}' failed: {e}") - self.errors += 1 - - except Exception as e: - logger.error(f"[greenhouse] Scrape failed: {e}", exc_info=True) - self.errors += 1 - finally: - await self.close() - - logger.info({ - "event": "scrape_complete", - "source": self.SOURCE_NAME, - "jobs_found": self.jobs_found, - "boards_scraped": self.pages_scraped, - "errors": self.errors, - }) diff --git a/apps/scraper/src/spiders/hiring_cafe.py b/apps/scraper/src/spiders/hiring_cafe.py deleted file mode 100644 index f403800..0000000 --- a/apps/scraper/src/spiders/hiring_cafe.py +++ /dev/null @@ -1,840 +0,0 @@ -#!/usr/bin/env python3 -""" -hiring_cafe.py -Spider for hiring.cafe's JSON API. - -CHANGELOG: -- Implemented rotating User-Agents for robust Cloudflare bypassing -- Added pagination circuit breaker to prevent infinite loops on stale API offsets -- Skipping detail fetches automatically if job ID is in known_ids -- Migrated to Playwright and playwright-stealth to autonomously run JS and defeat Cloudflare Turnstile blocks, reusing cf_clearance cookies. -""" - -import asyncio -import logging -import re -import time -import random -from decimal import Decimal, InvalidOperation -from typing import AsyncIterator, Optional, Dict, Any, List, Set, Tuple -from datetime import datetime, timezone -from html import unescape - -import json -import os -import sys -from pathlib import Path - -from tenacity import ( - retry, - stop_after_attempt, - wait_exponential, - retry_if_exception_type, -) - -from playwright.async_api import async_playwright, Page, Error as PlaywrightError -from playwright_stealth import Stealth - -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from models import ScrapedJob - -logger = logging.getLogger(__name__) - -# ─── Helpers ────────────────────────────────────────────────────── - -_BLOCK_TAG_RE = re.compile(r"", re.I) -_BR_TAG_RE = re.compile(r"", re.I) -_ALL_TAGS_RE = re.compile(r"<[^>]+>") -_MULTI_SPACE_RE = re.compile(r"[ \t]+") -_MULTI_NEWLINE_RE = re.compile(r"\n{3,}") - - -def _html_to_text(html: str) -> str: - """Convert HTML to plain text without external dependencies.""" - if not html: - return "" - text = _BR_TAG_RE.sub("\n", html) - text = _BLOCK_TAG_RE.sub("\n", text) - text = _ALL_TAGS_RE.sub(" ", text) - text = unescape(text) - text = _MULTI_SPACE_RE.sub(" ", text) - text = _MULTI_NEWLINE_RE.sub("\n\n", text) - return text.strip() - - -def _safe_decimal(value: Any) -> Optional[Decimal]: - """Safely convert a value to Decimal, returning None on failure.""" - if value is None: - return None - try: - return Decimal(str(value)) - except (InvalidOperation, ValueError, TypeError): - return None - - -# ─── Spider ─────────────────────────────────────────────────────── - - -class HiringCafeSpider: - """ - Production spider for hiring.cafe using Playwright. - - Features: - - Autonomously bypasses Cloudflare JS Challenges using a stealth Chromium instance - - Reuses clearance cookies for raw headless HTTP fetches avoiding constant popups - - GET /api/search-jobs for paginated search (returns full job records) - - GET /api/search-jobs/get-total-count for total count - - GET /_next/data/{buildId}/viewjob/{id}.json for extra detail (optional) - - Configurable rate limiting (RPM) - - Exponential backoff on 429 / transient errors - - requisition_id as natural dedup key - """ - - BASE = "https://hiring.cafe" - SEARCH_URL = "https://hiring.cafe/api/search-jobs" - COUNT_URL = "https://hiring.cafe/api/search-jobs/get-total-count" - - _BUILD_ID_RE = re.compile(r'"buildId"\s*:\s*"([^"]+)"') - - def __init__( - self, - requests_per_minute: int = 20, - page_size: int = 50, - max_pages: int = 500, - ): - self._min_interval = 60.0 / requests_per_minute - self._page_size = page_size - self._max_pages = max_pages - self._last_request_at = 0.0 - - self._build_id: Optional[str] = None - - # Session path - self._abs_root = Path("/Users/apple/Desktop/Postly/apps/scraper") - self._session_path = self._abs_root / ".sessions/hiring_cafe" - self._cookies_file = self._session_path / "cookies.json" - - # Injected runtime via scrape() execution loop - self._page: Optional[Page] = None - - # Metrics — reset between cycles by the orchestrator - self.jobs_found = 0 - self.pages_scraped = 0 - self.detail_fetches = 0 - self.errors = 0 - - async def close(self) -> None: - """Compatibility signature.""" - pass - - # ─── Cloudflare Clearance & Session ─────────────────────────── - - def _get_chromium_args(self) -> List[str]: - """Return platform-specific Chromium arguments to avoid native segfaults.""" - if sys.platform == "darwin": # macOS - return [ - "--disable-blink-features=AutomationControlled", - "--disable-dev-shm-usage", - "--disable-infobars", - "--window-size=1920,1080", - "--start-maximized", - "--lang=en-US", - "--exclude-switches=enable-automation", - "--disable-extensions-except=", - "--disable-gpu-sandbox", - "--no-first-run", - "--no-default-browser-check", - "--disable-features=IsolateOrigins,site-per-process", - ] - - async def _simulate_human_behavior(self, page: Page): - """Simulate realistic human interaction during the CF challenge window.""" - try: - # Move mouse to a random spot - await page.mouse.move(random.randint(100, 700), random.randint(100, 500), steps=10) - - # Natural scroll (shorter, less blocking) - if random.random() > 0.5: - scroll = random.randint(100, 400) - await page.evaluate(f"window.scrollBy({{top: {scroll}, behavior: 'smooth'}});") - await asyncio.sleep(0.5) - await page.evaluate(f"window.scrollBy({{top: -{random.randint(50, 100)}, behavior: 'smooth'}});") - - await asyncio.sleep(random.uniform(0.1, 0.5)) - except Exception as e: - logger.debug(f"Behavior simulation partial failure: {e}") - - async def _save_session(self, context) -> None: - """Save cookies to the session file.""" - try: - self._session_path.mkdir(parents=True, exist_ok=True) - cookies = await context.cookies() - with open(self._cookies_file, "w") as f: - json.dump(cookies, f, indent=2) - logger.info(f"💾 Saved {len(cookies)} cookies to {self._cookies_file}") - except Exception as e: - logger.warning(f"Failed to save session: {e}") - - async def _load_session(self, context) -> bool: - """Load cookies from the session file. Returns True if restored.""" - try: - if self._cookies_file.exists(): - with open(self._cookies_file, "r") as f: - cookies = json.load(f) - await context.add_cookies(cookies) - logger.info(f"♻️ Restored {len(cookies)} cookies from session.") - return True - except Exception as e: - logger.warning(f"Failed to load session: {e}") - return False - - async def _wait_for_clearance(self, page: Page, timeout_ms: int = 60000) -> bool: - """ - Handles BOTH Managed (invisible) and Interactive (iframe) CF challenges. - Returns True if clearance was obtained. - """ - start_time = asyncio.get_event_loop().time() - timeout_sec = timeout_ms / 1000.0 - - TURNSTILE_SELECTORS = [ - "#AOzYg6", # Primary Turnstile container found by investigation - "iframe[src*='challenges.cloudflare.com']", - "iframe[src*='challenge-platform']", - "iframe[title*='Cloudflare']", - "iframe[id*='cf-chl-widget']", - "#cf-turnstile", - ".cf-turnstile", - ] - - logger.info(f"Executing behavioral simulation and waiting for challenge to settle ({timeout_sec}s)...") - - while (asyncio.get_event_loop().time() - start_time) < timeout_sec: - # 1. Behavioral simulation loop (partial) - await self._simulate_human_behavior(page) - - # 2. Check: Cookie set (Managed Challenge solved silently or cookie restored) - cookies = await page.context.cookies() - if any(c["name"] == "cf_clearance" for c in cookies): - logger.info("✅ cf_clearance cookie captured.") - return True - - # 3. Check: Page title cleared - try: - title = await page.title() - if "just a moment" not in title.lower() and "attention required" not in title.lower(): - logger.info(f"✅ Challenge passed based on title: {title}") - return True - except Exception: - pass - - # 4. Check: Turnstile Challenge (Advanced Frame Search) - solved_this_loop = False - for selector in TURNSTILE_SELECTORS: - try: - locator = page.locator(selector).first - if await locator.count() > 0: - # Fallback 1: Try to find the checkbox in ANY frame on the page - for frame in page.frames: - try: - checkbox = frame.locator("input[type='checkbox']").first - if await checkbox.count() > 0 and await checkbox.is_visible(): - logger.info(f"🔲 Turnstile checkbox found in frame '{frame.name or 'unnamed'}'. Clicking...") - await checkbox.click(timeout=3000) - solved_this_loop = True - break - except Exception: - continue - - if solved_this_loop: - break - - # Fallback 2: Pixel click the LEFT-CENTER of the container (where the box actually is) - logger.info(f"🔲 Turnstile container '{selector}' visible. Attempting left-side pixel click...") - box = await locator.bounding_box() - if box: - # The checkbox in Cloudflare Turnstile is typically on the left side. - # We click ~30px from the left and center vertically. - target_x = box["x"] + 30 - target_y = box["y"] + box["height"] / 2 - await page.mouse.click(target_x, target_y) - solved_this_loop = True - break - - logger.info(f"🔲 Turnstile element '{selector}' detected. Waiting for settlement...") - break - except Exception: - continue - - await asyncio.sleep(1) - - # Failure Diagnostics - try: - diag_path = self._abs_root / "debug_clearance.png" - await page.screenshot(path=str(diag_path)) - logger.warning(f"❌ Clearance timed out. Screenshot saved to {diag_path}") - except Exception as e: - logger.debug(f"Failed to capture diagnostic screenshot: {e}") - - return False - - async def _get_clearance(self, playwright) -> Tuple[Any, Any, Page, str]: - """ - Launch a real browser, solve the CF challenge, return browser, context, page, UA. - """ - is_headless = os.getenv("HEADLESS", "True").lower() in ("true", "1", "t") - logger.info(f"Initializing {'Headless' if is_headless else 'Headed'} Chromium/Chrome for Cloudflare Clearance...") - - launch_args = self._get_chromium_args() - - # Ensure no-sandbox for execution environment compatibility - if "--no-sandbox" not in launch_args: - launch_args.append("--no-sandbox") - - user_data_dir = self._session_path / "browser_data" - user_data_dir.mkdir(parents=True, exist_ok=True) - - async def launch_with_retry(): - # Attempt 1: Real Chrome (Best TLS) - # Attempt 2: Bundled Chromium (Fallback) - channels = ["chrome", None] if sys.platform == "darwin" else [None] - - last_err = None - for channel in channels: - try: - logger.info(f"Targeting channel: {channel or 'bundled chromium'}...") - return await playwright.chromium.launch_persistent_context( - user_data_dir=str(user_data_dir.absolute()), - headless=is_headless, - args=launch_args, - channel=channel, - ignore_default_args=["--enable-automation"], - user_agent=( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/131.0.0.0 Safari/537.36" - ), - viewport={"width": 1440, "height": 900}, - device_scale_factor=2, - locale="en-US", - ) - except Exception as e: - last_err = e - if "SingletonSocket" in str(e) or "ProcessSingleton" in str(e): - logger.warning(f"Browser isolation error on {channel}: {e}. Retrying with different engine...") - continue - raise e - raise last_err - - # Using 1440x900 Retina display scale for better fingerprint - context = await launch_with_retry() - - # Patching detection vectors aggressively - await context.add_init_script(""" - // 1. Remove webdriver - Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); - - // 2. Mock chrome runtime - window.chrome = { - runtime: {}, - loadTimes: function() {}, - csi: function() {}, - app: {} - }; - - // 3. Mock permissions - try { - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission }) : - originalQuery(parameters) - ); - } catch(e) {} - - // 4. Mock CPU cores (MacBook typically 8+) - Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 }); - - // 5. Languages - Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); - """) - - page = context.pages[0] if context.pages else await context.new_page() - - # High-fidelity Stealth CDP Injection - stealth = Stealth( - navigator_plugins=True, - navigator_permissions=True, - ) - try: - await stealth.apply_stealth_async(page) - logger.info("playwright-stealth patches applied successfully") - except Exception as e: - logger.warning(f"playwright-stealth partial failure (continuing): {e}") - - # Try restoring existing session cookies - await self._load_session(context) - - # Navigate - logger.info(f"Navigating to {self.BASE} to verify session or clear challenges...") - try: - await page.goto(self.BASE, wait_until="domcontentloaded", timeout=30000) - except Exception as e: - logger.warning(f"Navigation error: {e}") - - # Wait for clearance - success = await self._wait_for_clearance(page) - - if success: - await self._save_session(context) - else: - logger.warning("Proceeding without confirmed clearance (may fail with 403)") - - ua = await page.evaluate("navigator.userAgent") - return None, context, page, ua - - # ─── Rate Limiting ──────────────────────────────────────────── - - async def _throttle(self) -> None: - """Enforce minimum interval between outbound API requests.""" - elapsed = time.monotonic() - self._last_request_at - if elapsed < self._min_interval: - await asyncio.sleep(self._min_interval - elapsed) - self._last_request_at = time.monotonic() - - # ─── Build ID ───────────────────────────────────────────────── - - @retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=2, min=5, max=60), - retry=retry_if_exception_type((PlaywrightError, asyncio.TimeoutError)), - ) - async def _discover_build_id(self) -> str: - """Fetch the homepage and extract the Next.js buildId.""" - await self._throttle() - logger.info(f"Fetching homepage for buildId using authorized session...") - - resp = await self._page.request.get(self.BASE) - status = resp.status - logger.info(f"Homepage response: {status}") - - if status == 403: - logger.warning("Homepage 403 — Cloudflare block persists after clearance") - await asyncio.sleep(30) - raise PlaywrightError("Homepage 403 — Cloudflare block") - - if status != 200: - raise PlaywrightError(f"Homepage returned {status}") - - body_bytes = await resp.body() - html = body_bytes.decode('utf-8', errors='ignore') - - match = self._BUILD_ID_RE.search(html) - if not match: - logger.warning("Could not find buildId in homepage HTML.") - raise PlaywrightError("buildId not found in homepage") - - build_id = match.group(1) - logger.info({"event": "build_id_discovered", "build_id": build_id}) - return build_id - - async def _ensure_build_id(self) -> None: - """Try to get buildId, but don't fail the whole scrape if it doesn't work.""" - if not self._build_id: - try: - self._build_id = await self._discover_build_id() - except Exception as e: - logger.warning(f"BuildId discovery failed: {e}. Skipping detail fetches.") - self._build_id = None - - # ─── Search API ─────────────────────────────────────────────── - - @retry( - stop=stop_after_attempt(5), - wait=wait_exponential(multiplier=2, min=3, max=120), - retry=retry_if_exception_type((PlaywrightError, asyncio.TimeoutError)), - ) - async def _search_page(self, offset: int) -> Dict[str, Any]: - """GET /api/search-jobs — returns JSON directly.""" - await self._throttle() - - url = f"{self.SEARCH_URL}?offset={offset}&limit={self._page_size}" - logger.debug(f"Searching offset {offset}...") - - resp = await self._page.request.get(url) - status = resp.status - - if status == 429: - retry_after = 60 - if "Retry-After" in resp.headers: - retry_after = int(resp.headers["Retry-After"]) - logger.warning({"event": "rate_limited", "retry_after": retry_after}) - await asyncio.sleep(retry_after) - raise PlaywrightError("Rate limited on search") - - if status == 403: - logger.warning("Search 403 — Cloudflare challenge expired or failed.") - await asyncio.sleep(30) - raise PlaywrightError("Search returned 403") - - if status != 200: - logger.error({"event": "search_error", "status": status}) - raise PlaywrightError(f"Search returned {status}") - - try: - data = await resp.json() - except Exception as e: - body_bytes = await resp.body() - text = body_bytes.decode('utf-8', errors='ignore') - logger.error(f"Failed to parse search JSON. Body preview: {text[:300]}") - raise PlaywrightError(f"Invalid JSON from search API: {e}") - - return data - - async def _get_total_count(self) -> int: - """GET /api/search-jobs/get-total-count → total available jobs.""" - await self._throttle() - - try: - resp = await self._page.request.get(self.COUNT_URL) - if resp.status == 200: - data = await resp.json() - if isinstance(data, int): - total = data - elif isinstance(data, dict): - total = data.get("total", data.get("count", 0)) - else: - total = 0 - logger.info({"event": "total_count", "total": total}) - return total - else: - logger.warning(f"Count API returned {resp.status}") - except Exception as exc: - logger.warning(f"Could not get total count: {exc}") - return 0 - - # ─── Job Detail (optional — needs buildId) ──────────────────── - - @retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=2, max=30), - retry=retry_if_exception_type((PlaywrightError, asyncio.TimeoutError)), - ) - async def _fetch_job_detail(self, requisition_id: str) -> Optional[Dict[str, Any]]: - """ - GET /_next/data/{buildId}/viewjob/{id}.json for full structured data. - Returns None if buildId is not available. - """ - if not self._build_id: - return None - - await self._throttle() - - url = f"{self.BASE}/_next/data/{self._build_id}/viewjob/{requisition_id}.json" - - resp = await self._page.request.get(url) - status = resp.status - - if status == 200: - self.detail_fetches += 1 - try: - data = await resp.json() - return data.get("pageProps", data) - except Exception: - return None - - if status == 404: - logger.debug(f"Detail 404 for {requisition_id} — buildId may be stale") - return None - - if status == 429: - await asyncio.sleep(30) - raise PlaywrightError("Rate limited on detail") - - if status == 403: - logger.warning("Detail 403 — blocked on individual fetch") - await asyncio.sleep(30) - raise PlaywrightError("Rate limited / blocked on detail") - - logger.debug(f"Detail {status} for {requisition_id}") - return None - - # ─── Parsing ────────────────────────────────────────────────── - - @staticmethod - def _extract_requisition_id(card: Dict[str, Any]) -> Optional[str]: - """Extract requisition_id from a search result card.""" - return card.get("requisition_id") or card.get("objectID") - - def _parse_job(self, raw: Dict[str, Any]) -> Optional[ScrapedJob]: - """ - Parse raw job data into ScrapedJob. - Handles both search result cards and detail page JSON. - """ - try: - if not raw: - return None - - data = raw.get("pageProps", raw) if "pageProps" in raw else raw - - for key in ["job", "job_information"]: - nested = data.get(key) - if isinstance(nested, dict): - merged = data.copy() - merged.update(nested) - data = merged - - title = ( - data.get("title") - or data.get("job_title") - or data.get("job_title_raw") - or "" - ) - requisition_id = ( - data.get("requisition_id") - or data.get("requisitionId") - or data.get("id") - or data.get("objectID") - or "" - ) - - if not title or not requisition_id: - logger.debug(f"Parsing failed: missing title or id. keys: {list(data.keys())}") - return None - - company_data = ( - data.get("enriched_company_data") - or data.get("company_data") - or {} - ) - company_name = ( - company_data.get("name") - or data.get("company_name") - or data.get("company") - or "Unknown" - ) - - description_html = ( - data.get("description") - or data.get("job_description_html") - or data.get("job_description", "") - ) - description = _html_to_text(description_html) - - if len(description) < 10: - description = data.get("description_clean") or data.get("job_description_text") or description - - if len(description) < 10: - logger.debug(f"Description too short for {requisition_id}. Content: {description[:50]}") - return None - - v5 = data.get("v5_processed_job_data") or data.get("processed_data") or {} - - salary_min = _safe_decimal(v5.get("yearly_min_compensation") or data.get("yearly_min_compensation")) - salary_max = _safe_decimal(v5.get("yearly_max_compensation") or data.get("yearly_max_compensation")) - - workplace_type = (v5.get("workplace_type") or "").lower() - is_remote = workplace_type == "remote" - location = ( - v5.get("formatted_workplace_location") - or data.get("location") - or ("Remote" if is_remote else None) - ) - - raw_tools = v5.get("technical_tools") or data.get("skills_required") or [] - skills = [str(t) for t in raw_tools if t] if isinstance(raw_tools, list) else [] - - min_yoe = v5.get("min_industry_and_role_yoe") - experience = f"{min_yoe}+ years" if min_yoe else None - - job_type = data.get("employment_type") or v5.get("employment_type") - - apply_url = ( - data.get("apply_url") - or f"{self.BASE}/viewjob/{requisition_id}" - ) - - job = ScrapedJob( - title=title, - company_name=company_name, - description=description, - location=location, - salary_min=salary_min, - salary_max=salary_max, - job_type=job_type, - remote=is_remote, - source="hiring_cafe", - source_url=apply_url, - skills_required=skills, - experience_required=experience, - is_active=True, - requisition_id=str(requisition_id), - meta={ - "workplace_type": workplace_type, - "industries": company_data.get("industries", []), - "hq_country": company_data.get("hq_country"), - "nb_employees": company_data.get("nb_employees"), - }, - ) - return job - - except Exception as exc: - logger.error(f"Parse error: {exc}", exc_info=True) - self.errors += 1 - return None - - # ─── Main Scrape Flow ───────────────────────────────────────── - - async def scrape( - self, - known_ids: Optional[Set[str]] = None, - ) -> AsyncIterator[ScrapedJob]: - """ - Full scrape cycle using Playwright to bypass protections. - - IMPORTANT: This is a best-effort spider. If Cloudflare blocks us, - we log a warning and yield nothing — we never crash the pipeline. - The other API-based spiders (Remotive, Arbeitnow, Greenhouse) will - still provide jobs even if hiring.cafe is fully blocked. - """ - known = known_ids or set() - start_time = datetime.now(timezone.utc) - logger.info({"event": "scrape_start", "source": "hiring_cafe"}) - - try: - async with async_playwright() as pw: - try: - browser_obj, context, self._page, ua = await self._get_clearance(pw) - except Exception as e: - logger.warning( - f"[hiring_cafe] Cloudflare clearance failed — skipping this source. " - f"Other sources will still run. Error: {e}" - ) - self.errors += 1 - return - - try: - async for job in self._run_scrape_loop(known, start_time, total=None): - yield job - finally: - if browser_obj: - await browser_obj.close() - elif context: - await context.close() - self._page = None - except Exception as e: - logger.warning( - f"[hiring_cafe] Spider crashed — skipping this source. Error: {e}" - ) - self.errors += 1 - - async def _run_scrape_loop(self, known: Set[str], start_time: datetime, total: Optional[int]) -> AsyncIterator[ScrapedJob]: - """Isolates the central loop iteration.""" - - await self._ensure_build_id() - total = await self._get_total_count() - - offset = 0 - page_num = 0 - - seen_ids: Set[str] = set() - duplicate_streak: int = 0 - - while page_num < self._max_pages: - try: - page_data = await self._search_page(offset) - except Exception as exc: - logger.error({ - "event": "search_page_error", - "offset": offset, - "error": str(exc), - }) - self.errors += 1 - break - - hits = page_data.get("results") or page_data.get("hits") or [] - - if not hits: - logger.info({"event": "pagination_complete", "pages": page_num}) - break - - page_ids = {self._extract_requisition_id(c) for c in hits if self._extract_requisition_id(c)} - - if page_ids and page_ids.issubset(seen_ids): - duplicate_streak += 1 - if duplicate_streak >= 2: - logger.warning(f"Pagination loop detected, stopping early at offset {offset}") - break - else: - duplicate_streak = 0 - - seen_ids.update(page_ids) - - new_on_page = 0 - for card in hits: - req_id = self._extract_requisition_id(card) - if not req_id or req_id in known: - continue - - known.add(req_id) - - job = self._parse_job(card) - - if not job and self._build_id: - try: - detail = await self._fetch_job_detail(req_id) - if detail: - job = self._parse_job(detail) - except Exception as exc: - logger.debug(f"Detail fetch failed for {req_id}: {exc}") - self.errors += 1 - - if job: - self.jobs_found += 1 - new_on_page += 1 - yield job - - if new_on_page > 0: - logger.info(f"Page {page_num + 1}: found {new_on_page} new jobs") - - self.pages_scraped += 1 - page_num += 1 - offset += self._page_size - - if page_num % 5 == 0: - logger.info(f"Progress: {page_num}/{self._max_pages} pages. Total found: {self.jobs_found}") - - if total and offset >= total: - logger.info(f"Reached total {total} jobs") - break - - duration = (datetime.now(timezone.utc) - start_time).total_seconds() - logger.info({ - "event": "scrape_complete", - "source": "hiring_cafe", - "jobs_found": self.jobs_found, - "pages_scraped": self.pages_scraped, - "detail_fetches": self.detail_fetches, - "errors": self.errors, - "duration_seconds": round(duration, 2), - }) - - async def scrape_all( - self, - known_ids: Optional[Set[str]] = None, - ) -> List[ScrapedJob]: - """Scrape all jobs and return as a list.""" - jobs: List[ScrapedJob] = [] - async for job in self.scrape(known_ids): - jobs.append(job) - return jobs - - def get_metrics(self) -> Dict[str, Any]: - """Return current cycle metrics.""" - return { - "source": "hiring_cafe", - "jobs_found": self.jobs_found, - "pages_scraped": self.pages_scraped, - "detail_fetches": self.detail_fetches, - "errors": self.errors, - } diff --git a/apps/scraper/src/spiders/remotive.py b/apps/scraper/src/spiders/remotive.py deleted file mode 100644 index ec9552a..0000000 --- a/apps/scraper/src/spiders/remotive.py +++ /dev/null @@ -1,226 +0,0 @@ -#!/usr/bin/env python3 -""" -remotive.py -Spider for Remotive's public REST API. - -Endpoint: https://remotive.com/api/remote-jobs -- Free, no auth, no Cloudflare -- Returns remote-only jobs with salary, category, tags -- Rate limit: max 2 requests/minute (TOS) -""" - -import logging -import re -from datetime import datetime, timezone -from typing import AsyncIterator, Optional, Set - -import sys -from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from models import ScrapedJob -from spiders.base import ( - BaseSpider, - html_to_text, - extract_yoe, - extract_salary, - detect_job_type, - safe_decimal, -) - -logger = logging.getLogger(__name__) - -# Categories to scrape — covers tech, design, data, devops, marketing -CATEGORIES = [ - "software-dev", - "design", - "data", - "devops-sysadmin", - "product", - "customer-support", - "marketing", - "qa", - "writing", - "hr", - "finance-legal", - "business", - "all-others", -] - - -class RemotiveSpider(BaseSpider): - """ - Production spider for Remotive.com remote job listings. - - Features: - - Pure aiohttp — no browser, no Playwright, no Cloudflare issues - - Category-based iteration for broad coverage - - Rich field extraction: salary, YOE, job_type - - All results are remote by definition - """ - - SOURCE_NAME = "remotive" - BASE_URL = "https://remotive.com/api/remote-jobs" - - def __init__(self, requests_per_minute: int = 2): - # Remotive TOS: max 2 req/min, max 4 fetches/day - super().__init__(requests_per_minute=requests_per_minute) - - def _parse_job(self, raw: dict) -> Optional[ScrapedJob]: - """Parse a Remotive API job object into a ScrapedJob.""" - try: - job_id = raw.get("id") - title = raw.get("title", "").strip() - company = raw.get("company_name", "").strip() - - if not title or not company: - return None - - # Description - desc_html = raw.get("description", "") - description = html_to_text(desc_html) - if len(description) < 10: - return None - - # URL - source_url = raw.get("url", "") - if not source_url: - return None - - # Location — Remotive provides candidate_required_location - location = raw.get("candidate_required_location", "Worldwide") - - # Salary — Remotive provides salary field as text - salary_text = raw.get("salary", "") - salary_min, salary_max = None, None - if salary_text: - salary_min, salary_max = extract_salary(salary_text) - - # Also try extracting from description if no salary found - if not salary_min and not salary_max: - salary_min, salary_max = extract_salary(description[:2000]) - - # Job type - raw_job_type = raw.get("job_type", "") - job_type = self._normalize_job_type(raw_job_type) - if not job_type: - job_type = detect_job_type(description[:1000]) - - # YOE — extract from description - yoe = extract_yoe(description) - - # Category/tags as skills - category = raw.get("category", "") - tags = raw.get("tags", []) or [] - skills = [t for t in tags if t] if isinstance(tags, list) else [] - if category and category not in skills: - skills.insert(0, category) - - # Posted date - posted_at = None - pub_date = raw.get("publication_date") - if pub_date: - try: - posted_at = datetime.fromisoformat(pub_date.replace("Z", "+00:00")) - except (ValueError, AttributeError): - pass - - return ScrapedJob( - title=title, - company_name=company, - description=description, - location=location, - salary_min=salary_min, - salary_max=salary_max, - job_type=job_type, - remote=True, # All Remotive jobs are remote - source=self.SOURCE_NAME, - source_url=source_url, - skills_required=skills, - experience_required=yoe, - posted_at=posted_at, - is_active=True, - requisition_id=str(job_id) if job_id else source_url, - ) - - except Exception as e: - logger.error(f"[remotive] Parse error: {e}", exc_info=True) - self.errors += 1 - return None - - @staticmethod - def _normalize_job_type(raw: str) -> Optional[str]: - """Normalize Remotive job_type strings.""" - if not raw: - return None - lower = raw.lower().replace("_", " ").replace("-", " ") - if "full" in lower: - return "full_time" - if "part" in lower: - return "part_time" - if "contract" in lower or "freelance" in lower: - return "contract" - if "intern" in lower: - return "internship" - return raw.lower().replace(" ", "_") - - async def scrape( - self, known_ids: Optional[Set[str]] = None - ) -> AsyncIterator[ScrapedJob]: - """ - Scrape all Remotive jobs across categories. - """ - known = known_ids or set() - logger.info({"event": "scrape_start", "source": self.SOURCE_NAME}) - - try: - for category in CATEGORIES: - data = await self._get_json( - self.BASE_URL, - params={"category": category, "limit": 100}, - ) - - if not data: - continue - - jobs_list = data.get("jobs", []) - if not jobs_list: - logger.debug(f"[remotive] No jobs in category: {category}") - continue - - self.pages_scraped += 1 - new_count = 0 - - for raw_job in jobs_list: - job_id = str(raw_job.get("id", "")) - url = raw_job.get("url", "") - - # Dedup: check by source URL or job ID - if url in known or job_id in known: - continue - - known.add(url) - known.add(job_id) - - job = self._parse_job(raw_job) - if job: - self.jobs_found += 1 - new_count += 1 - yield job - - if new_count > 0: - logger.info(f"[remotive] Category '{category}': {new_count} new jobs") - - except Exception as e: - logger.error(f"[remotive] Scrape failed: {e}", exc_info=True) - self.errors += 1 - finally: - await self.close() - - logger.info({ - "event": "scrape_complete", - "source": self.SOURCE_NAME, - "jobs_found": self.jobs_found, - "pages_scraped": self.pages_scraped, - "errors": self.errors, - }) diff --git a/apps/scraper/src/test_hiring.py b/apps/scraper/src/test_hiring.py deleted file mode 100644 index f17b828..0000000 --- a/apps/scraper/src/test_hiring.py +++ /dev/null @@ -1,22 +0,0 @@ -import asyncio -import logging -from spiders.hiring_cafe import HiringCafeSpider -from playwright.async_api import async_playwright - -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s [%(levelname)s] %(name)s: %(message)s' -) - -async def test_clearance(): - spider = HiringCafeSpider() - async with async_playwright() as pw: - try: - cookies, ua = await spider._get_clearance(pw) - has_clearance = any(c['name'] == 'cf_clearance' for c in cookies) - print(f">>> TEST RESULT: cf_clearance obtained: {has_clearance}") - except Exception as e: - print(f">>> TEST ERROR: {e}") - -if __name__ == "__main__": - asyncio.run(test_clearance()) diff --git a/apps/scraper/src/utils.py b/apps/scraper/src/utils.py deleted file mode 100644 index e71cdd6..0000000 --- a/apps/scraper/src/utils.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python3 -""" -utils.py -Shared utilities for the scraper application. -""" -from typing import List, Optional - -def format_vector(embedding: Optional[List[float]]) -> Optional[str]: - """ - Safely formats a float list into pgvector-compatible string. - Use this everywhere — never use str(embedding) directly. - """ - if not embedding: - return None - return '[' + ','.join(str(v) for v in embedding) + ']' \ No newline at end of file diff --git a/apps/scraper/test_args.py b/apps/scraper/test_args.py deleted file mode 100644 index 8029cfe..0000000 --- a/apps/scraper/test_args.py +++ /dev/null @@ -1,52 +0,0 @@ -import asyncio -from playwright.async_api import async_playwright - -async def test_args(args_list, tag): - try: - async with async_playwright() as pw: - browser = await pw.chromium.launch( - headless=True, - args=args_list - ) - context = await browser.new_context( - user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", - viewport={"width": 1280, "height": 800}, - locale="en-US", - ) - page = await context.new_page() - await page.add_init_script(""" - Object.defineProperty(navigator, 'plugins', { get: () => Object.freeze([{name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format', length: 1}]) }); - window.chrome = { runtime: {} }; - """) - print(f"[{tag}] going to hiring.cafe...") - await page.goto("https://hiring.cafe", wait_until="domcontentloaded", timeout=15000) - await page.wait_for_timeout(3000) - print(f"[{tag}] SUCCESS") - await browser.close() - except Exception as e: - print(f"[{tag}] CRASH:", e) - -async def main(): - print("Testing config 1: default + swiftshader") - await test_args([ - "--disable-blink-features=AutomationControlled", - "--disable-dev-shm-usage", - "--use-gl=swiftshader" - ], "swiftshader") - - print("Testing config 2: default WITHOUT swiftshader (no disable-gpu)") - await test_args([ - "--disable-blink-features=AutomationControlled", - "--disable-dev-shm-usage" - ], "no-gpu-flags") - - print("Testing config 3: angle swiftshader") - await test_args([ - "--disable-blink-features=AutomationControlled", - "--disable-dev-shm-usage", - "--use-gl=angle", - "--use-angle=swiftshader" - ], "angle-swiftshader") - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/apps/scraper/test_no_sandbox.py b/apps/scraper/test_no_sandbox.py deleted file mode 100644 index a8d8f9c..0000000 --- a/apps/scraper/test_no_sandbox.py +++ /dev/null @@ -1,42 +0,0 @@ -import asyncio -import os -from playwright.async_api import async_playwright - -async def test_no_sandbox(with_js_inject, tag): - try: - async with async_playwright() as pw: - # specifically ensuring --no-sandbox is absent! - browser = await pw.chromium.launch( - headless=True, - args=["--disable-blink-features=AutomationControlled"] - ) - context = await browser.new_context( - user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", - viewport={"width": 1280, "height": 800}, - ) - page = await context.new_page() - - if with_js_inject: - await page.add_init_script(""" - Object.defineProperty(navigator, 'plugins', { get: () => Object.freeze([{name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format', length: 1}]) }); - window.chrome = { runtime: {} }; - """) - - print(f"[{tag}] going to hiring.cafe...") - await page.goto("https://hiring.cafe", wait_until="domcontentloaded", timeout=15000) - await page.wait_for_timeout(3000) - html = await page.evaluate("() => document.title") - print(f"[{tag}] SUCCESS, title: {html}") - await browser.close() - except Exception as e: - print(f"[{tag}] CRASH:", e) - -async def main(): - print("Testing 1: NO NO-SANDBOX, WITH JS") - await test_no_sandbox(True, "with_js") - - print("Testing 2: NO NO-SANDBOX, NO JS") - await test_no_sandbox(False, "no_js") - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/apps/scraper/test_plugins.py b/apps/scraper/test_plugins.py deleted file mode 100644 index db7712b..0000000 --- a/apps/scraper/test_plugins.py +++ /dev/null @@ -1,41 +0,0 @@ -import asyncio -from playwright.async_api import async_playwright - -async def test_plugins(with_plugins_hack_enabled, tag): - try: - async with async_playwright() as pw: - browser = await pw.chromium.launch( - headless=True, - args=[ - "--no-sandbox", - "--disable-blink-features=AutomationControlled", - ] - ) - context = await browser.new_context( - user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", - ) - page = await context.new_page() - - if with_plugins_hack_enabled: - await page.add_init_script(""" - Object.defineProperty(navigator, 'plugins', { get: () => Object.freeze([{name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format', length: 1}]) }); - """) - - print(f"[{tag}] going to hiring.cafe...") - await page.goto("https://hiring.cafe", wait_until="domcontentloaded", timeout=15000) - await page.wait_for_timeout(3000) - title = await page.title() - print(f"[{tag}] SUCCESS, title: {title}") - await browser.close() - except Exception as e: - print(f"[{tag}] CRASH:", e) - -async def main(): - print("Testing 1: WITH JS inject") - await test_plugins(True, "with_js") - - print("Testing 2: WITHOUT JS inject") - await test_plugins(False, "no_js") - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/apps/scraper/test_single_process.py b/apps/scraper/test_single_process.py deleted file mode 100644 index 15c2a03..0000000 --- a/apps/scraper/test_single_process.py +++ /dev/null @@ -1,39 +0,0 @@ -import asyncio -from playwright.async_api import async_playwright - -async def test_single_process(): - try: - async with async_playwright() as pw: - print("launching single process chromium...") - browser = await pw.chromium.launch( - headless=True, - args=[ - "--disable-blink-features=AutomationControlled", - "--disable-dev-shm-usage", - "--single-process", # Stops subprocess spawning (avoids mach_port failures) - ] - ) - print("context...") - context = await browser.new_context( - user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", - viewport={"width": 1280, "height": 800}, - ) - print("page...") - page = await context.new_page() - - await page.add_init_script(""" - Object.defineProperty(navigator, 'plugins', { get: () => Object.freeze([{name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format', length: 1}]) }); - window.chrome = { runtime: {} }; - """) - - print("going to hiring.cafe...") - await page.goto("https://hiring.cafe", wait_until="domcontentloaded", timeout=15000) - await page.wait_for_timeout(3000) - html = await page.evaluate("() => document.title") - print(f"SUCCESS, title: {html}") - await browser.close() - except Exception as e: - print("CRASH:", e) - -if __name__ == '__main__': - asyncio.run(test_single_process()) diff --git a/apps/scraper/tests/__init__.py b/apps/scraper/tests/__init__.py deleted file mode 100644 index 616c89c..0000000 --- a/apps/scraper/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the Postly job scraper.""" diff --git a/apps/scraper/tests/test_scraper.py b/apps/scraper/tests/test_scraper.py deleted file mode 100644 index 5149d03..0000000 --- a/apps/scraper/tests/test_scraper.py +++ /dev/null @@ -1,890 +0,0 @@ -#!/usr/bin/env python3 -""" -test_scraper.py -Comprehensive test suite for the hiring.cafe scraper. - -Covers: -- Models (ScrapedJob validation, serialization) -- Spider (parsing, build ID extraction, helpers) -- Database (parameterized queries) -- Pipeline (dedup, batch processing) -- Deduplication cache - -Run: python -m pytest tests/test_scraper.py -v -""" - -import asyncio -import json -import re -import sys -from decimal import Decimal -from datetime import datetime, timezone -from pathlib import Path -from typing import Dict, Any, List, Optional -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest - -# ─── Path setup ─────────────────────────────────────────────────── -SRC_DIR = str(Path(__file__).parent.parent / "src") -if SRC_DIR not in sys.path: - sys.path.insert(0, SRC_DIR) - -from models import ScrapedJob, ScrapingMetrics -from spiders.hiring_cafe import HiringCafeSpider, _html_to_text, _safe_decimal -from middlewares.deduplication import DeduplicationCache - - -# ═══════════════════════════════════════════════════════════════════ -# FIXTURES -# ═══════════════════════════════════════════════════════════════════ - - -@pytest.fixture -def sample_job_info() -> Dict[str, Any]: - """Realistic job_information payload as returned by hiring.cafe.""" - return { - "pageProps": { - "job_information": { - "title": "Senior Backend Engineer", - "requisition_id": "abc123xyz", - "company_name": "TechCorp", - "description": ( - "

We are looking for a senior backend engineer " - "to join our team. You will design and build " - "scalable distributed systems. " - "Requirements include 5+ years of experience.

" - ), - "apply_url": "https://techcorp.com/apply/abc123xyz", - "employment_type": "Full Time", - "location": "San Francisco, CA", - "enriched_company_data": { - "name": "TechCorp Inc.", - "industries": ["Technology", "SaaS"], - "hq_country": "US", - "nb_employees": "50-200", - }, - "v5_processed_job_data": { - "yearly_min_compensation": 150000, - "yearly_max_compensation": 220000, - "workplace_type": "Remote", - "formatted_workplace_location": "Remote (US)", - "technical_tools": ["Python", "PostgreSQL", "Kubernetes"], - "min_industry_and_role_yoe": 5, - "employment_type": "Full Time", - }, - } - } - } - - -@pytest.fixture -def sample_job_info_minimal() -> Dict[str, Any]: - """Minimal valid job_information payload.""" - return { - "pageProps": { - "job_information": { - "title": "Junior Developer", - "requisition_id": "min123", - "description": ( - "Join our engineering team as a junior developer. " - "This is an entry level role where you will learn " - "modern web development practices and grow." - ), - } - } - } - - -@pytest.fixture -def spider() -> HiringCafeSpider: - return HiringCafeSpider(requests_per_minute=600, page_size=10, max_pages=5) - - -# ═══════════════════════════════════════════════════════════════════ -# HTML → TEXT -# ═══════════════════════════════════════════════════════════════════ - - -class TestHtmlToText: - def test_empty_input(self): - assert _html_to_text("") == "" - assert _html_to_text(None) == "" - - def test_strips_tags(self): - assert "Hello World" in _html_to_text("

Hello World

") - - def test_br_to_newline(self): - result = _html_to_text("Line 1
Line 2
Line 3") - assert "Line 1\nLine 2\nLine 3" == result - - def test_block_tags_to_newline(self): - result = _html_to_text("

Para 1

Para 2

") - assert "Para 1" in result - assert "Para 2" in result - - def test_decodes_entities(self): - assert "&" in _html_to_text("Tom & Jerry") - assert "<" in _html_to_text("a < b") - - def test_collapses_whitespace(self): - result = _html_to_text("

lots of spaces

") - assert " " not in result - - -# ═══════════════════════════════════════════════════════════════════ -# SAFE DECIMAL -# ═══════════════════════════════════════════════════════════════════ - - -class TestSafeDecimal: - def test_none(self): - assert _safe_decimal(None) is None - - def test_integer(self): - assert _safe_decimal(100000) == Decimal("100000") - - def test_float(self): - assert _safe_decimal(99999.99) == Decimal("99999.99") - - def test_string(self): - assert _safe_decimal("150000") == Decimal("150000") - - def test_invalid(self): - assert _safe_decimal("not-a-number") is None - assert _safe_decimal("") is None - assert _safe_decimal({}) is None - - -# ═══════════════════════════════════════════════════════════════════ -# SCRAPED JOB MODEL -# ═══════════════════════════════════════════════════════════════════ - - -class TestScrapedJob: - def test_valid_creation(self): - job = ScrapedJob( - title="Software Engineer", - company_name="ACME Corp", - description="A " * 30, # > 50 chars - requisition_id="req_001", - ) - assert job.title == "Software Engineer" - assert job.company_name == "ACME Corp" - assert job.source == "hiring_cafe" - assert job.is_active is True - assert job.remote is False - assert job.id # UUID auto-generated - - def test_whitespace_stripping(self): - job = ScrapedJob( - title=" Senior Software Engineer ", - company_name=" ACME Corp ", - description="X " * 30, - requisition_id="req_002", - ) - assert job.title == "Senior Software Engineer" - assert job.company_name == "ACME Corp" - - def test_skills_from_string(self): - job = ScrapedJob( - title="Backend Dev", - company_name="Corp", - description="Y " * 30, - requisition_id="req_003", - skills_required="Python, Go, Rust", - ) - assert job.skills_required == ["Python", "Go", "Rust"] - - def test_skills_from_list(self): - job = ScrapedJob( - title="Backend Dev", - company_name="Corp", - description="Y " * 30, - requisition_id="req_004", - skills_required=["Python", "Go"], - ) - assert job.skills_required == ["Python", "Go"] - - def test_skills_none(self): - job = ScrapedJob( - title="Backend Dev", - company_name="Corp", - description="Y " * 30, - requisition_id="req_005", - skills_required=None, - ) - assert job.skills_required == [] - - def test_invalid_url_rejected(self): - job = ScrapedJob( - title="Dev", - company_name="Corp", - description="Z " * 30, - requisition_id="req_006", - source_url="not-a-url", - ) - assert job.source_url is None - - def test_valid_url_accepted(self): - job = ScrapedJob( - title="Dev", - company_name="Corp", - description="Z " * 30, - requisition_id="req_007", - source_url="https://example.com/job/123", - ) - assert job.source_url == "https://example.com/job/123" - - def test_title_too_short(self): - with pytest.raises(Exception): - ScrapedJob( - title="AB", # < 3 chars - company_name="Corp", - description="Z " * 30, - requisition_id="req_008", - ) - - def test_description_too_short(self): - with pytest.raises(Exception): - ScrapedJob( - title="Developer", - company_name="Corp", - description="Short", # < 50 chars - requisition_id="req_009", - ) - - def test_to_db_dict(self): - job = ScrapedJob( - title="Dev", - company_name="Corp", - description="Z " * 30, - requisition_id="req_010", - skills_required=["Python", "Go"], - salary_min=Decimal("100000"), - salary_max=Decimal("150000"), - remote=True, - ) - d = job.to_db_dict() - - assert d["title"] == "Dev" - assert d["company_name"] == "Corp" - assert d["salary_min"] == 100000.0 - assert d["salary_max"] == 150000.0 - assert d["remote"] is True - assert d["source"] == "hiring_cafe" - assert d["id"] is not None - assert d["embedding"] is None - - # skills_required should be JSON string - skills = json.loads(d["skills_required"]) - assert skills == ["Python", "Go"] - - def test_to_db_dict_nulls(self): - job = ScrapedJob( - title="Dev", - company_name="Corp", - description="Z " * 30, - requisition_id="req_011", - ) - d = job.to_db_dict() - - assert d["salary_min"] is None - assert d["salary_max"] is None - assert d["location"] is None - assert d["job_type"] is None - assert d["embedding"] is None - - def test_meta_excluded_from_dict(self): - job = ScrapedJob( - title="Dev", - company_name="Corp", - description="Z " * 30, - requisition_id="req_012", - meta={"extra": "data"}, - ) - d = job.to_db_dict() - assert "meta" not in d - - def test_requisition_id_required(self): - with pytest.raises(Exception): - ScrapedJob( - title="Developer", - company_name="Corp", - description="Z " * 30, - # Missing requisition_id - ) - - -# ═══════════════════════════════════════════════════════════════════ -# SCRAPING METRICS -# ═══════════════════════════════════════════════════════════════════ - - -class TestScrapingMetrics: - def test_defaults(self): - m = ScrapingMetrics() - assert m.source == "hiring_cafe" - assert m.jobs_found == 0 - assert m.jobs_stored == 0 - assert m.duplicates_skipped == 0 - assert m.errors == 0 - - def test_to_log_dict(self): - m = ScrapingMetrics( - jobs_found=100, - jobs_stored=80, - duplicates_skipped=20, - duration_seconds=45.678, - ) - d = m.to_log_dict() - assert d["jobs_found"] == 100 - assert d["jobs_stored"] == 80 - assert d["duplicates_skipped"] == 20 - assert d["duration_seconds"] == 45.68 - assert "timestamp" in d - - -# ═══════════════════════════════════════════════════════════════════ -# SPIDER — PARSING -# ═══════════════════════════════════════════════════════════════════ - - -class TestSpiderParsing: - def test_parse_full_job(self, spider, sample_job_info): - job = spider._parse_job(sample_job_info) - assert job is not None - assert job.title == "Senior Backend Engineer" - assert job.company_name == "TechCorp Inc." - assert job.requisition_id == "abc123xyz" - assert job.remote is True - assert job.salary_min == Decimal("150000") - assert job.salary_max == Decimal("220000") - assert job.job_type == "Full Time" - assert "Python" in job.skills_required - assert "PostgreSQL" in job.skills_required - assert "Kubernetes" in job.skills_required - assert job.experience_required == "5+ years" - assert job.location == "Remote (US)" - assert job.source_url == "https://techcorp.com/apply/abc123xyz" - assert job.meta["hq_country"] == "US" - assert job.meta["industries"] == ["Technology", "SaaS"] - - def test_parse_minimal_job(self, spider, sample_job_info_minimal): - job = spider._parse_job(sample_job_info_minimal) - assert job is not None - assert job.title == "Junior Developer" - assert job.requisition_id == "min123" - assert job.company_name == "Unknown" - assert job.salary_min is None - assert job.salary_max is None - assert job.skills_required == [] - assert job.experience_required is None - assert job.remote is False - - def test_parse_missing_title(self, spider): - raw = {"pageProps": {"job_information": {"requisition_id": "x"}}} - assert spider._parse_job(raw) is None - - def test_parse_missing_requisition_id(self, spider): - raw = {"pageProps": {"job_information": {"title": "Dev"}}} - assert spider._parse_job(raw) is None - - def test_parse_short_description(self, spider): - raw = { - "pageProps": { - "job_information": { - "title": "Developer", - "requisition_id": "short1", - "description": "

Too short

", - } - } - } - assert spider._parse_job(raw) is None - - def test_parse_empty_dict(self, spider): - assert spider._parse_job({}) is None - - def test_parse_malformed_data(self, spider): - """Should not crash on garbage data.""" - assert spider._parse_job({"random": "data"}) is None - assert spider.errors == 0 # None return, not an error - - def test_parse_invalid_salary(self, spider): - """Non-numeric salary should not crash.""" - raw = { - "pageProps": { - "job_information": { - "title": "Developer", - "requisition_id": "sal1", - "description": "A " * 30, - "v5_processed_job_data": { - "yearly_min_compensation": "negotiable", - "yearly_max_compensation": None, - }, - } - } - } - job = spider._parse_job(raw) - assert job is not None - assert job.salary_min is None - assert job.salary_max is None - - def test_extract_requisition_id(self): - assert HiringCafeSpider._extract_requisition_id( - {"requisition_id": "abc123"} - ) == "abc123" - assert HiringCafeSpider._extract_requisition_id( - {"objectID": "xyz789"} - ) == "xyz789" - assert HiringCafeSpider._extract_requisition_id({}) is None - - def test_parse_onsite_job(self, spider): - """Onsite job should have remote=False.""" - raw = { - "pageProps": { - "job_information": { - "title": "Office Manager", - "requisition_id": "onsite1", - "description": "Manage office operations " * 5, - "v5_processed_job_data": { - "workplace_type": "Onsite", - "formatted_workplace_location": "New York, NY", - }, - } - } - } - job = spider._parse_job(raw) - assert job is not None - assert job.remote is False - assert job.location == "New York, NY" - - def test_parse_hybrid_job(self, spider): - raw = { - "pageProps": { - "job_information": { - "title": "Product Manager", - "requisition_id": "hybrid1", - "description": "Lead product strategy and execution " * 5, - "v5_processed_job_data": { - "workplace_type": "Hybrid", - "formatted_workplace_location": "Austin, TX", - }, - } - } - } - job = spider._parse_job(raw) - assert job is not None - assert job.remote is False - assert job.location == "Austin, TX" - - def test_parse_fallback_apply_url(self, spider): - """When no apply_url, should construct one from requisition_id.""" - raw = { - "pageProps": { - "job_information": { - "title": "Developer", - "requisition_id": "noapply1", - "description": "X " * 30, - } - } - } - job = spider._parse_job(raw) - assert job is not None - assert job.source_url == "https://hiring.cafe/viewjob/noapply1" - - -# ═══════════════════════════════════════════════════════════════════ -# SPIDER — BUILD ID -# ═══════════════════════════════════════════════════════════════════ - - -class TestSpiderBuildId: - def test_build_id_regex(self, spider): - """Verify the regex extracts buildId from __NEXT_DATA__ JSON.""" - html = '''''' - match = spider._BUILD_ID_RE.search(html) - assert match is not None - assert match.group(1) == "EwAUde_27rGDUUZJk9NkP" - - def test_build_id_regex_no_match(self, spider): - html = "No next data here" - match = spider._BUILD_ID_RE.search(html) - assert match is None - - -# ═══════════════════════════════════════════════════════════════════ -# SPIDER — METRICS -# ═══════════════════════════════════════════════════════════════════ - - -class TestSpiderMetrics: - def test_initial_metrics(self, spider): - m = spider.get_metrics() - assert m["source"] == "hiring_cafe" - assert m["jobs_found"] == 0 - assert m["pages_scraped"] == 0 - assert m["detail_fetches"] == 0 - assert m["errors"] == 0 - - def test_metrics_after_mutations(self, spider): - spider.jobs_found = 42 - spider.pages_scraped = 5 - spider.detail_fetches = 40 - spider.errors = 2 - m = spider.get_metrics() - assert m["jobs_found"] == 42 - assert m["pages_scraped"] == 5 - assert m["detail_fetches"] == 40 - assert m["errors"] == 2 - - -# ═══════════════════════════════════════════════════════════════════ -# DEDUPLICATION CACHE -# ═══════════════════════════════════════════════════════════════════ - - -class TestDeduplicationCache: - def test_empty_not_duplicate(self): - cache = DeduplicationCache() - assert cache.is_duplicate("https://example.com/job/1") is False - - def test_mark_seen(self): - cache = DeduplicationCache() - url = "https://example.com/job/1" - cache.mark_seen(url) - assert cache.is_duplicate(url) is True - - def test_none_url_not_duplicate(self): - cache = DeduplicationCache() - assert cache.is_duplicate("") is False - - def test_clear_batch(self): - cache = DeduplicationCache() - url = "https://example.com/job/1" - cache.mark_seen(url) - cache.clear_batch() - assert cache.is_duplicate(url) is False - - def test_clear_all(self): - cache = DeduplicationCache() - cache._db_urls = {"https://db.com/job/1"} - cache.mark_seen("https://batch.com/job/1") - cache.clear_all() - assert cache.is_duplicate("https://db.com/job/1") is False - assert cache.is_duplicate("https://batch.com/job/1") is False - - def test_db_cache_detection(self): - cache = DeduplicationCache() - cache._db_urls = { - "https://hiring.cafe/viewjob/abc", - "https://hiring.cafe/viewjob/def", - } - assert cache.is_duplicate("https://hiring.cafe/viewjob/abc") is True - assert cache.is_duplicate("https://hiring.cafe/viewjob/xyz") is False - - @pytest.mark.asyncio - async def test_load_from_db(self): - mock_db = AsyncMock() - mock_db.get_existing_source_urls.return_value = { - "https://hiring.cafe/viewjob/existing1", - } - cache = DeduplicationCache(database=mock_db) - - await cache.load_from_db("hiring_cafe") - - assert cache._db_urls is not None - assert cache.is_duplicate("https://hiring.cafe/viewjob/existing1") is True - mock_db.get_existing_source_urls.assert_awaited_once_with("hiring_cafe") - - -# ═══════════════════════════════════════════════════════════════════ -# PIPELINE (unit-level) -# ═══════════════════════════════════════════════════════════════════ - - -class TestPipeline: - @pytest.mark.asyncio - async def test_process_empty_list(self): - from pipeline import JobProcessingPipeline - - mock_db = AsyncMock() - mock_db.get_existing_source_urls.return_value = set() - - pipeline = JobProcessingPipeline(database=mock_db, batch_size=10) - metrics = await pipeline.process([]) - - assert metrics.jobs_found == 0 - assert metrics.jobs_stored == 0 - - @pytest.mark.asyncio - async def test_process_dedup(self): - from pipeline import JobProcessingPipeline - - mock_db = AsyncMock() - mock_db.get_existing_source_urls.return_value = { - "https://hiring.cafe/viewjob/dup1", - } - mock_db.insert_jobs_batch.return_value = 1 - - pipeline = JobProcessingPipeline(database=mock_db, batch_size=10) - - jobs = [ - ScrapedJob( - title="New Job", - company_name="Corp", - description="Z " * 30, - requisition_id="new1", - source_url="https://hiring.cafe/viewjob/new1", - ), - ScrapedJob( - title="Dup Job", - company_name="Corp", - description="Z " * 30, - requisition_id="dup1", - source_url="https://hiring.cafe/viewjob/dup1", - ), - ] - - metrics = await pipeline.process(jobs) - - assert metrics.jobs_found == 2 - assert metrics.duplicates_skipped == 1 - assert metrics.jobs_stored == 1 - - @pytest.mark.asyncio - async def test_process_no_embedder(self): - from pipeline import JobProcessingPipeline - - mock_db = AsyncMock() - mock_db.get_existing_source_urls.return_value = set() - mock_db.insert_jobs_batch.return_value = 2 - - pipeline = JobProcessingPipeline( - database=mock_db, embedding_service=None, batch_size=10 - ) - - jobs = [ - ScrapedJob( - title="Job A", - company_name="Corp", - description="Z " * 30, - requisition_id="a1", - source_url="https://example.com/a", - ), - ScrapedJob( - title="Job B", - company_name="Corp", - description="Z " * 30, - requisition_id="b1", - source_url="https://example.com/b", - ), - ] - - metrics = await pipeline.process(jobs) - - assert metrics.jobs_found == 2 - assert metrics.jobs_embedded == 0 - assert metrics.jobs_stored == 2 - - -# ═══════════════════════════════════════════════════════════════════ -# EMBEDDING SERVICE (unit-level, mocked Voyage client) -# ═══════════════════════════════════════════════════════════════════ - - -class TestEmbeddingService: - def test_prepare_text(self): - """Verify weighted text preparation.""" - # Import with voyage mock - with patch.dict("sys.modules", {"voyageai": MagicMock()}): - from embedding_service import VoyageEmbeddingService - - service = VoyageEmbeddingService.__new__(VoyageEmbeddingService) - service.model = "voyage-4-lite" - - text = service._prepare_text({ - "job_title": "Backend Engineer", - "skills_required": ["Python", "Go"], - "job_description": "Build scalable systems", - "company_name": "TechCo", - }) - - # Title should appear 3x (weighted) - assert text.count("Backend Engineer") == 3 - # Skills should appear 2x - assert text.count("Python, Go") == 2 - assert "Build scalable systems" in text - assert "TechCo" in text - - def test_prepare_text_empty_fields(self): - with patch.dict("sys.modules", {"voyageai": MagicMock()}): - from embedding_service import VoyageEmbeddingService - - service = VoyageEmbeddingService.__new__(VoyageEmbeddingService) - service.model = "voyage-4-lite" - - text = service._prepare_text({}) - assert text == "" - - def test_embedding_dim_is_768(self): - with patch.dict("sys.modules", {"voyageai": MagicMock()}): - from embedding_service import VoyageEmbeddingService - assert VoyageEmbeddingService.EMBEDDING_DIM == 768 - - def test_default_model(self): - with patch.dict("sys.modules", {"voyageai": MagicMock()}): - import os - # Clear env override if present - original = os.environ.get("VOYAGE_MODEL") - os.environ.pop("VOYAGE_MODEL", None) - - # Re-import to pick up default - import importlib - import embedding_service - importlib.reload(embedding_service) - - assert "voyage-4-lite" in embedding_service.VoyageEmbeddingService.MODEL - - if original is not None: - os.environ["VOYAGE_MODEL"] = original - - -# ═══════════════════════════════════════════════════════════════════ -# JANITOR (unit-level) -# ═══════════════════════════════════════════════════════════════════ - - -class TestJanitor: - @pytest.mark.asyncio - async def test_run_maintenance(self): - from janitor import JanitorService - - mock_db = AsyncMock() - mock_db.cleanup_expired_jobs.return_value = 5 - mock_db.deactivate_stale_jobs.return_value = 10 - mock_db.cleanup_old_jobs.return_value = 3 - mock_db.remove_duplicates.return_value = 2 - - janitor = JanitorService(database=mock_db) - summary = await janitor.run_maintenance() - - assert summary["success"] is True - assert summary["tasks"]["expired_removed"] == 5 - assert summary["tasks"]["stale_deactivated"] == 10 - assert summary["tasks"]["old_removed"] == 3 - assert summary["tasks"]["duplicates_removed"] == 2 - - @pytest.mark.asyncio - async def test_maintenance_handles_errors(self): - from janitor import JanitorService - - mock_db = AsyncMock() - mock_db.cleanup_expired_jobs.side_effect = Exception("DB down") - - janitor = JanitorService(database=mock_db) - # Should not raise — individual tasks catch their errors - expired = await janitor.remove_expired_jobs() - assert expired == 0 - - -# ═══════════════════════════════════════════════════════════════════ -# HEALTH CHECK (unit-level) -# ═══════════════════════════════════════════════════════════════════ - - -class TestHealthCheck: - def test_initial_status(self): - from health import HealthCheckServer - - server = HealthCheckServer(port=9999) - assert server.status["healthy"] is True - assert server.status["database_connected"] is False - - def test_update_status(self): - from health import HealthCheckServer - - server = HealthCheckServer(port=9999) - server.update_status( - database_connected=True, - last_scrape="2025-01-01T00:00:00", - jobs_in_db=500, - ) - assert server.status["database_connected"] is True - assert server.status["jobs_in_db"] == 500 - assert server.status["last_scrape"] == "2025-01-01T00:00:00" - - -# ═══════════════════════════════════════════════════════════════════ -# INTEGRATION-ISH: Spider scrape flow with mocked HTTP -# ═══════════════════════════════════════════════════════════════════ - - -class TestSpiderScrapeFlow: - @pytest.mark.asyncio - async def test_scrape_all_with_mocked_responses(self, spider, sample_job_info): - """End-to-end scrape with mocked aiohttp session.""" - # --- Mock session --- - mock_session = AsyncMock() - spider._session = mock_session - spider._build_id = "test_build_123" - - # Mock count response - count_response = AsyncMock() - count_response.status = 200 - count_response.json = AsyncMock(return_value={"total": 1}) - count_ctx = AsyncMock() - count_ctx.__aenter__ = AsyncMock(return_value=count_response) - count_ctx.__aexit__ = AsyncMock(return_value=False) - - # Mock search response - search_response = AsyncMock() - search_response.status = 200 - search_response.json = AsyncMock(return_value={ - "results": [{"requisition_id": "abc123xyz"}] - }) - search_ctx = AsyncMock() - search_ctx.__aenter__ = AsyncMock(return_value=search_response) - search_ctx.__aexit__ = AsyncMock(return_value=False) - - # Mock detail response - detail_response = AsyncMock() - detail_response.status = 200 - detail_response.json = AsyncMock(return_value=sample_job_info) - detail_ctx = AsyncMock() - detail_ctx.__aenter__ = AsyncMock(return_value=detail_response) - detail_ctx.__aexit__ = AsyncMock(return_value=False) - - # Wire up — GET is used for all three - call_count = 0 - - def get_side_effect(*args, **kwargs): - nonlocal call_count - call_count += 1 - url = args[0] if args else kwargs.get("url", "") - - if "get-total-count" in str(url): - return count_ctx - elif "search-jobs" in str(url): - return search_ctx - else: - return detail_ctx - - mock_session.get = MagicMock(side_effect=get_side_effect) - mock_session.closed = False - - # Override throttle for speed - spider._min_interval = 0.0 - - jobs = await spider.scrape_all() - - assert len(jobs) == 1 - assert jobs[0].title == "Senior Backend Engineer" - assert jobs[0].requisition_id == "abc123xyz" - assert spider.jobs_found == 1 - assert spider.pages_scraped == 1 - assert spider.detail_fetches == 1 - - -if __name__ == "__main__": - pytest.main([__file__, "-v", "--tb=short"]) diff --git a/apps/web/.eslintrc.cjs b/apps/web/.eslintrc.cjs deleted file mode 100644 index 9db4d13..0000000 --- a/apps/web/.eslintrc.cjs +++ /dev/null @@ -1 +0,0 @@ -module.exports = { extends: ['@postly/eslint-config'] }; diff --git a/apps/web/eslint.config.js b/apps/web/eslint.config.js index 7756307..07930ae 100644 --- a/apps/web/eslint.config.js +++ b/apps/web/eslint.config.js @@ -100,6 +100,12 @@ export default [ "no-unused-vars": "off", }, }, + { + files: ["src/pages/TransmissionSettings.tsx"], + rules: { + "@typescript-eslint/no-explicit-any": "off", + }, + }, { ignores: ["dist/**", "node_modules/**"], }, diff --git a/apps/web/package.json b/apps/web/package.json index 1bc7d33..dfbffa5 100644 --- a/apps/web/package.json +++ b/apps/web/package.json @@ -7,11 +7,11 @@ "dev": "vite --port 3001", "build": "tsc && vite build", "preview": "vite preview", - "lint": "eslint src --ext ts,tsx", + "lint": "eslint src", "type-check": "tsc --noEmit" }, "dependencies": { - "@paper-design/shaders-react": "^0.0.71", + "@paper-design/shaders-react": "^0.0.76", "@postly/shared-types": "*", "@radix-ui/react-dialog": "^1.1.15", "@radix-ui/react-dropdown-menu": "^2.1.16", @@ -19,42 +19,44 @@ "@radix-ui/react-select": "^2.2.6", "@radix-ui/react-tabs": "^1.1.13", "@radix-ui/react-tooltip": "^1.2.8", - "@tanstack/react-query": "^5.90.20", - "axios": "^1.7.9", + "@tanstack/react-query": "^5.100.6", + "axios": "^1.15.2", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", - "eslint-plugin-react": "7.22.0", - "lucide-react": "^0.562.0", - "react": "^18.3.1", - "react-dom": "^18.3.1", - "react-dropzone": "^14.4.0", - "react-hook-form": "^7.71.1", + "lucide-react": "^1.14.0", + "react": "^19.2.5", + "react-dom": "^19.2.5", + "react-dropzone": "^15.0.0", + "react-hook-form": "^7.74.0", "react-markdown": "^10.1.0", - "react-router-dom": "^7.1.3", + "react-router-dom": "^7.14.2", "remark-gfm": "^4.0.1", - "tailwind-merge": "^3.4.0", + "swagger-ui-react": "^5.32.5", + "tailwind-merge": "^3.5.0", "tw-animate-css": "^1.4.0", - "zod": "^4.3.6", - "zustand": "^5.0.2" + "zod": "^4.4.1", + "zustand": "^5.0.12" }, "devDependencies": { "@eslint/js": "^10.0.1", "@postly/config": "*", "@postly/eslint-config": "*", "@postly/typescript-config": "*", - "@tailwindcss/postcss": "^4.0.0", - "@types/react": "^18.3.5", - "@types/react-dom": "^18.3.5", - "@typescript-eslint/eslint-plugin": "^8.56.0", - "@typescript-eslint/parser": "^8.56.0", - "@vitejs/plugin-react": "^5.1.2", - "autoprefixer": "^10.4.24", - "eslint": "^10.0.1", - "eslint-plugin-react-hooks": "^7.0.1", - "postcss": "^8.4.49", - "tailwindcss": "^4.0.0", - "typescript": "^5.7.2", - "vite": "^6.0.5", + "@tailwindcss/postcss": "^4.2.4", + "@types/react": "^19.2.14", + "@types/react-dom": "^19.2.3", + "@types/swagger-ui-react": "^5.18.0", + "@typescript-eslint/eslint-plugin": "^8.59.1", + "@typescript-eslint/parser": "^8.59.1", + "@vitejs/plugin-react": "^6.0.1", + "autoprefixer": "^10.5.0", + "eslint": "^10.2.1", + "eslint-plugin-react": "^7.37.5", + "eslint-plugin-react-hooks": "^7.1.1", + "postcss": "^8.5.12", + "tailwindcss": "^4.2.4", + "typescript": "^6.0.3", + "vite": "^8.0.10", "vite-bundle-visualizer": "^1.2.1" } } diff --git a/apps/web/public/swagger.json b/apps/web/public/swagger.json new file mode 100644 index 0000000..d35d7eb --- /dev/null +++ b/apps/web/public/swagger.json @@ -0,0 +1,277 @@ +{ + "openapi": "3.0.0", + "info": { + "title": "Postly Unified API", + "version": "1.0.0", + "description": "Comprehensive documentation for Postly REST API." + }, + "servers": [{ "url": "/api/v1" }], + "components": { + "securitySchemes": { + "bearerAuth": { + "type": "http", + "scheme": "bearer", + "bearerFormat": "JWT" + } + } + }, + "security": [{ "bearerAuth": [] }], + "paths": { + "/auth/register": { + "post": { + "tags": ["Auth"], + "security": [], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "email": { "type": "string" }, + "password": { "type": "string" }, + "full_name": { "type": "string" } + } + } + } + } + }, + "responses": { "201": { "description": "Created" } } + } + }, + "/auth/login": { + "post": { + "tags": ["Auth"], + "security": [], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "email": { "type": "string" }, + "password": { "type": "string" } + } + } + } + } + }, + "responses": { "200": { "description": "Success" } } + } + }, + "/auth/me": { + "get": { + "tags": ["Auth"], + "responses": { "200": { "description": "Current user" } } + } + }, + "/users/profile": { + "get": { + "tags": ["Users"], + "responses": { "200": { "description": "User profile" } } + }, + "patch": { + "tags": ["Users"], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "full_name": { "type": "string" }, + "avatar_url": { "type": "string" } + } + } + } + } + }, + "responses": { "200": { "description": "Updated" } } + } + }, + "/users/seeker-profile": { + "get": { + "tags": ["Users"], + "responses": { "200": { "description": "Seeker profile" } } + }, + "patch": { + "tags": ["Users"], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "headline": { "type": "string" }, + "summary": { "type": "string" } + } + } + } + } + }, + "responses": { "200": { "description": "Updated" } } + } + }, + "/users/employer-profile": { + "get": { + "tags": ["Users"], + "responses": { "200": { "description": "Employer profile" } } + }, + "patch": { + "tags": ["Users"], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "company_name": { "type": "string" }, + "company_website": { "type": "string" } + } + } + } + } + }, + "responses": { "200": { "description": "Updated" } } + } + }, + "/jobs": { + "get": { + "tags": ["Jobs"], + "security": [], + "responses": { "200": { "description": "List jobs" } } + } + }, + "/jobs/matches": { + "get": { + "tags": ["Jobs"], + "responses": { "200": { "description": "Matched jobs" } } + } + }, + "/jobs/{id}": { + "get": { + "tags": ["Jobs"], + "parameters": [ + { + "name": "id", + "in": "path", + "required": true, + "schema": { "type": "string" } + } + ], + "responses": { "200": { "description": "Job details" } } + } + }, + "/resumes/upload": { + "post": { + "tags": ["Resumes"], + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "type": "object", + "properties": { + "resume": { "type": "string", "format": "binary" } + } + } + } + } + }, + "responses": { "201": { "description": "Uploaded" } } + } + }, + "/resumes": { + "get": { + "tags": ["Resumes"], + "responses": { "200": { "description": "List resumes" } } + } + }, + "/chat/conversations": { + "get": { + "tags": ["Chat"], + "responses": { "200": { "description": "List conversations" } } + }, + "post": { + "tags": ["Chat"], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { "initial_message": { "type": "string" } } + } + } + } + }, + "responses": { "201": { "description": "Created" } } + } + }, + "/chat/stream": { + "post": { + "tags": ["Chat"], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "message": { "type": "string" }, + "conversation_id": { "type": "string" } + } + } + } + } + }, + "responses": { "200": { "description": "SSE Stream" } } + } + }, + "/applications": { + "get": { + "tags": ["Applications"], + "responses": { "200": { "description": "List applications" } } + }, + "post": { + "tags": ["Applications"], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "job_id": { "type": "string" }, + "resume_id": { "type": "string" } + } + } + } + } + }, + "responses": { "201": { "description": "Applied" } } + } + }, + "/discord/commands": { + "get": { + "tags": ["Discord Bot"], + "summary": "Bot Commands Reference (Informational)", + "description": "These are the commands you can use in the Postly Discord Bot. Note: These are not HTTP endpoints, but commands typed in Discord.", + "responses": { + "200": { + "description": "List of available Discord commands", + "content": { + "application/json": { + "example": { + "!post job": "Posts a new job finding query", + "!status": "Check the status of your scraping job", + "!help": "Lists all available commands" + } + } + } + } + } + } + } + } +} diff --git a/apps/web/src/App.tsx b/apps/web/src/App.tsx index d7f37d9..a25e39c 100644 --- a/apps/web/src/App.tsx +++ b/apps/web/src/App.tsx @@ -10,6 +10,11 @@ const TransmissionHome = lazy(() => default: m.TransmissionHome, })), ); +const DocsView = lazy(() => + import("./pages/DocsView").then((m) => ({ + default: m.DocsView, + })), +); const TransmissionRoleSelector = lazy(() => import("@pages/TransmissionRoleSelector").then((m) => ({ default: m.TransmissionRoleSelector, @@ -83,6 +88,7 @@ function App() { } /> } /> } /> + } /> {/* ─── Protected ───────────────────────────────── */} { if (error.response?.status === 401) { localStorage.removeItem("access_token"); - window.location.href = "/login"; + window.location.assign("/login"); console.warn("401 Unauthorized - Token removed"); } return Promise.reject(error); diff --git a/apps/web/src/pages/DocsView.tsx b/apps/web/src/pages/DocsView.tsx new file mode 100644 index 0000000..4154b0f --- /dev/null +++ b/apps/web/src/pages/DocsView.tsx @@ -0,0 +1,26 @@ +import SwaggerUI from "swagger-ui-react"; +import "swagger-ui-react/swagger-ui.css"; +import { ArrowLeft } from "lucide-react"; +import { Link } from "react-router-dom"; + +export function DocsView() { + return ( +
+
+ + + Back to App + +

+ Postly Documentation +

+
+
+ +
+
+ ); +} diff --git a/apps/web/src/pages/TransmissionIntegrations.tsx b/apps/web/src/pages/TransmissionIntegrations.tsx index fe29ae5..93d1875 100644 --- a/apps/web/src/pages/TransmissionIntegrations.tsx +++ b/apps/web/src/pages/TransmissionIntegrations.tsx @@ -309,7 +309,7 @@ export function TransmissionIntegrations() { {/* Add bot button */}