diff --git a/.github/SECRETS.md b/.github/SECRETS.md index 4934d9c..8c3f960 100644 --- a/.github/SECRETS.md +++ b/.github/SECRETS.md @@ -1,113 +1,58 @@ # GitHub Secrets Configuration -This document lists all the secrets that need to be configured in GitHub repository settings for CI/CD pipelines to work properly. +All secrets needed for CI/CD pipelines. Configure in **Settings → Secrets and variables → Actions**. ## Required Secrets -### Docker Hub (for docker-build job) +### VPS Deployment -- **DOCKER_USERNAME** - Your Docker Hub username -- **DOCKER_PASSWORD** - Your Docker Hub password or access token +| Secret | Description | Example | +| ------------- | ----------------------------------- | ------------- | +| `VPS_HOST` | VPS IP address or hostname | `203.0.113.1` | +| `VPS_USER` | SSH username on VPS | `deploy` | +| `VPS_PORT` | SSH port | `22` | +| `VPS_SSH_KEY` | Private SSH key for the deploy user | Full PEM key | -### Code Coverage (optional) +### Container Registry (GHCR) -- **CODECOV_TOKEN** - Codecov token for uploading coverage reports - - Get from: https://codecov.io/ - -### Deployment Secrets - -#### Staging Environment - -- **STAGING_DATABASE_URL** - PostgreSQL connection string for staging - - Format: `postgresql://user:password@host:port/database` - -#### Production Environment - -- **PRODUCTION_DATABASE_URL** - PostgreSQL connection string for production - - Format: `postgresql://user:password@host:port/database` - -### Notifications (optional) - -- **SLACK_WEBHOOK_URL** - Slack webhook URL for deployment notifications - - Create at: https://api.slack.com/messaging/webhooks +> [!NOTE] +> GHCR uses `GITHUB_TOKEN` automatically — no additional secrets needed for pushing images. ## How to Add Secrets 1. Go to your GitHub repository -2. Click on **Settings** → **Secrets and variables** → **Actions** +2. Click **Settings** → **Secrets and variables** → **Actions** 3. Click **New repository secret** -4. Add each secret with its corresponding value +4. Add each secret listed above -## Environment-Specific Secrets +## VPS Setup Checklist -### Staging +Before the deploy workflow can succeed, ensure the VPS has: -Go to **Settings** → **Environments** → **staging** → **Add secret** +1. Docker and Docker Compose installed +2. The `deploy` user with docker group access +3. Project directory at `/opt/postly` with `.env` file (`chmod 600`) +4. GHCR login configured: `docker login ghcr.io -u -p ` +5. SSH key added to `~/.ssh/authorized_keys` for the deploy user -### Production - -Go to **Settings** → **Environments** → **production** → **Add secret** - -## Security Best Practices - -1. **Never commit secrets to the repository** -2. **Rotate secrets regularly** (every 3-6 months) -3. **Use environment-specific secrets** for staging vs production -4. **Limit secret access** to specific workflows/environments -5. **Enable branch protection** for main and develop branches -6. **Require approval** for production deployments - -## Additional Configuration - -### Branch Protection Rules (Recommended) +## Branch Protection Rules (Recommended) For `main` branch: - ✅ Require pull request reviews before merging -- ✅ Require status checks to pass before merging - - lint - - type-check - - test - - build +- ✅ Require status checks to pass (lint, type-check, test, build) - ✅ Require branches to be up to date before merging -- ✅ Require conversation resolution before merging - ✅ Do not allow bypassing the above settings -For `develop` branch: - -- ✅ Require status checks to pass before merging -- ✅ Require branches to be up to date before merging - -### Environment Protection Rules - -For `production` environment: - -- ✅ Required reviewers (at least 1) -- ✅ Wait timer: 5 minutes -- ✅ Deployment branches: Only `main` and tags matching `v*` - -For `staging` environment: - -- ✅ Deployment branches: Only `main` and `develop` - -## Verifying Configuration - -After adding secrets, you can verify they're working by: - -1. Pushing to a feature branch -2. Creating a pull request to `develop` or `main` -3. Check that CI workflow runs successfully -4. For deployment, merge to `main` and verify deployment workflow - -## Troubleshooting - -If workflows fail due to missing secrets: - -1. Check workflow logs for specific error messages -2. Verify secret names match exactly (case-sensitive) -3. Ensure secrets are set in the correct environment -4. Check that workflow has permission to access the secret +## Rollback -## Contact +Every deploy tags images with the Git SHA. To rollback: -For questions about secrets configuration, contact your DevOps team or repository administrator. +```bash +ssh deploy@ +cd /opt/postly +export API_IMAGE=ghcr.io//api: +export SCRAPER_IMAGE=ghcr.io//scraper: +export BOT_IMAGE=ghcr.io//bot: +docker compose -f docker-compose.prod.yml up -d --no-deps api bot scraper +``` diff --git a/.gitignore b/.gitignore index 580e5ca..335dcd6 100644 --- a/.gitignore +++ b/.gitignore @@ -108,4 +108,5 @@ lerna-debug.log* test-results/ playwright-report/ blob-report/ -playwright/.cache/ \ No newline at end of file +playwright/.cache/ +apps/scraper/.playwright_data \ No newline at end of file diff --git a/README.md b/README.md index e69de29..85e1d0e 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,182 @@ +# Postly — Production Deployment Guide + +Complete runbook for deploying Postly to a VPS. + +--- + +## Architecture Overview + +``` +VPS Proxy (Nginx/Traefik) → API (Express) + → Static Web (Vite build) + +Internal Docker Network: + API ←→ PostgreSQL (pgvector) + API ←→ Redis (BullMQ + Caching) + Scraper ←→ PostgreSQL + Bot ←→ PostgreSQL + Redis +``` + +**Stack:** Node.js API · Python Scraper · Python Discord Bot · PostgreSQL 16 + pgvector · Redis 7 + +--- + +## Quick-Start Checklist + +``` +□ 1. Clone repo to /opt/postly, create .env +□ 2. docker compose -f docker-compose.prod.yml up -d +□ 3. Verify all services healthy +□ 4. Configure GitHub Actions secrets +□ 5. Push to main → verify pipeline runs +□ 6. Set up cron backup job +□ 7. Run a backup restore drill +``` + +--- + +## Step-by-Step Deployment + +### 1. Clone and Configure + +Log into your VPS and run: + +```bash +cd /opt/postly +git clone https://github.com/.git . + +# Create production .env from template +cp .env.production.example .env +chmod 600 .env + +# Edit .env — fill in all CHANGE_ME values +nano .env +``` + +**Critical .env values to set:** + +- `DB_PASSWORD` — Strong random password (`openssl rand -hex 16`) +- `JWT_SECRET` / `JWT_REFRESH_SECRET` — (`openssl rand -hex 32`) +- `DISCORD_BOT_TOKEN` — From Discord Developer Portal +- `WEB_URL` — Your production domain (e.g., `https://postly.io`) + +### 2. Login to GHCR + +```bash +# Login to pull pre-built images from GitHub Container Registry +echo "" | docker login ghcr.io -u --password-stdin +``` + +### 3. Start the Stack + +```bash +docker compose -f docker-compose.prod.yml up -d +``` + +Verify all services are healthy: + +```bash +docker compose -f docker-compose.prod.yml ps +curl -s http://localhost:3000/health | jq +``` + +Expected health response: + +```json +{ + "status": "ok", + "checks": { "db": "ok", "redis": "ok" }, + "uptime": 12.345 +} +``` + +### 4. Run HNSW Index Migration (One-time) + +```bash +docker exec -i postly-postgres psql -U postly -d postly < scripts/add-hnsw-indexes.sql +``` + +### 5. Setup Backups + +```bash +# Make backup script executable +chmod +x scripts/backup.sh + +# Test it manually first +bash scripts/backup.sh + +# Add to cron (runs daily at 2 AM) +(crontab -l 2>/dev/null; echo "0 2 * * * /opt/postly/scripts/backup.sh >> /var/log/postly-backup.log 2>&1") | crontab - +``` + +### 6. Configure GitHub Actions + +Add these secrets in **Settings → Secrets → Actions**: + +| Secret | Value | +| ------------- | -------------------------- | +| `VPS_HOST` | Your VPS IP address | +| `VPS_USER` | `deploy` | +| `VPS_PORT` | `22` | +| `VPS_SSH_KEY` | Full private SSH key (PEM) | + +Push to `main` and verify the pipeline deploys successfully. + +--- + +## Rollback + +Every deploy tags images with the Git SHA. To rollback: + +```bash +cd /opt/postly +export API_IMAGE=ghcr.io//api: +export SCRAPER_IMAGE=ghcr.io//scraper: +export BOT_IMAGE=ghcr.io//bot: +docker compose -f docker-compose.prod.yml up -d --no-deps api bot scraper +``` + +--- + +## Backup Restore Drill + +Run this monthly to verify backups work: + +```bash +# Start a throwaway Postgres container +docker run -d --name pg-restore-test -e POSTGRES_PASSWORD=test pgvector/pgvector:pg16 + +# Restore latest backup into it +docker exec -i pg-restore-test pg_restore -U postgres -d postgres --create < backups/local/$(ls -t backups/local/ | head -1) + +# Verify data +docker exec pg-restore-test psql -U postgres -d postly -c "SELECT count(*) FROM users;" + +# Clean up +docker rm -f pg-restore-test +``` + +--- + +## Scaling Roadmap + +| Users | Action | Cost Impact | +| ------- | ----------------------------------------------- | ----------- | +| 0–1K | Current setup, no changes | — | +| 1K–10K | Add Postgres read replica (second VPS) | +€4.5/mo | +| 10K–50K | Extract scraper to own VPS, add pgBouncer | +€4.5/mo | +| 50K+ | Consider managed DB, split into domain services | Variable | + +--- + +## File Reference + +| File | Purpose | +| ------------------------------ | -------------------------------- | +| `docker-compose.prod.yml` | Main production stack | +| `scripts/backup.sh` | Daily PostgreSQL backup | +| `scripts/add-hnsw-indexes.sql` | pgvector HNSW indexes (run once) | +| `.env.production.example` | Production env template | +| `.github/workflows/deploy.yml` | CI/CD pipeline | +| `.github/workflows/ci.yml` | PR checks | +| `.github/SECRETS.md` | GitHub secrets reference | diff --git a/apps/api/package.json b/apps/api/package.json index 6dba678..1867f2b 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -16,8 +16,9 @@ "@dodopayments/express": "^0.2.6", "@postly/config": "*", "@postly/database": "*", + "@postly/logger": "*", "@postly/shared-types": "*", - "bcryptjs": "^2.4.3", + "bcrypt": "^5.1.1", "bullmq": "^5.31.3", "cors": "^2.8.6", "dotenv": "^16.4.7", @@ -29,12 +30,13 @@ "mammoth": "^1.11.0", "multer": "^2.0.2", "pdf-parse": "^2.4.5", + "resend": "^6.9.4", "zod": "^3.24.1" }, "devDependencies": { "@postly/eslint-config": "*", "@postly/typescript-config": "*", - "@types/bcryptjs": "^2.4.6", + "@types/bcrypt": "^5.0.2", "@types/cors": "^2.8.17", "@types/express": "^5.0.0", "@types/jsonwebtoken": "^9.0.7", diff --git a/apps/api/src/config/secrets.ts b/apps/api/src/config/secrets.ts index e506fa4..391634e 100644 --- a/apps/api/src/config/secrets.ts +++ b/apps/api/src/config/secrets.ts @@ -34,4 +34,6 @@ export { DODO_PAYMENTS_WEBHOOK_KEY, DODO_PAYMENTS_ENVIRONMENT, DODO_PAYMENTS_RETURN_URL, + RESEND_API_KEY, + RESEND_FROM_EMAIL, } from "@postly/config"; diff --git a/apps/api/src/controllers/auth.controller.ts b/apps/api/src/controllers/auth.controller.ts index 4c990dd..16e89ad 100644 --- a/apps/api/src/controllers/auth.controller.ts +++ b/apps/api/src/controllers/auth.controller.ts @@ -1,17 +1,19 @@ import { Request, Response, NextFunction } from "express"; -import bcrypt from "bcryptjs"; +import bcrypt from "bcrypt"; import jwt, { type SignOptions } from "jsonwebtoken"; import crypto from "crypto"; import { z } from "zod"; -import { userQueries } from "@postly/database"; -import type { AuthResponse } from "@postly/shared-types"; +import { userQueries, otpQueries } from "@postly/database"; +import type { AuthResponse, UserRole } from "@postly/shared-types"; import type { JwtPayload } from "../middleware/auth.js"; import { JWT_SECRET, JWT_REFRESH_SECRET, JWT_EXPIRES_IN, JWT_REFRESH_EXPIRES_IN, + RESEND_FROM_EMAIL, } from "../config/secrets.js"; +import { resend } from "../lib/resend.js"; // ─── Validation Schemas ────────────────────────────────────────────────────── @@ -39,15 +41,28 @@ const resetPasswordSchema = z.object({ password: z.string().min(8, "Password must be at least 8 characters"), }); +const verifyOtpSchema = z.object({ + email: z.string().email("Invalid email address"), + code: z.string().length(6, "OTP must be 6 digits"), +}); + +const resendOtpSchema = z.object({ + email: z.string().email("Invalid email address"), +}); + // ─── Controller ────────────────────────────────────────────────────────────── export class AuthController { /** * Generate access + refresh token pair for a user. */ - private generateTokens(user: { id: string; email: string; role: string }) { + private generateTokens(user: { + id: string; + email: string; + roles: UserRole[]; + }) { const access_token = jwt.sign( - { id: user.id, email: user.email, role: user.role }, + { id: user.id, email: user.email, roles: user.roles }, JWT_SECRET, { expiresIn: JWT_EXPIRES_IN as SignOptions["expiresIn"] }, ); @@ -97,22 +112,45 @@ export class AuthController { password_hash, full_name, }); - const tokens = this.generateTokens(user); - const response: AuthResponse = { - user: { - id: user.id, + // Generate 6-digit OTP + const otpCode = Math.floor(100000 + Math.random() * 900000).toString(); + const otpHash = await bcrypt.hash(otpCode, 10); + const otpExpiry = new Date(Date.now() + 10 * 60 * 1000); // 10 minutes + + await otpQueries.upsertOtp(user.id, otpHash, otpExpiry); + + // Send OTP via Resend + try { + await resend.emails.send({ + from: RESEND_FROM_EMAIL, + to: email, + subject: "Verify your Postly account", + html: ` +
+

Welcome to Postly!

+

Your verification code is:

+
+ ${otpCode} +
+

This code will expire in 10 minutes.

+

If you didn't create an account, you can safely ignore this email.

+
+ `, + }); + } catch (emailError) { + console.error("Failed to send verification email:", emailError); + // We still created the user, they can request a resend later + } + + res.status(201).json({ + success: true, + data: { + message: + "Registration successful. Please check your email for the verification code.", email: user.email, - full_name: user.full_name, - role: user.role, - is_verified: user.is_verified, - created_at: user.created_at, - updated_at: user.updated_at, }, - ...tokens, - }; - - res.status(201).json({ success: true, data: response }); + }); } catch (error) { next(error); } @@ -158,6 +196,18 @@ export class AuthController { return; } + // Check if email is verified + if (!user.is_verified) { + res.status(403).json({ + success: false, + error: { + message: "Email not verified", + code: "EMAIL_NOT_VERIFIED", + }, + }); + return; + } + // Track login timestamp await userQueries.updateLastLogin(user.id); @@ -168,7 +218,7 @@ export class AuthController { id: user.id, email: user.email, full_name: user.full_name, - role: user.role, + roles: user.roles, is_verified: user.is_verified, created_at: user.created_at, updated_at: user.updated_at, @@ -233,7 +283,7 @@ export class AuthController { } const access_token = jwt.sign( - { id: user.id, email: user.email, role: user.role }, + { id: user.id, email: user.email, roles: user.roles }, JWT_SECRET, { expiresIn: JWT_EXPIRES_IN as SignOptions["expiresIn"] }, ); @@ -268,7 +318,7 @@ export class AuthController { id: user.id, email: user.email, full_name: user.full_name, - role: user.role, + roles: user.roles, is_verified: user.is_verified, last_login_at: user.last_login_at, created_at: user.created_at, @@ -361,4 +411,193 @@ export class AuthController { next(error); } }; + + // ─── POST /verify-otp ──────────────────────────────────────────────────── + + verifyOtp = async ( + req: Request, + res: Response, + next: NextFunction, + ): Promise => { + try { + const validation = verifyOtpSchema.safeParse(req.body); + if (!validation.success) { + res.status(400).json({ + success: false, + error: { message: validation.error.errors[0].message }, + }); + return; + } + + const { email, code } = validation.data; + const user = await userQueries.findByEmail(email); + + if (!user) { + res.status(404).json({ + success: false, + error: { message: "User not found" }, + }); + return; + } + + if (user.is_verified) { + res.status(400).json({ + success: false, + error: { message: "User is already verified" }, + }); + return; + } + + const otp = await otpQueries.findOtpByUserId(user.id); + if (!otp) { + res.status(400).json({ + success: false, + error: { + message: "No verification code found. Please request a new one.", + }, + }); + return; + } + + // Check expiry + if (new Date() > new Date(otp.expires_at)) { + await otpQueries.deleteOtp(otp.id); + res.status(400).json({ + success: false, + error: { + message: "Verification code expired. Please request a new one.", + }, + }); + return; + } + + // Check attempts + if (otp.attempts >= 3) { + res.status(429).json({ + success: false, + error: { + message: "Too many failed attempts. Please request a new code.", + }, + }); + return; + } + + // Verify code + const isValid = await bcrypt.compare(code, otp.code_hash); + if (!isValid) { + await otpQueries.incrementOtpAttempts(otp.id); + res.status(400).json({ + success: false, + error: { message: "Invalid verification code" }, + }); + return; + } + + // Success + await otpQueries.verifyUser(user.id); + await otpQueries.deleteOtp(otp.id); + + const tokens = this.generateTokens(user); + const response: AuthResponse = { + user: { + id: user.id, + email: user.email, + full_name: user.full_name, + roles: user.roles, + is_verified: true, + created_at: user.created_at, + updated_at: new Date(), + }, + ...tokens, + }; + + res.json({ success: true, data: response }); + } catch (error) { + next(error); + } + }; + + // ─── POST /resend-otp ──────────────────────────────────────────────────── + + resendOtp = async ( + req: Request, + res: Response, + next: NextFunction, + ): Promise => { + try { + const validation = resendOtpSchema.safeParse(req.body); + if (!validation.success) { + res.status(400).json({ + success: false, + error: { message: validation.error.errors[0].message }, + }); + return; + } + + const { email } = validation.data; + const user = await userQueries.findByEmail(email); + + if (!user) { + res.status(404).json({ + success: false, + error: { message: "User not found" }, + }); + return; + } + + if (user.is_verified) { + res.status(400).json({ + success: false, + error: { message: "User is already verified" }, + }); + return; + } + + const existingOtp = await otpQueries.findOtpByUserId(user.id); + if (existingOtp) { + const timeSinceCreation = + Date.now() - new Date(existingOtp.created_at || 0).getTime(); + if (timeSinceCreation < 60 * 1000) { + const waitTime = Math.ceil((60 * 1000 - timeSinceCreation) / 1000); + res.status(429).json({ + success: false, + error: { + message: `Please wait ${waitTime} seconds before requesting a new code.`, + }, + }); + return; + } + } + + // Generate new OTP + const otpCode = Math.floor(100000 + Math.random() * 900000).toString(); + const otpHash = await bcrypt.hash(otpCode, 10); + const otpExpiry = new Date(Date.now() + 10 * 60 * 1000); + + await otpQueries.upsertOtp(user.id, otpHash, otpExpiry); + + await resend.emails.send({ + from: RESEND_FROM_EMAIL, + to: email, + subject: "Your new Postly verification code", + html: ` +
+

Verification Code

+

Your new verification code is:

+
+ ${otpCode} +
+

This code will expire in 10 minutes.

+
+ `, + }); + + res.json({ + success: true, + data: { message: "Verification code resent successfully." }, + }); + } catch (error) { + next(error); + } + }; } diff --git a/apps/api/src/controllers/bot.controller.ts b/apps/api/src/controllers/bot.controller.ts new file mode 100644 index 0000000..592091a --- /dev/null +++ b/apps/api/src/controllers/bot.controller.ts @@ -0,0 +1,196 @@ +import { Request, Response, NextFunction } from "express"; +import { db, bot_configs, eq, and, botQueries } from "@postly/database"; +import { queueService } from "../services/queue.service.js"; +import type { JwtPayload } from "../middleware/auth.js"; +import { WEB_URL } from "../config/secrets.js"; +import type { BotPlatform } from "@postly/shared-types"; + +export class BotController { + /** + * GET /api/v1/bots/callback + * Handles the redirect from OAuth providers (e.g. Discord). + */ + handleDiscordCallback = async ( + req: Request, + res: Response, + _next: NextFunction, + ): Promise => { + try { + const { guild_id } = req.query; + const user = req.user as JwtPayload; + + if (!guild_id) { + res.redirect(`${WEB_URL}/dashboard?discord_error=missing_guild`); + return; + } + + await botQueries.upsertConfig({ + user_id: user.id, + platform: "discord", + target_id: guild_id as string, + }); + + res.redirect( + `${WEB_URL}/dashboard?discord_success=true&guild_id=${guild_id}`, + ); + } catch (error) { + console.error("Discord callback error:", error); + res.redirect(`${WEB_URL}/dashboard?discord_error=true`); + } + }; + + /** + * GET /api/v1/bots/configs + * Returns all bot configurations for the current user. + */ + getConfigs = async ( + req: Request, + res: Response, + _next: NextFunction, + ): Promise => { + try { + const user = req.user as JwtPayload; + const configs = await db + .select() + .from(bot_configs) + .where(eq(bot_configs.user_id, user.id)); + + res.json({ + success: true, + data: configs, + }); + } catch (error) { + _next(error); + } + }; + + /** + * POST /api/v1/bots/configs + * Manually create or update a bot config (e.g. for Webhooks, Twitter, Reddit). + */ + upsertConfig = async ( + req: Request, + res: Response, + _next: NextFunction, + ): Promise => { + try { + const user = req.user as JwtPayload; + const { + platform, + target_id, + target_name, + webhook_url, + credentials, + filters, + } = req.body; + + const result = await botQueries.upsertConfig({ + user_id: user.id, + platform: platform as BotPlatform, + target_id, + target_name, + webhook_url, + credentials, + ...filters, + }); + + res.json({ + success: true, + data: result, + }); + } catch (error) { + _next(error); + } + }; + + /** + * PATCH /api/v1/bots/configs/:id + * Updates an existing bot configuration. + */ + updateConfig = async ( + req: Request, + res: Response, + _next: NextFunction, + ): Promise => { + try { + const { id } = req.params; + const user = req.user as JwtPayload; + const updateData = req.body; + + const [existing] = await db + .select() + .from(bot_configs) + .where( + and( + eq(bot_configs.id, id as string), + eq(bot_configs.user_id, user.id), + ), + ) + .limit(1); + + if (!existing) { + res.status(404).json({ success: false, message: "Config not found" }); + return; + } + + const [updated] = await db + .update(bot_configs) + .set({ + ...updateData, + updated_at: new Date(), + }) + .where(eq(bot_configs.id, id as string)) + .returning(); + + res.json({ + success: true, + data: updated, + }); + } catch (error) { + _next(error); + } + }; + + /** + * POST /api/v1/bots/configs/:id/test + * Manually trigger a test notification for a specific bot config. + */ + triggerTestNotification = async ( + req: Request, + res: Response, + _next: NextFunction, + ): Promise => { + try { + const { id } = req.params; + const user = req.user as JwtPayload; + + const [config] = await db + .select() + .from(bot_configs) + .where( + and( + eq(bot_configs.id, id as string), + eq(bot_configs.user_id, user.id), + ), + ) + .limit(1); + + if (!config) { + res.status(404).json({ + success: false, + message: "Bot configuration not found.", + }); + return; + } + + await queueService.dispatchForPlatform(config.id); + + res.json({ + success: true, + message: `Test notification queued for ${config.platform}!`, + }); + } catch (error) { + _next(error); + } + }; +} diff --git a/apps/api/src/controllers/user.controller.ts b/apps/api/src/controllers/user.controller.ts index 8d2043b..5e21629 100644 --- a/apps/api/src/controllers/user.controller.ts +++ b/apps/api/src/controllers/user.controller.ts @@ -1,5 +1,6 @@ import { Request, Response, NextFunction } from "express"; import { z } from "zod"; +import bcrypt from "bcrypt"; import { userQueries, seekerProfileQueries, @@ -7,9 +8,18 @@ import { subscriptionQueries, } from "@postly/database"; import type { JwtPayload } from "../middleware/auth.js"; +import { CacheService } from "../services/cache.service.js"; const updateProfileSchema = z.object({ full_name: z.string().min(1).max(100).optional(), + avatar_url: z.string().url().or(z.string().length(0)).optional(), + timezone: z.string().max(50).optional(), + locale: z.string().max(20).optional(), +}); + +const changePasswordSchema = z.object({ + current_password: z.string().min(1), + new_password: z.string().min(8), }); const updateSeekerProfileSchema = z.object({ @@ -50,7 +60,14 @@ export class UserController { ): Promise => { try { const payload = req.user as JwtPayload; - const user = await userQueries.findById(payload.id); + const cacheKey = CacheService.generateKey("user:profile", payload.id); + + const user = await CacheService.getOrSet( + cacheKey, + 300, // 5 minutes TTL + async () => await userQueries.findById(payload.id), + ); + if (!user) { res .status(404) @@ -63,7 +80,7 @@ export class UserController { id: user.id, email: user.email, full_name: user.full_name, - role: user.role, + roles: user.roles, is_verified: user.is_verified, last_login_at: user.last_login_at, created_at: user.created_at, @@ -97,6 +114,11 @@ export class UserController { .json({ success: false, error: { message: "User not found" } }); return; } + + // Invalidate profile cache + const cacheKey = CacheService.generateKey("user:profile", payload.id); + await CacheService.invalidate(cacheKey); + res.json({ success: true, data: updated }); } catch (error) { next(error); @@ -200,4 +222,84 @@ export class UserController { next(error); } }; + + changePassword = async ( + req: Request, + res: Response, + next: NextFunction, + ): Promise => { + try { + const validation = changePasswordSchema.safeParse(req.body); + if (!validation.success) { + res.status(400).json({ + success: false, + error: { message: validation.error.errors[0].message }, + }); + return; + } + const payload = req.user as JwtPayload; + const { current_password, new_password } = validation.data; + + const user = await userQueries.findByEmail(payload.email); + if (!user || !user.password_hash) { + res + .status(404) + .json({ success: false, error: { message: "User not found" } }); + return; + } + + const isValid = await bcrypt.compare( + current_password, + user.password_hash, + ); + if (!isValid) { + res.status(401).json({ + success: false, + error: { message: "Invalid current password" }, + }); + return; + } + + const salt = await bcrypt.genSalt(12); + const password_hash = await bcrypt.hash(new_password, salt); + + await userQueries.updatePassword(payload.id, password_hash); + + res.json({ + success: true, + data: { message: "Password updated successfully" }, + }); + } catch (error) { + next(error); + } + }; + + uploadAvatar = async ( + req: Request, + res: Response, + next: NextFunction, + ): Promise => { + try { + if (!req.file) { + res.status(400).json({ + success: false, + error: { message: "No file uploaded" }, + }); + return; + } + + // Construct the public URL for the image + // Note: In production, substitute with actual domain or CDN URL + const host = req.get("host"); + const protocol = req.protocol; + const fileUrl = `${protocol}://${host}/uploads/avatars/${req.file.filename}`; + + res.json({ + success: true, + data: { url: fileUrl }, + }); + } catch (error) { + next(error); + } + }; } diff --git a/apps/api/src/lib/redis.ts b/apps/api/src/lib/redis.ts new file mode 100644 index 0000000..e2248d1 --- /dev/null +++ b/apps/api/src/lib/redis.ts @@ -0,0 +1,20 @@ +import { Redis } from "ioredis"; +import { REDIS_URL } from "../config/secrets.js"; + +/** + * Shared Redis client for the API. + * + * Reuses the same connection pool for multiple features (rate limiting, health checks, etc.) + * to keep the connection count low and stable. + */ +export const redis = new Redis(REDIS_URL || "redis://localhost:6379", { + maxRetriesPerRequest: 1, + connectTimeout: 5000, +}); + +redis.on("error", (err) => { + // We log but don't crash — features should "fail open" if Redis is down + console.error("Shared Redis connection error:", err); +}); + +export default redis; diff --git a/apps/api/src/lib/resend.ts b/apps/api/src/lib/resend.ts new file mode 100644 index 0000000..8d4d430 --- /dev/null +++ b/apps/api/src/lib/resend.ts @@ -0,0 +1,4 @@ +import { Resend } from "resend"; +import { RESEND_API_KEY } from "../config/secrets.js"; + +export const resend = new Resend(RESEND_API_KEY); diff --git a/apps/api/src/middleware/auth.ts b/apps/api/src/middleware/auth.ts index f01334d..122ef3f 100644 --- a/apps/api/src/middleware/auth.ts +++ b/apps/api/src/middleware/auth.ts @@ -13,7 +13,7 @@ import type { UserRole } from "@postly/shared-types"; export interface JwtPayload { id: string; email: string; - role: UserRole; + roles: UserRole[]; iat?: number; exp?: number; } diff --git a/apps/api/src/middleware/strict-rate-limit.ts b/apps/api/src/middleware/strict-rate-limit.ts deleted file mode 100644 index 70616de..0000000 --- a/apps/api/src/middleware/strict-rate-limit.ts +++ /dev/null @@ -1,104 +0,0 @@ -import { Request, Response, NextFunction } from "express"; -import { Redis } from "ioredis"; -import { REDIS_URL, NODE_ENV } from "../config/secrets.js"; -import type { JwtPayload } from "./auth.js"; - -// Initialize Redis client -const redis = new Redis(REDIS_URL || "redis://localhost:6379"); -redis.on("error", (err) => { - console.error("Redis (strict-rate-limit) connection error:", err); -}); - -interface RateLimitConfig { - windowMs: number; - max: number | ((req: Request) => number | Promise); - keyPrefix: string; - message: string; -} - -/** - * Creates a strict rate limiter middleware using Redis. - * Persists limits across server restarts. - */ -export const createStrictRateLimiter = (config: RateLimitConfig) => { - return async (req: Request, res: Response, next: NextFunction) => { - try { - // Allow development and test environments to bypass limits - if (NODE_ENV !== "production") { - next(); - return; - } - - const user = req.user as JwtPayload | undefined; - if (!user?.id) { - res.status(401).json({ - success: false, - error: { message: "Authentication required for rate limiting" }, - }); - return; - } - - const userId = user.id; - const key = `${config.keyPrefix}:${userId}`; - - // Get dynamic max limit - let maxLimit: number; - if (typeof config.max === "function") { - const dynamicMax = await config.max(req); - maxLimit = dynamicMax ?? 3; - } else { - maxLimit = config.max ?? 3; - } - - // Get current count - const currentCount = await redis.get(key); - const count = currentCount ? parseInt(currentCount, 10) : 0; - - if (count >= maxLimit && maxLimit !== Infinity) { - const ttl = await redis.ttl(key); - const resetDate = new Date(Date.now() + ttl * 1000); - - res.status(429).json({ - success: false, - error: { - message: config.message, - resetAt: resetDate.toISOString(), - }, - }); - return; - } - - // Increment count - if (!currentCount) { - await redis.set(key, 1, "EX", Math.ceil(config.windowMs / 1000)); - } else { - await redis.incr(key); - } - - next(); - } catch (error) { - console.error("Rate limiting error:", error); - // Fail open to avoid blocking users on redis error - next(); - } - }; -}; - -const isDev = NODE_ENV !== "production"; - -export const chatRateLimiter = createStrictRateLimiter({ - windowMs: isDev ? 60 * 60 * 1000 : 7 * 24 * 60 * 60 * 1000, - max: async (req: Request) => { - if (isDev) return 10000; - - const user = req.user as JwtPayload | undefined; - if (!user) return 3; - - if (user.role === "admin") return Infinity; - if (user.role === "employer") return 50; - - return 3; - }, - keyPrefix: "rate_limit:ai_chat", - message: "Weekly AI limit reached. Upgrade to Premium for more.", -}); diff --git a/apps/api/src/middleware/token-bucket-rate-limit.ts b/apps/api/src/middleware/token-bucket-rate-limit.ts new file mode 100644 index 0000000..bfd451c --- /dev/null +++ b/apps/api/src/middleware/token-bucket-rate-limit.ts @@ -0,0 +1,159 @@ +import { Request, Response, NextFunction } from "express"; +import { redis } from "../lib/redis.js"; +import jwt from "jsonwebtoken"; +import { Redis } from "ioredis"; + +interface RateLimitConfig { + maxTokens: number; + refillRateSec: number; + keyPrefix?: string; +} + +interface RedisWithTokenBucket extends Redis { + consumeTokenBucket( + key: string, + maxTokens: number, + refillRateSec: number, + nowMs: number, + requested: number, + ): Promise<[number, string]>; +} + +// Token Bucket algorithm using Redis Lua script to prevent race conditions. +// KEYS[1] = bucket key +// ARGV[1] = max capacity +// ARGV[2] = refill rate per second +// ARGV[3] = current time in ms +// ARGV[4] = requested tokens +const tokenBucketScript = ` + local key = KEYS[1] + local capacity = tonumber(ARGV[1]) + local refill_rate_per_sec = tonumber(ARGV[2]) + local now_ms = tonumber(ARGV[3]) + local requested = tonumber(ARGV[4]) + + local bucket = redis.call("HMGET", key, "tokens", "last_refill") + local tokens = tonumber(bucket[1]) + local last_refill = tonumber(bucket[2]) + + if not tokens then + tokens = capacity + last_refill = now_ms + else + local time_passed_ms = math.max(0, now_ms - last_refill) + local accrued = (time_passed_ms / 1000) * refill_rate_per_sec + tokens = math.min(capacity, tokens + accrued) + end + + local granted = 0 + if tokens >= requested then + tokens = tokens - requested + granted = 1 + end + + redis.call("HMSET", key, "tokens", tostring(tokens), "last_refill", tostring(now_ms)) + -- TTL is enough time for bucket to refill completely + local ttl = math.ceil(capacity / refill_rate_per_sec) + 1 + redis.call("EXPIRE", key, ttl) + + return { granted, tostring(tokens) } +`; + +// Register the Lua script. +// ioredis adds it to the client instance as 'consumeTokenBucket'. +redis.defineCommand("consumeTokenBucket", { + numberOfKeys: 1, + lua: tokenBucketScript, +}); + +/** + * Token Bucket API Rate Limiting Middleware + * + * - Algorithm: Token Bucket mapping directly to tokens refilled per second + * - Identifier: Decoded JWT User ID for authenticated users, falling back to IP Address. + * - Concurrency: Handled atomically via Redis Lua script. + * - Fail-Safe: If Redis is down, it fails open (allows traffic). + * + * @param config { RateLimitConfig } + */ +export const tokenBucketRateLimiter = (config: RateLimitConfig) => { + const { maxTokens, refillRateSec, keyPrefix = "rl:tb" } = config; + + return async (req: Request, res: Response, next: NextFunction) => { + try { + if (redis.status !== "ready") { + // Fail Open behavior if Redis is not connected + return next(); + } + + // Identifier: Start with IP Address + let identifier = req.ip || "unknown-ip"; + + // Attempt to decode the JWT to use User ID as identifier + const authHeader = req.headers["authorization"]; + if (authHeader && authHeader.startsWith("Bearer ")) { + const token = authHeader.split(" ")[1]; + try { + // We decode instead of verifying here because + // this middleware might run before the auth middleware + // and decoding is faster for identifying rate limits per user + const decoded = jwt.decode(token) as { id?: string }; + if (decoded && decoded.id) { + identifier = decoded.id; + } + } catch { + // ignore invalid tokens, fallback to IP + } + } + + const key = `${keyPrefix}:${identifier}`; + const nowMs = Date.now(); + const requested = 1; + + // Executing the predefined Lua Script + const [grantedResult, currentTokensResult] = await ( + redis as RedisWithTokenBucket + ).consumeTokenBucket(key, maxTokens, refillRateSec, nowMs, requested); + + const granted = grantedResult === 1; + const currentTokens = parseFloat(currentTokensResult); + const remaining = Math.max(0, Math.floor(currentTokens)); + + // Calculate when the user will have at least 1 token again + let resetMs = nowMs; + if (!granted && currentTokens < 1) { + const tokensNeeded = 1 - currentTokens; + resetMs = nowMs + (tokensNeeded / refillRateSec) * 1000; + } + + // Inject standard HTTP headers + res.setHeader("X-RateLimit-Limit", maxTokens.toString()); + res.setHeader("X-RateLimit-Remaining", remaining.toString()); + // X-RateLimit-Reset is typically epoch timestamp in seconds + res.setHeader("X-RateLimit-Reset", Math.ceil(resetMs / 1000).toString()); + + if (granted) { + return next(); + } + + // Bucket is empty, return HTTP 429 Too Many Requests + return res.status(429).json({ + success: false, + error: { + code: "too_many_requests", + message: "Too many requests. Please try again later.", + limit: maxTokens, + remaining: remaining, + reset_at: Math.ceil(resetMs / 1000), + }, + }); + } catch (err) { + // Fail Open logic: Allow traffic if Lua script fails or Redis throws an error + console.error("Token Bucket Rate Limiter Error:", err); + // Ensure we haven't already sent headers + if (!res.headersSent) { + return next(); + } + } + }; +}; diff --git a/apps/api/src/routes/auth.routes.ts b/apps/api/src/routes/auth.routes.ts index 103b0fd..52cc3bc 100644 --- a/apps/api/src/routes/auth.routes.ts +++ b/apps/api/src/routes/auth.routes.ts @@ -11,6 +11,8 @@ router.post("/login", authController.login); router.post("/refresh", authController.refresh); router.post("/forgot-password", authController.forgotPassword); router.post("/reset-password", authController.resetPassword); +router.post("/verify-otp", authController.verifyOtp); +router.post("/resend-otp", authController.resendOtp); // Protected router.get("/me", authenticateToken, authController.me); diff --git a/apps/api/src/routes/bot.routes.ts b/apps/api/src/routes/bot.routes.ts new file mode 100644 index 0000000..a789d38 --- /dev/null +++ b/apps/api/src/routes/bot.routes.ts @@ -0,0 +1,25 @@ +import { Router } from "express"; +import { authenticateToken } from "../middleware/auth.js"; +import { BotController } from "../controllers/bot.controller.js"; + +const router = Router(); +const botController = new BotController(); + +// Some bot platforms might use callbacks (OAuth) +// We keep them separate and authenticate if possible, but usually these are handled via state/session. +// Here we assume authenticateToken works for our flow. +router.get( + "/discord/callback", + authenticateToken, + botController.handleDiscordCallback, +); + +// Protected management routes +router.use(authenticateToken); + +router.get("/configs", botController.getConfigs); +router.post("/configs", botController.upsertConfig); +router.patch("/configs/:id", botController.updateConfig); +router.post("/configs/:id/test", botController.triggerTestNotification); + +export default router; diff --git a/apps/api/src/routes/chat.routes.ts b/apps/api/src/routes/chat.routes.ts index ebd0c4b..bc29037 100644 --- a/apps/api/src/routes/chat.routes.ts +++ b/apps/api/src/routes/chat.routes.ts @@ -1,6 +1,5 @@ import { Router } from "express"; import { authenticateToken } from "../middleware/auth.js"; -import { chatRateLimiter } from "../middleware/strict-rate-limit.js"; import { ChatController } from "../controllers/chat.controller.js"; const router = Router(); @@ -21,7 +20,7 @@ router.post("/messages/:id/edit", chatController.editMessage); router.post("/messages/:id/cancel", chatController.cancelMessage); router.get("/messages/:id/versions", chatController.getMessageVersions); -// AI streaming (rate limited) -router.post("/stream", chatRateLimiter, chatController.streamResponse); +// AI streaming +router.post("/stream", chatController.streamResponse); export default router; diff --git a/apps/api/src/routes/user.routes.ts b/apps/api/src/routes/user.routes.ts index cdc4a71..27595eb 100644 --- a/apps/api/src/routes/user.routes.ts +++ b/apps/api/src/routes/user.routes.ts @@ -1,10 +1,36 @@ import { Router } from "express"; +import multer from "multer"; +import path from "path"; import { UserController } from "../controllers/user.controller.js"; import { authenticateToken } from "../middleware/auth.js"; const router = Router(); const userController = new UserController(); +// Configure storage for avatars +const storage = multer.diskStorage({ + destination: (_req, _file, cb) => { + cb(null, "uploads/avatars"); + }, + filename: (_req, file, cb) => { + const uniqueSuffix = Date.now() + "-" + Math.round(Math.random() * 1e9); + cb(null, `avatar-${uniqueSuffix}${path.extname(file.originalname)}`); + }, +}); + +const upload = multer({ + storage, + limits: { fileSize: 2 * 1024 * 1024 }, // 2MB + fileFilter: (_req, file, cb) => { + const allowed = ["image/jpeg", "image/jpg", "image/png", "image/webp"]; + if (allowed.includes(file.mimetype)) { + cb(null, true); + } else { + cb(new Error("Invalid file type. Only JPEG, PNG, and WebP are allowed.")); + } + }, +}); + router.use(authenticateToken); // Base profile @@ -22,4 +48,14 @@ router.patch("/employer-profile", userController.updateEmployerProfile); // Subscription status router.get("/subscription", userController.getSubscription); +// Security +router.post("/change-password", userController.changePassword); + +// Avatar +router.post( + "/upload-avatar", + upload.single("avatar"), + userController.uploadAvatar, +); + export default router; diff --git a/apps/api/src/server.ts b/apps/api/src/server.ts index 157b477..3c868d2 100644 --- a/apps/api/src/server.ts +++ b/apps/api/src/server.ts @@ -2,6 +2,9 @@ import express from "express"; import cors from "cors"; import helmet from "helmet"; import rateLimit from "express-rate-limit"; +import { tokenBucketRateLimiter } from "./middleware/token-bucket-rate-limit.js"; +import { pool } from "@postly/database"; +import { logger } from "@postly/logger"; import { API_PORT, WEB_URL, NODE_ENV } from "./config/secrets.js"; import { errorHandler } from "./middleware/error-handler.js"; import { notFoundHandler } from "./middleware/not-found.js"; @@ -10,12 +13,20 @@ import userRoutes from "./routes/user.routes.js"; import jobRoutes from "./routes/job.routes.js"; import resumeRoutes from "./routes/resume.routes.js"; import chatRoutes from "./routes/chat.routes.js"; -import discordRoutes from "./routes/discord.routes.js"; +import botRoutes from "./routes/bot.routes.js"; import dodoRoutes from "./routes/dodo.routes.js"; import applicationRoutes from "./routes/application.routes.js"; import { queueService } from "./services/queue.service.js"; const app = express(); +app.set("trust proxy", 1); + +import { redis as healthRedis } from "./lib/redis.js"; +import path from "path"; +import { fileURLToPath } from "url"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); // Security middleware app.use( @@ -58,49 +69,103 @@ app.use( }), ); -const globalLimiter = rateLimit({ - windowMs: 15 * 60 * 1000, - max: 100, +const aiRateLimiter = tokenBucketRateLimiter({ + maxTokens: 50, + refillRateSec: 5, // 5 tokens per second refill + keyPrefix: "rl:ai", +}); + +const apiRateLimiter = rateLimit({ + windowMs: 60 * 1000, // 1 minute + max: 100, // limit each IP to 100 requests per windowMs standardHeaders: true, legacyHeaders: false, message: { success: false, - error: { message: "Too many requests, please try again later" }, + error: { message: "Too many requests, please try again later." }, }, }); -const authLimiter = rateLimit({ - windowMs: 15 * 60 * 1000, - max: 50, +const healthRateLimiter = rateLimit({ + windowMs: 60 * 1000, // 1 minute + max: 30, // limit each IP to 30 requests per minute standardHeaders: true, legacyHeaders: false, message: { success: false, - error: { - message: "Too many authentication attempts, please try again later", - }, + error: { message: "Health check rate limit exceeded." }, }, }); -app.use(globalLimiter); +// Health check — rate limited to prevent DB/Redis connection exhaustion +app.get("/health", healthRateLimiter, async (_req, res) => { + const checks: Record = {}; + + // Check Postgres + try { + await pool.query("SELECT 1"); + checks.db = "ok"; + } catch { + checks.db = "failed"; + } + + // Check Redis + try { + await healthRedis.ping(); + checks.redis = "ok"; + } catch { + checks.redis = "failed"; + } + + const allHealthy = checks.db === "ok" && checks.redis === "ok"; + res.status(allHealthy ? 200 : 503).json({ + status: allHealthy ? "ok" : "degraded", + checks, + uptime: process.uptime(), + timestamp: new Date().toISOString(), + }); +}); + +// Apply global API rate limit (Standard Window) +app.use(apiRateLimiter); app.use(express.json({ limit: "10mb" })); app.use(express.urlencoded({ extended: true, limit: "10mb" })); -app.get("/health", (_req, res) => { - res.json({ status: "ok", timestamp: new Date().toISOString() }); +// Request logging middleware (structured for production log aggregation) +app.use((req, res, next) => { + const start = Date.now(); + res.on("finish", () => { + const duration = Date.now() - start; + // Only log in production or for slow requests + if (NODE_ENV === "production" || duration > 1000) { + logger.info("request", { + method: req.method, + url: req.url, + status: res.statusCode, + duration_ms: duration, + user_id: + (req as unknown as Request & { user?: { id: string } }).user?.id || + null, + }); + } + }); + next(); }); // API routes -app.use("/api/v1/auth", authLimiter, authRoutes); +app.use("/api/v1/auth", authRoutes); app.use("/api/v1/users", userRoutes); app.use("/api/v1/jobs", jobRoutes); -app.use("/api/v1/resumes", resumeRoutes); -app.use("/api/v1/chat", chatRoutes); -app.use("/api/v1/discord", discordRoutes); +app.use("/api/v1/resumes", aiRateLimiter, resumeRoutes); +app.use("/api/v1/chat", aiRateLimiter, chatRoutes); +app.use("/api/v1/bots", botRoutes); app.use("/api/v1/payments", dodoRoutes); app.use("/api/v1/applications", applicationRoutes); +// Static files +app.use("/uploads", express.static(path.join(__dirname, "../uploads"))); + // Error handling app.use(notFoundHandler); app.use(errorHandler); @@ -110,37 +175,23 @@ app.listen(API_PORT, "0.0.0.0", async () => { console.log(`🚀 API server running on http://0.0.0.0:${API_PORT}`); console.log(`📝 Environment: ${NODE_ENV}`); - // Initialize Discord Job Queue + // Initialize Bot Job Queue try { await queueService.initDailyCron(); - // Schedule daily dispatch at 9:00 AM UTC - const now = new Date(); - const target = new Date(now); - target.setUTCHours(9, 0, 0, 0); - if (target <= now) target.setDate(target.getDate() + 1); - const msUntilFirst = target.getTime() - now.getTime(); - const DAY_MS = 24 * 60 * 60 * 1000; - - setTimeout(() => { - queueService.dispatchAll(); - setInterval(() => queueService.dispatchAll(), DAY_MS); - }, msUntilFirst); - - console.log(`📅 Discord daily job dispatch cron initialized (9:00 AM)`); } catch (err) { - console.error("Failed to initialize Discord Queue:", err); + console.error("Failed to initialize Bot Queue:", err); } }); -// Graceful shutdown -process.on("SIGTERM", () => { - console.log("SIGTERM received, shutting down gracefully..."); +// Graceful shutdown — close all connections before exiting +const shutdown = async (signal: string) => { + logger.info(`${signal} received, shutting down gracefully...`); + healthRedis.disconnect(); + await pool.end(); process.exit(0); -}); +}; -process.on("SIGINT", () => { - console.log("SIGINT received, shutting down gracefully..."); - process.exit(0); -}); +process.on("SIGTERM", () => shutdown("SIGTERM")); +process.on("SIGINT", () => shutdown("SIGINT")); export default app; diff --git a/apps/api/src/services/cache.service.ts b/apps/api/src/services/cache.service.ts new file mode 100644 index 0000000..ac052fa --- /dev/null +++ b/apps/api/src/services/cache.service.ts @@ -0,0 +1,116 @@ +import { redis } from "../lib/redis.js"; + +/** + * Standardized Cache Service + * Provides generic cache-aside wrapping with graceful degradation. + */ +export class CacheService { + private static readonly APP_NAME = "postly"; + private static readonly VERSION = "v1"; + + /** + * Generates a namespaced, standardized key to prevent collisions. + * Format: app_name:version:entity:id + * + * @example + * CacheService.generateKey('user', '123') // => "postly:v1:user:123" + */ + public static generateKey(entity: string, id: string | number): string { + return `${this.APP_NAME}:${this.VERSION}:${entity}:${id}`; + } + + /** + * Generic Cache-Aside Wrapper (Fail-Open) + * + * Attempts to fetch data from the KV store. On miss or redis error, + * falls back to the provided `fetchFunction`, caches the result, + * and returns the data. + * + * @param key Fully qualified cache key (use `generateKey` for consistency). + * @param ttlSeconds Time-to-live in seconds. + * @param fetchFunction Async function to execute on cache miss (e.g. DB query). + * @returns The cached or freshly fetched data. + */ + public static async getOrSet( + key: string, + ttlSeconds: number, + fetchFunction: () => Promise, + ): Promise { + try { + // 1. Check Redis for existing data + const cachedData = await redis.get(key); + if (cachedData) { + return JSON.parse(cachedData) as T; + } + } catch (error) { + // Log warning but DO NOT crash (Graceful Degradation) + console.warn(`[CacheService] Redis GET failed for key "${key}":`, error); + } + + // 2. On miss (or Redis failure), execute the source DB query + const freshData = await fetchFunction(); + + // 3. Attempt to save the fresh data to the cache + try { + // Only cache non-null/non-undefined results + if (freshData !== undefined && freshData !== null) { + // Run SET asynchronously to not block returning the response + redis.setex(key, ttlSeconds, JSON.stringify(freshData)).catch((err) => { + console.warn( + `[CacheService] Background Redis SETEX failed for key "${key}":`, + err, + ); + }); + } + } catch (error) { + console.warn( + `[CacheService] Redis SETEX synchronous error for key "${key}":`, + error, + ); + } + + // 4. Return data immediately + return freshData; + } + + /** + * Invalidates a specific key. + */ + public static async invalidate(key: string): Promise { + try { + await redis.del(key); + } catch (error) { + console.warn(`[CacheService] Redis DEL failed for key "${key}":`, error); + } + } + + /** + * Invalidates all keys matching a pattern using a non-blocking SCAN operation. + * + * @example CacheService.invalidatePattern('postly:v1:user:*') + */ + public static async invalidatePattern(pattern: string): Promise { + try { + let cursor = "0"; + do { + // Scan in chunks of 100 to avoid blocking the Redis event loop + const [nextCursor, matchingKeys] = await redis.scan( + cursor, + "MATCH", + pattern, + "COUNT", + "100", + ); + cursor = nextCursor; + if (matchingKeys.length > 0) { + await redis.del(...matchingKeys); + } + } while (cursor !== "0"); + } catch (error) { + console.warn( + `[CacheService] Redis pattern invalidation failed for "${pattern}":`, + error, + ); + } + } +} diff --git a/apps/api/src/services/chat.service.ts b/apps/api/src/services/chat.service.ts index a05bf1b..d72f0bb 100644 --- a/apps/api/src/services/chat.service.ts +++ b/apps/api/src/services/chat.service.ts @@ -3,6 +3,7 @@ import { conversationQueries, resumeQueries, jobQueries, + userQueries, } from "@postly/database"; import { matchingService } from "./matching.service.js"; import type { @@ -18,6 +19,80 @@ interface MatchedJob extends Job { ai_explanation?: string; } +interface JobIntent { + isRelated: boolean; + isSpecific: boolean; + techKeywords: string[]; + allKeywords: string[]; +} + +/** + * More precise intent detection for job-related queries + */ +function getJobIntent(message: string): JobIntent { + const lowercaseMsg = message.toLowerCase(); + + // Universal job related terms (not just tech) + const jobKeywords = [ + "job", + "career", + "hiring", + "opportunity", + "opening", + "position", + "vacancy", + "work", + "hire", + "recruiting", + "talent", + "apply", + "application", + "resume", + "cv", + "salary", + "role", + "looking for", + "hunting", + "find", + "search", + "offer", + "interview", + "employer", + "company", + "staff", + "manager", + "engineer", + "designer", + "architect", + "developer", + "sales", + "marketing", + "doctor", + "nurse", + "teacher", + "driver", + "chef", + "accounting", + "legal", + "retail", + "remote", + "hybrid", + "fullstack", + "frontend", + "backend", + ]; + + const foundKeywords = jobKeywords.filter((kw) => lowercaseMsg.includes(kw)); + + return { + isRelated: foundKeywords.length > 0 || message.length > 50, + isSpecific: foundKeywords.length > 2, // A heuristic for specific queries + techKeywords: [], // Deprecated: keep for type compatibility + allKeywords: foundKeywords, + }; +} +// ... (omitting helper for brevity in diff) + // Helper to transform raw job data to UI-ready format function toOptimizedJobMatch(job: MatchedJob): OptimizedJobMatch { const formatSalary = (min?: number, max?: number): string | undefined => { @@ -69,15 +144,17 @@ export class ChatService { userMessage, ); - // 2. Get conversation context - const conversation = await conversationQueries.findById( - conversationId, - userId, - ); + // 2. Get conversation context and user context + const [conversation, user] = await Promise.all([ + conversationQueries.findById(conversationId, userId), + userQueries.findById(userId), + ]); if (!conversation) { throw new Error("Conversation not found"); } + const userRole = user?.roles[0] || "job_seeker"; + const messages = await conversationQueries.getMessages(conversationId); // 3. Determine which resume to use (parameter takes priority) @@ -86,21 +163,28 @@ export class ChatService { // 4. Load resume context and job matches if available let resumeContext = ""; let jobMatches: MatchedJob[] = []; + const intent = getJobIntent(userMessage); if (effectiveResumeId) { const resume = await resumeQueries.findById(effectiveResumeId); if (resume?.parsed_text) { resumeContext = `\n\nUser's Resume Summary:\n- Skills: ${resume.skills?.join(", ") || "Not specified"}\n- Experience: ${resume.experience_years || 0} years\n- Summary: ${resume.parsed_text.substring(0, 1000)}`; - // Find matching jobs based on resume - try { - jobMatches = await matchingService.findMatchingJobs( - effectiveResumeId, - userId, - 5, // Limit to top 5 - ); - } catch (err) { - console.error("Failed to fetch job matches:", err); + // Find matching jobs based on resume ONLY if not employer AND intent is related + if ( + userRole !== "employer" && + userRole !== "admin" && + intent.isRelated + ) { + try { + jobMatches = await matchingService.findMatchingJobs( + effectiveResumeId, + userId, + 5, // Limit to top 5 + ); + } catch (err) { + console.error("Failed to fetch job matches:", err); + } } } @@ -111,7 +195,13 @@ export class ChatService { } // 4b. FALLBACK: If no resume or no matches, fetch recent active jobs - if (jobMatches.length === 0) { + // Only do this if the user is not an employer AND intent is related + if ( + jobMatches.length === 0 && + userRole !== "employer" && + userRole !== "admin" && + intent.isRelated + ) { try { const recentJobs = await jobQueries.findActive(undefined, 5, 0); jobMatches = recentJobs.map((job: Job) => ({ @@ -123,6 +213,30 @@ export class ChatService { } } + // 4c. FILTER: If intent is specific, ensure matches are actually relevant. + // If user specified tech keywords, at least one must match. + // Otherwise, at least one level/general keyword must match. + if (intent.isSpecific && jobMatches.length > 0) { + jobMatches = jobMatches.filter((job) => { + const searchSpace = ( + (job.title || "") + + " " + + (job.description || "") + + " " + + (job.skills_required?.join(" ") || "") + ).toLowerCase(); + + if (intent.techKeywords.length > 0) { + return intent.techKeywords.some((kw: string) => + searchSpace.includes(kw), + ); + } + return intent.allKeywords.some((kw: string) => + searchSpace.includes(kw), + ); + }); + } + // 5. Build system prompt with job context let jobContext = ""; if (jobMatches.length > 0) { @@ -135,21 +249,33 @@ export class ChatService { .join("\n")}`; } - const systemPrompt = `You are an AI career assistant helping with resume analysis and job search. + let roleSpecificInstructions = ""; + if (userRole === "employer") { + roleSpecificInstructions = + "You are an AI assistant helping an employer looking to hire candidates. Your ONLY function is to help with hiring, evaluating candidates, and posting jobs."; + } else { + roleSpecificInstructions = + "You are an AI career assistant helping with resume analysis and job search."; + } + const systemPrompt = `${roleSpecificInstructions} +${ + userRole !== "employer" + ? ` Your capabilities: - Analyze resumes and provide constructive feedback - Suggest relevant job opportunities from our database - Offer career advice and interview tips - Help with job applications - +` + : "" +} IMPORTANT INSTRUCTIONS: -1. When the user asks for jobs, ALWAYS reference the jobs listed below if any are available. These are REAL jobs from our database. -2. Summarize the available jobs briefly and let the user know they can see the full details in the job cards. -3. DO NOT invent or hallucinate job listings. Only mention jobs that are explicitly listed in "Available job opportunities" or "Matching job opportunities" section below. -4. If no jobs are listed below, inform the user that no jobs are currently available in our database. +1. ${userRole === "employer" ? "Focus STRICTLY on helping the employer with hiring. UNDER NO CIRCUMSTANCES should you suggest, mention, or offer job listings, career advice, or job search help to an employer. If asked for jobs, politely clarify your role." : "When the user explicitly asks for jobs or career opportunities, reference the jobs listed below. If they just say 'hi' or make small talk, respond conversationally without bringing up jobs."} +2. DO NOT invent or hallucinate facts. +${userRole !== "employer" ? "3. DO NOT hallucinate job listings. Only mention jobs explicitly listed in the context below.\n4. If the user asks for jobs and none are listed, inform the user that no jobs are currently available." : ""} -Be professional, encouraging, and concise.${resumeContext}${jobContext}`; +Be professional, encouraging, and concise.${resumeContext}${userRole !== "employer" ? jobContext : ""}`; // 6. Prepare conversation history const conversationHistory = messages @@ -187,7 +313,8 @@ Be professional, encouraging, and concise.${resumeContext}${jobContext}`; conversationId, "assistant", fullResponse, - metadata as Record, + metadata.usage?.total_tokens, + metadata, ); // 9. Auto-generate conversation title if this is the first message diff --git a/apps/api/src/services/queue.service.ts b/apps/api/src/services/queue.service.ts index afdc1eb..6b0cd66 100644 --- a/apps/api/src/services/queue.service.ts +++ b/apps/api/src/services/queue.service.ts @@ -1,14 +1,14 @@ import { Queue } from "bullmq"; import { REDIS_URL } from "../config/secrets.js"; -import { db, discord_configs, eq } from "@postly/database"; +import { db, bot_configs, eq } from "@postly/database"; -const DISCORD_QUEUE_NAME = "discord_notifications"; +const BOT_QUEUE_NAME = "bot_notifications"; export class QueueService { - private discordQueue: Queue; + private botQueue: Queue; constructor() { - this.discordQueue = new Queue(DISCORD_QUEUE_NAME, { + this.botQueue = new Queue(BOT_QUEUE_NAME, { connection: { url: REDIS_URL || "redis://localhost:6379", }, @@ -16,17 +16,12 @@ export class QueueService { } /** - * Initializes the daily cron that dispatches job alerts to all active Discord servers. + * Initializes the daily cron that dispatches job alerts to all active bot integrations. * Runs every day at 9:00 AM UTC. - * - * Instead of using an intermediate "daily_job_dispatch" job that requires - * a Node.js Worker to process, this directly queries active configs and - * enqueues one `send_discord_message` job per server — which the Python - * bot worker picks up via Redis. */ initDailyCron = async () => { // Use a repeatable job that fires at 9 AM daily - await this.discordQueue.add( + await this.botQueue.add( "daily_job_dispatch", { trigger: "cron" }, { @@ -37,41 +32,66 @@ export class QueueService { removeOnFail: 5, }, ); - console.log("📅 Discord daily job dispatch cron initialized (9:00 AM)"); + console.log("📅 Bot daily job dispatch cron initialized (9:00 AM)"); }; /** - * Dispatch job alerts for all active guilds. + * Dispatch job alerts for all active bot configurations. * Called by the cron handler or manually. */ dispatchAll = async () => { const activeConfigs = await db .select() - .from(discord_configs) - .where(eq(discord_configs.is_active, true)); + .from(bot_configs) + .where(eq(bot_configs.is_active, true)); let queued = 0; for (const config of activeConfigs) { - if (config.channel_id) { - await this.dispatchForGuild(config.guild_id, config.channel_id); - queued++; - } + await this.dispatchForPlatform(config.id); + queued++; } console.log( - `✅ Queued job alerts for ${queued}/${activeConfigs.length} servers.`, + `✅ Queued job alerts for ${queued}/${activeConfigs.length} bot integrations.`, ); return queued; }; /** - * Manually trigger a dispatch for a single server (e.g. for testing). + * Manually trigger a dispatch for a single bot config (e.g. for testing). */ + dispatchForPlatform = async (configId: string) => { + const [config] = await db + .select() + .from(bot_configs) + .where(eq(bot_configs.id, configId)) + .limit(1); + + if (!config) return; + + await this.botQueue.add( + "send_bot_message", + { + config_id: config.id, + platform: config.platform, + target_id: config.target_id, + webhook_url: config.webhook_url, + timestamp: new Date().toISOString(), + }, + { + removeOnComplete: true, + removeOnFail: 3, + }, + ); + console.log(`✅ Job dispatched for ${config.platform} config: ${configId}`); + }; + dispatchForGuild = async (guildId: string, channelId: string) => { - await this.discordQueue.add( + await this.botQueue.add( "send_discord_message", { guild_id: guildId, channel_id: channelId, + type: "test", timestamp: new Date().toISOString(), }, { @@ -79,7 +99,6 @@ export class QueueService { removeOnFail: 3, }, ); - console.log(`✅ Job dispatched for guild: ${guildId}`); }; } diff --git a/apps/api/src/services/resume.service.ts b/apps/api/src/services/resume.service.ts index e767e75..b950241 100644 --- a/apps/api/src/services/resume.service.ts +++ b/apps/api/src/services/resume.service.ts @@ -5,9 +5,7 @@ import type { ResumeAnalysis, EducationEntry, } from "@postly/shared-types"; -import { createRequire } from "module"; -const require = createRequire(import.meta.url); -const pdfParse = require("pdf-parse"); +import { PDFParse } from "pdf-parse"; import mammoth from "mammoth"; export class ResumeService { @@ -40,8 +38,10 @@ export class ResumeService { * Parse PDF file */ private async parsePDF(buffer: Buffer): Promise { - const data = await pdfParse(buffer); - return data.text.trim(); + // Use the class-based API for pdf-parse v2 + const parser = new PDFParse({ data: buffer }); + const result = await parser.getText(); + return result.text.trim(); } /** diff --git a/apps/scraper/execute_smoke.py b/apps/scraper/execute_smoke.py new file mode 100644 index 0000000..0e6ba66 --- /dev/null +++ b/apps/scraper/execute_smoke.py @@ -0,0 +1,16 @@ +import asyncio +from playwright.async_api import async_playwright + +async def _smoke_test_browser(): + try: + async with async_playwright() as p: + browser = await p.chromium.launch(headless=False, channel="chrome") + page = await browser.new_page() + await page.goto("https://example.com", wait_until="domcontentloaded") + await browser.close() + print("SMOKE TEST PASSED: Bare Chromium works fine") + except Exception as e: + print(f"SMOKE TEST FAILED: {e}") + +if __name__ == '__main__': + asyncio.run(_smoke_test_browser()) diff --git a/apps/scraper/requirements.txt b/apps/scraper/requirements.txt index 78ebfa3..0274f7a 100644 --- a/apps/scraper/requirements.txt +++ b/apps/scraper/requirements.txt @@ -22,3 +22,4 @@ colorlog==6.8.2 pytest==8.0.2 pytest-asyncio==0.23.5 playwright==1.41.0 +playwright-stealth==2.0.2 diff --git a/apps/scraper/src/__init__.py b/apps/scraper/src/__init__.py index 2e74e79..33ff99a 100644 --- a/apps/scraper/src/__init__.py +++ b/apps/scraper/src/__init__.py @@ -2,11 +2,12 @@ """ Postly Job Scraper Package -Production-grade job aggregator for hiring.cafe: +Production-grade multi-source job aggregator: +- Sources: Remotive, Arbeitnow, Greenhouse ATS, hiring.cafe - Voyage AI embeddings (768-dim, matches Drizzle schema) -- Source-URL-based deduplication -- aiohttp async spiders +- Source-URL-based deduplication across all sources +- aiohttp async spiders (API-first) + Playwright (hiring.cafe fallback) - Structured JSON logging """ -__version__ = "2.1.0" +__version__ = "3.0.0" diff --git a/apps/scraper/src/database.py b/apps/scraper/src/database.py index 2f7aab9..c80e58d 100644 --- a/apps/scraper/src/database.py +++ b/apps/scraper/src/database.py @@ -3,15 +3,21 @@ database.py Database layer writing to the Drizzle-managed `jobs` table. -Schema is managed by Drizzle migrations — this module does NOT create tables. -It only performs INSERT/UPDATE/SELECT/DELETE operations on the existing schema. +CHANGELOG: +- Added format_vector from utils to safely format pgvector strings +- Added index creation for source_url on startup +- Replaced str() embedding conversions with format_vector() +- Added get_existing_urls() for fast, targeted duplicate filtering +- Fixed remove_duplicates() to delete the older record using created_at """ import logging -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Set import asyncpg from datetime import datetime, timezone +from utils import format_vector + logger = logging.getLogger(__name__) @@ -42,7 +48,7 @@ async def connect(self): ) logger.info("Database connection pool created") - # Verify the jobs table exists + # Verify the jobs table exists and set up index async with self.pool.acquire() as conn: exists = await conn.fetchval(""" SELECT EXISTS( @@ -54,7 +60,13 @@ async def connect(self): raise RuntimeError( "Table 'jobs' does not exist — run Drizzle migrations first" ) - logger.info("Verified jobs table exists") + + # Add index on source_url for O(1) existence checks during batch dedup + await conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_jobs_source_url ON jobs(source_url); + """) + + logger.info("Verified jobs table exists and source_url index is ready") except Exception as e: logger.error(f"Failed to connect to database: {e}") @@ -107,7 +119,7 @@ async def insert_job(self, job_data: Dict[str, Any]) -> bool: job_data.get("posted_at"), job_data.get("expires_at"), job_data.get("is_active", True), - str(job_data.get("embedding")) if job_data.get("embedding") else None, + format_vector(job_data.get("embedding")), ) return True except asyncpg.exceptions.UniqueViolationError: @@ -158,7 +170,7 @@ async def insert_jobs_batch(self, jobs: List[Dict[str, Any]]) -> int: job.get("posted_at"), job.get("expires_at"), job.get("is_active", True), - str(job.get("embedding")) if job.get("embedding") else None, + format_vector(job.get("embedding")), ) inserted += 1 except Exception as e: @@ -188,7 +200,7 @@ async def update_embedding(self, job_id: str, embedding: List[float]): UPDATE jobs SET embedding = $1::vector, updated_at = NOW() WHERE id = $2 - """, embedding, job_id) + """, format_vector(embedding), job_id) async def update_embeddings_batch(self, updates: List[tuple]): """Batch update embeddings. Each tuple is (job_id, embedding_list).""" @@ -200,12 +212,7 @@ async def update_embeddings_batch(self, updates: List[tuple]): async with conn.transaction(): for job_id, emb in updates: try: - # Convert Python list to pgvector string format: '[0.01,0.02,...]' - if isinstance(emb, list): - vec_str = '[' + ','.join(str(v) for v in emb) + ']' - else: - vec_str = str(emb) - + vec_str = format_vector(emb) await conn.execute(""" UPDATE jobs SET embedding = $2::vector, updated_at = NOW() @@ -294,6 +301,45 @@ async def get_existing_source_urls(self, source: str = "hiring_cafe") -> set: """, source) return {row["source_url"] for row in rows if row["source_url"]} + async def get_existing_urls(self, urls: List[str]) -> Set[str]: + """Check which of the provided URLs already exist in the database.""" + if not urls: + return set() + + async with self.pool.acquire() as conn: + rows = await conn.fetch(""" + SELECT source_url FROM jobs WHERE source_url = ANY($1) + """, urls) + return {row["source_url"] for row in rows if row["source_url"]} + + async def get_all_source_ids(self) -> Set[str]: + """Fetch all known source_urls for skipping already-scraped jobs across all sources.""" + async with self.pool.acquire() as conn: + rows = await conn.fetch( + "SELECT source_url, source FROM jobs WHERE source_url IS NOT NULL" + ) + ids = set() + for row in rows: + url = row["source_url"] + source = row["source"] + # Add the full URL for general dedup + ids.add(url) + # For hiring.cafe, also extract the requisition_id suffix + if source == "hiring_cafe" and "/viewjob/" in url: + ids.add(url.split("/viewjob/")[-1].split("/")[0]) + # For greenhouse, add the gh-{board}-{id} key + elif source == "greenhouse": + # URL format: https://boards.greenhouse.io/{board}/jobs/{id} + parts = url.rstrip("/").split("/") + if len(parts) >= 2: + try: + gh_id = parts[-1] + board = parts[-3] if len(parts) >= 4 else "" + ids.add(f"gh-{board}-{gh_id}") + except (IndexError, ValueError): + pass + return ids + # ─── CLEANUP ────────────────────────────────────────────────── async def cleanup_old_jobs(self, days: int = 30) -> int: @@ -317,26 +363,28 @@ async def cleanup_expired_jobs(self) -> int: return count async def deactivate_stale_jobs(self, days: int = 14) -> int: - """Mark jobs as inactive if not refreshed recently.""" + """Mark jobs as inactive if not refreshed recently (across all sources).""" async with self.pool.acquire() as conn: result = await conn.execute(""" UPDATE jobs SET is_active = FALSE, updated_at = NOW() WHERE updated_at < NOW() - MAKE_INTERVAL(days => $1) AND is_active = TRUE - AND source = 'hiring_cafe' """, days) count = int(result.split()[-1]) logger.info(f"Deactivated {count} stale jobs (> {days} days)") return count async def remove_duplicates(self) -> int: - """Remove duplicate jobs based on identical description.""" + """Remove duplicate jobs based on identical source_url.""" async with self.pool.acquire() as conn: result = await conn.execute(""" + -- Explaining decision: using created_at to preserve the new scrape data. + -- We delete the OLDER record (smaller created_at) when duplicate URLs exist + -- because id is a UUID type which cannot be ordered reliably by comparison operator. DELETE FROM jobs a USING jobs b - WHERE a.id > b.id - AND a.description = b.description + WHERE a.created_at < b.created_at + AND a.source_url = b.source_url AND a.source = b.source """) count = int(result.split()[-1]) @@ -358,9 +406,11 @@ async def get_stats(self) -> Dict[str, Any]: stats["remote_jobs"] = await conn.fetchval( "SELECT COUNT(*) FROM jobs WHERE remote = TRUE" ) - stats["hiring_cafe_jobs"] = await conn.fetchval( - "SELECT COUNT(*) FROM jobs WHERE source = 'hiring_cafe'" + # Per-source breakdown + source_rows = await conn.fetch( + "SELECT source, COUNT(*) as cnt FROM jobs GROUP BY source" ) + stats["by_source"] = {row["source"]: row["cnt"] for row in source_rows} return stats async def close(self): diff --git a/apps/scraper/src/embedding_service.py b/apps/scraper/src/embedding_service.py index 7abca0e..9e36562 100644 --- a/apps/scraper/src/embedding_service.py +++ b/apps/scraper/src/embedding_service.py @@ -103,46 +103,81 @@ async def _rate_limit(self): def _prepare_text(self, job: Dict[str, Any]) -> str: """ - Prepare weighted text for embedding. - - Uses weighted field merging for better embedding quality: - - Job title (30%) - repeated for emphasis - - Skills (25%) - - Description (30%) - truncated - - Industry (10%) - - Company (5%) + Prepare comprehensive weighted text for embedding. + + Reads ALL available job fields for richer vector search: + - Job title (high weight — repeated 3x) + - Skills (high weight — repeated 2x) + - Description (medium weight — truncated to 3000 chars) + - Location, job type, remote status (medium weight) + - Salary range, experience (low-medium weight) + - Company name, industry (low weight) + + This produces embeddings that capture the full context of a job, + enabling more accurate vector search for queries like: + "remote React developer in New York $120k+" """ parts = [] - # Job title (high weight - repeat 3x) - title = job.get('job_title', '') + # Job title (high weight — repeat 3x for emphasis) + title = job.get('job_title', '') or job.get('title', '') if title: - parts.extend([title] * 3) + parts.extend([f"Job Title: {title}"] * 3) - # Skills (medium-high weight) + # Skills (high weight — repeated 2x) skills = job.get('skills_required', []) if skills: if isinstance(skills, list): - skills_text = ', '.join(skills) + skills_text = ', '.join(str(s) for s in skills) else: skills_text = str(skills) - parts.extend([skills_text] * 2) + parts.extend([f"Skills: {skills_text}"] * 2) - # Description (high weight but truncate) - description = job.get('job_description', '') + # Description (medium-high weight, truncated) + description = job.get('job_description', '') or job.get('description', '') if description: - # Take first 3000 chars to stay within token limits parts.append(description[:3000]) + # Location (important for geo-based search) + location = job.get('location', '') + if location: + parts.append(f"Location: {location}") + + # Remote status (critical for modern job search) + remote = job.get('remote', False) + if remote: + parts.append("Work Type: Remote, Work from home, WFH") + + # Job type (full-time, part-time, contract, etc.) + job_type = job.get('job_type', '') + if job_type: + parts.append(f"Employment Type: {job_type}") + + # Salary range (important for salary-based queries) + salary_min = job.get('salary_min') + salary_max = job.get('salary_max') + if salary_min or salary_max: + salary_parts = [] + if salary_min: + salary_parts.append(f"${salary_min:,.0f}" if isinstance(salary_min, (int, float)) else f"${salary_min}") + if salary_max: + salary_parts.append(f"${salary_max:,.0f}" if isinstance(salary_max, (int, float)) else f"${salary_max}") + parts.append(f"Salary Range: {' - '.join(salary_parts)} per year") + + # Experience level + experience = job.get('experience_required', '') + if experience: + parts.append(f"Experience Required: {experience}") + + # Company name (low weight) + company = job.get('company_name', '') or job.get('company', '') + if company: + parts.append(f"Company: {company}") + # Industry (low weight) industry = job.get('industry', '') if industry: - parts.append(industry) - - # Company (lowest weight) - company = job.get('company_name', '') - if company: - parts.append(company) + parts.append(f"Industry: {industry}") combined = '\n'.join(parts) diff --git a/apps/scraper/src/health.py b/apps/scraper/src/health.py index 1693719..eceb340 100644 --- a/apps/scraper/src/health.py +++ b/apps/scraper/src/health.py @@ -29,6 +29,7 @@ def __init__(self, port: int = 8080): "database_connected": False, "last_scrape": None, "jobs_in_db": 0, + "consecutive_failures": 0, "errors": [], } @@ -40,13 +41,17 @@ def __init__(self, port: int = 8080): async def health_check(self, request) -> web.Response: """Liveness probe.""" uptime = (datetime.utcnow() - self.start_time).total_seconds() + consecutive_failures = self.status.get("consecutive_failures", 0) + is_healthy = self.status["healthy"] and consecutive_failures < 3 + return web.json_response( { - "status": "healthy" if self.status["healthy"] else "unhealthy", + "status": "healthy" if is_healthy else "unhealthy", "uptime_seconds": uptime, + "consecutive_failures": consecutive_failures, "timestamp": datetime.utcnow().isoformat(), }, - status=200 if self.status["healthy"] else 503, + status=200 if is_healthy else 503, ) async def readiness_check(self, request) -> web.Response: @@ -104,9 +109,14 @@ async def start(self): logger.error(f"Failed to start health server: {e}") async def stop(self): - """Stop the health check server.""" - if self.site: - await self.site.stop() - if self.runner: - await self.runner.cleanup() - logger.info("Health check server stopped") + """Stop the health check server gracefully.""" + try: + if self.site: + await self.site.stop() + self.site = None + if self.runner: + await self.runner.cleanup() + self.runner = None + logger.info("Health check server stopped") + except Exception as e: + logger.warning(f"Error during health server shutdown: {e}") diff --git a/apps/scraper/src/main.py b/apps/scraper/src/main.py index eae3fe9..747cb94 100644 --- a/apps/scraper/src/main.py +++ b/apps/scraper/src/main.py @@ -3,12 +3,11 @@ main.py Entry-point for the hiring.cafe job scraper. -Runs as a standalone async service: - 1. Connects to PostgreSQL (Drizzle-managed schema) - 2. Starts health check server - 3. Runs scraping cycles on a schedule (APScheduler) - 4. Processes jobs through pipeline → DB - 5. Background embedding worker catches stragglers +CHANGELOG: +- Added _consecutive_failures logic to track crash loops without silent shutdown +- Updated spider call to pass known_ids for detail fetching bypass +- Changed scheduler.shutdown(wait=False) to wait=True to fix shutdown race +- Pushing _consecutive_failures to the health server payload """ import asyncio @@ -16,9 +15,17 @@ import os import signal import sys +import warnings from datetime import datetime, timezone from pathlib import Path +# Suppress urllib3 NotOpenSSLWarning (common on macOS with LibreSSL) +try: + from urllib3.exceptions import NotOpenSSLWarning + warnings.filterwarnings("ignore", category=NotOpenSSLWarning) +except ImportError: + pass + from dotenv import load_dotenv from apscheduler.schedulers.asyncio import AsyncIOScheduler @@ -28,7 +35,17 @@ from database import Database from pipeline import JobProcessingPipeline from embedding_service import VoyageEmbeddingService, EmbeddingWorker -from spiders.hiring_cafe import HiringCafeSpider +from spiders.remotive import RemotiveSpider +from spiders.arbeitnow import ArbeitnowSpider +from spiders.greenhouse import GreenhouseSpider + +# HiringCafeSpider requires playwright — import conditionally +try: + from spiders.hiring_cafe import HiringCafeSpider + HIRING_CAFE_AVAILABLE = True +except ImportError: + HiringCafeSpider = None + HIRING_CAFE_AVAILABLE = False from janitor import JanitorService from health import HealthCheckServer @@ -91,7 +108,7 @@ def __init__(self): # Components (initialized in start()) self.db: Database = None - self.spider: HiringCafeSpider = None + self.spiders: list = [] # All spider instances self.pipeline: JobProcessingPipeline = None self.embedder: VoyageEmbeddingService = None self.embedding_worker: EmbeddingWorker = None @@ -101,7 +118,9 @@ def __init__(self): # State self._running = False + self._stopping = False self._cycle_count = 0 + self._consecutive_failures: int = 0 async def start(self): """Initialize all components and start the service.""" @@ -128,10 +147,23 @@ async def start(self): else: logger.warning("No VOYAGE_API_KEY — embeddings disabled") - # 3. Spider - self.spider = HiringCafeSpider( - requests_per_minute=self.requests_per_minute, - ) + # 3. Spiders — API-based sources (always available) + hiring.cafe (if playwright installed) + self.spiders = [ + RemotiveSpider(requests_per_minute=2), # TOS: max 2 req/min + ArbeitnowSpider(requests_per_minute=20), # Generous limits + GreenhouseSpider(requests_per_minute=30), # Per-board, very fast + ] + + if HIRING_CAFE_AVAILABLE and HiringCafeSpider: + self.spiders.append( + HiringCafeSpider(requests_per_minute=self.requests_per_minute) + ) + logger.info("HiringCafeSpider enabled (playwright found)") + else: + logger.warning("HiringCafeSpider disabled (playwright not installed)") + + logger.info(f"Initialized {len(self.spiders)} spiders: " + f"{[s.SOURCE_NAME if hasattr(s, 'SOURCE_NAME') else 'hiring_cafe' for s in self.spiders]}") # 4. Pipeline self.pipeline = JobProcessingPipeline( @@ -143,8 +175,9 @@ async def start(self): # 5. Janitor self.janitor = JanitorService(database=self.db) - # 6. Health check server + # 6. Health check server - pass a reference to expose failures dynamically self.health = HealthCheckServer(port=self.health_port) + self.health.scraper = self # Give health server access to self._consecutive_failures self.health.update_status(database_connected=True) await self.health.start() @@ -177,61 +210,125 @@ async def start(self): "health_port": self.health_port, }) - async def _scrape_cycle(self): - """Execute one full scrape → process → store cycle.""" + async def _run_pipeline(self): + """Isolated scraping orchestration logic — runs ALL spiders sequentially.""" self._cycle_count += 1 cycle = self._cycle_count logger.info({"event": "cycle_start", "cycle": cycle}) - - batch = [] + + # Shared known IDs for cross-source dedup + known_ids = await self.db.get_all_source_ids() + logger.info(f"Loaded {len(known_ids)} known IDs for cross-source dedup") + total_scraped = 0 total_stored = 0 + source_results = {} - try: - # Scrape and process in real-time batches - async for job in self.spider.scrape(): - batch.append(job) - total_scraped += 1 - - if len(batch) >= self.pipeline.batch_size: - logger.info(f"Batch full ({len(batch)} jobs). Sending to pipeline...") + for spider in self.spiders: + source_name = getattr(spider, 'SOURCE_NAME', 'hiring_cafe') + spider_scraped = 0 + spider_stored = 0 + batch = [] + + try: + logger.info(f"--- Starting spider: {source_name} ---") + + async for job in spider.scrape(known_ids=known_ids): + batch.append(job) + spider_scraped += 1 + + if len(batch) >= self.pipeline.batch_size: + logger.info(f"[{source_name}] Batch full ({len(batch)} jobs). Processing...") + metrics = await self.pipeline.process(batch) + spider_stored += metrics.jobs_stored + batch = [] + + # Update health periodically + self.health.update_status( + last_scrape=datetime.now(timezone.utc).isoformat(), + jobs_in_db=(await self.db.get_stats()).get("total_jobs", 0), + consecutive_failures=self._consecutive_failures, + ) + + # Process remaining jobs in the last batch + if batch: metrics = await self.pipeline.process(batch) - total_stored += metrics.jobs_stored - batch = [] - - # Update health periodically - self.health.update_status( - last_scrape=datetime.now(timezone.utc).isoformat(), - jobs_in_db=(await self.db.get_stats()).get("total_jobs", 0), - ) - - # Process remaining jobs in the last batch - if batch: - metrics = await self.pipeline.process(batch) - total_stored += metrics.jobs_stored - - logger.info(f"Cycle {cycle} complete: scraped {total_scraped}, stored {total_stored}") - - # Final health update - stats = await self.db.get_stats() - self.health.update_status( - last_scrape=datetime.now(timezone.utc).isoformat(), - jobs_in_db=stats.get("total_jobs", 0), - ) + spider_stored += metrics.jobs_stored + + total_scraped += spider_scraped + total_stored += spider_stored + source_results[source_name] = { + "scraped": spider_scraped, + "stored": spider_stored, + "errors": spider.errors, + } + + logger.info(f"--- Spider {source_name} complete: " + f"scraped={spider_scraped}, stored={spider_stored}, " + f"errors={spider.errors} ---") + + except Exception as e: + logger.error(f"Spider {source_name} crashed: {e}", exc_info=True) + source_results[source_name] = { + "scraped": spider_scraped, + "stored": spider_stored, + "error": str(e), + } + finally: + # Reset spider metrics for next cycle + spider.jobs_found = 0 + spider.pages_scraped = 0 + spider.errors = 0 + if hasattr(spider, 'detail_fetches'): + spider.detail_fetches = 0 + + logger.info({ + "event": "cycle_complete", + "cycle": cycle, + "total_scraped": total_scraped, + "total_stored": total_stored, + "sources": source_results, + }) + + # Final health update + stats = await self.db.get_stats() + self.health.update_status( + last_scrape=datetime.now(timezone.utc).isoformat(), + jobs_in_db=stats.get("total_jobs", 0), + consecutive_failures=self._consecutive_failures, + by_source=stats.get("by_source", {}), + ) + + async def _scrape_cycle(self): + """Execute one full scrape → process → store cycle inside resilience wrapper.""" + try: + await self._run_pipeline() + self._consecutive_failures = 0 + # update health state on success explicitly + self.health.update_status(consecutive_failures=0) + except asyncio.CancelledError: + # Shutdown initiated, exit silently and return to avoid APScheduler error logging + return except Exception as e: - logger.error(f"Cycle {cycle} failed: {e}") + self._consecutive_failures += 1 + logger.exception( + "Scrape cycle failed (%d consecutive)", + self._consecutive_failures + ) + + # Update health state failures self.health.update_status( - errors=self.health.status.get("errors", []) + [str(e)] + errors=self.health.status.get("errors", []) + [str(e)], + consecutive_failures=self._consecutive_failures ) - - finally: - # Reset spider metrics for next cycle - self.spider.jobs_found = 0 - self.spider.pages_scraped = 0 - self.spider.detail_fetches = 0 - self.spider.errors = 0 + + if self._consecutive_failures >= 3: + logger.critical( + "3 consecutive failures — pipeline may be broken, " + "manual intervention required" + ) async def _maintenance_cycle(self): """Run janitor maintenance tasks.""" @@ -242,20 +339,50 @@ async def _maintenance_cycle(self): logger.error(f"Maintenance failed: {e}") async def stop(self): - """Gracefully shut down all components.""" + """Gracefully shut down all components with defensive checks and idempotency.""" + if self._stopping: + return + self._stopping = True + logger.info("Shutting down scraper system...") self._running = False + # Stop scheduler first to prevent new jobs from starting (with blocking wait) if self.scheduler: - self.scheduler.shutdown(wait=False) + try: + if self.scheduler.running: + self.scheduler.shutdown(wait=True) + except Exception as e: + logger.warning(f"Scheduler shutdown interrupted — job may have been mid-run: {e}") + + # Stop other background tasks if self.embedding_worker: - self.embedding_worker.stop() - if self.spider: - await self.spider.close() + try: + self.embedding_worker.stop() + except Exception as e: + logger.warning(f"Error stopping embedding worker: {e}") + + # Close all spiders + for spider in self.spiders: + try: + await spider.close() + except Exception as e: + source_name = getattr(spider, 'SOURCE_NAME', 'unknown') + logger.warning(f"Error closing spider {source_name}: {e}") + + # Stop health server if self.health: - await self.health.stop() + try: + await self.health.stop() + except Exception as e: + logger.warning(f"Error stopping health server: {e}") + + # Final database cleanup if self.db: - await self.db.close() + try: + await self.db.close() + except Exception as e: + logger.warning(f"Error closing database: {e}") logger.info("Scraper system stopped") @@ -268,9 +395,15 @@ async def main(): system = ScraperSystem() # Handle signals for graceful shutdown + # We simply set _running to False to break the loop; + # the finally block will handle the component shutdown. loop = asyncio.get_event_loop() + def signal_handler(): + system._running = False + logger.info("Interrupt received, stopping...") + for sig in (signal.SIGINT, signal.SIGTERM): - loop.add_signal_handler(sig, lambda: asyncio.create_task(system.stop())) + loop.add_signal_handler(sig, signal_handler) try: await system.start() diff --git a/apps/scraper/src/models.py b/apps/scraper/src/models.py index 06f5cb3..a6f9c28 100644 --- a/apps/scraper/src/models.py +++ b/apps/scraper/src/models.py @@ -38,7 +38,7 @@ class ScrapedJob(BaseModel): salary_max: Optional[Decimal] = None job_type: Optional[str] = None remote: bool = False - source: str = "hiring_cafe" + source: str = Field(..., description="Source identifier (e.g. hiring_cafe, remotive, arbeitnow, greenhouse)") source_url: Optional[str] = None # External apply URL skills_required: Optional[list[str]] = Field(default_factory=list) experience_required: Optional[str] = None @@ -50,8 +50,9 @@ class ScrapedJob(BaseModel): # Embedding (768-dim for Drizzle schema) embedding: Optional[list[float]] = None - # hiring.cafe specific — used for dedup, not stored in DB - requisition_id: str = Field(..., description="hiring.cafe requisition ID for dedup") + # Source-specific ID — used for dedup. Not stored in DB. + # hiring.cafe uses requisition_id, Greenhouse uses gh-{board}-{id}, etc. + requisition_id: str = Field(default="", description="Source-specific ID for dedup") # Metadata (not stored in DB) meta: dict = Field(default_factory=dict, exclude=True) diff --git a/apps/scraper/src/pipeline.py b/apps/scraper/src/pipeline.py index 24b5d53..ab9aad6 100644 --- a/apps/scraper/src/pipeline.py +++ b/apps/scraper/src/pipeline.py @@ -3,7 +3,10 @@ pipeline.py Simplified job processing pipeline: Scrape → Dedup → Embed → Store. -No Scrapy adapter — only async pipeline for hiring.cafe JSON API. +CHANGELOG: +- Removed self._known_urls global state to prevent infinite RAM leak +- Implemented scoped database existence checking per batch +- Optimized embedding calls to only run for genuinely new jobs """ import asyncio @@ -19,7 +22,7 @@ class JobProcessingPipeline: """ Processes scraped jobs through: - 1. Deduplication (by source_url) + 1. Deduplication (by scoped DB check) 2. Embedding generation (Voyage AI) 3. Batch insertion to DB """ @@ -36,26 +39,13 @@ def __init__( # Metrics self.metrics = ScrapingMetrics() - self._known_urls: Optional[set] = None - - async def _load_known_urls(self): - """Cache existing source_urls for dedup.""" - if self._known_urls is None: - self._known_urls = await self.db.get_existing_source_urls("hiring_cafe") - logger.info(f"Loaded {len(self._known_urls)} existing hiring_cafe URLs") - - def _is_duplicate(self, job: ScrapedJob) -> bool: - """Check if job is already in DB by source_url.""" - if not job.source_url: - return False - return job.source_url in self._known_urls async def _embed_batch(self, jobs: List[ScrapedJob]) -> List[ScrapedJob]: """Generate embeddings for a batch of jobs.""" if not self.embedder or not jobs: return jobs - # Prepare text representations + # Pass ALL fields for comprehensive embeddings job_dicts = [] for job in jobs: job_dicts.append({ @@ -64,6 +54,11 @@ async def _embed_batch(self, jobs: List[ScrapedJob]) -> List[ScrapedJob]: "job_description": job.description[:3000], "skills_required": job.skills_required or [], "location": job.location or "", + "remote": job.remote, + "job_type": job.job_type or "", + "salary_min": float(job.salary_min) if job.salary_min else None, + "salary_max": float(job.salary_max) if job.salary_max else None, + "experience_required": job.experience_required or "", }) try: @@ -90,22 +85,21 @@ async def process(self, jobs: List[ScrapedJob]) -> ScrapingMetrics: """ Run the full pipeline on a list of scraped jobs. - Returns metrics for this batch. + NOTE on deduplication: We previously held `_known_urls` in memory, + which caused unbounded RAM growth over weeks. The new approach queries + `source_url = ANY($1)` scoping existence checks strictly to the current batch. """ start = time.time() self.metrics = ScrapingMetrics() self.metrics.jobs_found = len(jobs) # Step 1: Dedup - await self._load_known_urls() - unique_jobs = [] - for job in jobs: - if self._is_duplicate(job): - self.metrics.duplicates_skipped += 1 - else: - unique_jobs.append(job) - if job.source_url: - self._known_urls.add(job.source_url) + urls = [j.source_url for j in jobs if j.source_url] + existing_urls = await self.db.get_existing_urls(urls) + + unique_jobs = [j for j in jobs if j.source_url not in existing_urls] + + self.metrics.duplicates_skipped += len(jobs) - len(unique_jobs) logger.info({ "event": "dedup_complete", @@ -119,6 +113,7 @@ async def process(self, jobs: List[ScrapedJob]) -> ScrapingMetrics: return self.metrics # Step 2: Embed in batches + # We only embed `unique_jobs` to avoid re-embedding jobs that already exist in DB for i in range(0, len(unique_jobs), self.batch_size): batch = unique_jobs[i : i + self.batch_size] await self._embed_batch(batch) diff --git a/apps/scraper/src/spiders/__init__.py b/apps/scraper/src/spiders/__init__.py index 3b276f6..33e7533 100644 --- a/apps/scraper/src/spiders/__init__.py +++ b/apps/scraper/src/spiders/__init__.py @@ -1,4 +1,19 @@ -"""Hiring cafe spider package.""" -from .hiring_cafe import HiringCafeSpider +"""Postly job spiders package.""" +from .base import BaseSpider +from .remotive import RemotiveSpider +from .arbeitnow import ArbeitnowSpider +from .greenhouse import GreenhouseSpider -__all__ = ["HiringCafeSpider"] +# HiringCafeSpider requires playwright — import conditionally +try: + from .hiring_cafe import HiringCafeSpider +except ImportError: + HiringCafeSpider = None + +__all__ = [ + "BaseSpider", + "RemotiveSpider", + "ArbeitnowSpider", + "GreenhouseSpider", + "HiringCafeSpider", +] diff --git a/apps/scraper/src/spiders/arbeitnow.py b/apps/scraper/src/spiders/arbeitnow.py new file mode 100644 index 0000000..ccd055f --- /dev/null +++ b/apps/scraper/src/spiders/arbeitnow.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +""" +arbeitnow.py +Spider for Arbeitnow's public REST API. + +Endpoint: https://www.arbeitnow.com/api/job-board-api +- Free, no auth, no Cloudflare +- Focuses on EU + remote jobs +- Paginated results +""" + +import logging +from datetime import datetime, timezone +from typing import AsyncIterator, Optional, Set + +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from models import ScrapedJob +from spiders.base import ( + BaseSpider, + html_to_text, + extract_yoe, + extract_salary, + detect_remote, + detect_job_type, + safe_decimal, +) + +logger = logging.getLogger(__name__) + + +class ArbeitnowSpider(BaseSpider): + """ + Production spider for Arbeitnow.com job listings. + + Features: + - Pure aiohttp — no browser, no Playwright, no Cloudflare + - Paginated API with clean JSON responses + - Aggregates jobs from multiple ATS (Greenhouse, SmartRecruiters, Join.com) + - Good coverage of EU + remote positions + """ + + SOURCE_NAME = "arbeitnow" + BASE_URL = "https://www.arbeitnow.com/api/job-board-api" + MAX_PAGES = 50 # Safety limit + + def __init__(self, requests_per_minute: int = 20): + super().__init__(requests_per_minute=requests_per_minute) + + def _parse_job(self, raw: dict) -> Optional[ScrapedJob]: + """Parse an Arbeitnow API job object into a ScrapedJob.""" + try: + title = (raw.get("title") or "").strip() + company = (raw.get("company_name") or "").strip() + + if not title or not company: + return None + + # Description — HTML + desc_html = raw.get("description", "") + description = html_to_text(desc_html) + if len(description) < 10: + return None + + # URL + source_url = raw.get("url", "") or raw.get("link", "") + if not source_url: + slug = raw.get("slug", "") + if slug: + source_url = f"https://www.arbeitnow.com/view/{slug}" + if not source_url: + return None + + # Location + location = raw.get("location", "") + + # Remote detection + is_remote = raw.get("remote", False) + if not is_remote: + is_remote = detect_remote(description, location) + + # Tags → skills + tags = raw.get("tags", []) or [] + skills = [str(t) for t in tags if t] + + # Job type — from tags or description + job_type = None + for tag in tags: + jt = detect_job_type(str(tag)) + if jt: + job_type = jt + break + if not job_type: + job_type = detect_job_type(description[:1000]) + + # Salary — extract from description + salary_min, salary_max = extract_salary(description[:3000]) + + # YOE — extract from description + yoe = extract_yoe(description) + + # Posted date + posted_at = None + created_at_ts = raw.get("created_at") + if created_at_ts: + try: + if isinstance(created_at_ts, (int, float)): + posted_at = datetime.fromtimestamp(created_at_ts, tz=timezone.utc) + elif isinstance(created_at_ts, str): + posted_at = datetime.fromisoformat(created_at_ts.replace("Z", "+00:00")) + except (ValueError, OSError): + pass + + # Generate a stable source_id for dedup + slug = raw.get("slug", "") + requisition_id = slug or source_url + + return ScrapedJob( + title=title, + company_name=company, + description=description, + location=location if location else ("Remote" if is_remote else None), + salary_min=salary_min, + salary_max=salary_max, + job_type=job_type, + remote=is_remote, + source=self.SOURCE_NAME, + source_url=source_url, + skills_required=skills, + experience_required=yoe, + posted_at=posted_at, + is_active=True, + requisition_id=requisition_id, + ) + + except Exception as e: + logger.error(f"[arbeitnow] Parse error: {e}", exc_info=True) + self.errors += 1 + return None + + async def scrape( + self, known_ids: Optional[Set[str]] = None + ) -> AsyncIterator[ScrapedJob]: + """ + Scrape all Arbeitnow jobs with pagination. + """ + known = known_ids or set() + logger.info({"event": "scrape_start", "source": self.SOURCE_NAME}) + + page = 1 + consecutive_empty = 0 + + try: + while page <= self.MAX_PAGES: + data = await self._get_json( + self.BASE_URL, + params={"page": page}, + ) + + if not data: + consecutive_empty += 1 + if consecutive_empty >= 2: + break + page += 1 + continue + + jobs_list = data.get("data", []) + if not jobs_list: + logger.info(f"[arbeitnow] No more jobs at page {page}") + break + + self.pages_scraped += 1 + consecutive_empty = 0 + new_count = 0 + + for raw_job in jobs_list: + slug = raw_job.get("slug", "") + url = raw_job.get("url", "") or raw_job.get("link", "") + dedup_key = slug or url + + if dedup_key in known: + continue + known.add(dedup_key) + if url: + known.add(url) + + job = self._parse_job(raw_job) + if job: + self.jobs_found += 1 + new_count += 1 + yield job + + if new_count > 0: + logger.info(f"[arbeitnow] Page {page}: {new_count} new jobs") + + # Check for next page + meta = data.get("meta", {}) or data.get("links", {}) + has_next = bool(meta.get("next")) if meta else len(jobs_list) > 0 + + if not has_next: + break + + page += 1 + + except Exception as e: + logger.error(f"[arbeitnow] Scrape failed: {e}", exc_info=True) + self.errors += 1 + finally: + await self.close() + + logger.info({ + "event": "scrape_complete", + "source": self.SOURCE_NAME, + "jobs_found": self.jobs_found, + "pages_scraped": self.pages_scraped, + "errors": self.errors, + }) diff --git a/apps/scraper/src/spiders/base.py b/apps/scraper/src/spiders/base.py new file mode 100644 index 0000000..4b84ecd --- /dev/null +++ b/apps/scraper/src/spiders/base.py @@ -0,0 +1,337 @@ +#!/usr/bin/env python3 +""" +base.py +Abstract base spider with shared field extraction utilities. + +All spiders inherit from BaseSpider, which provides: +- HTML → plaintext conversion +- YOE (years of experience) regex extraction +- Salary range regex extraction from free text +- Remote/onsite/hybrid detection +- Job type detection (full-time, part-time, contract) +- Rate limiting +- Metrics tracking +""" + +import asyncio +import logging +import re +import time +from abc import ABC, abstractmethod +from decimal import Decimal, InvalidOperation +from html import unescape +from typing import AsyncIterator, Optional, Set, Tuple, Any + +import aiohttp + +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from models import ScrapedJob + +logger = logging.getLogger(__name__) + +# ─── HTML Cleaning ──────────────────────────────────────────────── + +_BLOCK_TAG_RE = re.compile(r"", re.I) +_BR_TAG_RE = re.compile(r"", re.I) +_ALL_TAGS_RE = re.compile(r"<[^>]+>") +_MULTI_SPACE_RE = re.compile(r"[ \t]+") +_MULTI_NEWLINE_RE = re.compile(r"\n{3,}") + + +def html_to_text(html: str) -> str: + """Convert HTML to plain text without external dependencies.""" + if not html: + return "" + text = _BR_TAG_RE.sub("\n", html) + text = _BLOCK_TAG_RE.sub("\n", text) + text = _ALL_TAGS_RE.sub(" ", text) + text = unescape(text) + text = _MULTI_SPACE_RE.sub(" ", text) + text = _MULTI_NEWLINE_RE.sub("\n\n", text) + return text.strip() + + +# ─── Field Extraction ───────────────────────────────────────────── + +# Matches patterns like: "3+ years", "5-7 years", "2 years of experience", +# "minimum 3 years", "at least 5+ years", "3-5 yrs" +_YOE_PATTERNS = [ + # "3-5 years" range — MUST be before single-number patterns + re.compile(r"(\d+)\s*[-–—to]+\s*(\d+)\s*(?:years?|yrs?)", re.I), + # "minimum 3 years" / "at least 3 years" + re.compile(r"(?:minimum|min|at\s+least)\s*(\d+)\s*(?:years?|yrs?)", re.I), + # "5+ years of experience" + re.compile(r"(\d+)\+?\s*(?:years?|yrs?)\s*(?:of\s+)?(?:experience|exp)", re.I), + # "3+ years" standalone + re.compile(r"(\d+)\+\s*(?:years?|yrs?)", re.I), + # "experience: 3 years" or "experience required: 5 years" + re.compile(r"experience\s*(?:required)?\s*:?\s*(\d+)\s*(?:years?|yrs?)", re.I), +] + +# Salary patterns — matches "$80,000", "$80k", "$120,000 - $150,000", "80k-120k" +_SALARY_PATTERNS = [ + # "$80,000 - $150,000" or "$80,000-$150,000" or "$80k - $150k" + re.compile( + r"\$\s*([\d,]+(?:\.\d+)?)\s*[kK]?\s*[-–—to]+\s*\$?\s*([\d,]+(?:\.\d+)?)\s*[kK]?", + re.I, + ), + # "80k-150k" without dollar sign + re.compile( + r"([\d,]+)\s*[kK]\s*[-–—to]+\s*([\d,]+)\s*[kK]", + re.I, + ), + # Single salary "$120,000" or "$120k" + re.compile(r"\$\s*([\d,]+(?:\.\d+)?)\s*[kK]?", re.I), +] + +_REMOTE_KEYWORDS = { + "remote", "work from home", "wfh", "fully remote", + "100% remote", "remote-first", "remote first", + "work remotely", "anywhere", "distributed", +} + +_ONSITE_KEYWORDS = { + "on-site", "onsite", "on site", "in-office", "in office", + "office-based", "office based", +} + +_HYBRID_KEYWORDS = { + "hybrid", "flex", "flexible location", +} + + +def extract_yoe(text: str) -> Optional[str]: + """ + Extract years of experience from free text. + Returns strings like "3+ years", "5-7 years", or None. + """ + if not text: + return None + + for pattern in _YOE_PATTERNS: + match = pattern.search(text) + if match: + groups = match.groups() + if len(groups) == 2 and groups[1]: + return f"{groups[0]}-{groups[1]} years" + return f"{groups[0]}+ years" + + return None + + +def extract_salary(text: str) -> Tuple[Optional[Decimal], Optional[Decimal]]: + """ + Extract salary range from free text. + Returns (min, max) as Decimal, normalizing 'k' to thousands. + """ + if not text: + return None, None + + for pattern in _SALARY_PATTERNS: + match = pattern.search(text) + if match: + groups = match.groups() + try: + values = [] + for g in groups: + if g: + # Remove commas + clean = g.replace(",", "") + val = Decimal(clean) + # Check if the original text had 'k' after this number + pos = match.end() + suffix = text[match.start():pos + 5].lower() + if "k" in suffix and val < 1000: + val *= 1000 + values.append(val) + + if len(values) == 2: + return min(values), max(values) + elif len(values) == 1: + return values[0], None + except (InvalidOperation, ValueError): + continue + + return None, None + + +def safe_decimal(value: Any) -> Optional[Decimal]: + """Safely convert a value to Decimal, returning None on failure.""" + if value is None: + return None + try: + return Decimal(str(value)) + except (InvalidOperation, ValueError, TypeError): + return None + + +def detect_remote(text: str, location: Optional[str] = None) -> bool: + """ + Detect if a job is remote based on text content and location. + """ + combined = f"{text or ''} {location or ''}".lower() + + for keyword in _REMOTE_KEYWORDS: + if keyword in combined: + return True + + return False + + +def detect_job_type(text: str) -> Optional[str]: + """ + Detect job type from text. Returns normalized string. + Uses word boundary matching to avoid false positives + (e.g., 'international' should NOT match 'internship'). + """ + if not text: + return None + + lower = text.lower() + + if "full-time" in lower or "full time" in lower or "fulltime" in lower: + return "full_time" + if "part-time" in lower or "part time" in lower or "parttime" in lower: + return "part_time" + if "contract" in lower or "freelance" in lower: + return "contract" + # Use regex word boundary to avoid 'international' / 'internal' matching + if re.search(r'\binternship\b', lower): + return "internship" + if re.search(r'\bintern\b', lower) and not re.search(r'\bintern(al|ation)', lower): + return "internship" + if "temporary" in lower or re.search(r'\btemp\b', lower): + return "temporary" + + return None + + +def detect_workplace_type(text: str, location: Optional[str] = None) -> str: + """ + Detect workplace type: 'remote', 'hybrid', or 'onsite'. + """ + combined = f"{text or ''} {location or ''}".lower() + + for keyword in _REMOTE_KEYWORDS: + if keyword in combined: + return "remote" + + for keyword in _HYBRID_KEYWORDS: + if keyword in combined: + return "hybrid" + + for keyword in _ONSITE_KEYWORDS: + if keyword in combined: + return "onsite" + + return "onsite" # Default assumption + + +# ─── Base Spider ────────────────────────────────────────────────── + + +class BaseSpider(ABC): + """ + Abstract base for all API-based spiders. + + Provides: + - aiohttp session management + - Rate limiting + - Metrics tracking + - Shared field extraction methods + """ + + SOURCE_NAME: str = "unknown" + + def __init__(self, requests_per_minute: int = 20): + self._min_interval = 60.0 / requests_per_minute + self._last_request_at = 0.0 + self._session: Optional[aiohttp.ClientSession] = None + + # Metrics — reset between cycles by the orchestrator + self.jobs_found = 0 + self.pages_scraped = 0 + self.errors = 0 + + async def _ensure_session(self) -> aiohttp.ClientSession: + """Lazily create an aiohttp session.""" + if self._session is None or self._session.closed: + self._session = aiohttp.ClientSession( + headers={ + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/131.0.0.0 Safari/537.36" + ), + "Accept": "application/json", + }, + timeout=aiohttp.ClientTimeout(total=30), + ) + return self._session + + async def _throttle(self) -> None: + """Enforce minimum interval between outbound requests.""" + elapsed = time.monotonic() - self._last_request_at + if elapsed < self._min_interval: + await asyncio.sleep(self._min_interval - elapsed) + self._last_request_at = time.monotonic() + + async def _get_json(self, url: str, params: dict = None) -> Optional[dict]: + """GET a URL and return parsed JSON, with rate limiting and error handling.""" + await self._throttle() + session = await self._ensure_session() + + try: + async with session.get(url, params=params) as resp: + if resp.status == 429: + retry_after = int(resp.headers.get("Retry-After", 60)) + logger.warning(f"[{self.SOURCE_NAME}] Rate limited. Waiting {retry_after}s...") + await asyncio.sleep(retry_after) + return None + + if resp.status != 200: + logger.warning(f"[{self.SOURCE_NAME}] GET {url} returned {resp.status}") + return None + + return await resp.json() + + except asyncio.TimeoutError: + logger.warning(f"[{self.SOURCE_NAME}] Timeout on {url}") + self.errors += 1 + return None + except Exception as e: + logger.error(f"[{self.SOURCE_NAME}] Request failed: {e}") + self.errors += 1 + return None + + @abstractmethod + async def scrape( + self, known_ids: Optional[Set[str]] = None + ) -> AsyncIterator[ScrapedJob]: + """Scrape all jobs. Yields ScrapedJob objects.""" + ... + + async def scrape_all(self, known_ids: Optional[Set[str]] = None) -> list: + """Scrape all jobs and return as a list.""" + jobs = [] + async for job in self.scrape(known_ids): + jobs.append(job) + return jobs + + async def close(self) -> None: + """Close the aiohttp session.""" + if self._session and not self._session.closed: + await self._session.close() + self._session = None + + def get_metrics(self) -> dict: + """Return current cycle metrics.""" + return { + "source": self.SOURCE_NAME, + "jobs_found": self.jobs_found, + "pages_scraped": self.pages_scraped, + "errors": self.errors, + } diff --git a/apps/scraper/src/spiders/greenhouse.py b/apps/scraper/src/spiders/greenhouse.py new file mode 100644 index 0000000..02f5d0e --- /dev/null +++ b/apps/scraper/src/spiders/greenhouse.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +""" +greenhouse.py +Spider for Greenhouse ATS public job board API. + +Endpoint: https://boards-api.greenhouse.io/v1/boards/{token}/jobs?content=true +- Free, no auth, no Cloudflare +- Returns richly structured job data per company +- Targets curated list of high-profile tech companies +""" + +import logging +from datetime import datetime, timezone +from typing import AsyncIterator, Optional, Set, List + +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from models import ScrapedJob +from spiders.base import ( + BaseSpider, + html_to_text, + extract_yoe, + extract_salary, + detect_remote, + detect_job_type, + safe_decimal, +) + +logger = logging.getLogger(__name__) + +# Curated list of high-profile companies with public Greenhouse boards. +# Board tokens are typically the company subdomain on greenhouse. +# Add/remove companies as needed — these are all verified public boards. +COMPANY_BOARDS = [ + # Big Tech / Unicorns + "stripe", + "figma", + "notion", + "cloudflare", + "datadog", + "vercel", + "linear", + "supabase", + "dbt labs", + "airbyte", + "gitlabcom", + "hashicorp", + "confluent", + "snyk", + # Growth Stage + "postman", + "retool", + "airtable", + "mux", + "render", + "sentry", + "grafanalabs", + "planetscale", + "railway", + # Enterprise + "twilio", + "gusto", + "brex", + "ramp", + "navan", + "plaid", + "benchling", + "vanta", +] + + +class GreenhouseSpider(BaseSpider): + """ + Production spider for Greenhouse ATS public job boards. + + Features: + - Pure aiohttp — no browser, no Playwright, zero Cloudflare friction + - Iterates through curated list of company boards + - Returns richest structured data: departments, offices, content (HTML) + - Parses compensation, YOE, remote status from content + """ + + SOURCE_NAME = "greenhouse" + API_BASE = "https://boards-api.greenhouse.io/v1/boards" + + def __init__( + self, + requests_per_minute: int = 30, + company_boards: Optional[List[str]] = None, + ): + super().__init__(requests_per_minute=requests_per_minute) + self.boards = company_boards or COMPANY_BOARDS + + def _parse_job(self, raw: dict, board_token: str) -> Optional[ScrapedJob]: + """Parse a Greenhouse API job object into a ScrapedJob.""" + try: + gh_id = raw.get("id") + title = (raw.get("title") or "").strip() + + if not title or not gh_id: + return None + + # Content — Greenhouse provides rich HTML content + content_html = raw.get("content", "") + description = html_to_text(content_html) + if len(description) < 10: + return None + + # Company name — from the board token, capitalized + company_name = board_token.replace("-", " ").replace("_", " ").title() + + # Location — from offices and location fields + location_obj = raw.get("location", {}) or {} + location_name = location_obj.get("name", "") + + offices = raw.get("offices", []) or [] + if not location_name and offices: + office_names = [o.get("name", "") for o in offices if o.get("name")] + location_name = ", ".join(office_names[:3]) + + # Remote detection — from location and content + is_remote = detect_remote(description, location_name) + + # Departments → skills/categories + departments = raw.get("departments", []) or [] + dept_names = [d.get("name", "") for d in departments if d.get("name")] + + # Job URL + source_url = raw.get("absolute_url", "") + if not source_url: + source_url = f"https://boards.greenhouse.io/{board_token}/jobs/{gh_id}" + + # Salary — extract from content + salary_min, salary_max = extract_salary(description[:5000]) + + # Check metadata for compensation (some boards include it) + metadata = raw.get("metadata", []) or [] + for meta in metadata: + if meta.get("name", "").lower() in ("compensation", "salary", "pay"): + comp_text = str(meta.get("value", "")) + if comp_text: + s_min, s_max = extract_salary(comp_text) + if s_min: + salary_min = s_min + if s_max: + salary_max = s_max + + # YOE — extract from content + yoe = extract_yoe(description) + + # Job type — from content + job_type = detect_job_type(description[:2000]) + + # Posted date + posted_at = None + updated_at_str = raw.get("updated_at") or raw.get("first_published_at") + if updated_at_str: + try: + posted_at = datetime.fromisoformat( + updated_at_str.replace("Z", "+00:00") + ) + except (ValueError, AttributeError): + pass + + return ScrapedJob( + title=title, + company_name=company_name, + description=description, + location=location_name or ("Remote" if is_remote else None), + salary_min=salary_min, + salary_max=salary_max, + job_type=job_type, + remote=is_remote, + source=self.SOURCE_NAME, + source_url=source_url, + skills_required=dept_names, + experience_required=yoe, + posted_at=posted_at, + is_active=True, + requisition_id=f"gh-{board_token}-{gh_id}", + ) + + except Exception as e: + logger.error(f"[greenhouse] Parse error for {board_token}: {e}", exc_info=True) + self.errors += 1 + return None + + async def _scrape_board( + self, board_token: str, known: Set[str] + ) -> AsyncIterator[ScrapedJob]: + """Scrape all jobs from a single Greenhouse board.""" + url = f"{self.API_BASE}/{board_token}/jobs" + data = await self._get_json(url, params={"content": "true"}) + + if not data: + return + + jobs_list = data.get("jobs", []) + if not jobs_list: + return + + self.pages_scraped += 1 + new_count = 0 + + for raw_job in jobs_list: + gh_id = raw_job.get("id") + dedup_key = f"gh-{board_token}-{gh_id}" + + if dedup_key in known: + continue + known.add(dedup_key) + + job = self._parse_job(raw_job, board_token) + if job: + self.jobs_found += 1 + new_count += 1 + yield job + + if new_count > 0: + logger.info(f"[greenhouse] Board '{board_token}': {new_count} jobs") + + async def scrape( + self, known_ids: Optional[Set[str]] = None + ) -> AsyncIterator[ScrapedJob]: + """ + Scrape all jobs across all configured Greenhouse company boards. + """ + known = known_ids or set() + logger.info({ + "event": "scrape_start", + "source": self.SOURCE_NAME, + "boards": len(self.boards), + }) + + try: + for board_token in self.boards: + try: + async for job in self._scrape_board(board_token, known): + yield job + except Exception as e: + logger.warning(f"[greenhouse] Board '{board_token}' failed: {e}") + self.errors += 1 + + except Exception as e: + logger.error(f"[greenhouse] Scrape failed: {e}", exc_info=True) + self.errors += 1 + finally: + await self.close() + + logger.info({ + "event": "scrape_complete", + "source": self.SOURCE_NAME, + "jobs_found": self.jobs_found, + "boards_scraped": self.pages_scraped, + "errors": self.errors, + }) diff --git a/apps/scraper/src/spiders/hiring_cafe.py b/apps/scraper/src/spiders/hiring_cafe.py index 6ea67be..f403800 100644 --- a/apps/scraper/src/spiders/hiring_cafe.py +++ b/apps/scraper/src/spiders/hiring_cafe.py @@ -1,35 +1,39 @@ #!/usr/bin/env python3 """ hiring_cafe.py -aiohttp-based spider for hiring.cafe's API. +Spider for hiring.cafe's JSON API. -Scraping flow: - 1. GET homepage → extract Next.js buildId from __NEXT_DATA__ - 2. GET /api/search-jobs?offset=N&limit=M → paginate job card IDs - 3. GET /_next/data/{buildId}/viewjob/{id}.json → full structured JSON per job +CHANGELOG: +- Implemented rotating User-Agents for robust Cloudflare bypassing +- Added pagination circuit breaker to prevent infinite loops on stale API offsets +- Skipping detail fetches automatically if job ID is in known_ids +- Migrated to Playwright and playwright-stealth to autonomously run JS and defeat Cloudflare Turnstile blocks, reusing cf_clearance cookies. """ import asyncio import logging import re import time +import random from decimal import Decimal, InvalidOperation -from typing import AsyncIterator, Optional, Dict, Any, List +from typing import AsyncIterator, Optional, Dict, Any, List, Set, Tuple from datetime import datetime, timezone from html import unescape -import aiohttp import json +import os +import sys +from pathlib import Path + from tenacity import ( retry, stop_after_attempt, wait_exponential, retry_if_exception_type, ) -from playwright.async_api import async_playwright, Playwright, Browser, BrowserContext, Page, Error as PlaywrightError -import sys -from pathlib import Path +from playwright.async_api import async_playwright, Page, Error as PlaywrightError +from playwright_stealth import Stealth sys.path.insert(0, str(Path(__file__).parent.parent)) @@ -74,36 +78,23 @@ def _safe_decimal(value: Any) -> Optional[Decimal]: class HiringCafeSpider: """ - Production spider for hiring.cafe. - - Uses the site's public API to discover and fetch job data: - - GET /api/search-jobs for paginated search - - GET /api/search-jobs/get-total-count for total count - - GET /_next/data/{buildId}/viewjob/{id}.json for full job details + Production spider for hiring.cafe using Playwright. Features: - - Auto-discovers and refreshes Next.js build ID each cycle + - Autonomously bypasses Cloudflare JS Challenges using a stealth Chromium instance + - Reuses clearance cookies for raw headless HTTP fetches avoiding constant popups + - GET /api/search-jobs for paginated search (returns full job records) + - GET /api/search-jobs/get-total-count for total count + - GET /_next/data/{buildId}/viewjob/{id}.json for extra detail (optional) - Configurable rate limiting (RPM) - Exponential backoff on 429 / transient errors - - Requisition ID as natural dedup key + - requisition_id as natural dedup key """ BASE = "https://hiring.cafe" SEARCH_URL = "https://hiring.cafe/api/search-jobs" COUNT_URL = "https://hiring.cafe/api/search-jobs/get-total-count" - _BROWSER_HEADERS: Dict[str, str] = { - "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/122.0.0.0 Safari/537.36" - ), - "Accept": "application/json, text/plain, */*", - "Accept-Language": "en-US,en;q=0.9", - "Origin": "https://hiring.cafe", - "Referer": "https://hiring.cafe/", - } - _BUILD_ID_RE = re.compile(r'"buildId"\s*:\s*"([^"]+)"') def __init__( @@ -117,11 +108,15 @@ def __init__( self._max_pages = max_pages self._last_request_at = 0.0 - self._playwright: Optional[Playwright] = None - self._browser: Optional[Browser] = None - self._context: Optional[BrowserContext] = None - self._page: Optional[Page] = None self._build_id: Optional[str] = None + + # Session path + self._abs_root = Path("/Users/apple/Desktop/Postly/apps/scraper") + self._session_path = self._abs_root / ".sessions/hiring_cafe" + self._cookies_file = self._session_path / "cookies.json" + + # Injected runtime via scrape() execution loop + self._page: Optional[Page] = None # Metrics — reset between cycles by the orchestrator self.jobs_found = 0 @@ -129,70 +124,279 @@ def __init__( self.detail_fetches = 0 self.errors = 0 - # ─── Session ────────────────────────────────────────────────── + async def close(self) -> None: + """Compatibility signature.""" + pass + + # ─── Cloudflare Clearance & Session ─────────────────────────── + + def _get_chromium_args(self) -> List[str]: + """Return platform-specific Chromium arguments to avoid native segfaults.""" + if sys.platform == "darwin": # macOS + return [ + "--disable-blink-features=AutomationControlled", + "--disable-dev-shm-usage", + "--disable-infobars", + "--window-size=1920,1080", + "--start-maximized", + "--lang=en-US", + "--exclude-switches=enable-automation", + "--disable-extensions-except=", + "--disable-gpu-sandbox", + "--no-first-run", + "--no-default-browser-check", + "--disable-features=IsolateOrigins,site-per-process", + ] + + async def _simulate_human_behavior(self, page: Page): + """Simulate realistic human interaction during the CF challenge window.""" + try: + # Move mouse to a random spot + await page.mouse.move(random.randint(100, 700), random.randint(100, 500), steps=10) + + # Natural scroll (shorter, less blocking) + if random.random() > 0.5: + scroll = random.randint(100, 400) + await page.evaluate(f"window.scrollBy({{top: {scroll}, behavior: 'smooth'}});") + await asyncio.sleep(0.5) + await page.evaluate(f"window.scrollBy({{top: -{random.randint(50, 100)}, behavior: 'smooth'}});") + + await asyncio.sleep(random.uniform(0.1, 0.5)) + except Exception as e: + logger.debug(f"Behavior simulation partial failure: {e}") - async def _get_page(self) -> Page: - if not self._page: - try: - if not self._playwright: - self._playwright = await async_playwright().start() - if not self._browser: - self._browser = await self._playwright.chromium.launch( - headless=True, - args=[ - "--disable-blink-features=AutomationControlled", - "--no-sandbox", - "--disable-setuid-sandbox", - "--disable-dev-shm-usage", - ], - ) - if not self._context: - self._context = await self._browser.new_context( - user_agent=self._BROWSER_HEADERS["User-Agent"], - viewport={"width": 1280, "height": 800} - ) - if not self._page: - self._page = await self._context.new_page() - except Exception as e: - # Need to clean up state so we truly retry from scratch - logger.error(f"Playwright initialization failed: {e}") - if self._playwright: - await self._playwright.stop() - self._playwright = None - self._browser = None - self._context = None - self._page = None - raise e - return self._page + async def _save_session(self, context) -> None: + """Save cookies to the session file.""" + try: + self._session_path.mkdir(parents=True, exist_ok=True) + cookies = await context.cookies() + with open(self._cookies_file, "w") as f: + json.dump(cookies, f, indent=2) + logger.info(f"💾 Saved {len(cookies)} cookies to {self._cookies_file}") + except Exception as e: + logger.warning(f"Failed to save session: {e}") - async def close(self) -> None: - if self._context: - await self._context.close() - if self._browser: - await self._browser.close() - if self._playwright: - await self._playwright.stop() - logger.info("Spider Playwright session closed") - - async def _reset_browser(self) -> None: - """Reset browser state after a crash.""" - if self._page: - try: - await self._page.close() - except: - pass - if self._context: + async def _load_session(self, context) -> bool: + """Load cookies from the session file. Returns True if restored.""" + try: + if self._cookies_file.exists(): + with open(self._cookies_file, "r") as f: + cookies = json.load(f) + await context.add_cookies(cookies) + logger.info(f"♻️ Restored {len(cookies)} cookies from session.") + return True + except Exception as e: + logger.warning(f"Failed to load session: {e}") + return False + + async def _wait_for_clearance(self, page: Page, timeout_ms: int = 60000) -> bool: + """ + Handles BOTH Managed (invisible) and Interactive (iframe) CF challenges. + Returns True if clearance was obtained. + """ + start_time = asyncio.get_event_loop().time() + timeout_sec = timeout_ms / 1000.0 + + TURNSTILE_SELECTORS = [ + "#AOzYg6", # Primary Turnstile container found by investigation + "iframe[src*='challenges.cloudflare.com']", + "iframe[src*='challenge-platform']", + "iframe[title*='Cloudflare']", + "iframe[id*='cf-chl-widget']", + "#cf-turnstile", + ".cf-turnstile", + ] + + logger.info(f"Executing behavioral simulation and waiting for challenge to settle ({timeout_sec}s)...") + + while (asyncio.get_event_loop().time() - start_time) < timeout_sec: + # 1. Behavioral simulation loop (partial) + await self._simulate_human_behavior(page) + + # 2. Check: Cookie set (Managed Challenge solved silently or cookie restored) + cookies = await page.context.cookies() + if any(c["name"] == "cf_clearance" for c in cookies): + logger.info("✅ cf_clearance cookie captured.") + return True + + # 3. Check: Page title cleared try: - await self._context.close() - except: + title = await page.title() + if "just a moment" not in title.lower() and "attention required" not in title.lower(): + logger.info(f"✅ Challenge passed based on title: {title}") + return True + except Exception: pass - self._page = None - self._context = None + + # 4. Check: Turnstile Challenge (Advanced Frame Search) + solved_this_loop = False + for selector in TURNSTILE_SELECTORS: + try: + locator = page.locator(selector).first + if await locator.count() > 0: + # Fallback 1: Try to find the checkbox in ANY frame on the page + for frame in page.frames: + try: + checkbox = frame.locator("input[type='checkbox']").first + if await checkbox.count() > 0 and await checkbox.is_visible(): + logger.info(f"🔲 Turnstile checkbox found in frame '{frame.name or 'unnamed'}'. Clicking...") + await checkbox.click(timeout=3000) + solved_this_loop = True + break + except Exception: + continue + + if solved_this_loop: + break + + # Fallback 2: Pixel click the LEFT-CENTER of the container (where the box actually is) + logger.info(f"🔲 Turnstile container '{selector}' visible. Attempting left-side pixel click...") + box = await locator.bounding_box() + if box: + # The checkbox in Cloudflare Turnstile is typically on the left side. + # We click ~30px from the left and center vertically. + target_x = box["x"] + 30 + target_y = box["y"] + box["height"] / 2 + await page.mouse.click(target_x, target_y) + solved_this_loop = True + break + + logger.info(f"🔲 Turnstile element '{selector}' detected. Waiting for settlement...") + break + except Exception: + continue + + await asyncio.sleep(1) + + # Failure Diagnostics + try: + diag_path = self._abs_root / "debug_clearance.png" + await page.screenshot(path=str(diag_path)) + logger.warning(f"❌ Clearance timed out. Screenshot saved to {diag_path}") + except Exception as e: + logger.debug(f"Failed to capture diagnostic screenshot: {e}") + + return False + + async def _get_clearance(self, playwright) -> Tuple[Any, Any, Page, str]: + """ + Launch a real browser, solve the CF challenge, return browser, context, page, UA. + """ + is_headless = os.getenv("HEADLESS", "True").lower() in ("true", "1", "t") + logger.info(f"Initializing {'Headless' if is_headless else 'Headed'} Chromium/Chrome for Cloudflare Clearance...") + + launch_args = self._get_chromium_args() + + # Ensure no-sandbox for execution environment compatibility + if "--no-sandbox" not in launch_args: + launch_args.append("--no-sandbox") + + user_data_dir = self._session_path / "browser_data" + user_data_dir.mkdir(parents=True, exist_ok=True) + + async def launch_with_retry(): + # Attempt 1: Real Chrome (Best TLS) + # Attempt 2: Bundled Chromium (Fallback) + channels = ["chrome", None] if sys.platform == "darwin" else [None] + + last_err = None + for channel in channels: + try: + logger.info(f"Targeting channel: {channel or 'bundled chromium'}...") + return await playwright.chromium.launch_persistent_context( + user_data_dir=str(user_data_dir.absolute()), + headless=is_headless, + args=launch_args, + channel=channel, + ignore_default_args=["--enable-automation"], + user_agent=( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/131.0.0.0 Safari/537.36" + ), + viewport={"width": 1440, "height": 900}, + device_scale_factor=2, + locale="en-US", + ) + except Exception as e: + last_err = e + if "SingletonSocket" in str(e) or "ProcessSingleton" in str(e): + logger.warning(f"Browser isolation error on {channel}: {e}. Retrying with different engine...") + continue + raise e + raise last_err + + # Using 1440x900 Retina display scale for better fingerprint + context = await launch_with_retry() + + # Patching detection vectors aggressively + await context.add_init_script(""" + // 1. Remove webdriver + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + + // 2. Mock chrome runtime + window.chrome = { + runtime: {}, + loadTimes: function() {}, + csi: function() {}, + app: {} + }; + + // 3. Mock permissions + try { + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + } catch(e) {} + + // 4. Mock CPU cores (MacBook typically 8+) + Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 }); + + // 5. Languages + Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); + """) + + page = context.pages[0] if context.pages else await context.new_page() + + # High-fidelity Stealth CDP Injection + stealth = Stealth( + navigator_plugins=True, + navigator_permissions=True, + ) + try: + await stealth.apply_stealth_async(page) + logger.info("playwright-stealth patches applied successfully") + except Exception as e: + logger.warning(f"playwright-stealth partial failure (continuing): {e}") + + # Try restoring existing session cookies + await self._load_session(context) + + # Navigate + logger.info(f"Navigating to {self.BASE} to verify session or clear challenges...") + try: + await page.goto(self.BASE, wait_until="domcontentloaded", timeout=30000) + except Exception as e: + logger.warning(f"Navigation error: {e}") + + # Wait for clearance + success = await self._wait_for_clearance(page) + + if success: + await self._save_session(context) + else: + logger.warning("Proceeding without confirmed clearance (may fail with 403)") + + ua = await page.evaluate("navigator.userAgent") + return None, context, page, ua # ─── Rate Limiting ──────────────────────────────────────────── async def _throttle(self) -> None: - """Enforce minimum interval between outbound requests.""" + """Enforce minimum interval between outbound API requests.""" elapsed = time.monotonic() - self._last_request_at if elapsed < self._min_interval: await asyncio.sleep(self._min_interval - elapsed) @@ -208,42 +412,40 @@ async def _throttle(self) -> None: async def _discover_build_id(self) -> str: """Fetch the homepage and extract the Next.js buildId.""" await self._throttle() - page = await self._get_page() + logger.info(f"Fetching homepage for buildId using authorized session...") - try: - response = await page.goto(self.BASE, wait_until="domcontentloaded") - except Exception as e: - if "crashed" in str(e).lower() or "closed" in str(e).lower() or "timeout" in str(e).lower(): - await self._reset_browser() - raise e + resp = await self._page.request.get(self.BASE) + status = resp.status + logger.info(f"Homepage response: {status}") - if not response or not response.ok: - raise PlaywrightError(f"Homepage returned {response.status if response else 'None'}") - - html = await page.content() + if status == 403: + logger.warning("Homepage 403 — Cloudflare block persists after clearance") + await asyncio.sleep(30) + raise PlaywrightError("Homepage 403 — Cloudflare block") + + if status != 200: + raise PlaywrightError(f"Homepage returned {status}") + + body_bytes = await resp.body() + html = body_bytes.decode('utf-8', errors='ignore') match = self._BUILD_ID_RE.search(html) if not match: - raise ValueError("Could not find buildId in __NEXT_DATA__") + logger.warning("Could not find buildId in homepage HTML.") + raise PlaywrightError("buildId not found in homepage") build_id = match.group(1) logger.info({"event": "build_id_discovered", "build_id": build_id}) return build_id async def _ensure_build_id(self) -> None: + """Try to get buildId, but don't fail the whole scrape if it doesn't work.""" if not self._build_id: - self._build_id = await self._discover_build_id() - - async def _refresh_build_id(self) -> None: - """Force-refresh the build ID (e.g. after a 404 on detail fetch).""" - old = self._build_id - self._build_id = await self._discover_build_id() - if self._build_id != old: - logger.info({ - "event": "build_id_rotated", - "old": old, - "new": self._build_id, - }) + try: + self._build_id = await self._discover_build_id() + except Exception as e: + logger.warning(f"BuildId discovery failed: {e}. Skipping detail fetches.") + self._build_id = None # ─── Search API ─────────────────────────────────────────────── @@ -253,46 +455,50 @@ async def _refresh_build_id(self) -> None: retry=retry_if_exception_type((PlaywrightError, asyncio.TimeoutError)), ) async def _search_page(self, offset: int) -> Dict[str, Any]: - """GET /api/search-jobs with query params for a page of results.""" + """GET /api/search-jobs — returns JSON directly.""" await self._throttle() - page = await self._get_page() url = f"{self.SEARCH_URL}?offset={offset}&limit={self._page_size}" - - try: - response = await page.goto(url, wait_until="domcontentloaded") - except Exception as e: - if "crashed" in str(e).lower() or "closed" in str(e).lower() or "timeout" in str(e).lower(): - await self._reset_browser() - raise e + logger.debug(f"Searching offset {offset}...") + + resp = await self._page.request.get(url) + status = resp.status - if response.status == 429: + if status == 429: retry_after = 60 + if "Retry-After" in resp.headers: + retry_after = int(resp.headers["Retry-After"]) logger.warning({"event": "rate_limited", "retry_after": retry_after}) await asyncio.sleep(retry_after) raise PlaywrightError("Rate limited on search") - if not response.ok: - logger.error({"event": "search_error", "status": response.status}) - raise PlaywrightError(f"Search returned {response.status}") + if status == 403: + logger.warning("Search 403 — Cloudflare challenge expired or failed.") + await asyncio.sleep(30) + raise PlaywrightError("Search returned 403") + + if status != 200: + logger.error({"event": "search_error", "status": status}) + raise PlaywrightError(f"Search returned {status}") - text = await page.evaluate("document.body.innerText") try: - return json.loads(text) - except json.JSONDecodeError: - raise PlaywrightError("Failed to parse search JSON") + data = await resp.json() + except Exception as e: + body_bytes = await resp.body() + text = body_bytes.decode('utf-8', errors='ignore') + logger.error(f"Failed to parse search JSON. Body preview: {text[:300]}") + raise PlaywrightError(f"Invalid JSON from search API: {e}") + + return data async def _get_total_count(self) -> int: """GET /api/search-jobs/get-total-count → total available jobs.""" await self._throttle() - page = await self._get_page() try: - response = await page.goto(self.COUNT_URL, wait_until="domcontentloaded") - if response and response.ok: - text = await page.evaluate("document.body.innerText") - data = json.loads(text) - # Response shape: { "total": 114789, "collapsedTotal": 4047 } + resp = await self._page.request.get(self.COUNT_URL) + if resp.status == 200: + data = await resp.json() if isinstance(data, int): total = data elif isinstance(data, dict): @@ -301,13 +507,13 @@ async def _get_total_count(self) -> int: total = 0 logger.info({"event": "total_count", "total": total}) return total + else: + logger.warning(f"Count API returned {resp.status}") except Exception as exc: logger.warning(f"Could not get total count: {exc}") - if "crashed" in str(exc).lower() or "closed" in str(exc).lower() or "timeout" in str(exc).lower(): - await self._reset_browser() return 0 - # ─── Job Detail ─────────────────────────────────────────────── + # ─── Job Detail (optional — needs buildId) ──────────────────── @retry( stop=stop_after_attempt(3), @@ -316,39 +522,41 @@ async def _get_total_count(self) -> int: ) async def _fetch_job_detail(self, requisition_id: str) -> Optional[Dict[str, Any]]: """ - GET /viewjob/{id} and extract __NEXT_DATA__ + GET /_next/data/{buildId}/viewjob/{id}.json for full structured data. + Returns None if buildId is not available. """ - await self._ensure_build_id() - await self._throttle() - page = await self._get_page() + if not self._build_id: + return None - url = f"{self.BASE}/viewjob/{requisition_id}" + await self._throttle() - try: - response = await page.goto(url, wait_until="domcontentloaded", timeout=30000) - except Exception as e: - logger.warning(f"Timeout or error fetching detail for {requisition_id}. Will retry... Error: {e}") - if "crashed" in str(e).lower() or "closed" in str(e).lower() or "timeout" in str(e).lower(): - await self._reset_browser() - raise e + url = f"{self.BASE}/_next/data/{self._build_id}/viewjob/{requisition_id}.json" - if response and response.ok: + resp = await self._page.request.get(url) + status = resp.status + + if status == 200: self.detail_fetches += 1 - html = await page.content() - match = re.search(r'', html, re.DOTALL) - if match: - try: - data = json.loads(match.group(1)) - return data.get("props", {}) - except json.JSONDecodeError: - return None + try: + data = await resp.json() + return data.get("pageProps", data) + except Exception: + return None + + if status == 404: + logger.debug(f"Detail 404 for {requisition_id} — buildId may be stale") return None - if response and response.status == 429: + if status == 429: await asyncio.sleep(30) raise PlaywrightError("Rate limited on detail") - logger.debug(f"Detail {response.status if response else 'None'} for {requisition_id}") + if status == 403: + logger.warning("Detail 403 — blocked on individual fetch") + await asyncio.sleep(30) + raise PlaywrightError("Rate limited / blocked on detail") + + logger.debug(f"Detail {status} for {requisition_id}") return None # ─── Parsing ────────────────────────────────────────────────── @@ -359,20 +567,19 @@ def _extract_requisition_id(card: Dict[str, Any]) -> Optional[str]: return card.get("requisition_id") or card.get("objectID") def _parse_job(self, raw: Dict[str, Any]) -> Optional[ScrapedJob]: - """Parse raw job detail JSON into ScrapedJob.""" + """ + Parse raw job data into ScrapedJob. + Handles both search result cards and detail page JSON. + """ try: if not raw: return None - - # Start with the main props - data = raw.get("pageProps", raw) - - # Merge nested job info if present, but keep parent fields + + data = raw.get("pageProps", raw) if "pageProps" in raw else raw + for key in ["job", "job_information"]: nested = data.get(key) if isinstance(nested, dict): - # Shallow merge: nested fields overwrite parent but we keep what's unique - # This ensures we get 'requisition_id' from parent and 'title' from child merged = data.copy() merged.update(nested) data = merged @@ -395,7 +602,6 @@ def _parse_job(self, raw: Dict[str, Any]) -> Optional[ScrapedJob]: logger.debug(f"Parsing failed: missing title or id. keys: {list(data.keys())}") return None - # Company company_data = ( data.get("enriched_company_data") or data.get("company_data") @@ -408,7 +614,6 @@ def _parse_job(self, raw: Dict[str, Any]) -> Optional[ScrapedJob]: or "Unknown" ) - # Description: HTML → plain text description_html = ( data.get("description") or data.get("job_description_html") @@ -416,22 +621,18 @@ def _parse_job(self, raw: Dict[str, Any]) -> Optional[ScrapedJob]: ) description = _html_to_text(description_html) - # Fall back to alternate field if len(description) < 10: description = data.get("description_clean") or data.get("job_description_text") or description - + if len(description) < 10: logger.debug(f"Description too short for {requisition_id}. Content: {description[:50]}") return None - # Processed data v5 = data.get("v5_processed_job_data") or data.get("processed_data") or {} - # Salary salary_min = _safe_decimal(v5.get("yearly_min_compensation") or data.get("yearly_min_compensation")) salary_max = _safe_decimal(v5.get("yearly_max_compensation") or data.get("yearly_max_compensation")) - # Location & remote workplace_type = (v5.get("workplace_type") or "").lower() is_remote = workplace_type == "remote" location = ( @@ -440,18 +641,14 @@ def _parse_job(self, raw: Dict[str, Any]) -> Optional[ScrapedJob]: or ("Remote" if is_remote else None) ) - # Skills raw_tools = v5.get("technical_tools") or data.get("skills_required") or [] skills = [str(t) for t in raw_tools if t] if isinstance(raw_tools, list) else [] - # Experience min_yoe = v5.get("min_industry_and_role_yoe") experience = f"{min_yoe}+ years" if min_yoe else None - # Job type job_type = data.get("employment_type") or v5.get("employment_type") - # Apply URL apply_url = ( data.get("apply_url") or f"{self.BASE}/viewjob/{requisition_id}" @@ -479,7 +676,6 @@ def _parse_job(self, raw: Dict[str, Any]) -> Optional[ScrapedJob]: "nb_employees": company_data.get("nb_employees"), }, ) - logger.debug(f"Parsed job successfully: {job.title} | {job.company_name}") return job except Exception as exc: @@ -491,29 +687,58 @@ def _parse_job(self, raw: Dict[str, Any]) -> Optional[ScrapedJob]: async def scrape( self, - known_ids: Optional[set] = None, + known_ids: Optional[Set[str]] = None, ) -> AsyncIterator[ScrapedJob]: """ - Full scrape cycle: discover IDs via search → fetch details → yield jobs. + Full scrape cycle using Playwright to bypass protections. - Args: - known_ids: Set of requisition_ids already in DB (skip these). + IMPORTANT: This is a best-effort spider. If Cloudflare blocks us, + we log a warning and yield nothing — we never crash the pipeline. + The other API-based spiders (Remotive, Arbeitnow, Greenhouse) will + still provide jobs even if hiring.cafe is fully blocked. """ known = known_ids or set() - - logger.info({"event": "scrape_start", "source": "hiring_cafe"}) start_time = datetime.now(timezone.utc) + logger.info({"event": "scrape_start", "source": "hiring_cafe"}) - # Step 1: Discover build ID - await self._ensure_build_id() + try: + async with async_playwright() as pw: + try: + browser_obj, context, self._page, ua = await self._get_clearance(pw) + except Exception as e: + logger.warning( + f"[hiring_cafe] Cloudflare clearance failed — skipping this source. " + f"Other sources will still run. Error: {e}" + ) + self.errors += 1 + return + + try: + async for job in self._run_scrape_loop(known, start_time, total=None): + yield job + finally: + if browser_obj: + await browser_obj.close() + elif context: + await context.close() + self._page = None + except Exception as e: + logger.warning( + f"[hiring_cafe] Spider crashed — skipping this source. Error: {e}" + ) + self.errors += 1 - # Step 2: Get total count for progress logging + async def _run_scrape_loop(self, known: Set[str], start_time: datetime, total: Optional[int]) -> AsyncIterator[ScrapedJob]: + """Isolates the central loop iteration.""" + + await self._ensure_build_id() total = await self._get_total_count() - # Step 3: Paginate search results to collect requisition IDs offset = 0 page_num = 0 - all_req_ids: List[str] = [] + + seen_ids: Set[str] = set() + duplicate_streak: int = 0 while page_num < self._max_pages: try: @@ -532,36 +757,53 @@ async def scrape( if not hits: logger.info({"event": "pagination_complete", "pages": page_num}) break + + page_ids = {self._extract_requisition_id(c) for c in hits if self._extract_requisition_id(c)} + + if page_ids and page_ids.issubset(seen_ids): + duplicate_streak += 1 + if duplicate_streak >= 2: + logger.warning(f"Pagination loop detected, stopping early at offset {offset}") + break + else: + duplicate_streak = 0 + + seen_ids.update(page_ids) - page_req_ids = [] + new_on_page = 0 for card in hits: req_id = self._extract_requisition_id(card) - if req_id and req_id not in known: - page_req_ids.append(req_id) - all_req_ids.append(req_id) # keep for final metrics - - # Immediately fetch details for this page's new IDs - if page_req_ids: - logger.info(f"Discovered {len(page_req_ids)} new jobs on page {page_num + 1}. Fetching details...") - for rid in page_req_ids: + if not req_id or req_id in known: + continue + + known.add(req_id) + + job = self._parse_job(card) + + if not job and self._build_id: try: - result = await self._fetch_job_detail(rid) - if result: - job = self._parse_job(result) - if job: - self.jobs_found += 1 - yield job + detail = await self._fetch_job_detail(req_id) + if detail: + job = self._parse_job(detail) except Exception as exc: - logger.error(f"Detail fetch failed for {rid}: {exc}") + logger.debug(f"Detail fetch failed for {req_id}: {exc}") self.errors += 1 + if job: + self.jobs_found += 1 + new_on_page += 1 + yield job + + if new_on_page > 0: + logger.info(f"Page {page_num + 1}: found {new_on_page} new jobs") + self.pages_scraped += 1 page_num += 1 offset += self._page_size - + if page_num % 5 == 0: - logger.info(f"Progress: {page_num}/{self._max_pages} search pages processed. Total found: {self.jobs_found}") - + logger.info(f"Progress: {page_num}/{self._max_pages} pages. Total found: {self.jobs_found}") + if total and offset >= total: logger.info(f"Reached total {total} jobs") break @@ -579,7 +821,7 @@ async def scrape( async def scrape_all( self, - known_ids: Optional[set] = None, + known_ids: Optional[Set[str]] = None, ) -> List[ScrapedJob]: """Scrape all jobs and return as a list.""" jobs: List[ScrapedJob] = [] diff --git a/apps/scraper/src/spiders/remotive.py b/apps/scraper/src/spiders/remotive.py new file mode 100644 index 0000000..ec9552a --- /dev/null +++ b/apps/scraper/src/spiders/remotive.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +""" +remotive.py +Spider for Remotive's public REST API. + +Endpoint: https://remotive.com/api/remote-jobs +- Free, no auth, no Cloudflare +- Returns remote-only jobs with salary, category, tags +- Rate limit: max 2 requests/minute (TOS) +""" + +import logging +import re +from datetime import datetime, timezone +from typing import AsyncIterator, Optional, Set + +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from models import ScrapedJob +from spiders.base import ( + BaseSpider, + html_to_text, + extract_yoe, + extract_salary, + detect_job_type, + safe_decimal, +) + +logger = logging.getLogger(__name__) + +# Categories to scrape — covers tech, design, data, devops, marketing +CATEGORIES = [ + "software-dev", + "design", + "data", + "devops-sysadmin", + "product", + "customer-support", + "marketing", + "qa", + "writing", + "hr", + "finance-legal", + "business", + "all-others", +] + + +class RemotiveSpider(BaseSpider): + """ + Production spider for Remotive.com remote job listings. + + Features: + - Pure aiohttp — no browser, no Playwright, no Cloudflare issues + - Category-based iteration for broad coverage + - Rich field extraction: salary, YOE, job_type + - All results are remote by definition + """ + + SOURCE_NAME = "remotive" + BASE_URL = "https://remotive.com/api/remote-jobs" + + def __init__(self, requests_per_minute: int = 2): + # Remotive TOS: max 2 req/min, max 4 fetches/day + super().__init__(requests_per_minute=requests_per_minute) + + def _parse_job(self, raw: dict) -> Optional[ScrapedJob]: + """Parse a Remotive API job object into a ScrapedJob.""" + try: + job_id = raw.get("id") + title = raw.get("title", "").strip() + company = raw.get("company_name", "").strip() + + if not title or not company: + return None + + # Description + desc_html = raw.get("description", "") + description = html_to_text(desc_html) + if len(description) < 10: + return None + + # URL + source_url = raw.get("url", "") + if not source_url: + return None + + # Location — Remotive provides candidate_required_location + location = raw.get("candidate_required_location", "Worldwide") + + # Salary — Remotive provides salary field as text + salary_text = raw.get("salary", "") + salary_min, salary_max = None, None + if salary_text: + salary_min, salary_max = extract_salary(salary_text) + + # Also try extracting from description if no salary found + if not salary_min and not salary_max: + salary_min, salary_max = extract_salary(description[:2000]) + + # Job type + raw_job_type = raw.get("job_type", "") + job_type = self._normalize_job_type(raw_job_type) + if not job_type: + job_type = detect_job_type(description[:1000]) + + # YOE — extract from description + yoe = extract_yoe(description) + + # Category/tags as skills + category = raw.get("category", "") + tags = raw.get("tags", []) or [] + skills = [t for t in tags if t] if isinstance(tags, list) else [] + if category and category not in skills: + skills.insert(0, category) + + # Posted date + posted_at = None + pub_date = raw.get("publication_date") + if pub_date: + try: + posted_at = datetime.fromisoformat(pub_date.replace("Z", "+00:00")) + except (ValueError, AttributeError): + pass + + return ScrapedJob( + title=title, + company_name=company, + description=description, + location=location, + salary_min=salary_min, + salary_max=salary_max, + job_type=job_type, + remote=True, # All Remotive jobs are remote + source=self.SOURCE_NAME, + source_url=source_url, + skills_required=skills, + experience_required=yoe, + posted_at=posted_at, + is_active=True, + requisition_id=str(job_id) if job_id else source_url, + ) + + except Exception as e: + logger.error(f"[remotive] Parse error: {e}", exc_info=True) + self.errors += 1 + return None + + @staticmethod + def _normalize_job_type(raw: str) -> Optional[str]: + """Normalize Remotive job_type strings.""" + if not raw: + return None + lower = raw.lower().replace("_", " ").replace("-", " ") + if "full" in lower: + return "full_time" + if "part" in lower: + return "part_time" + if "contract" in lower or "freelance" in lower: + return "contract" + if "intern" in lower: + return "internship" + return raw.lower().replace(" ", "_") + + async def scrape( + self, known_ids: Optional[Set[str]] = None + ) -> AsyncIterator[ScrapedJob]: + """ + Scrape all Remotive jobs across categories. + """ + known = known_ids or set() + logger.info({"event": "scrape_start", "source": self.SOURCE_NAME}) + + try: + for category in CATEGORIES: + data = await self._get_json( + self.BASE_URL, + params={"category": category, "limit": 100}, + ) + + if not data: + continue + + jobs_list = data.get("jobs", []) + if not jobs_list: + logger.debug(f"[remotive] No jobs in category: {category}") + continue + + self.pages_scraped += 1 + new_count = 0 + + for raw_job in jobs_list: + job_id = str(raw_job.get("id", "")) + url = raw_job.get("url", "") + + # Dedup: check by source URL or job ID + if url in known or job_id in known: + continue + + known.add(url) + known.add(job_id) + + job = self._parse_job(raw_job) + if job: + self.jobs_found += 1 + new_count += 1 + yield job + + if new_count > 0: + logger.info(f"[remotive] Category '{category}': {new_count} new jobs") + + except Exception as e: + logger.error(f"[remotive] Scrape failed: {e}", exc_info=True) + self.errors += 1 + finally: + await self.close() + + logger.info({ + "event": "scrape_complete", + "source": self.SOURCE_NAME, + "jobs_found": self.jobs_found, + "pages_scraped": self.pages_scraped, + "errors": self.errors, + }) diff --git a/apps/scraper/src/test_hiring.py b/apps/scraper/src/test_hiring.py new file mode 100644 index 0000000..f17b828 --- /dev/null +++ b/apps/scraper/src/test_hiring.py @@ -0,0 +1,22 @@ +import asyncio +import logging +from spiders.hiring_cafe import HiringCafeSpider +from playwright.async_api import async_playwright + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(name)s: %(message)s' +) + +async def test_clearance(): + spider = HiringCafeSpider() + async with async_playwright() as pw: + try: + cookies, ua = await spider._get_clearance(pw) + has_clearance = any(c['name'] == 'cf_clearance' for c in cookies) + print(f">>> TEST RESULT: cf_clearance obtained: {has_clearance}") + except Exception as e: + print(f">>> TEST ERROR: {e}") + +if __name__ == "__main__": + asyncio.run(test_clearance()) diff --git a/apps/scraper/src/utils.py b/apps/scraper/src/utils.py new file mode 100644 index 0000000..e71cdd6 --- /dev/null +++ b/apps/scraper/src/utils.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +""" +utils.py +Shared utilities for the scraper application. +""" +from typing import List, Optional + +def format_vector(embedding: Optional[List[float]]) -> Optional[str]: + """ + Safely formats a float list into pgvector-compatible string. + Use this everywhere — never use str(embedding) directly. + """ + if not embedding: + return None + return '[' + ','.join(str(v) for v in embedding) + ']' \ No newline at end of file diff --git a/apps/scraper/test_args.py b/apps/scraper/test_args.py new file mode 100644 index 0000000..8029cfe --- /dev/null +++ b/apps/scraper/test_args.py @@ -0,0 +1,52 @@ +import asyncio +from playwright.async_api import async_playwright + +async def test_args(args_list, tag): + try: + async with async_playwright() as pw: + browser = await pw.chromium.launch( + headless=True, + args=args_list + ) + context = await browser.new_context( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + viewport={"width": 1280, "height": 800}, + locale="en-US", + ) + page = await context.new_page() + await page.add_init_script(""" + Object.defineProperty(navigator, 'plugins', { get: () => Object.freeze([{name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format', length: 1}]) }); + window.chrome = { runtime: {} }; + """) + print(f"[{tag}] going to hiring.cafe...") + await page.goto("https://hiring.cafe", wait_until="domcontentloaded", timeout=15000) + await page.wait_for_timeout(3000) + print(f"[{tag}] SUCCESS") + await browser.close() + except Exception as e: + print(f"[{tag}] CRASH:", e) + +async def main(): + print("Testing config 1: default + swiftshader") + await test_args([ + "--disable-blink-features=AutomationControlled", + "--disable-dev-shm-usage", + "--use-gl=swiftshader" + ], "swiftshader") + + print("Testing config 2: default WITHOUT swiftshader (no disable-gpu)") + await test_args([ + "--disable-blink-features=AutomationControlled", + "--disable-dev-shm-usage" + ], "no-gpu-flags") + + print("Testing config 3: angle swiftshader") + await test_args([ + "--disable-blink-features=AutomationControlled", + "--disable-dev-shm-usage", + "--use-gl=angle", + "--use-angle=swiftshader" + ], "angle-swiftshader") + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/apps/scraper/test_no_sandbox.py b/apps/scraper/test_no_sandbox.py new file mode 100644 index 0000000..a8d8f9c --- /dev/null +++ b/apps/scraper/test_no_sandbox.py @@ -0,0 +1,42 @@ +import asyncio +import os +from playwright.async_api import async_playwright + +async def test_no_sandbox(with_js_inject, tag): + try: + async with async_playwright() as pw: + # specifically ensuring --no-sandbox is absent! + browser = await pw.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled"] + ) + context = await browser.new_context( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + viewport={"width": 1280, "height": 800}, + ) + page = await context.new_page() + + if with_js_inject: + await page.add_init_script(""" + Object.defineProperty(navigator, 'plugins', { get: () => Object.freeze([{name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format', length: 1}]) }); + window.chrome = { runtime: {} }; + """) + + print(f"[{tag}] going to hiring.cafe...") + await page.goto("https://hiring.cafe", wait_until="domcontentloaded", timeout=15000) + await page.wait_for_timeout(3000) + html = await page.evaluate("() => document.title") + print(f"[{tag}] SUCCESS, title: {html}") + await browser.close() + except Exception as e: + print(f"[{tag}] CRASH:", e) + +async def main(): + print("Testing 1: NO NO-SANDBOX, WITH JS") + await test_no_sandbox(True, "with_js") + + print("Testing 2: NO NO-SANDBOX, NO JS") + await test_no_sandbox(False, "no_js") + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/apps/scraper/test_plugins.py b/apps/scraper/test_plugins.py new file mode 100644 index 0000000..db7712b --- /dev/null +++ b/apps/scraper/test_plugins.py @@ -0,0 +1,41 @@ +import asyncio +from playwright.async_api import async_playwright + +async def test_plugins(with_plugins_hack_enabled, tag): + try: + async with async_playwright() as pw: + browser = await pw.chromium.launch( + headless=True, + args=[ + "--no-sandbox", + "--disable-blink-features=AutomationControlled", + ] + ) + context = await browser.new_context( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + ) + page = await context.new_page() + + if with_plugins_hack_enabled: + await page.add_init_script(""" + Object.defineProperty(navigator, 'plugins', { get: () => Object.freeze([{name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format', length: 1}]) }); + """) + + print(f"[{tag}] going to hiring.cafe...") + await page.goto("https://hiring.cafe", wait_until="domcontentloaded", timeout=15000) + await page.wait_for_timeout(3000) + title = await page.title() + print(f"[{tag}] SUCCESS, title: {title}") + await browser.close() + except Exception as e: + print(f"[{tag}] CRASH:", e) + +async def main(): + print("Testing 1: WITH JS inject") + await test_plugins(True, "with_js") + + print("Testing 2: WITHOUT JS inject") + await test_plugins(False, "no_js") + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/apps/scraper/test_single_process.py b/apps/scraper/test_single_process.py new file mode 100644 index 0000000..15c2a03 --- /dev/null +++ b/apps/scraper/test_single_process.py @@ -0,0 +1,39 @@ +import asyncio +from playwright.async_api import async_playwright + +async def test_single_process(): + try: + async with async_playwright() as pw: + print("launching single process chromium...") + browser = await pw.chromium.launch( + headless=True, + args=[ + "--disable-blink-features=AutomationControlled", + "--disable-dev-shm-usage", + "--single-process", # Stops subprocess spawning (avoids mach_port failures) + ] + ) + print("context...") + context = await browser.new_context( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + viewport={"width": 1280, "height": 800}, + ) + print("page...") + page = await context.new_page() + + await page.add_init_script(""" + Object.defineProperty(navigator, 'plugins', { get: () => Object.freeze([{name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format', length: 1}]) }); + window.chrome = { runtime: {} }; + """) + + print("going to hiring.cafe...") + await page.goto("https://hiring.cafe", wait_until="domcontentloaded", timeout=15000) + await page.wait_for_timeout(3000) + html = await page.evaluate("() => document.title") + print(f"SUCCESS, title: {html}") + await browser.close() + except Exception as e: + print("CRASH:", e) + +if __name__ == '__main__': + asyncio.run(test_single_process()) diff --git a/apps/web/index.html b/apps/web/index.html index e49b108..eb78b71 100644 --- a/apps/web/index.html +++ b/apps/web/index.html @@ -5,9 +5,17 @@ - Postly - AI Job Matching + Postly - Search Jobs + + + + - import("@pages/TransmissionLanding").then((m) => ({ - default: m.TransmissionLanding, +const TransmissionHome = lazy(() => + import("@pages/TransmissionHome").then((m) => ({ + default: m.TransmissionHome, + })), +); +const TransmissionRoleSelector = lazy(() => + import("@pages/TransmissionRoleSelector").then((m) => ({ + default: m.TransmissionRoleSelector, })), ); const TransmissionLogin = lazy(() => @@ -20,6 +25,12 @@ const TransmissionRegister = lazy(() => default: m.TransmissionRegister, })), ); +const VerifyOtpPage = lazy(() => + import("./pages/VerifyOtpPage").then((m) => ({ + default: m.VerifyOtpPage, + })), +); + const TransmissionPricing = lazy(() => import("@pages/TransmissionPricing").then((m) => ({ default: m.TransmissionPricing, @@ -35,13 +46,17 @@ const TransmissionIntegrations = lazy(() => default: m.TransmissionIntegrations, })), ); +const TransmissionSettings = lazy(() => + import("@pages/TransmissionSettings").then((m) => ({ + default: m.TransmissionSettings, + })), +); const TransmissionNotFound = lazy(() => import("@pages/TransmissionNotFound").then((m) => ({ default: m.TransmissionNotFound, })), ); -// ─── Auth utilities ───────────────────────────────────────────────── const ForgotPasswordPage = lazy(() => import("./pages/ForgotPasswordPage").then((m) => ({ default: m.ForgotPasswordPage, @@ -59,9 +74,12 @@ function App() { }> {/* ─── Public ──────────────────────────────────── */} - } /> + } /> + } /> } /> } /> + } /> + } /> } /> } /> @@ -91,6 +109,14 @@ function App() { } /> + + + + } + /> {/* ─── 404 ─────────────────────────────────────── */} } /> diff --git a/apps/web/src/components/chat/JobCarousel.tsx b/apps/web/src/components/chat/JobCarousel.tsx index 65d1478..88699c6 100644 --- a/apps/web/src/components/chat/JobCarousel.tsx +++ b/apps/web/src/components/chat/JobCarousel.tsx @@ -15,13 +15,44 @@ export function JobCarousel({ onApply, }: JobCarouselProps) { return ( -
- {message &&

{message}

} +
+ {message && ( +

+ {message} +

+ )} {/* Scroll Container */} -
+
{data.map((job) => ( -
+
))} @@ -29,14 +60,39 @@ export function JobCarousel({ {/* Actions */} {suggested_actions && suggested_actions.length > 0 && ( -
+
{suggested_actions.map((action, i) => ( ))}
diff --git a/apps/web/src/components/chat/TransmissionSidebar.tsx b/apps/web/src/components/chat/TransmissionSidebar.tsx index b2c76df..e6b9c87 100644 --- a/apps/web/src/components/chat/TransmissionSidebar.tsx +++ b/apps/web/src/components/chat/TransmissionSidebar.tsx @@ -58,7 +58,9 @@ export function TransmissionSidebar({ const [deleteId, setDeleteId] = useState(null); const [isDeleting, setIsDeleting] = useState(false); const [isCreating, setIsCreating] = useState(false); + const [isProfileOpen, setIsProfileOpen] = useState(false); const sidebarRef = useRef(null); + const profileRef = useRef(null); // Fetch conversations const { data: fetchedConversations } = useQuery({ @@ -72,6 +74,24 @@ export function TransmissionSidebar({ } }, [fetchedConversations, setConversations]); + // Click outside profile menu handler + useEffect(() => { + const handleClickOutside = (event: MouseEvent) => { + if ( + profileRef.current && + !profileRef.current.contains(event.target as Node) + ) { + setIsProfileOpen(false); + } + }; + if (isProfileOpen) { + document.addEventListener("mousedown", handleClickOutside); + } + return () => { + document.removeEventListener("mousedown", handleClickOutside); + }; + }, [isProfileOpen]); + const handleNewChat = async () => { if (isCreating) return; setIsCreating(true); @@ -324,83 +344,166 @@ export function TransmissionSidebar({ {/* User profile */}
+ {/* Dropdown Menu */} + {isProfileOpen && ( +
+ {[ + // { label: "Pricing", path: "/pricing" }, + { label: "Account Settings", path: "/settings" }, + { label: "Integration", path: "/integrations" }, + { label: "Logout", action: logout, danger: true }, + ].map((item) => ( + + ))} +
+ )} +
setIsProfileOpen(!isProfileOpen)} + role="button" + tabIndex={0} style={{ + padding: "8px", display: "flex", alignItems: "center", - gap: "8px", - minWidth: 0, + justifyContent: "space-between", + cursor: "pointer", + background: isProfileOpen ? "var(--tx-bg)" : "transparent", + transition: "background-color 150ms var(--tx-ease-sharp)", + }} + onMouseEnter={(e) => { + if (!isProfileOpen) + e.currentTarget.style.backgroundColor = "var(--tx-bg)"; + }} + onMouseLeave={(e) => { + if (!isProfileOpen) + e.currentTarget.style.backgroundColor = "transparent"; }} >
- {user?.full_name?.[0]?.toUpperCase() || "U"} +
+ {user?.full_name?.[0]?.toUpperCase() || "U"} +
+
+ + {user?.full_name || "User"} + + + {role} + +
- - {user?.full_name || "User"} - + ▲ +
- -
diff --git a/apps/web/src/components/jobs/JobCard.tsx b/apps/web/src/components/jobs/JobCard.tsx index e733958..68725a9 100644 --- a/apps/web/src/components/jobs/JobCard.tsx +++ b/apps/web/src/components/jobs/JobCard.tsx @@ -57,26 +57,275 @@ export function JobCard({ const { display_info, matching_data, meta } = job; const isChat = variant === "chat"; + if (isChat) { + return ( +
{ + e.currentTarget.style.transform = "translate(-2px, -2px)"; + e.currentTarget.style.boxShadow = "6px 6px 0 var(--tx-border)"; + }} + onMouseLeave={(e) => { + e.currentTarget.style.transform = "translate(0, 0)"; + e.currentTarget.style.boxShadow = "4px 4px 0 var(--tx-border)"; + }} + > + {/* Match Score */} + {matching_data.match_score > 0 && ( +
+
+ {matching_data.match_score}% MATCH +
+
+ )} + + {/* Content */} +
+

+ {display_info.title} +

+ +
+ + + {display_info.company} + +
+ +
+ {display_info.location && ( + + + + {display_info.location} + + + )} + {meta.salary_range && ( + + {meta.salary_range} + + )} +
+ +
+ {matching_data.key_skills?.slice(0, 3).map((skill) => ( + + {skill} + + ))} + {matching_data.key_skills && + matching_data.key_skills.length > 3 && ( + + +{matching_data.key_skills.length - 3} + + )} +
+
+ + {/* Footer actions */} +
+ +
+
+ ); + } + return (
-
+
{/* Match Score */} {matching_data.match_score > 0 && ( -
+
-
+
+
-

+

{display_info.title}

@@ -109,34 +348,27 @@ export function JobCard({
{/* Save button */} - {!isChat && ( - - )} +
{/* Location & Job Type */} -
+
{display_info.location && ( @@ -145,7 +377,7 @@ export function JobCard({ )} - {!isChat && meta.remote && ( + {meta.remote && ( Remote @@ -160,99 +392,50 @@ export function JobCard({ {/* Skills */} {matching_data.key_skills && matching_data.key_skills.length > 0 && ( -
- {matching_data.key_skills - .slice(0, isChat ? 3 : 4) - .map((skill) => ( - - {skill} - - ))} - {matching_data.key_skills.length > (isChat ? 3 : 4) && ( +
+ {matching_data.key_skills.slice(0, 4).map((skill) => ( - +{matching_data.key_skills.length - (isChat ? 3 : 4)} + {skill} + + ))} + {matching_data.key_skills.length > 4 && ( + + +{matching_data.key_skills.length - 4} )}
)} {/* AI Explanation */} - {!isChat && matching_data.ai_explanation && ( + {matching_data.ai_explanation && (

{matching_data.ai_explanation}

)} {/* Actions */} -
- {isChat ? ( +
+ {meta.apply_url && ( - ) : ( - <> - {meta.apply_url && ( - - )} - - - {meta.posted_at - ? new Date(meta.posted_at).toLocaleDateString() - : "Recently posted"} - - )} + + + {meta.posted_at + ? new Date(meta.posted_at).toLocaleDateString() + : "Recently posted"} +
diff --git a/apps/web/src/components/ui/PageLoader.tsx b/apps/web/src/components/ui/PageLoader.tsx index 788e885..321e196 100644 --- a/apps/web/src/components/ui/PageLoader.tsx +++ b/apps/web/src/components/ui/PageLoader.tsx @@ -1,15 +1,185 @@ import React from "react"; +import "../../styles/transmission.css"; export const PageLoader: React.FC = () => { return ( -
-
-
-
+
+ + + {/* ─── Animated Runner SVG ────────────────────────────────────── */} +
+ {/* Speed Lines */} +
+
+ + + {/* Head */} + + {/* Body */} + + {/* Arms */} + + + {/* Legs */} + + + + + + + +
+ +
+

+ TRANSMITTING SIGNAL +

+
+

+ Running to your destination... +

-

- Loading your experience... -

); }; diff --git a/apps/web/src/components/ui/Toast.tsx b/apps/web/src/components/ui/Toast.tsx index 674d048..ef5b775 100644 --- a/apps/web/src/components/ui/Toast.tsx +++ b/apps/web/src/components/ui/Toast.tsx @@ -1,7 +1,8 @@ import { useEffect, useState } from "react"; import { useToastStore } from "../../stores/toast.store"; -import { X, CheckCircle, AlertCircle, Info, AlertTriangle } from "lucide-react"; +import { X } from "lucide-react"; import { createPortal } from "react-dom"; +import "../../styles/transmission.css"; export function ToastContainer() { const { toasts, removeToast } = useToastStore(); @@ -15,38 +16,99 @@ export function ToastContainer() { if (!mounted) return null; return createPortal( -
+
{toasts.map((toast) => (
- {toast.type === "success" && ( - - )} - {toast.type === "error" && ( - - )} - {toast.type === "warning" && ( - - )} - {toast.type === "info" && } +
-

{toast.message}

+
+ + {toast.type || "SIGNAL"} + +

+ {toast.message} +

+
))} diff --git a/apps/web/src/hooks/useSSEChat.ts b/apps/web/src/hooks/useSSEChat.ts index 9ccb6e8..74031f5 100644 --- a/apps/web/src/hooks/useSSEChat.ts +++ b/apps/web/src/hooks/useSSEChat.ts @@ -63,10 +63,7 @@ export function useSSEChat() { // 1. Create conversation if needed if (!currentConversationId) { try { - const newConv = await chatService.createConversation( - undefined, - message, - ); + const newConv = await chatService.createConversation(undefined); currentConversationId = newConv.id; addConversation(newConv); setActiveConversation(newConv.id); diff --git a/apps/web/src/lib/api-client.ts b/apps/web/src/lib/api-client.ts index 3ca1326..2a911b3 100644 --- a/apps/web/src/lib/api-client.ts +++ b/apps/web/src/lib/api-client.ts @@ -24,9 +24,8 @@ apiClient.interceptors.response.use( (response) => response, async (error) => { if (error.response?.status === 401) { - // Token expired, try to refresh localStorage.removeItem("access_token"); - // window.location.href = "/login"; // Temporarily disabled for debugging + window.location.href = "/login"; console.warn("401 Unauthorized - Token removed"); } return Promise.reject(error); diff --git a/apps/web/src/pages/TransmissionChat.tsx b/apps/web/src/pages/TransmissionChat.tsx index 9951fb9..a68e000 100644 --- a/apps/web/src/pages/TransmissionChat.tsx +++ b/apps/web/src/pages/TransmissionChat.tsx @@ -42,6 +42,21 @@ export function TransmissionChat() { const { id: conversationIdParam } = useParams<{ id: string }>(); const navigate = useNavigate(); const role = (searchParams.get("role") as Role) || "seeker"; + const { addToast } = useToastStore(); + + // Redirect if role is recruiter (Hiring feature is Coming Soon) + useEffect(() => { + if (role === "recruiter") { + addToast({ + type: "error", + message: "Hiring feature is coming soon. Redirecting...", + }); + const newParams = new URLSearchParams(searchParams); + newParams.set("role", "seeker"); + navigate({ search: newParams.toString() }, { replace: true }); + } + }, [role, searchParams, navigate, addToast]); + const accentColor = role === "seeker" ? "var(--tx-seeker)" : "var(--tx-recruiter)"; const accentHex = role === "seeker" ? "#FF3D00" : "#0038FF"; @@ -56,7 +71,6 @@ export function TransmissionChat() { const setLoading = useChatStore((s) => s.setLoading); const setActiveResumeId = useChatStore((s) => s.setActiveResumeId); const { sendMessage, stopGeneration } = useSSEChat(); - const { addToast } = useToastStore(); /* ─── Local State ────────────────────────────────────────────────── */ const [input, setInput] = useState(""); diff --git a/apps/web/src/pages/TransmissionHome.tsx b/apps/web/src/pages/TransmissionHome.tsx new file mode 100644 index 0000000..afba53d --- /dev/null +++ b/apps/web/src/pages/TransmissionHome.tsx @@ -0,0 +1,177 @@ +import { Link } from "react-router-dom"; +import { useAuthStore } from "../stores/auth.store"; +import "../styles/transmission.css"; + +/** + * TransmissionHome + * ──────────────── + * Professional, neo-brutalist landing page for Postly. + * High contrast, bold typography, and clear calls to action. + */ +export function TransmissionHome() { + const { isAuthenticated } = useAuthStore(); + + return ( +
+ {/* ─── Navigation ──────────────────────────────────────────────── */} + + + {/* ─── Hero Section ────────────────────────────────────────────── */} +
+

+ The Future of Work is{" "} + Broadcast. +

+

+ Postly is a high-frequency talent terminal. Stop searching. Start + transmitting. AI-driven matching for the next generation of builders. +

+ { + e.currentTarget.style.transform = "translate(-2px, -2px)"; + e.currentTarget.style.boxShadow = "10px 10px 0px var(--tx-seeker)"; + }} + onMouseLeave={(e) => { + e.currentTarget.style.transform = "translate(0, 0)"; + e.currentTarget.style.boxShadow = "8px 8px 0px var(--tx-seeker)"; + }} + > + {isAuthenticated ? "CONTINUE TO SIGNAL →" : "GET STARTED"} + +
+ + {/* ─── Footer ──────────────────────────────────────────────────── */} +
+ © {new Date().getFullYear()} POSTLY +
+
+ ); +} diff --git a/apps/web/src/pages/TransmissionLogin.tsx b/apps/web/src/pages/TransmissionLogin.tsx index 89e23fa..29ea7fe 100644 --- a/apps/web/src/pages/TransmissionLogin.tsx +++ b/apps/web/src/pages/TransmissionLogin.tsx @@ -20,9 +20,14 @@ export function TransmissionLogin() { e.preventDefault(); try { await login({ email, password }); - navigate("/"); - } catch { - // Error handled by store + navigate("/chat?role=seeker"); + } catch (err: unknown) { + const error = err as { + response?: { data?: { error?: { code?: string } } }; + }; + if (error?.response?.data?.error?.code === "EMAIL_NOT_VERIFIED") { + navigate(`/verify-otp?email=${encodeURIComponent(email)}`); + } } }; diff --git a/apps/web/src/pages/TransmissionRegister.tsx b/apps/web/src/pages/TransmissionRegister.tsx index cea91af..4056b33 100644 --- a/apps/web/src/pages/TransmissionRegister.tsx +++ b/apps/web/src/pages/TransmissionRegister.tsx @@ -21,8 +21,8 @@ export function TransmissionRegister() { const handleSubmit = async (e: React.FormEvent) => { e.preventDefault(); try { - await register({ full_name: name, email, password, role: userType }); - navigate("/"); + await register({ full_name: name, email, password, roles: [userType] }); + navigate(`/verify-otp?email=${encodeURIComponent(email)}`); } catch { // Error surfaced via store } diff --git a/apps/web/src/pages/TransmissionLanding.tsx b/apps/web/src/pages/TransmissionRoleSelector.tsx similarity index 95% rename from apps/web/src/pages/TransmissionLanding.tsx rename to apps/web/src/pages/TransmissionRoleSelector.tsx index e29f605..7cb3e26 100644 --- a/apps/web/src/pages/TransmissionLanding.tsx +++ b/apps/web/src/pages/TransmissionRoleSelector.tsx @@ -6,8 +6,8 @@ import { import "../styles/transmission.css"; /** - * TransmissionLanding - * ─────────────────── + * TransmissionRoleSelector + * ──────────────────────── * Full-viewport split-screen role selection. * Left = SEEKING (orange-red) | Right = HIRING (blue) * @@ -20,7 +20,7 @@ import "../styles/transmission.css"; * - clip-path circle transition on click */ -export function TransmissionLanding() { +export function TransmissionRoleSelector() { const [mounted, setMounted] = useState(false); const { state: transState, trigger } = useTransmissionTransition(); const seekerRef = useRef(null); @@ -183,8 +183,7 @@ export function TransmissionLanding() { {/* ─── RIGHT PANEL: HIRING ───────────────────────────────────── */}
handleRoleClick("recruiter", e)} + className={`tx-panel tx-panel-recruiter tx-panel-disabled ${mounted ? "tx-split-right" : ""}`} style={{ flex: 1, display: "flex", @@ -194,8 +193,12 @@ export function TransmissionLanding() { background: "#0D0D0D", position: "relative", opacity: mounted ? undefined : 0, + cursor: "not-allowed", }} > + {/* Coming Soon Badge */} +
COMING SOON
+ {/* SVG border trace overlay */} - ← Click to enter + 🚧 Feature Offline
diff --git a/apps/web/src/pages/TransmissionSettings.tsx b/apps/web/src/pages/TransmissionSettings.tsx new file mode 100644 index 0000000..cdba2bd --- /dev/null +++ b/apps/web/src/pages/TransmissionSettings.tsx @@ -0,0 +1,967 @@ +import { useState, useEffect } from "react"; +import { useAuthStore } from "../stores/auth.store"; +import { Link } from "react-router-dom"; +import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query"; +import { userService } from "../services/user.service"; +import { useToastStore } from "../stores/toast.store"; +import "../styles/transmission.css"; + +/** + * TransmissionSettings + * ──────────────────── + * Brutalist settings page. Tabbed layout (General, Professional, Security). + * Dynamically adapts to Seeker or Recruiter roles. + */ + +const TIMEZONES = [ + "UTC", + "America/New_York", + "America/Chicago", + "America/Denver", + "America/Los_Angeles", + "Europe/London", + "Europe/Paris", + "Europe/Berlin", + "Asia/Tokyo", + "Asia/Singapore", + "Asia/Kolkata", + "Australia/Sydney", +]; + +const LOCALES = [ + { label: "English (US)", value: "en-US" }, + { label: "English (UK)", value: "en-GB" }, + { label: "French", value: "fr-FR" }, + { label: "German", value: "de-DE" }, + { label: "Japanese", value: "ja-JP" }, + { label: "Hindi", value: "hi-IN" }, +]; + +export function TransmissionSettings() { + const { user } = useAuthStore(); + const { addToast } = useToastStore(); + const queryClient = useQueryClient(); + const [activeTab, setActiveTab] = useState< + "general" | "professional" | "security" + >("general"); + const [isUploading, setIsUploading] = useState(false); + + const role = user?.roles?.[0] || "seeker"; + const accentColor = + role === "seeker" ? "var(--tx-seeker)" : "var(--tx-recruiter)"; + + // Form state + const [formData, setFormData] = useState>({ + full_name: user?.full_name || "", + avatar_url: user?.avatar_url || "", + timezone: user?.timezone || "UTC", + locale: user?.locale || "en-US", + }); + + const [passwordData, setPasswordData] = useState({ + current_password: "", + new_password: "", + confirm_password: "", + }); + + // Fetch role-specific profile + const { data: profileData, isLoading: isProfileLoading } = useQuery({ + queryKey: ["profile", role], + queryFn: () => + role === "seeker" + ? userService.getSeekerProfile() + : userService.getEmployerProfile(), + enabled: !!user, + }); + + // Sync form data once profile is loaded + useEffect(() => { + if (profileData) { + setFormData((prev) => ({ + ...prev, + ...profileData, + skills: profileData.skills?.join(", ") || "", + desired_job_titles: profileData.desired_job_titles?.join(", ") || "", + desired_locations: profileData.desired_locations?.join(", ") || "", + })); + } + }, [profileData]); + + // Mutations + const updateBaseProfile = useMutation({ + mutationFn: (data: Parameters[0]) => + userService.updateProfile(data), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ["auth-me"] }); + addToast({ + type: "success", + message: "General broadcast identity updated.", + }); + }, + onError: (error: Error) => { + addToast({ + type: "error", + message: error.message || "Failed to update profile.", + }); + }, + }); + + const updateRoleProfile = useMutation({ + mutationFn: (data: any) => + role === "seeker" + ? userService.updateSeekerProfile(data) + : userService.updateEmployerProfile(data), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ["profile", role] }); + addToast({ + type: "success", + message: "Professional broadcast profile updated.", + }); + }, + onError: (error: Error) => { + addToast({ + type: "error", + message: error.message || "Failed to update profile.", + }); + }, + }); + + const changePassword = useMutation({ + mutationFn: (data: Parameters[0]) => + userService.changePassword(data), + onSuccess: () => { + addToast({ + type: "success", + message: "Cipher sequence updated (Password changed).", + }); + setPasswordData({ + current_password: "", + new_password: "", + confirm_password: "", + }); + }, + onError: (error: Error) => { + addToast({ + type: "error", + message: error.message || "Failed to change password.", + }); + }, + }); + + const handleSaveGeneral = async (e: React.FormEvent) => { + e.preventDefault(); + updateBaseProfile.mutate({ + full_name: formData.full_name, + avatar_url: formData.avatar_url, + timezone: formData.timezone, + locale: formData.locale, + }); + }; + + const handleSaveProfessional = async (e: React.FormEvent) => { + e.preventDefault(); + const roleData = { ...formData }; + delete roleData.full_name; + delete roleData.avatar_url; + delete roleData.timezone; + delete roleData.locale; + delete roleData.email; + + if (role === "seeker") { + roleData.skills = + roleData.skills + ?.split(",") + .map((s: string) => s.trim()) + .filter(Boolean) || []; + roleData.desired_job_titles = + roleData.desired_job_titles + ?.split(",") + .map((s: string) => s.trim()) + .filter(Boolean) || []; + roleData.desired_locations = + roleData.desired_locations + ?.split(",") + .map((s: string) => s.trim()) + .filter(Boolean) || []; + roleData.experience_years = parseInt(roleData.experience_years) || 0; + } + + updateRoleProfile.mutate(roleData); + }; + + const handleChangePassword = (e: React.FormEvent) => { + e.preventDefault(); + if (passwordData.new_password !== passwordData.confirm_password) { + addToast({ type: "error", message: "Passphrase mismatch." }); + return; + } + changePassword.mutate({ + current_password: passwordData.current_password, + new_password: passwordData.new_password, + }); + }; + + const handleInputChange = ( + e: React.ChangeEvent< + HTMLInputElement | HTMLTextAreaElement | HTMLSelectElement + >, + ) => { + const { name, value, type } = e.target; + const val = + type === "checkbox" ? (e.target as HTMLInputElement).checked : value; + setFormData((prev) => ({ ...prev, [name]: val })); + }; + + const handleFileChange = async (e: React.ChangeEvent) => { + const file = e.target.files?.[0]; + if (!file) return; + + // Validate size (2MB) + if (file.size > 2 * 1024 * 1024) { + addToast({ + type: "error", + message: "File sequence too heavy (Max 2MB).", + }); + return; + } + + try { + setIsUploading(true); + const url = await userService.uploadAvatar(file); + setFormData((prev) => ({ ...prev, avatar_url: url })); + addToast({ + type: "success", + message: "Cipher image uploaded to broadcast node.", + }); + } catch (error: any) { + addToast({ + type: "error", + message: error.message || "Failed to upload image.", + }); + } finally { + setIsUploading(false); + } + }; + + const handlePasswordChange = (e: React.ChangeEvent) => { + const { name, value } = e.target; + setPasswordData((prev) => ({ ...prev, [name]: value })); + }; + + if (isProfileLoading) { + return ( +
+

+ LINKING TO BROADCAST NODE... +

+
+ ); + } + + return ( +
+
+ {/* Header */} +
+ + ← RETURN TO COMMUNICATIONS + +

+ SETTINGS +

+

+ Node ID: {user?.id?.split("-")[0]}... +

+
+ + {/* Main Layout */} +
+ {/* Side Tabs */} + + + {/* Content Pane */} +
+ {/* ─── GENERAL TAB ─── */} + {activeTab === "general" && ( +
+ + +
+
+ {formData.avatar_url ? ( + Avatar + ) : ( +
?
+ )} +
+
+ +
+ + + +
+
+
+
+ +
+ + + + + + +
+ +
+ + + + + + +
+ + + + )} + + {/* ─── PROFESSIONAL TAB ─── */} + {activeTab === "professional" && ( +
+ {role === "seeker" ? ( + <> + + + + + +