diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile index 6e3764c..17ce130 100644 --- a/apps/api/Dockerfile +++ b/apps/api/Dockerfile @@ -1,13 +1,12 @@ # Stage 1: Build -FROM --platform=$BUILDPLATFORM node:20-alpine AS builder +FROM --platform=$BUILDPLATFORM node:20-bookworm-slim AS builder WORKDIR /app -# Copy package files for dependency installation -COPY package*.json ./ -COPY turbo.json ./ +# Copy root config files +COPY package*.json turbo.json ./ -# Copy all package.json files for workspaces +# Copy all workspace package.json files for installation COPY packages/database/package.json ./packages/database/ COPY packages/ai-utils/package.json ./packages/ai-utils/ COPY packages/logger/package.json ./packages/logger/ @@ -17,27 +16,27 @@ COPY packages/config/eslint-config/package.json ./packages/config/eslint-config/ COPY packages/config/typescript-config/package.json ./packages/config/typescript-config/ COPY apps/api/package*.json ./apps/api/ -# Install dependencies -RUN --mount=type=cache,target=/root/.npm npm ci --ignore-scripts --legacy-peer-deps +# Install dependencies allowing native scripts to run +# HUSKY=0 prevents git hooks from failing in docker +RUN --mount=type=cache,target=/root/.npm HUSKY=0 npm ci --legacy-peer-deps -# Copy source code +# Copy full source COPY . . -# Build API and its dependencies using turbo +# Build specific app RUN npx turbo run build --filter=api # Stage 2: Production -FROM node:20-alpine AS runner +FROM node:20-bookworm-slim AS runner WORKDIR /app ENV NODE_ENV=production -# Copy package files for production install -COPY package*.json ./ -COPY turbo.json ./ +# Copy root config files +COPY package*.json turbo.json ./ -# Copy package.json files for production workspaces +# Copy workspace package.json files for production install COPY packages/database/package.json ./packages/database/ COPY packages/ai-utils/package.json ./packages/ai-utils/ COPY packages/logger/package.json ./packages/logger/ @@ -47,10 +46,13 @@ COPY packages/config/eslint-config/package.json ./packages/config/eslint-config/ COPY packages/config/typescript-config/package.json ./packages/config/typescript-config/ COPY apps/api/package*.json ./apps/api/ -# Install production dependencies only -RUN --mount=type=cache,target=/root/.npm npm ci --omit=dev --ignore-scripts --legacy-peer-deps +# Production-only install with native scripts allowed +# We delete the prepare script to avoid husky failing since it's a devDependency +RUN --mount=type=cache,target=/root/.npm \ + npm pkg delete scripts.prepare && \ + HUSKY=0 npm ci --omit=dev --legacy-peer-deps -# Copy built files from builder +# Copy built artifacts from builder COPY --from=builder /app/apps/api/dist ./apps/api/dist COPY --from=builder /app/packages/database/dist ./packages/database/dist COPY --from=builder /app/packages/ai-utils/dist ./packages/ai-utils/dist @@ -58,19 +60,14 @@ COPY --from=builder /app/packages/logger/dist ./packages/logger/dist COPY --from=builder /app/packages/shared-types/dist ./packages/shared-types/dist COPY --from=builder /app/packages/config/app-config/dist ./packages/config/app-config/dist -# Create non-root user for security -RUN addgroup -g 1001 -S nodejs && \ - adduser -S nodejs -u 1001 +# Secure user built into node image +USER node -USER nodejs - -# Expose port EXPOSE 3000 -# Health check (architecture check) -# Using wget which is more robust in Alpine and avoids unmatched quote issues +# Pure Node healthcheck (No wget or curl required) HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ - CMD wget --no-verbose --tries=1 --spider http://localhost:3000/health || exit 1 + CMD node -e "require('http').get('http://localhost:3000/health', (r) => {if (r.statusCode !== 200) process.exit(1)})" || exit 1 -# Start API with memory optimization -CMD ["node", "--max-old-space-size=80", "apps/api/dist/server.js"] +# Start API with optimized memory +CMD ["node", "--max-old-space-size=200", "apps/api/dist/server.js"] \ No newline at end of file diff --git a/apps/api/src/middleware/strict-rate-limit.ts b/apps/api/src/middleware/strict-rate-limit.ts index b624f90..70616de 100644 --- a/apps/api/src/middleware/strict-rate-limit.ts +++ b/apps/api/src/middleware/strict-rate-limit.ts @@ -5,6 +5,9 @@ import type { JwtPayload } from "./auth.js"; // Initialize Redis client const redis = new Redis(REDIS_URL || "redis://localhost:6379"); +redis.on("error", (err) => { + console.error("Redis (strict-rate-limit) connection error:", err); +}); interface RateLimitConfig { windowMs: number; diff --git a/apps/api/src/server.ts b/apps/api/src/server.ts index 932e342..fde3e76 100644 --- a/apps/api/src/server.ts +++ b/apps/api/src/server.ts @@ -71,8 +71,8 @@ app.use(notFoundHandler); app.use(errorHandler); // Start server -app.listen(API_PORT, () => { - console.log(`🚀 API server running on http://localhost:${API_PORT}`); +app.listen(API_PORT, "0.0.0.0", () => { + console.log(`🚀 API server running on http://0.0.0.0:${API_PORT}`); console.log(`📝 Environment: ${NODE_ENV}`); }); diff --git a/apps/scraper/Dockerfile b/apps/scraper/Dockerfile index 8d3ea06..75b8d12 100644 --- a/apps/scraper/Dockerfile +++ b/apps/scraper/Dockerfile @@ -18,6 +18,7 @@ ENV PATH="/opt/venv/bin:$PATH" # Install Chromium and minimal dependencies # We use playwright to install the browser but manually handle dependencies # to keep the image size low and avoid problematic packages. +ENV PLAYWRIGHT_BROWSERS_PATH=/opt/pw-browsers RUN apt-get update && apt-get install -y --no-install-recommends \ libnss3 \ libnspr4 \ @@ -37,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libcairo2 \ && pip install playwright \ && playwright install chromium \ + && chmod -R 755 /opt/pw-browsers \ && apt-get clean && rm -rf /var/lib/apt/lists/* # Create non-root user diff --git a/apps/scraper/src/database.py b/apps/scraper/src/database.py index 3b976a5..5243e45 100644 --- a/apps/scraper/src/database.py +++ b/apps/scraper/src/database.py @@ -107,7 +107,7 @@ async def insert_job(self, job_data: Dict[str, Any]) -> bool: job_data.get("posted_at"), job_data.get("expires_at"), job_data.get("is_active", True), - job_data.get("embedding"), + str(job_data.get("embedding")) if job_data.get("embedding") else None, ) return True except asyncpg.exceptions.UniqueViolationError: @@ -158,7 +158,7 @@ async def insert_jobs_batch(self, jobs: List[Dict[str, Any]]) -> int: job.get("posted_at"), job.get("expires_at"), job.get("is_active", True), - job.get("embedding"), + str(job.get("embedding")) if job.get("embedding") else None, ) inserted += 1 except Exception as e: @@ -192,12 +192,13 @@ async def update_embedding(self, job_id: str, embedding: List[float]): async def update_embeddings_batch(self, updates: List[tuple]): """Batch update embeddings. Each tuple is (job_id, embedding).""" + formatted_updates = [(job_id, str(emb)) for job_id, emb in updates] async with self.pool.acquire() as conn: await conn.executemany(""" UPDATE jobs SET embedding = $2::vector, updated_at = NOW() WHERE id = $1 - """, updates) + """, formatted_updates) logger.info(f"Batch updated {len(updates)} embeddings") # ─── SEARCH ─────────────────────────────────────────────────── diff --git a/apps/scraper/src/spiders/hiring_cafe.py b/apps/scraper/src/spiders/hiring_cafe.py index 32698b7..6ea67be 100644 --- a/apps/scraper/src/spiders/hiring_cafe.py +++ b/apps/scraper/src/spiders/hiring_cafe.py @@ -132,22 +132,37 @@ def __init__( # ─── Session ────────────────────────────────────────────────── async def _get_page(self) -> Page: - if not self._playwright: - self._playwright = await async_playwright().start() - self._browser = await self._playwright.chromium.launch( - headless=True, - args=[ - "--disable-blink-features=AutomationControlled", - "--no-sandbox", - "--disable-setuid-sandbox", - "--disable-dev-shm-usage", - ], - ) - self._context = await self._browser.new_context( - user_agent=self._BROWSER_HEADERS["User-Agent"], - viewport={"width": 1280, "height": 800} - ) - self._page = await self._context.new_page() + if not self._page: + try: + if not self._playwright: + self._playwright = await async_playwright().start() + if not self._browser: + self._browser = await self._playwright.chromium.launch( + headless=True, + args=[ + "--disable-blink-features=AutomationControlled", + "--no-sandbox", + "--disable-setuid-sandbox", + "--disable-dev-shm-usage", + ], + ) + if not self._context: + self._context = await self._browser.new_context( + user_agent=self._BROWSER_HEADERS["User-Agent"], + viewport={"width": 1280, "height": 800} + ) + if not self._page: + self._page = await self._context.new_page() + except Exception as e: + # Need to clean up state so we truly retry from scratch + logger.error(f"Playwright initialization failed: {e}") + if self._playwright: + await self._playwright.stop() + self._playwright = None + self._browser = None + self._context = None + self._page = None + raise e return self._page async def close(self) -> None: @@ -159,6 +174,21 @@ async def close(self) -> None: await self._playwright.stop() logger.info("Spider Playwright session closed") + async def _reset_browser(self) -> None: + """Reset browser state after a crash.""" + if self._page: + try: + await self._page.close() + except: + pass + if self._context: + try: + await self._context.close() + except: + pass + self._page = None + self._context = None + # ─── Rate Limiting ──────────────────────────────────────────── async def _throttle(self) -> None: @@ -180,7 +210,13 @@ async def _discover_build_id(self) -> str: await self._throttle() page = await self._get_page() - response = await page.goto(self.BASE, wait_until="domcontentloaded") + try: + response = await page.goto(self.BASE, wait_until="domcontentloaded") + except Exception as e: + if "crashed" in str(e).lower() or "closed" in str(e).lower() or "timeout" in str(e).lower(): + await self._reset_browser() + raise e + if not response or not response.ok: raise PlaywrightError(f"Homepage returned {response.status if response else 'None'}") @@ -223,7 +259,13 @@ async def _search_page(self, offset: int) -> Dict[str, Any]: url = f"{self.SEARCH_URL}?offset={offset}&limit={self._page_size}" - response = await page.goto(url, wait_until="domcontentloaded") + try: + response = await page.goto(url, wait_until="domcontentloaded") + except Exception as e: + if "crashed" in str(e).lower() or "closed" in str(e).lower() or "timeout" in str(e).lower(): + await self._reset_browser() + raise e + if response.status == 429: retry_after = 60 logger.warning({"event": "rate_limited", "retry_after": retry_after}) @@ -261,6 +303,8 @@ async def _get_total_count(self) -> int: return total except Exception as exc: logger.warning(f"Could not get total count: {exc}") + if "crashed" in str(exc).lower() or "closed" in str(exc).lower() or "timeout" in str(exc).lower(): + await self._reset_browser() return 0 # ─── Job Detail ─────────────────────────────────────────────── @@ -281,11 +325,12 @@ async def _fetch_job_detail(self, requisition_id: str) -> Optional[Dict[str, Any url = f"{self.BASE}/viewjob/{requisition_id}" try: - response = await page.goto(url, wait_until="domcontentloaded", timeout=15000) + response = await page.goto(url, wait_until="domcontentloaded", timeout=30000) except Exception as e: - logger.warning(f"Timeout or error fetching detail for {requisition_id}: {e}") - self.errors += 1 - return None + logger.warning(f"Timeout or error fetching detail for {requisition_id}. Will retry... Error: {e}") + if "crashed" in str(e).lower() or "closed" in str(e).lower() or "timeout" in str(e).lower(): + await self._reset_browser() + raise e if response and response.ok: self.detail_fetches += 1 diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 173bf70..a42371a 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -8,6 +8,9 @@ services: POSTGRES_USER: ${DB_USER:-postly} POSTGRES_PASSWORD: ${DB_PASSWORD:?Database password required} POSTGRES_DB: ${DB_NAME:-postly} + DATABASE_URL: postgresql://${DB_USER:-postly}:${DB_PASSWORD}@postgres:5432/${DB_NAME:-postly} + ports: + - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data - ./packages/database/migrations:/docker-entrypoint-initdb.d:ro @@ -71,8 +74,10 @@ services: DB_NAME: ${DB_NAME:-postly} DB_USER: ${DB_USER:-postly} DB_PASSWORD: ${DB_PASSWORD:?Database password required} + DATABASE_URL: postgresql://${DB_USER:-postly}:${DB_PASSWORD}@postgres:5432/${DB_NAME:-postly} REDIS_HOST: redis REDIS_PORT: 6379 + REDIS_URL: redis://redis:6379 JWT_SECRET: ${JWT_SECRET:?JWT secret required} JWT_REFRESH_SECRET: ${JWT_REFRESH_SECRET:?JWT refresh secret required} JWT_EXPIRES_IN: ${JWT_EXPIRES_IN:-7d} @@ -92,10 +97,15 @@ services: cpus: '0.3' memory: 220M healthcheck: - test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000/health"] + test: ["CMD", "wget", "-q", "--spider", "http://127.0.0.1:3000/health"] interval: 30s timeout: 10s retries: 3 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" networks: - postly-network @@ -110,6 +120,7 @@ services: DB_NAME: ${DB_NAME:-postly} DB_USER: ${DB_USER:-postly} DB_PASSWORD: ${DB_PASSWORD:?Database password required} + DATABASE_URL: postgresql://${DB_USER:-postly}:${DB_PASSWORD}@postgres:5432/${DB_NAME:-postly} OPENAI_API_KEY: ${OPENAI_API_KEY:-} VOYAGE_API_KEY: ${VOYAGE_API_KEY:-} HEALTH_PORT: 8080 @@ -125,11 +136,16 @@ services: cpus: '0.3' memory: 400M healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/health"] + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/health"] interval: 60s timeout: 10s retries: 2 start_period: 60s + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" networks: - postly-network diff --git a/infrastructure/docker/bot-discord.Dockerfile b/infrastructure/docker/bot-discord.Dockerfile deleted file mode 100644 index 359bfa4..0000000 --- a/infrastructure/docker/bot-discord.Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -# Stage 1: Build -FROM node:20-alpine AS builder - -WORKDIR /app - -# Copy package files -COPY package*.json ./ -COPY turbo.json ./ -COPY packages/package*.json ./packages/ -COPY apps/bot-discord/package*.json ./apps/bot-discord/ - -# Install dependencies -RUN npm ci - -# Copy source code -COPY . . - -# Build packages and Discord bot -RUN npm run build --filter=@postly/* -RUN npm run build --filter=bot-discord - -# Stage 2: Production -FROM node:20-alpine AS runner - -WORKDIR /app - -# Install production dependencies -COPY package*.json ./ -COPY turbo.json ./ -COPY packages/package*.json ./packages/ -COPY apps/bot-discord/package*.json ./apps/bot-discord/ - -RUN npm ci --omit=dev - -# Copy built files from builder -COPY --from=builder /app/apps/bot-discord/dist ./apps/bot-discord/dist -COPY --from=builder /app/packages/*/dist ./packages/ - -# Create non-root user -RUN addgroup -g 1001 -S nodejs && \ - adduser -S nodejs -u 1001 - -USER nodejs - -# Start Discord bot -CMD ["node", "apps/bot-discord/dist/bot.js"] diff --git a/infrastructure/docker/web.Dockerfile b/infrastructure/docker/web.Dockerfile deleted file mode 100644 index 4bbef38..0000000 --- a/infrastructure/docker/web.Dockerfile +++ /dev/null @@ -1,58 +0,0 @@ -# Stage 1: Build -FROM node:20-alpine AS builder - -WORKDIR /app - -# Copy package files -COPY package*.json ./ -COPY turbo.json ./ - -# Explicitly copy workspace packages -COPY packages/database/package.json ./packages/database/ -COPY packages/ai-utils/package.json ./packages/ai-utils/ -COPY packages/logger/package.json ./packages/logger/ -COPY packages/shared-types/package.json ./packages/shared-types/ -COPY packages/config/app-config/package.json ./packages/config/app-config/ -COPY packages/config/eslint-config/package.json ./packages/config/eslint-config/ -COPY packages/config/typescript-config/package.json ./packages/config/typescript-config/ -COPY apps/web/package*.json ./apps/web/ - -# Install dependencies -RUN npm ci --ignore-scripts --legacy-peer-deps - -# Copy source code -COPY . . - -# Build packages and web app -RUN npx turbo run build --filter=web - -# Stage 2: Production -FROM node:20-alpine AS runner - -WORKDIR /app - -RUN npm install -g serve - -# Copy built files from builder -COPY --from=builder /app/apps/web/dist ./apps/web/dist - -# Create non-root user -RUN addgroup -g 1001 -S nodejs && \ - adduser -S nodejs -u 1001 - -# Set ownership -RUN chown -R nodejs:nodejs /app - -USER nodejs - -# Expose port -EXPOSE 3001 - -ENV NODE_ENV=production - -# Health check -HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \ - CMD node -e "require('http').get('http://localhost:3001', (r) => {process.exit(r.statusCode === 200 ? 0 : 1)})" - -# Start web app -CMD ["serve", "-s", "apps/web/dist", "-l", "3001"]