Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 25 additions & 28 deletions apps/api/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
# Stage 1: Build
FROM --platform=$BUILDPLATFORM node:20-alpine AS builder
FROM --platform=$BUILDPLATFORM node:20-bookworm-slim AS builder

WORKDIR /app

# Copy package files for dependency installation
COPY package*.json ./
COPY turbo.json ./
# Copy root config files
COPY package*.json turbo.json ./

# Copy all package.json files for workspaces
# Copy all workspace package.json files for installation
COPY packages/database/package.json ./packages/database/
COPY packages/ai-utils/package.json ./packages/ai-utils/
COPY packages/logger/package.json ./packages/logger/
Expand All @@ -17,27 +16,27 @@ COPY packages/config/eslint-config/package.json ./packages/config/eslint-config/
COPY packages/config/typescript-config/package.json ./packages/config/typescript-config/
COPY apps/api/package*.json ./apps/api/

# Install dependencies
RUN --mount=type=cache,target=/root/.npm npm ci --ignore-scripts --legacy-peer-deps
# Install dependencies allowing native scripts to run
# HUSKY=0 prevents git hooks from failing in docker
RUN --mount=type=cache,target=/root/.npm HUSKY=0 npm ci --legacy-peer-deps

# Copy source code
# Copy full source
COPY . .

# Build API and its dependencies using turbo
# Build specific app
RUN npx turbo run build --filter=api

# Stage 2: Production
FROM node:20-alpine AS runner
FROM node:20-bookworm-slim AS runner

WORKDIR /app

ENV NODE_ENV=production

# Copy package files for production install
COPY package*.json ./
COPY turbo.json ./
# Copy root config files
COPY package*.json turbo.json ./

# Copy package.json files for production workspaces
# Copy workspace package.json files for production install
COPY packages/database/package.json ./packages/database/
COPY packages/ai-utils/package.json ./packages/ai-utils/
COPY packages/logger/package.json ./packages/logger/
Expand All @@ -47,30 +46,28 @@ COPY packages/config/eslint-config/package.json ./packages/config/eslint-config/
COPY packages/config/typescript-config/package.json ./packages/config/typescript-config/
COPY apps/api/package*.json ./apps/api/

# Install production dependencies only
RUN --mount=type=cache,target=/root/.npm npm ci --omit=dev --ignore-scripts --legacy-peer-deps
# Production-only install with native scripts allowed
# We delete the prepare script to avoid husky failing since it's a devDependency
RUN --mount=type=cache,target=/root/.npm \
npm pkg delete scripts.prepare && \
HUSKY=0 npm ci --omit=dev --legacy-peer-deps

# Copy built files from builder
# Copy built artifacts from builder
COPY --from=builder /app/apps/api/dist ./apps/api/dist
COPY --from=builder /app/packages/database/dist ./packages/database/dist
COPY --from=builder /app/packages/ai-utils/dist ./packages/ai-utils/dist
COPY --from=builder /app/packages/logger/dist ./packages/logger/dist
COPY --from=builder /app/packages/shared-types/dist ./packages/shared-types/dist
COPY --from=builder /app/packages/config/app-config/dist ./packages/config/app-config/dist

# Create non-root user for security
RUN addgroup -g 1001 -S nodejs && \
adduser -S nodejs -u 1001
# Secure user built into node image
USER node

USER nodejs

# Expose port
EXPOSE 3000

# Health check (architecture check)
# Using wget which is more robust in Alpine and avoids unmatched quote issues
# Pure Node healthcheck (No wget or curl required)
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://localhost:3000/health || exit 1
CMD node -e "require('http').get('http://localhost:3000/health', (r) => {if (r.statusCode !== 200) process.exit(1)})" || exit 1

# Start API with memory optimization
CMD ["node", "--max-old-space-size=80", "apps/api/dist/server.js"]
# Start API with optimized memory
CMD ["node", "--max-old-space-size=200", "apps/api/dist/server.js"]
3 changes: 3 additions & 0 deletions apps/api/src/middleware/strict-rate-limit.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ import type { JwtPayload } from "./auth.js";

// Initialize Redis client
const redis = new Redis(REDIS_URL || "redis://localhost:6379");
redis.on("error", (err) => {
console.error("Redis (strict-rate-limit) connection error:", err);
});

interface RateLimitConfig {
windowMs: number;
Expand Down
4 changes: 2 additions & 2 deletions apps/api/src/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ app.use(notFoundHandler);
app.use(errorHandler);

// Start server
app.listen(API_PORT, () => {
console.log(`🚀 API server running on http://localhost:${API_PORT}`);
app.listen(API_PORT, "0.0.0.0", () => {
console.log(`🚀 API server running on http://0.0.0.0:${API_PORT}`);
console.log(`📝 Environment: ${NODE_ENV}`);
});

Expand Down
2 changes: 2 additions & 0 deletions apps/scraper/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ ENV PATH="/opt/venv/bin:$PATH"
# Install Chromium and minimal dependencies
# We use playwright to install the browser but manually handle dependencies
# to keep the image size low and avoid problematic packages.
ENV PLAYWRIGHT_BROWSERS_PATH=/opt/pw-browsers
RUN apt-get update && apt-get install -y --no-install-recommends \
libnss3 \
libnspr4 \
Expand All @@ -37,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libcairo2 \
&& pip install playwright \
&& playwright install chromium \
&& chmod -R 755 /opt/pw-browsers \
&& apt-get clean && rm -rf /var/lib/apt/lists/*

# Create non-root user
Expand Down
7 changes: 4 additions & 3 deletions apps/scraper/src/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ async def insert_job(self, job_data: Dict[str, Any]) -> bool:
job_data.get("posted_at"),
job_data.get("expires_at"),
job_data.get("is_active", True),
job_data.get("embedding"),
str(job_data.get("embedding")) if job_data.get("embedding") else None,
)
return True
except asyncpg.exceptions.UniqueViolationError:
Expand Down Expand Up @@ -158,7 +158,7 @@ async def insert_jobs_batch(self, jobs: List[Dict[str, Any]]) -> int:
job.get("posted_at"),
job.get("expires_at"),
job.get("is_active", True),
job.get("embedding"),
str(job.get("embedding")) if job.get("embedding") else None,
)
inserted += 1
except Exception as e:
Expand Down Expand Up @@ -192,12 +192,13 @@ async def update_embedding(self, job_id: str, embedding: List[float]):

async def update_embeddings_batch(self, updates: List[tuple]):
"""Batch update embeddings. Each tuple is (job_id, embedding)."""
formatted_updates = [(job_id, str(emb)) for job_id, emb in updates]
async with self.pool.acquire() as conn:
await conn.executemany("""
UPDATE jobs
SET embedding = $2::vector, updated_at = NOW()
WHERE id = $1
""", updates)
""", formatted_updates)
logger.info(f"Batch updated {len(updates)} embeddings")

# ─── SEARCH ───────────────────────────────────────────────────
Expand Down
89 changes: 67 additions & 22 deletions apps/scraper/src/spiders/hiring_cafe.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,22 +132,37 @@ def __init__(
# ─── Session ──────────────────────────────────────────────────

async def _get_page(self) -> Page:
if not self._playwright:
self._playwright = await async_playwright().start()
self._browser = await self._playwright.chromium.launch(
headless=True,
args=[
"--disable-blink-features=AutomationControlled",
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
],
)
self._context = await self._browser.new_context(
user_agent=self._BROWSER_HEADERS["User-Agent"],
viewport={"width": 1280, "height": 800}
)
self._page = await self._context.new_page()
if not self._page:
try:
if not self._playwright:
self._playwright = await async_playwright().start()
if not self._browser:
self._browser = await self._playwright.chromium.launch(
headless=True,
args=[
"--disable-blink-features=AutomationControlled",
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
],
)
if not self._context:
self._context = await self._browser.new_context(
user_agent=self._BROWSER_HEADERS["User-Agent"],
viewport={"width": 1280, "height": 800}
)
if not self._page:
self._page = await self._context.new_page()
except Exception as e:
# Need to clean up state so we truly retry from scratch
logger.error(f"Playwright initialization failed: {e}")
if self._playwright:
await self._playwright.stop()
self._playwright = None
self._browser = None
self._context = None
self._page = None
raise e
return self._page

async def close(self) -> None:
Expand All @@ -159,6 +174,21 @@ async def close(self) -> None:
await self._playwright.stop()
logger.info("Spider Playwright session closed")

async def _reset_browser(self) -> None:
"""Reset browser state after a crash."""
if self._page:
try:
await self._page.close()
except:
pass
if self._context:
try:
await self._context.close()
except:
pass
self._page = None
self._context = None

# ─── Rate Limiting ────────────────────────────────────────────

async def _throttle(self) -> None:
Expand All @@ -180,7 +210,13 @@ async def _discover_build_id(self) -> str:
await self._throttle()
page = await self._get_page()

response = await page.goto(self.BASE, wait_until="domcontentloaded")
try:
response = await page.goto(self.BASE, wait_until="domcontentloaded")
except Exception as e:
if "crashed" in str(e).lower() or "closed" in str(e).lower() or "timeout" in str(e).lower():
await self._reset_browser()
raise e

if not response or not response.ok:
raise PlaywrightError(f"Homepage returned {response.status if response else 'None'}")

Expand Down Expand Up @@ -223,7 +259,13 @@ async def _search_page(self, offset: int) -> Dict[str, Any]:

url = f"{self.SEARCH_URL}?offset={offset}&limit={self._page_size}"

response = await page.goto(url, wait_until="domcontentloaded")
try:
response = await page.goto(url, wait_until="domcontentloaded")
except Exception as e:
if "crashed" in str(e).lower() or "closed" in str(e).lower() or "timeout" in str(e).lower():
await self._reset_browser()
raise e

if response.status == 429:
retry_after = 60
logger.warning({"event": "rate_limited", "retry_after": retry_after})
Expand Down Expand Up @@ -261,6 +303,8 @@ async def _get_total_count(self) -> int:
return total
except Exception as exc:
logger.warning(f"Could not get total count: {exc}")
if "crashed" in str(exc).lower() or "closed" in str(exc).lower() or "timeout" in str(exc).lower():
await self._reset_browser()
return 0

# ─── Job Detail ───────────────────────────────────────────────
Expand All @@ -281,11 +325,12 @@ async def _fetch_job_detail(self, requisition_id: str) -> Optional[Dict[str, Any
url = f"{self.BASE}/viewjob/{requisition_id}"

try:
response = await page.goto(url, wait_until="domcontentloaded", timeout=15000)
response = await page.goto(url, wait_until="domcontentloaded", timeout=30000)
except Exception as e:
logger.warning(f"Timeout or error fetching detail for {requisition_id}: {e}")
self.errors += 1
return None
logger.warning(f"Timeout or error fetching detail for {requisition_id}. Will retry... Error: {e}")
if "crashed" in str(e).lower() or "closed" in str(e).lower() or "timeout" in str(e).lower():
await self._reset_browser()
raise e

if response and response.ok:
self.detail_fetches += 1
Expand Down
20 changes: 18 additions & 2 deletions docker-compose.prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ services:
POSTGRES_USER: ${DB_USER:-postly}
POSTGRES_PASSWORD: ${DB_PASSWORD:?Database password required}
POSTGRES_DB: ${DB_NAME:-postly}
DATABASE_URL: postgresql://${DB_USER:-postly}:${DB_PASSWORD}@postgres:5432/${DB_NAME:-postly}
ports:
- "5432:5432"
volumes:
- postgres_data:/var/lib/postgresql/data
- ./packages/database/migrations:/docker-entrypoint-initdb.d:ro
Expand Down Expand Up @@ -71,8 +74,10 @@ services:
DB_NAME: ${DB_NAME:-postly}
DB_USER: ${DB_USER:-postly}
DB_PASSWORD: ${DB_PASSWORD:?Database password required}
DATABASE_URL: postgresql://${DB_USER:-postly}:${DB_PASSWORD}@postgres:5432/${DB_NAME:-postly}
REDIS_HOST: redis
REDIS_PORT: 6379
REDIS_URL: redis://redis:6379
JWT_SECRET: ${JWT_SECRET:?JWT secret required}
JWT_REFRESH_SECRET: ${JWT_REFRESH_SECRET:?JWT refresh secret required}
JWT_EXPIRES_IN: ${JWT_EXPIRES_IN:-7d}
Expand All @@ -92,10 +97,15 @@ services:
cpus: '0.3'
memory: 220M
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000/health"]
test: ["CMD", "wget", "-q", "--spider", "http://127.0.0.1:3000/health"]
interval: 30s
timeout: 10s
retries: 3
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
networks:
- postly-network

Expand All @@ -110,6 +120,7 @@ services:
DB_NAME: ${DB_NAME:-postly}
DB_USER: ${DB_USER:-postly}
DB_PASSWORD: ${DB_PASSWORD:?Database password required}
DATABASE_URL: postgresql://${DB_USER:-postly}:${DB_PASSWORD}@postgres:5432/${DB_NAME:-postly}
OPENAI_API_KEY: ${OPENAI_API_KEY:-}
VOYAGE_API_KEY: ${VOYAGE_API_KEY:-}
HEALTH_PORT: 8080
Expand All @@ -125,11 +136,16 @@ services:
cpus: '0.3'
memory: 400M
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/health"]
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/health"]
interval: 60s
timeout: 10s
retries: 2
start_period: 60s
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
networks:
- postly-network

Expand Down
Loading
Loading