diff --git a/.claude/settings.json b/.claude/settings.json index c659be5..e47738f 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -5,13 +5,12 @@ "defaultMode": "default", "allow": [ "Bash(pnpm lint:*)", + "Bash(pnpm lint:fix:*)", "Bash(pnpm typecheck:*)", "Bash(pnpm build:*)", "Bash(pnpm format:*)", "Bash(pnpm format:check:*)", - "Bash(pnpm test:*)", - "Bash(pnpm agents:check:*)", - "Bash(pnpm agents:sync:*)" + "Bash(pnpm test:*)" ], "ask": [ "Bash(pnpm install:*)", diff --git a/.codex/skills/docs-sync/SKILL.md b/.codex/skills/docs-sync/SKILL.md index 5ba78b3..834fc8e 100644 --- a/.codex/skills/docs-sync/SKILL.md +++ b/.codex/skills/docs-sync/SKILL.md @@ -36,6 +36,7 @@ Documentation files to consider: - When adding new commands, include both the command and a brief explanation - Do not introduce instructions that conflict with `AGENTS.md` - Do not edit `CLAUDE.md` directly; update `AGENTS.md` instead +- Mermaid: wrap node text in quotes like `A["Label"]` and `B{"Question?"}` to avoid parse issues with punctuation ## Output Requirements diff --git a/.npmrc b/.npmrc new file mode 100644 index 0000000..019f618 --- /dev/null +++ b/.npmrc @@ -0,0 +1,5 @@ +# Don't use caret (^) or tilde (~) in package versions +save-prefix= + +# Always save exact versions +save-exact=true \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index 8868f57..cb26a5a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,81 +1,115 @@ -## Repository overview - -- **Name:** cli-agent-sandbox -- **Purpose:** Minimal TypeScript CLI sandbox for testing agent workflows. -- **Entry points:** `src/cli/guestbook/main.ts`, `src/cli/name-explorer/main.ts`, `src/cli/scrape-publications/main.ts`. -- **Framework:** Uses `@openai/agents` with file tools scoped to `tmp`. - -## Setup - -1. Install Node.js and pnpm. -2. Install dependencies: `pnpm install` - -## Environment - -- Set `OPENAI_API_KEY` (export it or use a `.env`) to run the guestbook, name explorer (AI mode), and publication scraper. - -## Common commands - -Available pnpm scripts for development and testing: - -| Command | Description | -| ------------------------------ | ------------------------------------------------- | -| `pnpm run:guestbook` | Run the interactive guestbook CLI demo | -| `pnpm run:name-explorer` | Explore Finnish name statistics (AI Q&A or stats) | -| `pnpm run:scrape-publications` | Scrape publication links and build a review page | -| `pnpm typecheck` | Run TypeScript type checking | -| `pnpm lint` | Run ESLint for code quality | -| `pnpm format` | Format code with Prettier | -| `pnpm format:check` | Check code formatting | -| `pnpm test` | Run Vitest test suite | - -## Project layout - -| Path | Description | -| ----------------------------------------- | ----------------------------------------------- | -| `src/cli/guestbook/main.ts` | Guestbook CLI entry point | -| `src/cli/guestbook/README.md` | Guestbook CLI docs | -| `src/cli/name-explorer/main.ts` | Name Explorer CLI entry point | -| `src/cli/name-explorer/README.md` | Name Explorer CLI docs | -| `src/cli/scrape-publications/main.ts` | Publication scraping CLI entry point | -| `src/cli/scrape-publications/README.md` | Publication scraping CLI docs | -| `src/clients/*` | Publication scraping pipeline clients | -| `src/utils/parse-args.ts` | Shared CLI argument parsing helper | -| `src/utils/question-handler.ts` | Shared CLI prompt + validation helper | -| `src/tools/index.ts` | Tool exports | -| `src/tools/fetch-url/fetch-url-tool.ts` | Safe HTTP fetch tool with SSRF protection | -| `src/tools/read-file/read-file-tool.ts` | Agent tool for reading files under `tmp` | -| `src/tools/write-file/write-file-tool.ts` | Agent tool for writing files under `tmp` | -| `src/tools/list-files/list-files-tool.ts` | Agent tool for listing files under `tmp` | -| `src/tools/utils/fs.ts` | Path safety utilities | -| `src/tools/utils/html-processing.ts` | HTML sanitization + extraction helpers | -| `src/tools/utils/url-safety.ts` | URL safety + SSRF protection helpers | -| `src/tools/utils/test-utils.ts` | Shared test helpers | -| `src/tools/*/*.test.ts` | Vitest tests for tools and safety utils | -| `src/types/index.ts` | Zod schemas for publication pipeline | -| `eslint.config.ts` | ESLint configuration | -| `prettier.config.ts` | Prettier configuration | -| `tsconfig.json` | TypeScript configuration | -| `vitest.config.ts` | Vitest configuration | -| `tmp/` | Runtime scratch space for tool + scraper output | - -## Tools - -File tools provide operations sandboxed to the `tmp/` directory with path validation. The `fetchUrl` tool adds SSRF protection and sanitizes HTML content before conversion. - -| Tool | Location | Parameters | Description | -| ----------- | ----------------------------------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------------- | -| `fetchUrl` | `src/tools/fetch-url/fetch-url-tool.ts` | `url`, `timeoutMs?`, `maxBytes?`, `maxRedirects?`, `maxChars?`, `etag?`, `lastModified?` | Fetches URLs safely and returns sanitized Markdown/text | -| `readFile` | `src/tools/read-file/read-file-tool.ts` | `path` (string) | Reads file content from `tmp` | -| `writeFile` | `src/tools/write-file/write-file-tool.ts` | `path`, `content` (strings) | Writes content to file in `tmp` | -| `listFiles` | `src/tools/list-files/list-files-tool.ts` | `path` (string, optional) | Lists files under `tmp` | - -## Agent notes - -- Use pnpm for scripts and dependency changes. -- Keep changes small and focused; update tests when behavior changes. -- Do not run git operations that change repo state: no `git commit`, `git push`, or opening PRs. -- Read-only git commands are allowed (e.g., `git status`, `git diff`, `git log`). -- Do not read `.env` files or any other secrets. +# AGENTS.md — Operating Guide for AI Agents + +## 0) TL;DR (Agent quick start) + +**Goal:** Make small, safe, test-covered changes in this TypeScript CLI sandbox. + +**Repo:** `cli-agent-sandbox` — minimal TypeScript CLI sandbox built with `@openai/agents` and tool sandboxing under `tmp/`. + +1. Start at `src/cli//main.ts` and the matching `src/cli//README.md`. +2. Follow the pipeline classes under `src/cli//clients/*` and schemas under `src/cli//types/*`. +3. Reuse shared helpers: `src/utils/parse-args.ts`, `src/utils/question-handler.ts`, `src/clients/logger.ts`. +4. Keep changes minimal; add/update **Vitest** tests (`*.test.ts`) when behavior changes. +5. Run: `pnpm typecheck`, `pnpm lint`, `pnpm test` (and `pnpm format:check` if formatting changed). +6. All runtime artifacts go under `tmp/` (never commit them). + +**Scratch space:** Use `tmp/` for generated HTML/markdown/JSON/reports. + +--- + +## 1) Fast map (where to look first) + +- Entry points: `src/cli/*/main.ts` +- Shared clients: `src/clients/*` +- Shared helpers: `src/utils/*` +- Agent tools: `src/tools/*` + +--- + +## 2) Setup & commands + +- Install deps: `pnpm install` +- Set `OPENAI_API_KEY` via env or `.env` (humans do this; agents must not read secrets) +- If a task requires Playwright, follow the repo README for system deps + +**Common scripts (see `package.json` for all):** + +- `pnpm run:[cli-name-here]` +- `pnpm typecheck` +- `pnpm lint` (use `pnpm lint:fix` if errors are auto-fixable) +- `pnpm format` / `pnpm format:check` +- `pnpm test` + +--- + +## 3) Hard rules (security & repo safety) + +### MUST NOT + +- **Do not read** `.env` files or any secrets. +- **Do not run** git commands that change repo state: `git commit`, `git push`, PR creation. +- **Do not bypass** SSRF protections or URL/path safety utilities. + +### Allowed + +- Read-only git commands: `git status`, `git diff`, `git log`. +- Writing runtime artifacts under `tmp/`. + +--- + +## 4) Agent tools (runtime tool catalog) + +All file tools are sandboxed to `tmp/` using path validation (`src/tools/utils/fs.ts`). + +### File tools + +- **`readFile`** (`src/tools/read-file/read-file-tool.ts`) + - Reads a file under `tmp/`. + - Params: `{ path: string }` (path is **relative to `tmp/`**) +- **`writeFile`** (`src/tools/write-file/write-file-tool.ts`) + - Writes a file under `tmp/`. + - Params: `{ path: string, content: string }` (path is **relative to `tmp/`**) +- **`listFiles`** (`src/tools/list-files/list-files-tool.ts`) + - Lists files/dirs under `tmp/`. + - Params: `{ path?: string }` (defaults to `tmp/` root) + +### Safe web fetch tool + +- **`fetchUrl`** (`src/tools/fetch-url/fetch-url-tool.ts`) + - SSRF protection + redirect validation + HTML sanitization + markdown/text conversion. + - Params: `{ url, timeoutMs?, maxBytes?, maxRedirects?, maxChars?, etag?, lastModified? }` + - Output: sanitized content, metadata, and warnings. + +--- + +## 5) Coding conventions (how changes should look) + - Initialize `Logger` in CLI entry points and pass it into clients/pipelines via constructor options. -- Prefer shared helpers in `src/utils` over custom parsing or prompt logic. +- Prefer shared helpers in `src/utils` (`parse-args`, `question-handler`) over custom logic. +- Prefer TypeScript path aliases over deep relative imports: `~tools/*`, `~clients/*`, `~utils/*`. +- Use Zod schemas for CLI args and tool IO. +- For HTTP fetching in code, prefer `Fetch` (sanitized) or `PlaywrightScraper` for JS-heavy pages. +- When adding tools that touch files, use `src/tools/utils/fs.ts` for path validation. +- Comments should capture invariants or subtle behavior, not restate code. +- Prefer a class over a function when state/lifecycle or shared dependencies make it appropriate. +- Avoid `index.ts` barrel exports; use explicit module paths. + +### Comment guidance (short) + +- Use comments for intent/tradeoffs, contracts (inputs/outputs, invariants, side effects, errors), non-obvious behavior (ordering, caching, perf), or domain meanings. +- Avoid `@param`/`@returns` boilerplate and step-by-step narration that repeats the signature or body. +- Rule of thumb: each comment should say something the types cannot. + +--- + +## 6) Definition of Done (before finishing) + +- [ ] Change is minimal and localized +- [ ] Tests added/updated if behavior changed (`pnpm test`) +- [ ] Typecheck passes (`pnpm typecheck`) +- [ ] Lint passes (`pnpm lint`) +- [ ] Formatting is clean (`pnpm format:check` or `pnpm format`) +- [ ] No secrets accessed, no unsafe file/network behavior introduced +- [ ] Any generated artifacts are in `tmp/` only + +--- diff --git a/README.md b/README.md index 7d4066b..3ceff76 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ A minimal TypeScript CLI sandbox for testing agent workflows and safe web scrapi | `pnpm run:scrape-publications` | Scrape publication links and build a review page | | `pnpm typecheck` | Run TypeScript type checking | | `pnpm lint` | Run ESLint for code quality | +| `pnpm lint:fix` | Run ESLint and auto-fix issues | | `pnpm format` | Format code with Prettier | | `pnpm format:check` | Check code formatting | | `pnpm test` | Run Vitest test suite | @@ -78,48 +79,41 @@ src/ │ │ ├── main.ts # Name Explorer CLI entry point │ │ └── README.md # Name Explorer CLI docs │ └── scrape-publications/ -│ ├── main.ts # Publication scraping CLI -│ └── README.md # Publication scraping docs +│ ├── main.ts # Publication scraping CLI entry point +│ ├── README.md # Publication scraping docs +│ ├── clients/ # Publication-specific clients +│ │ ├── publication-pipeline.ts # Pipeline orchestration +│ │ ├── publication-scraper.ts # Link discovery + selector inference +│ │ └── review-page-generator.ts # Review HTML generator +│ └── types/ +│ └── index.ts # Publication Zod schemas ├── clients/ -│ ├── fetch.ts # HTTP fetch + sanitization helpers -│ ├── logger.ts # Console logger -│ ├── playwright-scraper.ts # Playwright-based scraper for JS-rendered pages -│ ├── publication-pipeline.ts # Pipeline orchestration -│ ├── publication-scraper.ts # Link discovery + selector inference -│ └── review-page-generator.ts # Review HTML generator +│ ├── fetch.ts # Shared HTTP fetch + sanitization +│ ├── logger.ts # Shared console logger +│ └── playwright-scraper.ts # Playwright-based web scraper ├── utils/ │ ├── parse-args.ts # Shared CLI arg parsing helper │ └── question-handler.ts # Shared CLI prompt + validation helper ├── tools/ -│ ├── fetch-url/ -│ │ ├── fetch-url-tool.ts # Safe fetch tool -│ │ └── fetch-url-tool.test.ts # Fetch tool tests -│ ├── index.ts # Tool exports -│ ├── list-files/ -│ │ ├── list-files-tool.ts # List tool implementation -│ │ └── list-files-tool.test.ts # List tool tests -│ ├── read-file/ -│ │ ├── read-file-tool.ts # Read tool implementation -│ │ └── read-file-tool.test.ts # Read tool tests -│ ├── write-file/ -│ │ ├── write-file-tool.ts # Write tool implementation -│ │ └── write-file-tool.test.ts # Write tool tests +│ ├── index.ts # Tool exports +│ ├── fetch-url/ # Safe fetch tool +│ ├── list-files/ # List files tool +│ ├── read-file/ # Read file tool +│ ├── write-file/ # Write file tool │ └── utils/ -│ ├── fs.ts # Path safety utilities -│ ├── html-processing.ts # HTML sanitization + extraction helpers -│ ├── html-processing.test.ts # HTML processing tests -│ ├── url-safety.ts # SSRF protection helpers -│ ├── url-safety.test.ts # URL safety tests -│ └── test-utils.ts # Shared test helpers -└── types/ - └── index.ts # Zod schemas for publication pipeline -tmp/ # Runtime scratch space (tool I/O) +│ ├── fs.ts # Path safety utilities +│ ├── html-processing.ts # HTML sanitization + extraction helpers +│ ├── url-safety.ts # SSRF protection helpers +│ └── test-utils.ts # Shared test helpers +tmp/ # Runtime scratch space (tool I/O) ``` ## CLI conventions - When using `Logger`, initialize it in the CLI entry point and pass it into clients/pipelines via constructor options. - Prefer shared helpers in `src/utils` (`parse-args`, `question-handler`) over custom argument parsing or prompt logic. +- Use the TypeScript path aliases for shared modules: `~tools/*`, `~clients/*`, `~utils/*`. + Example: `import { readFileTool } from "~tools/read-file/read-file-tool";` ## Security diff --git a/eslint.config.ts b/eslint.config.ts index 1e541d8..c8d2d0a 100644 --- a/eslint.config.ts +++ b/eslint.config.ts @@ -18,6 +18,7 @@ export default defineConfig( eslint.configs.recommended, ...tseslint.configs.recommended, ...tseslint.configs.recommendedTypeChecked, + ...tseslint.configs.strictTypeChecked, ...tseslint.configs.stylisticTypeChecked, ], rules: { @@ -39,8 +40,55 @@ export default defineConfig( allowConstantLoopConditions: true, }, ], + // Enforce arrow functions over function declarations + "func-style": ["error", "expression"], + "@typescript-eslint/no-floating-promises": [ + "error", + { ignoreVoid: true }, + ], + "@typescript-eslint/switch-exhaustiveness-check": "error", "@typescript-eslint/no-non-null-assertion": "error", + "@typescript-eslint/consistent-type-exports": "error", + "@typescript-eslint/consistent-type-definitions": ["error", "type"], + "@typescript-eslint/restrict-template-expressions": [ + "error", + { + allowAny: false, + allowBoolean: true, + allowNever: false, + allowNullish: false, + allowNumber: true, + allowRegExp: false, + }, + ], + "prefer-const": "error", + "no-var": "error", + // --- Async correctness --- + "@typescript-eslint/await-thenable": "error", + + // --- Safer error handling --- + "@typescript-eslint/only-throw-error": "error", + + // --- Better modern TS patterns --- + "@typescript-eslint/prefer-nullish-coalescing": "error", + "@typescript-eslint/prefer-optional-chain": "error", + eqeqeq: ["error", "smart"], + curly: ["error", "all"], + "import/no-default-export": "error", "import/consistent-type-specifier-style": ["error", "prefer-top-level"], + // Enforce path aliases for cross-module imports + "@typescript-eslint/no-restricted-imports": [ + "error", + { + patterns: [ + { + group: ["../../*", "../../../*", "../../../../*"], + message: + "Use path aliases (e.g. ~tools/...) instead of ../../ imports.", + }, + ], + }, + ], }, }, { diff --git a/package.json b/package.json index 4846b37..4543f75 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,7 @@ "node:tsx": "node --disable-warning=ExperimentalWarning --import tsx", "typecheck": "tsc --noEmit", "lint": "eslint .", + "lint:fix": "eslint . --fix", "format": "prettier --write .", "format:check": "prettier --check .", "test": "vitest" @@ -28,6 +29,7 @@ "devDependencies": { "@eslint/compat": "2.0.1", "@eslint/js": "9.39.2", + "@ianvs/prettier-plugin-sort-imports": "4.7.0", "@openai/agents": "0.3.7", "@types/jsdom": "27.0.0", "@types/node": "25.0.6", @@ -39,7 +41,7 @@ "jiti": "2.6.1", "jsdom": "27.4.0", "marked": "17.0.1", - "node-html-markdown": "^2.0.0", + "node-html-markdown": "2.0.0", "playwright": "1.57.0", "prettier": "3.7.4", "sanitize-html": "2.17.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index d4bd146..9927fa0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -14,6 +14,9 @@ importers: '@eslint/js': specifier: 9.39.2 version: 9.39.2 + '@ianvs/prettier-plugin-sort-imports': + specifier: 4.7.0 + version: 4.7.0(prettier@3.7.4) '@openai/agents': specifier: 0.3.7 version: 0.3.7(hono@4.11.4)(ws@8.19.0)(zod@4.3.5) @@ -48,7 +51,7 @@ importers: specifier: 17.0.1 version: 17.0.1 node-html-markdown: - specifier: ^2.0.0 + specifier: 2.0.0 version: 2.0.0 playwright: specifier: 1.57.0 @@ -95,6 +98,43 @@ packages: '@asamuzakjp/nwsapi@2.3.9': resolution: {integrity: sha512-n8GuYSrI9bF7FFZ/SjhwevlHc8xaVlb/7HmHelnc/PZXBD2ZR49NnN9sMMuDdEGPeeRQ5d0hqlSlEpgCX3Wl0Q==} + '@babel/code-frame@7.28.6': + resolution: {integrity: sha512-JYgintcMjRiCvS8mMECzaEn+m3PfoQiyqukOMCCVQtoJGYJw8j/8LBJEiqkHLkfwCcs74E3pbAUFNg7d9VNJ+Q==} + engines: {node: '>=6.9.0'} + + '@babel/generator@7.28.6': + resolution: {integrity: sha512-lOoVRwADj8hjf7al89tvQ2a1lf53Z+7tiXMgpZJL3maQPDxh0DgLMN62B2MKUOFcoodBHLMbDM6WAbKgNy5Suw==} + engines: {node: '>=6.9.0'} + + '@babel/helper-globals@7.28.0': + resolution: {integrity: sha512-+W6cISkXFa1jXsDEdYA8HeevQT/FULhxzR99pxphltZcVaugps53THCeiWA8SguxxpSp3gKPiuYfSWopkLQ4hw==} + engines: {node: '>=6.9.0'} + + '@babel/helper-string-parser@7.27.1': + resolution: {integrity: sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==} + engines: {node: '>=6.9.0'} + + '@babel/helper-validator-identifier@7.28.5': + resolution: {integrity: sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==} + engines: {node: '>=6.9.0'} + + '@babel/parser@7.28.6': + resolution: {integrity: sha512-TeR9zWR18BvbfPmGbLampPMW+uW1NZnJlRuuHso8i87QZNq2JRF9i6RgxRqtEq+wQGsS19NNTWr2duhnE49mfQ==} + engines: {node: '>=6.0.0'} + hasBin: true + + '@babel/template@7.28.6': + resolution: {integrity: sha512-YA6Ma2KsCdGb+WC6UpBVFJGXL58MDA6oyONbjyF/+5sBgxY/dwkhLogbMT2GXXyU84/IhRw/2D1Os1B/giz+BQ==} + engines: {node: '>=6.9.0'} + + '@babel/traverse@7.28.6': + resolution: {integrity: sha512-fgWX62k02qtjqdSNTAGxmKYY/7FSL9WAS1o2Hu5+I5m9T0yxZzr4cnrfXQ/MX0rIifthCSs6FKTlzYbJcPtMNg==} + engines: {node: '>=6.9.0'} + + '@babel/types@7.28.6': + resolution: {integrity: sha512-0ZrskXVEHSWIqZM/sQZ4EV3jZJXRkio/WCxaqKZP1g//CEWEPSfeZFcms4XeKBCHU0ZKnIkdJeU/kF+eRp5lBg==} + engines: {node: '>=6.9.0'} + '@csstools/color-helpers@5.1.0': resolution: {integrity: sha512-S11EXWJyy0Mz5SYvRmY8nJYTFFd1LCNV+7cXyAgQtOOuzb4EsgfqDufL+9esx72/eLhsRdGZwaldu/h+E4t4BA==} engines: {node: '>=18'} @@ -365,9 +405,37 @@ packages: resolution: {integrity: sha512-bV0Tgo9K4hfPCek+aMAn81RppFKv2ySDQeMoSZuvTASywNTnVJCArCZE2FWqpvIatKu7VMRLWlR1EazvVhDyhQ==} engines: {node: '>=18.18'} + '@ianvs/prettier-plugin-sort-imports@4.7.0': + resolution: {integrity: sha512-soa2bPUJAFruLL4z/CnMfSEKGznm5ebz29fIa9PxYtu8HHyLKNE1NXAs6dylfw1jn/ilEIfO2oLLN6uAafb7DA==} + peerDependencies: + '@prettier/plugin-oxc': ^0.0.4 + '@vue/compiler-sfc': 2.7.x || 3.x + content-tag: ^4.0.0 + prettier: 2 || 3 || ^4.0.0-0 + prettier-plugin-ember-template-tag: ^2.1.0 + peerDependenciesMeta: + '@prettier/plugin-oxc': + optional: true + '@vue/compiler-sfc': + optional: true + content-tag: + optional: true + prettier-plugin-ember-template-tag: + optional: true + + '@jridgewell/gen-mapping@0.3.13': + resolution: {integrity: sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==} + + '@jridgewell/resolve-uri@3.1.2': + resolution: {integrity: sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==} + engines: {node: '>=6.0.0'} + '@jridgewell/sourcemap-codec@1.5.5': resolution: {integrity: sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==} + '@jridgewell/trace-mapping@0.3.31': + resolution: {integrity: sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==} + '@modelcontextprotocol/sdk@1.25.2': resolution: {integrity: sha512-LZFeo4F9M5qOhC/Uc1aQSrBHxMrvxett+9KLHt7OhcExtoiRN9DKgbZffMP/nxjutWDQpfMDfP3nkHI4X9ijww==} engines: {node: '>=18'} @@ -1378,6 +1446,9 @@ packages: jose@6.1.3: resolution: {integrity: sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==} + js-tokens@4.0.0: + resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==} + js-yaml@4.1.1: resolution: {integrity: sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==} hasBin: true @@ -1391,6 +1462,11 @@ packages: canvas: optional: true + jsesc@3.1.0: + resolution: {integrity: sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==} + engines: {node: '>=6'} + hasBin: true + json-buffer@3.0.1: resolution: {integrity: sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==} @@ -2106,6 +2182,53 @@ snapshots: '@asamuzakjp/nwsapi@2.3.9': {} + '@babel/code-frame@7.28.6': + dependencies: + '@babel/helper-validator-identifier': 7.28.5 + js-tokens: 4.0.0 + picocolors: 1.1.1 + + '@babel/generator@7.28.6': + dependencies: + '@babel/parser': 7.28.6 + '@babel/types': 7.28.6 + '@jridgewell/gen-mapping': 0.3.13 + '@jridgewell/trace-mapping': 0.3.31 + jsesc: 3.1.0 + + '@babel/helper-globals@7.28.0': {} + + '@babel/helper-string-parser@7.27.1': {} + + '@babel/helper-validator-identifier@7.28.5': {} + + '@babel/parser@7.28.6': + dependencies: + '@babel/types': 7.28.6 + + '@babel/template@7.28.6': + dependencies: + '@babel/code-frame': 7.28.6 + '@babel/parser': 7.28.6 + '@babel/types': 7.28.6 + + '@babel/traverse@7.28.6': + dependencies: + '@babel/code-frame': 7.28.6 + '@babel/generator': 7.28.6 + '@babel/helper-globals': 7.28.0 + '@babel/parser': 7.28.6 + '@babel/template': 7.28.6 + '@babel/types': 7.28.6 + debug: 4.4.3 + transitivePeerDependencies: + - supports-color + + '@babel/types@7.28.6': + dependencies: + '@babel/helper-string-parser': 7.27.1 + '@babel/helper-validator-identifier': 7.28.5 + '@csstools/color-helpers@5.1.0': {} '@csstools/css-calc@2.1.4(@csstools/css-parser-algorithms@3.0.5(@csstools/css-tokenizer@3.0.4))(@csstools/css-tokenizer@3.0.4)': @@ -2280,8 +2403,31 @@ snapshots: '@humanwhocodes/retry@0.4.3': {} + '@ianvs/prettier-plugin-sort-imports@4.7.0(prettier@3.7.4)': + dependencies: + '@babel/generator': 7.28.6 + '@babel/parser': 7.28.6 + '@babel/traverse': 7.28.6 + '@babel/types': 7.28.6 + prettier: 3.7.4 + semver: 7.7.3 + transitivePeerDependencies: + - supports-color + + '@jridgewell/gen-mapping@0.3.13': + dependencies: + '@jridgewell/sourcemap-codec': 1.5.5 + '@jridgewell/trace-mapping': 0.3.31 + + '@jridgewell/resolve-uri@3.1.2': {} + '@jridgewell/sourcemap-codec@1.5.5': {} + '@jridgewell/trace-mapping@0.3.31': + dependencies: + '@jridgewell/resolve-uri': 3.1.2 + '@jridgewell/sourcemap-codec': 1.5.5 + '@modelcontextprotocol/sdk@1.25.2(hono@4.11.4)(zod@4.3.5)': dependencies: '@hono/node-server': 1.19.8(hono@4.11.4) @@ -3524,6 +3670,8 @@ snapshots: jose@6.1.3: optional: true + js-tokens@4.0.0: {} + js-yaml@4.1.1: dependencies: argparse: 2.0.1 @@ -3556,6 +3704,8 @@ snapshots: - supports-color - utf-8-validate + jsesc@3.1.0: {} + json-buffer@3.0.1: {} json-schema-traverse@0.4.1: {} diff --git a/prettier.config.ts b/prettier.config.ts index 19f3d38..f9d4231 100644 --- a/prettier.config.ts +++ b/prettier.config.ts @@ -2,6 +2,8 @@ import type { Config } from "prettier"; const config: Config = { trailingComma: "es5", + plugins: ["@ianvs/prettier-plugin-sort-imports"], + importOrder: ["", "", "^~/.(.*)$", "", "^[./]"], }; export default config; diff --git a/src/cli/guestbook/main.ts b/src/cli/guestbook/main.ts index 6021ab4..f17dda2 100644 --- a/src/cli/guestbook/main.ts +++ b/src/cli/guestbook/main.ts @@ -1,9 +1,12 @@ // pnpm run:guestbook import { Agent, run } from "@openai/agents"; + import "dotenv/config"; + +import { readFileTool } from "~tools/read-file/read-file-tool"; +import { writeFileTool } from "~tools/write-file/write-file-tool"; import { question } from "zx"; -import { readFileTool, writeFileTool } from "../../tools"; console.log("Guestbook running..."); diff --git a/src/cli/name-explorer/database.ts b/src/cli/name-explorer/clients/database.ts similarity index 55% rename from src/cli/name-explorer/database.ts rename to src/cli/name-explorer/clients/database.ts index a083fdc..84c5a47 100644 --- a/src/cli/name-explorer/database.ts +++ b/src/cli/name-explorer/clients/database.ts @@ -1,32 +1,43 @@ +import fs from "node:fs"; import { DatabaseSync } from "node:sqlite"; import type { SQLInputValue } from "node:sqlite"; -import fs from "node:fs"; -import type { Logger } from "../../clients/logger"; +import type { Logger } from "~clients/logger"; + import type { NameEntry } from "./parse-names"; -export interface NameRow { +export type NameRow = { id: number; decade: string; gender: "boy" | "girl"; rank: number; name: string; count: number; -} +}; -export interface DecadeData { +export type DecadeData = { decade: string; boys: NameEntry[]; girls: NameEntry[]; -} +}; -export interface ConsolidatedData { +export type ConsolidatedData = { decades: DecadeData[]; -} - +}; + +/** + * Manages an in-memory SQLite database for Finnish names data. + * Provides methods to create the schema, insert data, and query the database. + * The database schema includes a 'names' table with columns for decade, + * gender, rank, name, and count. + */ export class NameDatabase { private db: DatabaseSync; private logger: Logger; + /** + * Creates a new NameDatabase instance with an in-memory SQLite database. + * @param logger - Logger instance for debug output + */ constructor(logger: Logger) { this.logger = logger; this.logger.debug("Initializing in-memory SQLite database"); @@ -35,6 +46,9 @@ export class NameDatabase { this.logger.debug("Database schema created"); } + /** + * Creates the database schema with the names table and indexes. + */ private createSchema(): void { this.db.exec(` CREATE TABLE names ( @@ -52,6 +66,12 @@ export class NameDatabase { `); } + /** + * Inserts name entries for a specific decade and gender. + * @param decade - The decade identifier (e.g., "1980") + * @param gender - The gender category + * @param entries - Array of name entries to insert + */ insertNames( decade: string, gender: "boy" | "girl", @@ -77,11 +97,20 @@ export class NameDatabase { ); } + /** + * Retrieves all name records for a specific decade. + * @param decade - The decade to query + * @returns Array of name rows for the decade + */ getByDecade(decade: string): NameRow[] { const stmt = this.db.prepare("SELECT * FROM names WHERE decade = ?"); return stmt.all(decade) as unknown as NameRow[]; } + /** + * Retrieves all data organized by decade with separate boy/girl arrays. + * @returns Consolidated data structure with all decades + */ getAll(): ConsolidatedData { const decades = this.db .prepare("SELECT DISTINCT decade FROM names ORDER BY decade DESC") @@ -103,6 +132,10 @@ export class NameDatabase { return { decades: result }; } + /** + * Returns the total number of records in the database. + * @returns Total record count + */ getTotalCount(): number { const result = this.db .prepare("SELECT COUNT(*) as count FROM names") @@ -110,6 +143,10 @@ export class NameDatabase { return result.count; } + /** + * Loads data from a consolidated data structure into the database. + * @param data - The consolidated data to load + */ loadFromConsolidatedData(data: ConsolidatedData): void { for (const decadeData of data.decades) { this.insertNames(decadeData.decade, "boy", decadeData.boys); @@ -118,14 +155,41 @@ export class NameDatabase { this.logger.debug(`Loaded ${this.getTotalCount()} records from JSON`); } - query(sql: string, params: SQLInputValue[] = []): T[] { + /** + * Executes a SQL query and returns all matching rows. + * @param sql - The SQL query string + * @param params - Query parameters + * @param mapRow - Optional row mapping function + * @returns Array of query results + */ + query( + sql: string, + params: SQLInputValue[] = [], + mapRow?: (row: unknown) => T + ): T[] { const stmt = this.db.prepare(sql); - return stmt.all(...params) as T[]; + const rows = stmt.all(...params) as unknown[]; + return mapRow ? rows.map(mapRow) : (rows as T[]); } - queryOne(sql: string, params: SQLInputValue[] = []): T | undefined { + /** + * Executes a SQL query and returns the first matching row. + * @param sql - The SQL query string + * @param params - Query parameters + * @param mapRow - Optional row mapping function + * @returns The first result or undefined if no match + */ + queryOne( + sql: string, + params: SQLInputValue[] = [], + mapRow?: (row: unknown) => T + ): T | undefined { const stmt = this.db.prepare(sql); - return stmt.get(...params) as T | undefined; + const row = stmt.get(...params) as unknown; + if (row === undefined) { + return undefined; + } + return mapRow ? mapRow(row) : (row as T); } close(): void { @@ -134,17 +198,26 @@ export class NameDatabase { } } -export interface AggregatedNameRow { +export type AggregatedNameRow = { id: number; name: string; count: number; gender: "male" | "female"; -} +}; +/** + * Manages an in-memory SQLite database for aggregated Finnish names data. + * Stores total name counts across all time (not broken down by decade). + * Used for looking up overall name popularity. + */ export class AggregatedNameDatabase { private db: DatabaseSync; private logger: Logger; + /** + * Creates a new AggregatedNameDatabase instance with an in-memory SQLite database. + * @param logger - Logger instance for debug output + */ constructor(logger: Logger) { this.logger = logger; this.logger.debug("Initializing aggregated names SQLite database"); @@ -153,6 +226,9 @@ export class AggregatedNameDatabase { this.logger.debug("Aggregated database schema created"); } + /** + * Creates the database schema for aggregated names. + */ private createSchema(): void { this.db.exec(` CREATE TABLE names ( @@ -167,6 +243,12 @@ export class AggregatedNameDatabase { `); } + /** + * Loads name data from a CSV file. + * Expects CSV with name and count columns, handles thousand separators. + * @param filePath - Path to the CSV file + * @param gender - Gender to assign to all loaded names + */ loadFromCsv(filePath: string, gender: "male" | "female"): void { const content = fs.readFileSync(filePath, "utf-8"); const lines = content.trim().split("\n"); @@ -183,13 +265,19 @@ export class AggregatedNameDatabase { try { for (const line of dataLines) { const [name, countStr] = line.split(","); - if (!name || !countStr) continue; + if (!name || !countStr) { + continue; + } // Parse count with thousand separators like "43.276" or "43,276" const normalizedCount = countStr.replace(/[^\d]/g, ""); - if (!normalizedCount) continue; + if (!normalizedCount) { + continue; + } const count = Number.parseInt(normalizedCount, 10); - if (Number.isNaN(count)) continue; + if (Number.isNaN(count)) { + continue; + } insert.run(name.trim(), count, gender); } @@ -201,6 +289,10 @@ export class AggregatedNameDatabase { this.logger.debug(`Loaded ${gender} names from ${filePath}`); } + /** + * Returns the total number of records in the database. + * @returns Total record count + */ getTotalCount(): number { const result = this.db .prepare("SELECT COUNT(*) as count FROM names") @@ -208,14 +300,41 @@ export class AggregatedNameDatabase { return result.count; } - query(sql: string, params: SQLInputValue[] = []): T[] { + /** + * Executes a SQL query and returns all matching rows. + * @param sql - The SQL query string + * @param params - Query parameters + * @param mapRow - Optional row mapping function + * @returns Array of query results + */ + query( + sql: string, + params: SQLInputValue[] = [], + mapRow?: (row: unknown) => T + ): T[] { const stmt = this.db.prepare(sql); - return stmt.all(...params) as T[]; + const rows = stmt.all(...params) as unknown[]; + return mapRow ? rows.map(mapRow) : (rows as T[]); } - queryOne(sql: string, params: SQLInputValue[] = []): T | undefined { + /** + * Executes a SQL query and returns the first matching row. + * @param sql - The SQL query string + * @param params - Query parameters + * @param mapRow - Optional row mapping function + * @returns The first result or undefined if no match + */ + queryOne( + sql: string, + params: SQLInputValue[] = [], + mapRow?: (row: unknown) => T + ): T | undefined { const stmt = this.db.prepare(sql); - return stmt.get(...params) as T | undefined; + const row = stmt.get(...params) as unknown; + if (row === undefined) { + return undefined; + } + return mapRow ? mapRow(row) : (row as T); } close(): void { diff --git a/src/cli/name-explorer/parse-names.ts b/src/cli/name-explorer/clients/parse-names.ts similarity index 76% rename from src/cli/name-explorer/parse-names.ts rename to src/cli/name-explorer/clients/parse-names.ts index cb37c18..d711be6 100644 --- a/src/cli/name-explorer/parse-names.ts +++ b/src/cli/name-explorer/clients/parse-names.ts @@ -1,30 +1,21 @@ import { JSDOM } from "jsdom"; -export interface NameEntry { +export type NameEntry = { rank: number; name: string; count: number; -} +}; -export interface ParsedNames { +export type ParsedNames = { decade: string; boys: NameEntry[]; girls: NameEntry[]; -} +}; -export function parseNamesHtml(html: string, decade: string): ParsedNames { - const dom = new JSDOM(html); - const tables = dom.window.document.querySelectorAll("table"); - - // First table is boys (Miehet), second is girls (Naiset) - const boys = parseTable(tables[0]); - const girls = parseTable(tables[1]); - - return { decade, boys, girls }; -} - -function parseTable(table: Element | undefined): NameEntry[] { - if (!table) return []; +const parseTable = (table: Element | undefined): NameEntry[] => { + if (!table) { + return []; + } const rows = table.querySelectorAll("tbody tr"); return Array.from(rows).map((row) => { @@ -34,4 +25,15 @@ function parseTable(table: Element | undefined): NameEntry[] { const count = parseInt(cells[2]?.textContent.replace(/\s/g, "") ?? "0", 10); return { rank, name, count }; }); -} +}; + +export const parseNamesHtml = (html: string, decade: string): ParsedNames => { + const dom = new JSDOM(html); + const tables = dom.window.document.querySelectorAll("table"); + + // First table is boys (Miehet), second is girls (Naiset) + const boys = parseTable(tables[0]); + const girls = parseTable(tables[1]); + + return { decade, boys, girls }; +}; diff --git a/src/cli/name-explorer/pipeline.ts b/src/cli/name-explorer/clients/pipeline.ts similarity index 95% rename from src/cli/name-explorer/pipeline.ts rename to src/cli/name-explorer/clients/pipeline.ts index e09c6d0..5fc6c37 100644 --- a/src/cli/name-explorer/pipeline.ts +++ b/src/cli/name-explorer/clients/pipeline.ts @@ -1,42 +1,43 @@ import fs from "node:fs/promises"; import path from "node:path"; -import { Fetch } from "../../clients/fetch"; -import type { Logger } from "../../clients/logger"; +import { Fetch } from "~clients/fetch"; +import type { Logger } from "~clients/logger"; + +import { FETCH_DECADES } from "../constants"; import type { ConsolidatedData } from "./database"; import { AggregatedNameDatabase, NameDatabase } from "./database"; -import { FETCH_DECADES } from "./decades"; import type { ParsedNames } from "./parse-names"; import { parseNamesHtml } from "./parse-names"; -export interface NameSuggesterPipelineConfig { +export type NameSuggesterPipelineConfig = { logger: Logger; outputDir: string; refetch?: boolean; -} +}; export type { DecadeData, ConsolidatedData } from "./database"; -export interface FetchDecadePageResult { +export type FetchDecadePageResult = { html: string; markdown: string; parsedNames: ParsedNames; fromCache: boolean; -} +}; -export interface ProcessAllDecadesResult { +export type ProcessAllDecadesResult = { totalPages: number; cachedPages: number; fetchedPages: number; -} +}; -export interface SetupResult { +export type SetupResult = { outputPath: string; totalPages: number; cachedPages: number; fetchedPages: number; db: NameDatabase; aggregatedDb: AggregatedNameDatabase | null; -} +}; const BASE_URL = "https://nimipalvelu.dvv.fi/suosituimmat-etunimet"; const REQUEST_DELAY_MS = 500; diff --git a/src/cli/name-explorer/stats-generator.ts b/src/cli/name-explorer/clients/stats-generator.ts similarity index 80% rename from src/cli/name-explorer/stats-generator.ts rename to src/cli/name-explorer/clients/stats-generator.ts index 7ffee30..bfbbde0 100644 --- a/src/cli/name-explorer/stats-generator.ts +++ b/src/cli/name-explorer/clients/stats-generator.ts @@ -1,5 +1,4 @@ -import type { NameDatabase } from "./database"; -import { DECADES } from "./decades"; +import { DECADES, FIRST_DECADE, LAST_DECADE } from "../constants"; import type { AllStats, ChurnMetrics, @@ -15,14 +14,26 @@ import type { SuffixStats, TopName, UnisexName, -} from "./stats-types"; - -const FIRST_DECADE = DECADES[0] ?? "1889"; -const LAST_DECADE = DECADES[DECADES.length - 1] ?? "2020"; +} from "../types"; +import type { NameDatabase } from "./database"; +/** + * Generates statistical analysis from Finnish names data stored in a NameDatabase. + * Computes various metrics including decade stats, top names, name dynamics, + * rank changes, churn metrics, and phonetic analysis. + */ export class StatsGenerator { + /** + * Creates a new StatsGenerator instance. + * @param db - The NameDatabase instance to query for statistics + */ constructor(private db: NameDatabase) {} + /** + * Computes comprehensive statistics for each decade and gender combination. + * Includes birth totals, name counts, top-N concentration, diversity indices, and entropy. + * @returns Array of decade/gender statistics + */ computeDecadeStats(): DecadeGenderStats[] { const results: DecadeGenderStats[] = []; @@ -36,7 +47,9 @@ export class StatsGenerator { const total = totalRow?.total ?? 0; const nameCount = totalRow?.cnt ?? 0; - if (total === 0) continue; + if (total === 0) { + continue; + } // Top-N concentration const topConcentration = { @@ -71,6 +84,14 @@ export class StatsGenerator { return results; } + /** + * Calculates the share of births for the top N names. + * @param decade - The decade to query + * @param gender - The gender category + * @param n - Number of top names to include + * @param total - Total births for normalization + * @returns Share as a decimal (0-1) + */ private getTopNShare( decade: string, gender: string, @@ -84,6 +105,14 @@ export class StatsGenerator { return total > 0 ? (row?.topSum ?? 0) / total : 0; } + /** + * Calculates how many names are needed to reach a given percentage of births. + * @param decade - The decade to query + * @param gender - The gender category + * @param pct - Target percentage as decimal (e.g., 0.5 for 50%) + * @param total - Total births for the decade/gender + * @returns Number of names needed to reach the percentage + */ private getNamesToReachPct( decade: string, gender: string, @@ -106,6 +135,13 @@ export class StatsGenerator { return rows.length; } + /** + * Computes diversity indices for name distribution. + * @param decade - The decade to query + * @param gender - The gender category + * @param total - Total births for normalization + * @returns Object containing HHI, effective names count, and Shannon entropy + */ private computeDiversityIndices( decade: string, gender: string, @@ -134,6 +170,11 @@ export class StatsGenerator { }; } + /** + * Retrieves the top-ranked names for each decade and gender. + * @param limit - Maximum number of names per decade/gender (default: 10) + * @returns Array of top names with rank, count, and share + */ computeTopNames(limit = 10): TopName[] { const results: TopName[] = []; @@ -170,6 +211,11 @@ export class StatsGenerator { return results; } + /** + * Analyzes the lifecycle dynamics of each name across decades. + * Computes peak decade, longevity, average rank, and rank stability. + * @returns Array of name dynamics with timing and consistency metrics + */ computeNameDynamics(): NameDynamics[] { const rows = this.db.query<{ name: string; @@ -242,13 +288,19 @@ export class StatsGenerator { }); } + /** + * Identifies names with the largest rank changes between consecutive decades. + * @returns Object containing top 20 climbers and top 20 fallers + */ computeRankChanges(): { climbers: RankChange[]; fallers: RankChange[] } { const changes: RankChange[] = []; for (let i = 1; i < DECADES.length; i++) { const fromDecade = DECADES[i - 1]; const toDecade = DECADES[i]; - if (!fromDecade || !toDecade) continue; + if (!fromDecade || !toDecade) { + continue; + } const rows = this.db.query<{ name: string; @@ -289,13 +341,19 @@ export class StatsGenerator { return { climbers, fallers }; } + /** + * Finds names that newly appeared in each decade (not present in the previous decade). + * @returns Array of new entries with their debut decade and initial rank + */ computeNewEntries(): NewEntry[] { const results: NewEntry[] = []; for (let i = 1; i < DECADES.length; i++) { const prevDecade = DECADES[i - 1]; const currDecade = DECADES[i]; - if (!prevDecade || !currDecade) continue; + if (!prevDecade || !currDecade) { + continue; + } const rows = this.db.query<{ name: string; @@ -327,6 +385,10 @@ export class StatsGenerator { return results; } + /** + * Identifies names that returned to the rankings after one or more decades of absence. + * @returns Array of comebacks sorted by gap length (longest gaps first) + */ computeComebacks(): Comeback[] { const results: Comeback[] = []; @@ -359,12 +421,16 @@ export class StatsGenerator { const parts = key.split("|"); const name = parts[0]; const gender = parts[1]; - if (!name || !gender) continue; + if (!name || !gender) { + continue; + } for (let i = 1; i < decadeList.length; i++) { const prevEntry = decadeList[i - 1]; const currEntry = decadeList[i]; - if (!prevEntry || !currEntry) continue; + if (!prevEntry || !currEntry) { + continue; + } const prevIdx = DECADES.indexOf(prevEntry.decade); const currIdx = DECADES.indexOf(currEntry.decade); @@ -386,13 +452,20 @@ export class StatsGenerator { return results.sort((a, b) => b.gapDecades - a.gapDecades); } + /** + * Computes name churn metrics between consecutive decades. + * Measures how much the name pool changes over time. + * @returns Array of churn metrics including new/exited names and Jaccard similarity + */ computeChurnMetrics(): ChurnMetrics[] { const results: ChurnMetrics[] = []; for (let i = 1; i < DECADES.length; i++) { const fromDecade = DECADES[i - 1]; const toDecade = DECADES[i]; - if (!fromDecade || !toDecade) continue; + if (!fromDecade || !toDecade) { + continue; + } for (const gender of ["boy", "girl"] as const) { // Get name sets @@ -444,6 +517,10 @@ export class StatsGenerator { return results; } + /** + * Finds names used for both boys and girls in the same decade. + * @returns Array of unisex names with rankings and counts for each gender + */ computeUnisexNames(): UnisexName[] { return this.db.query(` SELECT @@ -460,6 +537,10 @@ export class StatsGenerator { `); } + /** + * Identifies names that have remained popular across 10 or more decades. + * @returns Array of evergreen names sorted by longevity and average rank + */ computeEvergreenNames(): EvergreenName[] { return this.db.query(` SELECT @@ -475,6 +556,10 @@ export class StatsGenerator { `); } + /** + * Analyzes the distribution of first letters across names. + * @returns Array of letter statistics with counts and shares per decade/gender + */ computeLetterStats(): LetterStats[] { const results: LetterStats[] = []; @@ -520,6 +605,10 @@ export class StatsGenerator { return results; } + /** + * Analyzes the distribution of name endings (suffixes like -nen, -us, -ja). + * @returns Array of suffix statistics with counts and shares per decade/gender + */ computeSuffixStats(): SuffixStats[] { const results: SuffixStats[] = []; @@ -575,6 +664,10 @@ export class StatsGenerator { return results; } + /** + * Computes name length statistics (average, min, max) per decade and gender. + * @returns Array of name length statistics + */ computeNameLengthStats(): NameLengthStats[] { return this.db.query(` SELECT @@ -589,6 +682,10 @@ export class StatsGenerator { `); } + /** + * Analyzes the usage of Finnish special characters (ä, ö) in names. + * @returns Array of special character statistics with shares per decade/gender + */ computeSpecialCharStats(): SpecialCharStats[] { const rows = this.db.query<{ decade: string; @@ -615,6 +712,10 @@ export class StatsGenerator { })); } + /** + * Computes all available statistics in a single call. + * @returns Comprehensive statistics object containing all metrics + */ computeAll(): AllStats { const decadeStats = this.computeDecadeStats(); const topNames = this.computeTopNames(10); diff --git a/src/cli/name-explorer/stats-page-generator.ts b/src/cli/name-explorer/clients/stats-page-generator.ts similarity index 99% rename from src/cli/name-explorer/stats-page-generator.ts rename to src/cli/name-explorer/clients/stats-page-generator.ts index 4ce5c39..7ae2c53 100644 --- a/src/cli/name-explorer/stats-page-generator.ts +++ b/src/cli/name-explorer/clients/stats-page-generator.ts @@ -1,9 +1,10 @@ -import type { Logger } from "../../clients/logger"; -import type { AllStats, LetterStats, TopName, UnisexName } from "./stats-types"; +import type { Logger } from "~clients/logger"; -export interface StatsPageGeneratorConfig { +import type { AllStats, LetterStats, TopName, UnisexName } from "../types"; + +export type StatsPageGeneratorConfig = { logger: Logger; -} +}; export class StatsPageGenerator { private logger: Logger; @@ -786,7 +787,9 @@ details > div { padding: 1rem; } ${decades .map((decade) => { const names = unisexByDecade.get(decade) ?? []; - if (names.length === 0) return ""; + if (names.length === 0) { + return ""; + } return `
diff --git a/src/cli/name-explorer/decades.ts b/src/cli/name-explorer/constants.ts similarity index 67% rename from src/cli/name-explorer/decades.ts rename to src/cli/name-explorer/constants.ts index 518ac63..5307486 100644 --- a/src/cli/name-explorer/decades.ts +++ b/src/cli/name-explorer/constants.ts @@ -16,3 +16,6 @@ export const DECADES: string[] = [ ]; export const FETCH_DECADES: string[] = DECADES.slice().reverse(); + +export const FIRST_DECADE = DECADES[0] ?? "1889"; +export const LAST_DECADE = DECADES[DECADES.length - 1] ?? "2020"; diff --git a/src/cli/name-explorer/main.ts b/src/cli/name-explorer/main.ts index fd0c2c0..8d95b94 100644 --- a/src/cli/name-explorer/main.ts +++ b/src/cli/name-explorer/main.ts @@ -2,17 +2,22 @@ // pnpm run:name-explorer --mode ai import "dotenv/config"; + import { writeFile } from "fs/promises"; -import { z } from "zod"; import { Agent, MemorySession, Runner } from "@openai/agents"; -import { Logger } from "../../clients/logger"; -import { NameSuggesterPipeline } from "./pipeline"; -import { StatsGenerator } from "./stats-generator"; -import { StatsPageGenerator } from "./stats-page-generator"; -import { createFetchNameTool } from "./fetch-name-tool"; -import { createAggregatedSqlQueryTool, createSqlQueryTool } from "./sql-tool"; -import { parseArgs } from "../../utils/parse-args"; -import { QuestionHandler } from "../../utils/question-handler"; +import { Logger } from "~clients/logger"; +import { parseArgs } from "~utils/parse-args"; +import { QuestionHandler } from "~utils/question-handler"; +import { z } from "zod"; + +import { NameSuggesterPipeline } from "./clients/pipeline"; +import { StatsGenerator } from "./clients/stats-generator"; +import { StatsPageGenerator } from "./clients/stats-page-generator"; +import { createFetchNameTool } from "./tools/fetch-name-tool"; +import { + createAggregatedSqlQueryTool, + createSqlQueryTool, +} from "./tools/sql-tool"; import { NameSuggesterOutputSchema, NameSuggesterOutputTypeSchema, @@ -38,18 +43,8 @@ const pipeline = new NameSuggesterPipeline({ const { db, aggregatedDb } = await pipeline.setup(); -// --- Run selected mode --- -if (mode === "stats") { - await runStatsMode(); -} else { - await runAiMode(); -} - -db.close(); -aggregatedDb?.close(); - // --- Stats Mode: Generate HTML statistics page --- -async function runStatsMode() { +const runStatsMode = async () => { logger.info("Computing statistics..."); const statsGenerator = new StatsGenerator(db); const stats = statsGenerator.computeAll(); @@ -61,10 +56,10 @@ async function runStatsMode() { const outputPath = "tmp/name-explorer/statistics.html"; await writeFile(outputPath, html, "utf-8"); logger.info(`Statistics page written to ${outputPath}`); -} +}; // --- AI Mode: Interactive Q&A with SQL agent --- -async function runAiMode() { +const runAiMode = async () => { logger.info("Starting AI mode..."); const tools = [ @@ -109,7 +104,9 @@ When answering, do not include any questions. Do not include markdown or extra k runner.on("agent_tool_start", (_context, _agent, tool, details) => { const toolCall = details.toolCall as Record; const callId = toolCall.id as string; - if (toolsInProgress.has(callId)) return; + if (toolsInProgress.has(callId)) { + return; + } toolsInProgress.add(callId); const args = String(toolCall.arguments); @@ -160,4 +157,14 @@ When answering, do not include any questions. Do not include markdown or extra k logger.answer(output.content); break; } +}; + +// --- Run selected mode --- +if (mode === "stats") { + await runStatsMode(); +} else { + await runAiMode(); } + +db.close(); +aggregatedDb?.close(); diff --git a/src/cli/name-explorer/fetch-name-tool.ts b/src/cli/name-explorer/tools/fetch-name-tool.ts similarity index 88% rename from src/cli/name-explorer/fetch-name-tool.ts rename to src/cli/name-explorer/tools/fetch-name-tool.ts index 62b4b31..7420de8 100644 --- a/src/cli/name-explorer/fetch-name-tool.ts +++ b/src/cli/name-explorer/tools/fetch-name-tool.ts @@ -1,14 +1,14 @@ import fs from "node:fs/promises"; import path from "node:path"; import { tool } from "@openai/agents"; +import { resolveAndValidateUrl } from "~tools/utils/url-safety"; import { JSDOM } from "jsdom"; import { z } from "zod"; -import { resolveAndValidateUrl } from "../../tools/utils/url-safety"; /** * Statistics for a single decade row */ -export interface NameStatRow { +export type NameStatRow = { decade: string; men: number | null; women: number | null; @@ -16,12 +16,12 @@ export interface NameStatRow { menUnder5?: boolean; womenUnder5?: boolean; totalUnder5?: boolean; -} +}; /** * Complete result from fetching name statistics */ -export interface NameStatResult { +export type NameStatResult = { name: string; rows: NameStatRow[]; totals: { @@ -33,20 +33,20 @@ export interface NameStatResult { totalUnder5?: boolean; }; fetchedAt: string; -} +}; /** * Error result when fetch fails */ -export interface NameStatError { +export type NameStatError = { error: string; name: string; -} +}; -interface ParsedValue { +type ParsedValue = { value: number | null; isUnder5: boolean; -} +}; const DVV_BASE_URL = "https://nimipalvelu.dvv.fi/etunimihaku"; const USER_AGENT = "cli-agent-sandbox/1.0"; @@ -59,7 +59,7 @@ const DEFAULT_TIMEOUT_MS = 15000; * - "alle X" (under X, privacy-protected) -> returns null with flag * - "0" -> 0 */ -function parseTableValue(rawValue: string): ParsedValue { +const parseTableValue = (rawValue: string): ParsedValue => { const trimmed = rawValue.trim(); // Handle "alle X" (Finnish for "under X") - privacy protection for small counts @@ -90,12 +90,12 @@ function parseTableValue(rawValue: string): ParsedValue { } return { value: parsed, isUnder5: false }; -} +}; /** * Parse a single table row into a NameStatRow */ -function parseTableRow(cells: Element[]): NameStatRow | null { +const parseTableRow = (cells: Element[]): NameStatRow | null => { const [decadeCell, menCell, womenCell, totalCell] = cells; if (!decadeCell || !menCell || !womenCell || !totalCell) { return null; @@ -113,20 +113,26 @@ function parseTableRow(cells: Element[]): NameStatRow | null { total: totalParsed.value, }; - if (menParsed.isUnder5) row.menUnder5 = true; - if (womenParsed.isUnder5) row.womenUnder5 = true; - if (totalParsed.isUnder5) row.totalUnder5 = true; + if (menParsed.isUnder5) { + row.menUnder5 = true; + } + if (womenParsed.isUnder5) { + row.womenUnder5 = true; + } + if (totalParsed.isUnder5) { + row.totalUnder5 = true; + } return row; -} +}; /** * Extract name statistics from the DVV HTML page */ -function extractNameStatistics( +const extractNameStatistics = ( html: string, name: string -): NameStatResult | NameStatError { +): NameStatResult | NameStatError => { const dom = new JSDOM(html); const document = dom.window.document; @@ -199,9 +205,15 @@ function extractNameStatistics( total: totalParsed.value, }; - if (menParsed.isUnder5) totals.menUnder5 = true; - if (womenParsed.isUnder5) totals.womenUnder5 = true; - if (totalParsed.isUnder5) totals.totalUnder5 = true; + if (menParsed.isUnder5) { + totals.menUnder5 = true; + } + if (womenParsed.isUnder5) { + totals.womenUnder5 = true; + } + if (totalParsed.isUnder5) { + totals.totalUnder5 = true; + } } } @@ -211,27 +223,27 @@ function extractNameStatistics( totals, fetchedAt: new Date().toISOString(), }; -} +}; -async function fileExists(filePath: string): Promise { +const fileExists = async (filePath: string): Promise => { try { await fs.access(filePath); return true; } catch { return false; } -} +}; -export interface FetchNameToolOptions { +export type FetchNameToolOptions = { cacheDir: string; refetch?: boolean; maxRequests?: number; -} +}; /** * Create a tool for fetching individual name statistics from DVV */ -export function createFetchNameTool(options: FetchNameToolOptions) { +export const createFetchNameTool = (options: FetchNameToolOptions) => { const { cacheDir, refetch = false, maxRequests = 3 } = options; let requestCount = 0; @@ -290,7 +302,7 @@ For aggregate statistics across top 100 names per decade, use the SQL database t const validation = await resolveAndValidateUrl(url); if (!validation.valid) { return JSON.stringify({ - error: `URL validation failed: ${validation.error}`, + error: `URL validation failed: ${validation.error ?? "Unknown error"}`, name: normalizedName, }); } @@ -298,10 +310,9 @@ For aggregate statistics across top 100 names per decade, use the SQL database t // Fetch the page try { const controller = new AbortController(); - const timeoutId = setTimeout( - () => controller.abort(), - DEFAULT_TIMEOUT_MS - ); + const timeoutId = setTimeout(() => { + controller.abort(); + }, DEFAULT_TIMEOUT_MS); const response = await fetch(url, { method: "GET", @@ -352,4 +363,4 @@ For aggregate statistics across top 100 names per decade, use the SQL database t } }, }); -} +}; diff --git a/src/cli/name-explorer/sql-tool.ts b/src/cli/name-explorer/tools/sql-tool.ts similarity index 90% rename from src/cli/name-explorer/sql-tool.ts rename to src/cli/name-explorer/tools/sql-tool.ts index d3e2d27..ac465a2 100644 --- a/src/cli/name-explorer/sql-tool.ts +++ b/src/cli/name-explorer/tools/sql-tool.ts @@ -1,6 +1,7 @@ import { tool } from "@openai/agents"; import { z } from "zod"; -import type { AggregatedNameDatabase, NameDatabase } from "./database"; + +import type { AggregatedNameDatabase, NameDatabase } from "../clients/database"; const DANGEROUS_KEYWORDS = [ "DROP", @@ -14,10 +15,12 @@ const DANGEROUS_KEYWORDS = [ "EXECUTE", ]; -function validateReadOnlyQuery(sql: string): { +const validateReadOnlyQuery = ( + sql: string +): { valid: boolean; error?: string; -} { +} => { const trimmedSql = sql.trim(); // Must start with SELECT @@ -39,9 +42,9 @@ function validateReadOnlyQuery(sql: string): { } return { valid: true }; -} +}; -export function createSqlQueryTool(db: NameDatabase) { +export const createSqlQueryTool = (db: NameDatabase) => { return tool({ name: "query_names_database", description: `Execute a read-only SQL query against the Finnish names database (decade-based data). @@ -68,9 +71,9 @@ Example queries: } }, }); -} +}; -export function createAggregatedSqlQueryTool(db: AggregatedNameDatabase) { +export const createAggregatedSqlQueryTool = (db: AggregatedNameDatabase) => { return tool({ name: "query_aggregated_names", description: `Execute a read-only SQL query against the aggregated Finnish names database (total counts across all time). @@ -98,4 +101,4 @@ Example queries: } }, }); -} +}; diff --git a/src/cli/name-explorer/types.ts b/src/cli/name-explorer/types/ai-output.ts similarity index 100% rename from src/cli/name-explorer/types.ts rename to src/cli/name-explorer/types/ai-output.ts diff --git a/src/cli/name-explorer/types/index.ts b/src/cli/name-explorer/types/index.ts new file mode 100644 index 0000000..e89da03 --- /dev/null +++ b/src/cli/name-explorer/types/index.ts @@ -0,0 +1,2 @@ +export * from "./ai-output"; +export type * from "./stats"; diff --git a/src/cli/name-explorer/stats-types.ts b/src/cli/name-explorer/types/stats.ts similarity index 84% rename from src/cli/name-explorer/stats-types.ts rename to src/cli/name-explorer/types/stats.ts index e713fac..73f394e 100644 --- a/src/cli/name-explorer/stats-types.ts +++ b/src/cli/name-explorer/types/stats.ts @@ -1,6 +1,6 @@ // TypeScript interfaces for name statistics -export interface DecadeGenderStats { +export type DecadeGenderStats = { decade: string; gender: "boy" | "girl"; totalBirths: number; @@ -17,18 +17,18 @@ export interface DecadeGenderStats { hhi: number; effectiveNames: number; entropy: number; -} +}; -export interface TopName { +export type TopName = { decade: string; gender: "boy" | "girl"; rank: number; name: string; count: number; share: number; -} +}; -export interface NameDynamics { +export type NameDynamics = { name: string; gender: "boy" | "girl"; peakDecade: string; @@ -39,9 +39,9 @@ export interface NameDynamics { longevity: number; avgRank: number; rankStddev: number; -} +}; -export interface RankChange { +export type RankChange = { name: string; gender: "boy" | "girl"; fromDecade: string; @@ -49,26 +49,26 @@ export interface RankChange { fromRank: number; toRank: number; change: number; -} +}; -export interface NewEntry { +export type NewEntry = { name: string; gender: "boy" | "girl"; decade: string; rank: number; count: number; -} +}; -export interface Comeback { +export type Comeback = { name: string; gender: "boy" | "girl"; comebackDecade: string; previousDecade: string; gapDecades: number; comebackRank: number; -} +}; -export interface ChurnMetrics { +export type ChurnMetrics = { fromDecade: string; toDecade: string; gender: "boy" | "girl"; @@ -76,52 +76,52 @@ export interface ChurnMetrics { newNames: number; exitedNames: number; jaccardSimilarity: number; -} +}; -export interface UnisexName { +export type UnisexName = { name: string; decade: string; boyRank: number; girlRank: number; boyCount: number; girlCount: number; -} +}; -export interface EvergreenName { +export type EvergreenName = { name: string; gender: "boy" | "girl"; decadesPresent: number; avgRank: number; totalCount: number; -} +}; -export interface LetterStats { +export type LetterStats = { decade: string; gender: "boy" | "girl"; letter: string; nameCount: number; totalBirths: number; share: number; -} +}; -export interface SuffixStats { +export type SuffixStats = { decade: string; gender: "boy" | "girl"; suffix: string; nameCount: number; totalBirths: number; share: number; -} +}; -export interface NameLengthStats { +export type NameLengthStats = { decade: string; gender: "boy" | "girl"; avgLength: number; minLength: number; maxLength: number; -} +}; -export interface SpecialCharStats { +export type SpecialCharStats = { decade: string; gender: "boy" | "girl"; namesWithUmlautA: number; @@ -129,9 +129,9 @@ export interface SpecialCharStats { totalNames: number; umlautAShare: number; umlautOShare: number; -} +}; -export interface AllStats { +export type AllStats = { generatedAt: string; dataSource: string; decadeRange: { first: string; last: string }; @@ -152,4 +152,4 @@ export interface AllStats { suffixStats: SuffixStats[]; nameLengthStats: NameLengthStats[]; specialCharStats: SpecialCharStats[]; -} +}; diff --git a/src/clients/publication-pipeline.ts b/src/cli/scrape-publications/clients/publication-pipeline.ts similarity index 85% rename from src/clients/publication-pipeline.ts rename to src/cli/scrape-publications/clients/publication-pipeline.ts index 755bacc..be8959f 100644 --- a/src/clients/publication-pipeline.ts +++ b/src/cli/scrape-publications/clients/publication-pipeline.ts @@ -1,56 +1,57 @@ +import crypto from "node:crypto"; import fs from "node:fs/promises"; import path from "node:path"; -import crypto from "node:crypto"; -import slug from "slug"; +import { Fetch } from "~clients/fetch"; +import type { Logger } from "~clients/logger"; +import { PlaywrightScraper } from "~clients/playwright-scraper"; import { NodeHtmlMarkdown } from "node-html-markdown"; +import slug from "slug"; import type { z } from "zod"; -import { Fetch } from "./fetch"; -import { PlaywrightScraper } from "./playwright-scraper"; -import { PublicationScraper } from "./publication-scraper"; -import { ReviewPageGenerator } from "./review-page-generator"; -import type { Logger } from "./logger"; + import type { - PublicationLink, LinkCandidate, - SelectorResult, Publication, + PublicationLink, + SelectorResult, } from "../types/index"; +import { PublicationScraper } from "./publication-scraper"; +import { ReviewPageGenerator } from "./review-page-generator"; export type FetchSource = "playwright" | "basic-fetch"; -export interface PublicationPipelineConfig { +export type PublicationPipelineConfig = { logger: Logger; outputDir: string; refetch?: boolean; -} +}; -export interface FetchSourceResult { +export type FetchSourceResult = { markdown: string; html: string; fromCache: { markdown: boolean; html: boolean }; source: FetchSource; -} +}; -export interface DiscoverLinksResult { +export type DiscoverLinksResult = { allLinks: string[]; filteredLinks: string[]; linkCandidates: z.infer[]; source: FetchSource; usedFallback: boolean; -} +}; -export interface IdentifyAndExtractResult { +export type IdentifyAndExtractResult = { selectors: z.infer; publications: z.infer[]; -} +}; -export interface FetchPublicationsResult { +export type FetchPublicationsResult = { fetchedCount: number; skippedCount: number; markdownCount: number; -} +}; -export interface ExtractContentResult { +export type ExtractContentResult = { publications: z.infer[]; report: { total: number; @@ -58,10 +59,15 @@ export interface ExtractContentResult { failed: number; results: { success: boolean; filename: string; error?: string }[]; }; -} +}; const MAX_TITLE_SLUG_LENGTH = 80; +/** + * Orchestrates the full publication scraping pipeline. + * Handles fetching source content, discovering links, extracting metadata, + * fetching individual publications, and generating review pages. + */ export class PublicationPipeline { private logger: Logger; private outputDir: string; @@ -72,6 +78,10 @@ export class PublicationPipeline { private reviewGenerator: ReviewPageGenerator; private htmlToMarkdown: NodeHtmlMarkdown; + /** + * Creates a new PublicationPipeline instance. + * @param config - Configuration with logger, output directory, and refetch flag + */ constructor(config: PublicationPipelineConfig) { this.logger = config.logger; this.outputDir = config.outputDir; @@ -83,6 +93,11 @@ export class PublicationPipeline { this.htmlToMarkdown = new NodeHtmlMarkdown(); } + /** + * Checks if a file exists at the given path. + * @param filePath - Path to check + * @returns True if file exists, false otherwise + */ private async fileExists(filePath: string): Promise { try { await fs.access(filePath); @@ -92,6 +107,11 @@ export class PublicationPipeline { } } + /** + * Converts a title to a URL-safe slug. + * @param title - The title to convert + * @returns Slugified title, truncated to MAX_TITLE_SLUG_LENGTH + */ private titleToSlug(title: string): string { const baseSlug = slug(title, { lower: true }); const trimmedSlug = baseSlug.replace(/^-+|-+$/g, ""); @@ -109,10 +129,21 @@ export class PublicationPipeline { return shortened || "untitled-publication"; } + /** + * Generates a short hash from a URL for disambiguation. + * @param url - The URL to hash + * @returns First 8 characters of SHA-256 hash + */ private urlToShortHash(url: string): string { return crypto.createHash("sha256").update(url).digest("hex").slice(0, 8); } + /** + * Fetches source content from a URL using Playwright or basic HTTP. + * Caches results to avoid refetching unless explicitly requested. + * @param options - Target URL and optional source override + * @returns Fetched content with cache status and source used + */ async fetchSourceContent({ targetUrl, forceSource, @@ -171,6 +202,12 @@ export class PublicationPipeline { return { markdown, html, fromCache, source }; } + /** + * Discovers and filters links from HTML content. + * Falls back to basic fetch if Playwright finds no candidates. + * @param options - HTML content, target URL, and optional filter substring + * @returns Discovered links, filtered links, and link candidates + */ async discoverLinks({ html, targetUrl, @@ -281,6 +318,11 @@ export class PublicationPipeline { }; } + /** + * Identifies CSS selectors and extracts publication metadata. + * @param options - Link candidates to analyze + * @returns Identified selectors and extracted publications + */ async identifyAndExtractMetadata({ linkCandidates, }: { @@ -318,6 +360,12 @@ export class PublicationPipeline { return { selectors, publications }; } + /** + * Fetches HTML pages for each publication and converts to markdown. + * Skips already-cached pages unless refetch is enabled. + * @param options - Publications to fetch + * @returns Fetch statistics (fetched, skipped, markdown counts) + */ async fetchPublicationPages({ publications, }: { @@ -412,6 +460,12 @@ export class PublicationPipeline { return { fetchedCount, skippedCount, markdownCount }; } + /** + * Extracts main content from fetched publication HTML files. + * Uses AI to identify content selectors and converts to markdown. + * @param options - Publications to extract content from + * @returns Extracted publications with content and extraction report + */ async extractPublicationContent({ publications, }: { @@ -549,6 +603,11 @@ export class PublicationPipeline { return { publications: publicationsWithContent, report }; } + /** + * Generates an HTML review page for extracted publications. + * @param options - Publications and source URL + * @returns Path to the generated review.html file + */ async generateReviewPage({ publications, targetUrl, @@ -567,6 +626,9 @@ export class PublicationPipeline { return reviewPath; } + /** + * Closes all resources including the Playwright browser. + */ async close(): Promise { await this.playwrightScraper.close(); } diff --git a/src/clients/publication-scraper.ts b/src/cli/scrape-publications/clients/publication-scraper.ts similarity index 79% rename from src/clients/publication-scraper.ts rename to src/cli/scrape-publications/clients/publication-scraper.ts index bd2937d..d2bf715 100644 --- a/src/clients/publication-scraper.ts +++ b/src/cli/scrape-publications/clients/publication-scraper.ts @@ -1,30 +1,39 @@ +import { Agent, run } from "@openai/agents"; +import type { Logger } from "~clients/logger"; import { JSDOM } from "jsdom"; import { NodeHtmlMarkdown } from "node-html-markdown"; -import { Agent, run } from "@openai/agents"; import type { z } from "zod"; + import { + ContentSelectorResult, PublicationLink, SelectorResult, - ContentSelectorResult, } from "../types/index"; import type { LinkCandidate } from "../types/index"; -import type { Logger } from "./logger"; type SelectorAgent = Agent; type ContentSelectorAgent = Agent; -export interface PublicationScraperConfig { +export type PublicationScraperConfig = { logger: Logger; selectorAgent?: SelectorAgent; contentSelectorAgent?: ContentSelectorAgent; -} +}; +/** + * Scrapes publication data from HTML pages using AI-powered CSS selector identification. + * Extracts titles, dates, and content from publication listing pages. + */ export class PublicationScraper { private logger: Logger; private selectorAgent: SelectorAgent; private contentSelectorAgent: ContentSelectorAgent; private htmlToMarkdown: NodeHtmlMarkdown; + /** + * Creates a new PublicationScraper instance. + * @param config - Configuration with logger and optional custom agents + */ constructor(config: PublicationScraperConfig) { this.logger = config.logger; this.selectorAgent = config.selectorAgent ?? this.createSelectorAgent(); @@ -33,6 +42,10 @@ export class PublicationScraper { this.htmlToMarkdown = new NodeHtmlMarkdown(); } + /** + * Creates the default AI agent for identifying CSS selectors in publication listings. + * @returns Configured selector agent + */ private createSelectorAgent(): SelectorAgent { return new Agent({ name: "SelectorAnalyzer", @@ -63,6 +76,10 @@ Do not include any explanation or markdown - only the JSON object.`, }); } + /** + * Creates the default AI agent for identifying main content selectors. + * @returns Configured content selector agent + */ private createContentSelectorAgent(): ContentSelectorAgent { return new Agent({ name: "ContentSelectorAnalyzer", @@ -90,6 +107,11 @@ IMPORTANT: Respond with ONLY a valid JSON object: }); } + /** + * Parses various date formats to ISO format (YYYY-MM-DD). + * @param rawDate - Raw date string in various formats + * @returns ISO date string or undefined if parsing fails + */ private parseToIsoDate(rawDate: string): string | undefined { // Already ISO format if (/^\d{4}-\d{2}-\d{2}$/.test(rawDate)) { @@ -119,6 +141,11 @@ IMPORTANT: Respond with ONLY a valid JSON object: return undefined; } + /** + * Finds the parent container element for an anchor (article card, list item, etc.). + * @param anchor - The anchor element to find a container for + * @returns The container element or the anchor's parent as fallback + */ private findParentContainer(anchor: Element): Element { const containerTags = ["LI", "ARTICLE", "DIV", "SECTION", "TR", "DD"]; let container: Element | null = anchor.parentElement; @@ -138,7 +165,9 @@ IMPORTANT: Respond with ONLY a valid JSON object: private getStructureSignature(html: string): string { const dom = new JSDOM(html); const root = dom.window.document.body.firstElementChild; - if (!root) return "unknown"; + if (!root) { + return "unknown"; + } const tag = root.tagName.toLowerCase(); const hasImage = !!root.querySelector("img"); @@ -156,10 +185,18 @@ IMPORTANT: Respond with ONLY a valid JSON object: */ private scoreStructureSignature(signature: string): number { let score = 0; - if (signature.includes("h=true")) score += 10; // Has heading - strong signal - if (signature.includes("img=true")) score += 5; // Has image - if (signature.includes("date=true")) score += 5; // Has date - if (signature.startsWith("article:")) score += 5; // Semantic article tag + if (signature.includes("h=true")) { + score += 10; + } // Has heading - strong signal + if (signature.includes("img=true")) { + score += 5; + } // Has image + if (signature.includes("date=true")) { + score += 5; + } // Has date + if (signature.startsWith("article:")) { + score += 5; + } // Semantic article tag return score; } @@ -231,7 +268,9 @@ IMPORTANT: Respond with ONLY a valid JSON object: const anchors = doc.querySelectorAll("a[href]"); for (const anchor of anchors) { const href = anchor.getAttribute("href"); - if (!href) continue; + if (!href) { + continue; + } // Check if the href matches (could be relative or absolute) if ( @@ -263,39 +302,54 @@ IMPORTANT: Respond with ONLY a valid JSON object: if (targetAnchor) { const titleElement = targetAnchor.querySelector(selectors.titleSelector); let title = titleElement?.textContent.trim(); - if (title && title.length > 3) return this.cleanTitle(title); + if (title && title.length > 3) { + return this.cleanTitle(title); + } // Strategy 2: Anchor title attribute const anchorTitle = targetAnchor.getAttribute("title")?.trim(); - if (anchorTitle && anchorTitle.length > 3) + if (anchorTitle && anchorTitle.length > 3) { return this.cleanTitle(anchorTitle); + } // Strategy 3: Heading inside the anchor (h1-h6) const heading = targetAnchor.querySelector("h1, h2, h3, h4, h5, h6"); title = heading?.textContent.trim(); - if (title && title.length > 3) return this.cleanTitle(title); + if (title && title.length > 3) { + return this.cleanTitle(title); + } // Strategy 4: Direct anchor text title = targetAnchor.textContent.trim(); - if (title && title.length > 3) return this.cleanTitle(title); + if (title && title.length > 3) { + return this.cleanTitle(title); + } } // Fallback: Try document-level selectors if no target anchor found const titleElement = doc.querySelector(selectors.titleSelector); let title = titleElement?.textContent.trim(); - if (title && title.length > 3) return this.cleanTitle(title); + if (title && title.length > 3) { + return this.cleanTitle(title); + } const anchor = doc.querySelector("a[title]"); title = anchor?.getAttribute("title")?.trim(); - if (title && title.length > 3) return this.cleanTitle(title); + if (title && title.length > 3) { + return this.cleanTitle(title); + } const heading = doc.querySelector("a h1, a h2, a h3, a h4, a h5, a h6"); title = heading?.textContent.trim(); - if (title && title.length > 3) return this.cleanTitle(title); + if (title && title.length > 3) { + return this.cleanTitle(title); + } const mainAnchor = doc.querySelector("a[href]"); title = mainAnchor?.textContent.trim(); - if (title && title.length > 3) return this.cleanTitle(title); + if (title && title.length > 3) { + return this.cleanTitle(title); + } return null; } @@ -304,7 +358,9 @@ IMPORTANT: Respond with ONLY a valid JSON object: * Parses a date from an element, checking datetime attribute first, then text content. */ private parseDateFromElement(el: Element | null): string | undefined { - if (!el) return undefined; + if (!el) { + return undefined; + } const raw = el.getAttribute("datetime") ?? el.textContent.trim(); return raw ? this.parseToIsoDate(raw) : undefined; } @@ -330,40 +386,58 @@ IMPORTANT: Respond with ONLY a valid JSON object: if (selectors.dateSelector) { const dateEl = targetAnchor.querySelector(selectors.dateSelector); const date = this.parseDateFromElement(dateEl); - if (date) return date; + if (date) { + return date; + } } // Strategy 2: