diff --git a/.gitignore b/.gitignore index 880342f..d8133af 100644 --- a/.gitignore +++ b/.gitignore @@ -46,4 +46,7 @@ agent/downloads/* tmp/.cache/.chrome-pid # Chrome PID -.chrome-pid \ No newline at end of file +.chrome-pid + +# Per-skill local Claude Code settings (contains user-specific permission approvals) +skills/*/.claude/settings.local.json diff --git a/skills/company-research/.gitignore b/skills/company-research/.gitignore new file mode 100644 index 0000000..d4fcb2d --- /dev/null +++ b/skills/company-research/.gitignore @@ -0,0 +1,2 @@ +profiles/*.json +!profiles/example.json diff --git a/skills/company-research/SKILL.md b/skills/company-research/SKILL.md new file mode 100644 index 0000000..84f7cfd --- /dev/null +++ b/skills/company-research/SKILL.md @@ -0,0 +1,214 @@ +--- +name: company-research +description: | + Company discovery and deep research skill. Researches a company's product and ICP, + discovers target companies to sell to using Browserbase Search API, deeply researches + each using a Plan→Research→Synthesize pattern, and scores ICP fit — compiled into + a scored research report and CSV. Supports depth modes (quick/deep/deeper) for + balancing scale vs intelligence. + Use when the user wants to: (1) find companies to sell to, (2) research potential + customers, (3) discover companies matching an ICP, (4) build a target company list, + (5) do market research on prospects. Triggers: "find companies to sell to", + "company research", "find prospects", "ICP research", "target companies", + "who should we sell to", "market research", "lead research", "prospect list". +license: MIT +compatibility: Requires bb CLI (@browserbasehq/cli) and BROWSERBASE_API_KEY env var +allowed-tools: Bash Agent +metadata: + author: browserbase + version: "1.1.0" +--- + +# Company Research + +Discover and deeply research companies to sell to. Uses Browserbase Search API for discovery and a Plan→Research→Synthesize pattern for deep enrichment — outputting a scored research report and CSV. + +**Required**: `BROWSERBASE_API_KEY` env var and `bb` CLI installed. + +**First-run setup**: On the first run you'll be prompted to approve `bb fetch`, `bb search`, `cat`, `mkdir`, `sed`, etc. Select **"Yes, and don't ask again for: bb fetch:\*"** (or equivalent) for each to auto-approve for the session. To permanently approve, add these to your `~/.claude/settings.json` under `permissions.allow`: +```json +"Bash(bb:*)", "Bash(bunx:*)", "Bash(bun:*)", "Bash(node:*)", +"Bash(cat:*)", "Bash(mkdir:*)", "Bash(sed:*)", "Bash(head:*)", "Bash(tr:*)", "Bash(rm:*)" +``` + +**Path rules**: Always use the full literal path in all Bash commands — NOT `~` or `$HOME` (both trigger "shell expansion syntax" approval prompts). Resolve the home directory once and use it everywhere. When constructing subagent prompts, replace `{SKILL_DIR}` with the full literal path. + +**Output directory**: All research output goes to `~/Desktop/{company_slug}_research_{YYYY-MM-DD}/`. This directory contains one `.md` file per researched company plus a final `.csv`. The user gets both the scored spreadsheet and the full research files on their Desktop. + +**CRITICAL — Tool restrictions (applies to main agent AND all subagents)**: +- All web searches: use `bb search`. NEVER use WebSearch. +- All page content extraction: use `node {SKILL_DIR}/scripts/extract_page.mjs ""`. This script fetches via `bb fetch`, parses title + meta tags + visible body text, and automatically falls back to `bb browse` when the page is JS-rendered or over 1MB. NEVER hand-roll a `bb fetch | sed` pipeline — it silently strips meta tags and doesn't handle the JSON envelope. NEVER use WebFetch. +- All research output: subagents write **one markdown file per company** to `{OUTPUT_DIR}/{company-slug}.md` using bash heredoc. NEVER use the Write tool or `python3 -c`. See `references/example-research.md` for the file format. +- Report + CSV compilation: use `node {SKILL_DIR}/scripts/compile_report.mjs {OUTPUT_DIR} --open` — generates HTML report and CSV in one step, opens overview in browser. +- URL deduplication: use `node {SKILL_DIR}/scripts/list_urls.mjs /tmp` after discovery. +- **Subagents must use ONLY the Bash tool. No other tools allowed.** +- **Main agent NEVER reads raw discovery JSON batch files.** Use `list_urls.mjs` for dedup. + +**CRITICAL — Anti-hallucination rules (applies to main agent AND all subagents)**: +- NEVER infer `product_description`, `industry`, or `target_audience` from a site's fonts, framework (Framer/Next.js/React), design system, or typography. These are cosmetic and say nothing about what the company sells. +- NEVER let the user's own ICP leak into a target's description. If you don't know what the target does, write `Unknown` — do not pattern-match them onto the ICP. +- `product_description` MUST quote or paraphrase a specific phrase from `extract_page.mjs` output (TITLE, META_DESCRIPTION, OG_DESCRIPTION, HEADINGS, or BODY). If none of those fields yield a recognizable product statement, write `Unknown — homepage content not accessible`. +- If `product_description` is `Unknown`, cap `icp_fit_score` at 3 and set `icp_fit_reasoning` to `Insufficient evidence — homepage returned no readable content`. + +**CRITICAL — Minimize permission prompts**: +- Subagents MUST batch ALL file writes into a SINGLE Bash call using chained heredocs. One Bash call = one permission prompt. +- Batch ALL searches and ALL fetches into single Bash calls using `&&` chaining. + +## Pipeline Overview + +Follow these 5 steps in order. Do not skip steps or reorder. + +1. **Company Research** — Deeply understand the user's company, product, and who they sell to +2. **Depth Mode Selection** — Choose research depth based on how many targets they want +3. **Discovery** — Find target companies using diverse search queries +4. **Deep Research & Scoring** — Research each company, score ICP fit +5. **Report & CSV** — Present findings, compile scored CSV + +--- + +## Step 0: Setup Output Directory + +Before starting, create the output directory on the user's Desktop: + +```bash +OUTPUT_DIR=~/Desktop/{company_slug}_research_{YYYY-MM-DD} +mkdir -p "$OUTPUT_DIR" +``` + +Replace `{company_slug}` with the user's company name (lowercase, hyphenated) and `{YYYY-MM-DD}` with today's date. Pass `{OUTPUT_DIR}` (as a full literal path, not with `~`) to all subagent prompts so they write research files there. + +Also clean up discovery batch files from prior runs: +```bash +rm -f /tmp/company_discovery_batch_*.json +``` + +## Step 1: Deep Company Research + +This is the most important step. The quality of everything downstream depends on deeply understanding the user's company. + +1. Ask the user for their company name or URL + +2. **Check for an existing profile**: + - List files in `{SKILL_DIR}/profiles/` (ignore `example.json`) + - If a matching profile exists → load it, present to user: "I have your profile from {researched_at}. Still accurate?" If yes → skip to Step 2. + - If no profile exists → proceed with deep research below. + +3. **Run a full deep research on the user's company** using the Plan→Research→Synthesize pattern. + See `references/research-patterns.md` for sub-question templates and research methodology. + + **Key research steps:** + - Search: `bb search "{company name}" --num-results 10` + - Fetch homepage: `node {SKILL_DIR}/scripts/extract_page.mjs "{company website}"` + - **Discover site pages via sitemap** (do NOT hardcode paths like `/about` or `/customers`): + 1. `bb fetch --allow-redirects "{company website}/sitemap.xml"` — sitemap is small, raw `bb fetch` is fine + 2. Scan for URLs with keywords: `customer`, `case-stud`, `pricing`, `about`, `use-case`, `industry`, `solution` + 3. Optionally also fetch `/llms.txt` for page descriptions + 4. Pick 3-5 most relevant URLs and extract with `extract_page.mjs` (NOT raw `bb fetch`) + - Search for external context and competitors + - Accumulate findings with confidence levels + + **Synthesize into a profile**: + Company, Product, Existing Customers, Competitors, Use Cases. + Do NOT include ICP or sub-verticals — those are per-run decisions. + +4. Present the profile to the user for confirmation. Do not proceed until confirmed. + +5. **Save the confirmed profile** to `{SKILL_DIR}/profiles/{company-slug}.json` + +6. **Ask clarifying questions** using `AskUserQuestion` with checkboxes: + - "Which segments are you targeting?" with options derived from the company research + - "Company stage?" — Startups, Mid-market, Enterprise, All + - "How many companies / depth?" — Quick (~100), Deep (~50), Deeper (~25) + - This is the ONLY user interaction. After this, execute silently until results are ready. + +## Step 2: Depth Mode Selection + +| Mode | Research per company | Best for | +|------|---------------------|----------| +| `quick` | Homepage + 1-2 searches | ~100 companies, broad scan | +| `deep` | 2-3 sub-questions, 5-8 tool calls | ~50 companies, solid research | +| `deeper` | 4-5 sub-questions, 10-15 tool calls | ~25 companies, full intelligence | + +## Step 3: Discovery + +**Formula**: `ceil(requested_companies / 35)` search queries needed. Over-discover by ~2-3x because filtering typically drops 50-70%. + +Generate search queries with these patterns: +- Industry + company stage + geography ("fintech startups series A Bay Area") +- Technology stack + use case ("companies using Selenium for web scraping") +- Competitor adjacency ("alternatives to {known company in ICP}") +- Buyer persona + pain point ("engineering teams struggling with browser automation") + +**Process**: +1. Launch ALL discovery subagents at once (up to ~6 per message). Each runs its queries in a SINGLE Bash call: + ```bash + bb search "{query}" --num-results 25 --output /tmp/company_discovery_batch_{N}.json + ``` +2. After all waves complete, deduplicate: `node {SKILL_DIR}/scripts/list_urls.mjs /tmp` +3. **Filter the URL list** — remove: + - Blog posts, news articles (globenewswire.com, techcrunch.com, etc.) + - Directories/aggregators (tracxn.com, crunchbase.com, g2.com) + - The user's own competitors and existing customers (from profile) + Keep only company homepages. + +See `references/workflow.md` for subagent prompt templates and wave management. + +## Step 4: Deep Research & Scoring + +Launch subagents to research companies in parallel. See `references/workflow.md` for the enrichment subagent prompt template. See `references/research-patterns.md` for the full research methodology. + +**Process**: +1. Split filtered URLs into groups per subagent (quick: ~10, deep: ~5, deeper: ~2-3) +2. Launch ALL enrichment subagents at once (up to ~6 per message) +3. Each subagent uses ONLY Bash — for each company: + + **Phase A — Plan** (skip in quick mode): + Decompose into 2-5 sub-questions based on ICP and enrichment fields. + + **Phase B — Research Loop**: + Search and fetch pages, extract findings. Respect step budget (quick: 2-3, deep: 5-8, deeper: 10-15). + + **Phase C — Synthesize**: + Score ICP fit 1-10 with evidence. Fill enrichment fields from findings. + +4. Subagents write ALL markdown files in a SINGLE Bash call using chained heredocs to `{OUTPUT_DIR}/` +5. After ALL subagents complete, proceed to Step 5 + +**Critical**: Include the confirmed ICP description verbatim in every subagent prompt. Pass the full literal `{OUTPUT_DIR}` path to every subagent. + +## Step 5: Report & CSV + +1. **Generate HTML report + CSV** (opens overview in browser automatically): + ```bash + node {SKILL_DIR}/scripts/compile_report.mjs {OUTPUT_DIR} --open + ``` + This generates: + - `{OUTPUT_DIR}/index.html` — overview page with scored table (opens in browser) + - `{OUTPUT_DIR}/companies/*.html` — individual company pages (linked from overview) + - `{OUTPUT_DIR}/results.csv` — scored spreadsheet for import into sheets/CRM + +2. **Present a summary in chat** too: + +``` +## Company Research Complete + +- **Total companies researched**: {count} +- **Depth mode**: {mode} +- **Score distribution**: + - Strong fit (8-10): {count} + - Partial fit (5-7): {count} + - Weak fit (1-4): {count} +- **Report opened in browser**: ~/Desktop/{company_slug}_research_{date}/index.html +``` + +3. Show the **top companies** sorted by ICP score in a table: + +``` +| Company | Score | Product | Industry | Fit Reasoning | +|---------|-------|---------|----------|---------------| +| Acme | 9 | AI inventory management | E-commerce SaaS | Series A, uses Selenium, expanding to EU | +``` + +4. For the top 3-5 companies, show a brief research summary — key findings, why they're a good fit, and what specific angle to approach them with. + +Offer to dig deeper into specific companies, adjust scoring criteria, or re-run discovery with different queries. diff --git a/skills/company-research/profiles/example.json b/skills/company-research/profiles/example.json new file mode 100644 index 0000000..ae469f5 --- /dev/null +++ b/skills/company-research/profiles/example.json @@ -0,0 +1,9 @@ +{ + "company": "", + "website": "", + "product": "", + "existing_customers": [], + "competitors": [], + "use_cases": [], + "researched_at": "" +} diff --git a/skills/company-research/references/example-research.md b/skills/company-research/references/example-research.md new file mode 100644 index 0000000..aa10f1c --- /dev/null +++ b/skills/company-research/references/example-research.md @@ -0,0 +1,66 @@ +# Example Company Research File + +Each research subagent writes one markdown file per company to `{OUTPUT_DIR}/{company-slug}.md`, where `{OUTPUT_DIR}` is the per-run Desktop directory set up by the main agent in Step 0 (e.g., `~/Desktop/acme_research_2026-04-23/`). The YAML frontmatter contains structured fields for report + CSV compilation. The body contains human-readable research. + +## Template + +```markdown +--- +company_name: Acme Inc +website: https://acme.com +product_description: AI-powered inventory management for e-commerce brands +industry: E-commerce / SaaS +target_audience: Mid-market e-commerce brands +key_features: demand forecasting | automated reordering | multi-warehouse sync +icp_fit_score: 8 +icp_fit_reasoning: Series A e-commerce SaaS, uses Selenium for scraping, expanding to EU — strong fit +employee_estimate: 50-100 +funding_info: Series A, $12M +headquarters: San Francisco, CA +--- + +## Product +AI-powered inventory management for e-commerce brands. Helps DTC brands +automate reordering and sync across multiple warehouses. + +## Research Findings +- **[high]** Checkout optimization for Shopify stores, serving mid-market DTC brands with $5M-$50M revenue (source: acme.com/about) +- **[high]** Series A, $12M raised in Q3 2025 from Sequoia (source: TechCrunch) +- **[medium]** Recently hired 3 data engineers, expanding platform team (source: LinkedIn job posts) +- **[medium]** Uses Selenium for web scraping in their data pipeline (source: careers page) +``` + +## Field Rules + +- **YAML frontmatter**: All structured fields go here. These are extracted for CSV compilation. +- **`key_features`**: Pipe-separated (`|`) list in YAML, not a JSON array. +- **`icp_fit_score`**: Integer 1-10. +- **`icp_fit_reasoning`**: One line, references specific findings. +- **Body sections**: `## Product`, `## Research Findings`. +- **Findings format**: `- **[confidence]** fact (source: url or description)` +- **Filename**: `{OUTPUT_DIR}/{company-slug}.md` where slug is lowercase, hyphenated (e.g., `acme-inc.md`). +- **Deduplication**: One file per company. If a subagent encounters a company that already has a file, overwrite with richer data. + +## Writing via Bash Heredoc + +Subagents write these files using bash heredoc to avoid security prompts. Use the full literal `{OUTPUT_DIR}` path — no `~` or `$HOME`: + +```bash +cat << 'COMPANY_MD' > {OUTPUT_DIR}/acme-inc.md +--- +company_name: Acme Inc +website: https://acme.com +... +--- + +## Product +... + +## Research Findings +... +COMPANY_MD +``` + +Use `'COMPANY_MD'` (quoted) as the delimiter to prevent shell variable expansion. + +**IMPORTANT**: Write ALL company files in a SINGLE Bash call using chained heredocs to minimize permission prompts. diff --git a/skills/company-research/references/report-template.html b/skills/company-research/references/report-template.html new file mode 100644 index 0000000..97abeb7 --- /dev/null +++ b/skills/company-research/references/report-template.html @@ -0,0 +1,139 @@ + + + + + +Company Research — {{COMPANY_NAME}} + + + + + + +
+
+
+

{{TITLE}}

+
{{META}}
+
+ + Powered by Browserbase + + +
+ +
+
Companies
{{TOTAL}}
+
Strong Fit (8-10)
{{HIGH_COUNT}}
+
Partial Fit (5-7)
{{MEDIUM_COUNT}}
+
Weak Fit (1-4)
{{LOW_COUNT}}
+
+ +
+
+ Score Distribution + {{HIGH_PCT}}% strong fit +
+
+
+
+
+
+
+ Strong (8-10) + Partial (5-7) + Weak (1-4) +
+
+ + + + + + + + + + + + + {{TABLE_ROWS}} + +
ScoreCompanyProductIndustryFit Reasoning
+
+ + + + + diff --git a/skills/company-research/references/research-patterns.md b/skills/company-research/references/research-patterns.md new file mode 100644 index 0000000..faaee97 --- /dev/null +++ b/skills/company-research/references/research-patterns.md @@ -0,0 +1,187 @@ +# Company Research — Deep Research Patterns + +## Overview + +This reference defines two research contexts: +1. **Self-Research** (Step 1) — Deep research on the user's own company to build a strong ICP foundation +2. **Target Research** (Step 6) — Research each discovered company using Plan→Research→Synthesize + +Both use the same 3-phase pattern but with different sub-questions and goals. + +## Self-Research (User's Company) + +This is the most important research in the pipeline. Every downstream decision depends on it. + +### Sub-Questions +- "What does {company} sell and what specific problem does it solve?" +- "Who are {company}'s existing customers? What industries, company sizes, and use cases?" +- "Who are {company}'s competitors and what differentiates them?" +- "What pricing model does {company} use and who is the typical buyer persona?" +- "What use cases and pain points does {company}'s marketing emphasize?" + +### Page Discovery +Discover site pages dynamically — do NOT hardcode paths like `/about` or `/customers`: +1. Fetch `bb fetch --allow-redirects "{company website}/sitemap.xml"` — primary source, has ALL pages +2. Scan sitemap URLs for keywords: `customer`, `case-stud`, `pricing`, `about`, `use-case`, `blog`, `docs`, `industry`, `solution` +3. Optionally fetch `bb fetch --allow-redirects "{company website}/llms.txt"` for page descriptions +4. Pick the 3-5 most relevant URLs from the sitemap and fetch those +5. Sitemap is the source of truth. llms.txt is bonus context but often incomplete. + +### External Research +- Search: `"{company} customers use cases reviews"` +- Search: `"{company} alternatives competitors vs"` +- Fetch 1-2 of the most informative third-party results (G2, blog posts, comparisons) + +### Synthesis Output +From all findings, produce a company profile: +- **Company**: name +- **Product**: what they sell, how it works, key capabilities (2-3 sentences, specific) +- **Existing Customers**: named customers or customer types found +- **Competitors**: who they compete with, key differentiators +- **Use Cases**: broad list of use cases the product serves (NOT tied to one vertical) + +Do NOT include ICP, pitch angle, or sub-verticals in the profile. Those are per-run targeting decisions made in Step 2 after the profile is confirmed. The profile is a general-purpose company fact sheet that works regardless of which vertical you target next. + +### Why This Matters +A thin profile produces generic search queries, weak lead scoring, and cookie-cutter emails. A rich profile with specific customers, competitors, and use cases produces targeted queries, accurate scoring, and emails that reference real pain points. + +--- + +## Target Company Research (Step 6) + +### Sub-Question Templates + +Generate sub-questions from these categories based on the ICP and enrichment fields requested. Not every category applies to every company — pick the most relevant. + +### Priority 1 (Always ask) +- **Product/Market**: "What does {company} sell and who are their customers?" +- **ICP Fit**: "How does {company}'s product/market relate to {sender's ICP description}?" + +### Priority 2 (Ask in deep/deeper) +- **Tech Stack**: "What technologies, frameworks, or infrastructure does {company} use?" +- **Growth Signals**: "Has {company} raised funding, launched products, or expanded recently?" +- **Pain Points**: "What challenges might {company} face that {sender's product} addresses?" + +### Priority 3 (Ask in deeper only) +- **Decision Makers**: "Who leads engineering, product, or growth at {company}?" +- **Competitive Landscape**: "Who are {company}'s competitors and how are they differentiated?" +- **Customers/Case Studies**: "Who are {company}'s notable customers and what results do they highlight?" + +### Search Query Patterns + +For each sub-question, generate 2-3 search query variations: + +``` +# Product/Market +"{company name} what they do" +"{company name} product features customers" + +# Tech Stack +"{company name} tech stack engineering blog" +"{company name} careers software engineer" (job posts reveal stack) + +# Growth Signals +"{company name} funding round 2025 2026" +"{company name} launch announcement" +"{company name} hiring" + +# Pain Points +"{company name} challenges {relevant domain}" +"{company name} {problem sender solves}" + +# Decision Makers +"{company name} VP engineering CTO LinkedIn" +"{company name} head of growth product" +``` + +## Finding Format + +Each finding is a self-contained factual statement tied to a source: + +```json +{ + "subQuestion": "What does Acme sell and who are their customers?", + "fact": "Acme provides checkout optimization for Shopify stores, serving mid-market DTC brands with $5M-$50M revenue", + "sourceUrl": "https://acme.com/about", + "sourceTitle": "About Acme - Checkout Optimization", + "confidence": "high" +} +``` + +**Confidence levels**: +- `high`: Directly stated on the company's own website or official press +- `medium`: Inferred from job postings, third-party articles, or indirect signals +- `low`: Speculative based on industry/category, or from outdated sources + +## Research Loop Rules + +1. **Process sub-questions by priority** — Priority 1 first, then 2, then 3 +2. **3-5 findings per sub-question, then move on** — Don't exhaust a topic +3. **Use parallel tool calls** — Search multiple queries simultaneously when possible +4. **Rephrase, don't retry** — If a search returns poor results, try different keywords +5. **Fetch selectively** — Don't fetch every URL from search results. Pick the 1-2 most relevant based on title and URL +6. **Stop at step limit** — Respect the depth mode's step budget per company +7. **Homepage first** — Always fetch the company's homepage before branching to other pages +8. **Deduplicate findings** — Don't record the same fact twice from different sources + +## Depth Mode Behavior + +### Quick Mode (100+ leads) +- **Skip Phase A** — No sub-question decomposition +- **Phase B**: Fetch the company homepage. Run 1-2 supplementary searches if homepage data is thin. +- **Phase C**: Extract available data, score ICP, write email from what's available +- **Budget**: 2-3 total tool calls per company +- **Trade-off**: Fast and cheap, but emails may be less personalized + +### Deep Mode (25-50 leads) +- **Phase A**: Decompose into 2-3 sub-questions (Priority 1 + selected Priority 2) +- **Phase B**: For each sub-question, run 2-3 searches + fetch 1-2 URLs. Target 3-5 findings per sub-question. +- **Phase C**: Synthesize from all findings. ICP reasoning references specific evidence. Email uses the most specific/compelling finding. +- **Budget**: 5-8 total tool calls per company +- **Trade-off**: Good balance of depth and scale + +### Deeper Mode (10-25 leads) +- **Phase A**: Decompose into 4-5 sub-questions (Priority 1 + 2 + selected Priority 3) +- **Phase B**: Research exhaustively. Fetch multiple pages per company (homepage, about, blog, careers, product pages). Target 3-5 findings per sub-question. +- **Phase C**: Synthesize with cited evidence. ICP reasoning is detailed. Email references multiple specific signals. +- **Budget**: 10-15 total tool calls per company +- **Trade-off**: High quality intelligence, but slow and expensive + +## Synthesis Instructions + +After the research loop completes for a company, synthesize findings into the output record: + +### ICP Scoring +Score 1-10 using ALL accumulated findings as evidence: +- **8-10**: Strong match. Multiple high-confidence findings confirm right industry, company stage, and clear pain point alignment. The pitch angle directly addresses a visible need supported by evidence. +- **5-7**: Partial match. Some findings suggest relevance but key signals are missing or low-confidence. Adjacent industry or unclear pain point. +- **1-4**: Weak match. Findings indicate wrong segment, too large/small, or no apparent connection to sender's product. + +Write `icp_fit_reasoning` referencing specific findings: "Series A fintech (from Crunchbase), uses Selenium for scraping (from job posting), expanding to EU market (from blog) — strong fit for browser infrastructure." + +### Email Personalization +Use the **richest, most specific** findings for email context: +- Opening: Use the most concrete finding (a specific product feature, a recent launch, a job posting) +- Bridge: Connect a finding about their challenges/stack to the sender's pitch angle +- If only low-confidence findings exist, keep the email shorter and more general — don't fabricate specificity + +### Enrichment Fields +Map findings to enrichment fields: +- `product_description` → from Product/Market findings +- `industry` → inferred from Product/Market +- `employee_estimate` → from LinkedIn search or careers page findings +- `funding_info` → from Growth Signals findings +- `headquarters` → from company homepage or about page +- `target_audience` → from Product/Market findings +- `key_features` → from product page findings + +If a field has no supporting findings, leave it empty rather than guessing. + +### Anti-Hallucination Rules + +Apply these at synthesis time. They exist because the failure mode — especially on Framer/Next.js landing pages with little server-rendered copy — is for the subagent to pattern-match visual cues onto the sender's ICP and fabricate a plausible-sounding description: + +1. **Typography is not a product.** Never infer `product_description`, `industry`, or `target_audience` from fonts, design system, framework choice (Framer, Next.js, React), or site polish. "Framer-built" and "uses Geist Mono" are observations about tooling, not signals of what the company sells. +2. **No ICP leakage.** If the homepage is thin and external search turns up nothing, do NOT default the target's description toward the sender's ICP. Manufacturing AI ≠ browser automation just because both use AI. +3. **Quote, don't paraphrase from memory.** `product_description` must quote or closely paraphrase a specific phrase from `extract_page.mjs` output (TITLE / META_DESCRIPTION / OG_DESCRIPTION / HEADINGS / BODY) or from an external search result. If no such phrase exists, write `Unknown — homepage content not accessible`. +4. **Cap scores on thin evidence.** If `product_description` is `Unknown`, set `icp_fit_score` ≤ 3 and `icp_fit_reasoning: Insufficient evidence — homepage returned no readable content`. Do not justify a higher score on inferred signals alone. diff --git a/skills/company-research/references/workflow.md b/skills/company-research/references/workflow.md new file mode 100644 index 0000000..a013d35 --- /dev/null +++ b/skills/company-research/references/workflow.md @@ -0,0 +1,256 @@ +# Company Research — Workflow Reference + +## Discovery Batch JSON Schema + +File: `/tmp/company_discovery_batch_{N}.json` + +`bb search --output` writes a JSON object (NOT a flat array): + +```json +{ + "requestId": "abc123", + "query": "AI data extraction startups", + "results": [ + { "url": "https://example.com", "title": "Example Corp", "author": null, "publishedDate": null }, + ... + ] +} +``` + +The `list_urls.mjs` script handles both formats (flat array and `{ results: [...] }`). + +## Company Research Markdown Format + +File: `{OUTPUT_DIR}/{company-slug}.md` + +Where `{OUTPUT_DIR}` is the per-run directory on the user's Desktop (e.g., `~/Desktop/acme_research_2026-04-23/`). The main agent sets this up in Step 0 and passes the full literal path to every subagent. + +Each research subagent writes one markdown file per company. See `references/example-research.md` for the full template. + +**YAML frontmatter fields** (used for report + CSV compilation): +- `company_name` (required) +- `website` (required) +- `product_description` +- `industry` +- `target_audience` +- `key_features` (pipe-separated: `feature1 | feature2 | feature3`) +- `icp_fit_score` (integer 1-10, required) +- `icp_fit_reasoning` +- `employee_estimate` +- `funding_info` +- `headquarters` + +**Body sections**: +- `## Product` — what they do +- `## Research Findings` — evidence with confidence levels and sources + +**CRITICAL**: Use consistent field names across all files. The `compile_report.mjs` script reads these fields. + +## Extracting Page Content + +Use `extract_page.mjs` for all homepage/product-page content extraction. It fetches via `bb fetch`, parses title + meta + visible body text, and falls back to `bb browse` automatically when the page is JS-rendered or too large for fetch: + +```bash +node {SKILL_DIR}/scripts/extract_page.mjs "https://example.com" --max-chars 3000 +``` + +Output is a structured block: +``` +URL: https://example.com +FETCH_OK: true|false +FALLBACK_TO_BROWSE: true|false +TITLE: ... +META_DESCRIPTION: ... +OG_TITLE: ... +OG_DESCRIPTION: ... +HEADINGS: h1/h2/h3 joined by " | " +BODY_CHARS: N +BODY: + +``` + +**Why not a raw `bb fetch | sed` pipeline?** `bb fetch` returns a JSON envelope with the HTML embedded as an escaped string — a naive sed pipeline strips `<>` from the JSON wrapper too and destroys the content. It also strips `` tags, which on Framer/Next.js SPAs are often the only readable content. `extract_page.mjs` handles both correctly. + +**When to use raw `bb fetch`**: Only for small structured files where you want the JSON envelope intact — e.g. `sitemap.xml`, `robots.txt`, `llms.txt`. For any HTML page you'd feed to a model, use `extract_page.mjs`. + +## Verifying content is real (not hallucinated) + +Before writing `product_description`, `industry`, or `target_audience` into a company file, confirm the claim is grounded in `extract_page.mjs` output. Quote or closely paraphrase from TITLE, META_DESCRIPTION, OG_DESCRIPTION, HEADINGS, or BODY. + +If `extract_page.mjs` returns `FETCH_OK: false` AND `FALLBACK_TO_BROWSE: false` (or BODY_CHARS < 50), the homepage is inaccessible. Do not fabricate. Write: +- `product_description: Unknown — homepage content not accessible` +- `icp_fit_score: 3` (or lower) +- `icp_fit_reasoning: Insufficient evidence — homepage returned no readable content` + +A classic failure mode this prevents: a Framer/Next.js landing page with no server-rendered copy, where the subagent pattern-matches visual cues ("design-forward", "Geist Mono", "Framer-built") onto the user's own ICP. Typography is not a product. + +## Discovery Subagent Prompt Template + +``` +You are a company discovery subagent. Run search queries and save results. + +TOOL RULES — CRITICAL, FOLLOW EXACTLY: +1. You may ONLY use the Bash tool. No exceptions. +2. Run ALL searches in a SINGLE Bash call using && chaining. +3. BANNED TOOLS: WebFetch, WebSearch, Write, Read, Glob, Grep — ALL BANNED. + If you use ANY banned tool, the entire run fails. Use ONLY Bash. +4. NEVER use ~ or $HOME in paths — use full literal paths. + +TASK: +Run ALL of the following searches in ONE Bash command: + +bb search "{query1}" --num-results 25 --output /tmp/company_discovery_batch_{N1}.json && \ +bb search "{query2}" --num-results 25 --output /tmp/company_discovery_batch_{N2}.json && \ +bb search "{query3}" --num-results 25 --output /tmp/company_discovery_batch_{N3}.json && \ +echo "Discovery complete" + +After the command completes, report back ONLY the count of results found per batch. +Do NOT analyze, summarize, or return the actual results. +``` + +## Research Subagent Prompt Template + +``` +You are a company research subagent. For each company URL, research the company and score ICP fit. + +CONTEXT: +- User's company: {user_company} +- User's product: {user_product} +- ICP description: {icp_description} +- Depth mode: {depth_mode} +- Output directory: {OUTPUT_DIR} ← write research files HERE, as a full literal path + +URLS TO PROCESS: +{url_list} + +TOOL RULES — CRITICAL, FOLLOW EXACTLY: +1. You may ONLY use the Bash tool. No exceptions. +2. All searches: Bash → bb search "..." --num-results 10 +3. All homepage/product-page content extraction: + Bash → node {SKILL_DIR}/scripts/extract_page.mjs "URL" --max-chars 3000 + This returns structured TITLE / META_DESCRIPTION / OG_DESCRIPTION / HEADINGS / BODY and auto-falls back to bb browse for JS-rendered or >1MB pages. + DO NOT hand-roll a `bb fetch | sed` pipeline — it silently strips meta tags and doesn't parse the JSON envelope. Use `bb fetch` raw only for sitemap.xml, robots.txt, llms.txt. +4. BATCH all file writes: Write ALL markdown files in a SINGLE Bash call using chained heredocs (one permission prompt, not one per file). +5. BANNED TOOLS: WebFetch, WebSearch, Write, Read, Glob, Grep — ALL BANNED. + If you use ANY banned tool, the entire run fails. Use ONLY Bash. +6. NEVER use ~ or $HOME in paths — use full literal paths. + +ANTI-HALLUCINATION RULES — CRITICAL: +- NEVER infer product_description, industry, or target_audience from fonts, framework (Framer/Next.js/React), design system, or visual style. Typography is not a product. +- NEVER let the sender's ICP leak into a target's description. If you don't know what the target does, write "Unknown" — do not pattern-match them onto the ICP. +- product_description MUST quote or closely paraphrase a phrase from extract_page.mjs output. If none of TITLE/META/OG/HEADINGS/BODY yield a recognizable product statement, write "Unknown — homepage content not accessible" and cap icp_fit_score at 3. + +RESEARCH PATTERN (per company): + +Phase A — Plan (skip in quick mode): +Decompose what you need to know into sub-questions based on ICP and enrichment fields. + +Phase B — Research Loop: +For each sub-question (or just the homepage in quick mode): +1. Run bb search with relevant query +2. Pick 1-2 most relevant URLs from results +3. Extract page content: node {SKILL_DIR}/scripts/extract_page.mjs "URL" --max-chars 3000 + (auto-handles the JSON envelope, meta tags, and the bb browse fallback) +4. Smart page discovery: use `bb fetch --allow-redirects` on /sitemap.xml or /llms.txt to find relevant URLs — these are small XML/text files where the raw JSON envelope is fine. For the actual HTML pages you discover, use extract_page.mjs. +5. Extract findings: factual statements with source, confidence level +6. Accumulate findings, move to next sub-question +7. Respect step budget: quick=2-3 calls, deep=5-8, deeper=10-15 + +Phase C — Synthesize: +From accumulated findings: +1. Score ICP fit 1-10 (see rubric below) +2. Fill enrichment fields from findings +3. Reference specific findings in icp_fit_reasoning + +ICP SCORING RUBRIC: +- 8-10: Strong match. Multiple high-confidence findings confirm fit. +- 5-7: Partial match. Some findings suggest relevance but key signals missing. +- 1-4: Weak match. Wrong segment or no apparent connection. + +OUTPUT — write ALL company files in a SINGLE Bash call using chained heredocs directly to {OUTPUT_DIR}: + +cat << 'COMPANY_MD' > {OUTPUT_DIR}/{slug1}.md +--- +company_name: {name} +website: {url} +product_description: {description} +industry: {industry} +target_audience: {audience} +key_features: {feature1} | {feature2} | {feature3} +icp_fit_score: {score} +icp_fit_reasoning: {reasoning} +employee_estimate: {estimate} +funding_info: {funding} +headquarters: {location} +--- + +## Product +{product description paragraph} + +## Research Findings +- **[{confidence}]** {finding} (source: {url}) +COMPANY_MD +cat << 'COMPANY_MD' > {OUTPUT_DIR}/{slug2}.md +--- +... +--- +... +COMPANY_MD + +Use 'COMPANY_MD' (quoted) as the heredoc delimiter to prevent shell variable expansion. + +Report back ONLY: "Batch {batch_id}: {succeeded}/{total} researched, {findings_count} total findings." +Do NOT return raw data to the main conversation. +``` + +## Wave Management + +### Key Principle: Maximize Parallelism, Minimize Prompts +Launch as many subagents as possible in a single message (up to ~6 Agent tool calls per message). Each subagent MUST batch all its Bash operations to minimize permission prompts. + +### Discovery Phase +- Launch up to 6 discovery subagents in a single message +- Each subagent runs ALL its queries in a SINGLE Bash call using `&&` chaining +- After all waves complete, run `node {SKILL_DIR}/scripts/list_urls.mjs /tmp` +- **Filter URLs**: Remove blog posts, news articles, directories, competitors, and existing customers. Keep only company homepages. + +### Research Phase +- Companies per subagent varies by depth: + - `quick`: ~10 companies per subagent + - `deep`: ~5 companies per subagent + - `deeper`: ~2-3 companies per subagent +- Each subagent writes ALL its markdown files in a SINGLE Bash call (chained heredocs) directly to `{OUTPUT_DIR}` + +### Sizing Formula +``` +search_queries = ceil(requested_companies / 35) +discovery_subagents = search_queries +expected_urls = search_queries * 20 + +quick: research_subagents = ceil(expected_urls / 10) +deep: research_subagents = ceil(expected_urls / 5) +deeper: research_subagents = ceil(expected_urls / 3) +``` + +### Error Handling +- If a subagent fails, log the error and continue with remaining batches +- If >50% of subagents fail in a wave, pause and inform the user +- `extract_page.mjs` already handles the bb fetch → bb browse fallback internally. If it still returns FETCH_OK: false with empty BODY, skip the company and mark product_description as Unknown (do not guess). + +## Report + CSV Compilation + +After all research subagents complete, compile the HTML report and CSV in one command: + +```bash +node {SKILL_DIR}/scripts/compile_report.mjs {OUTPUT_DIR} --open +``` + +The script: +- Reads all `.md` files in `{OUTPUT_DIR}` +- Parses YAML frontmatter + body sections +- Deduplicates by normalized company name (keeps highest ICP score) +- Generates `{OUTPUT_DIR}/index.html` — scored overview page +- Generates `{OUTPUT_DIR}/companies/{slug}.html` — one page per company +- Generates `{OUTPUT_DIR}/results.csv` — spreadsheet for sheets/CRM +- Opens `index.html` in the default browser (`--open` flag) +- Prints a JSON summary to stderr diff --git a/skills/company-research/scripts/compile_report.mjs b/skills/company-research/scripts/compile_report.mjs new file mode 100644 index 0000000..2759c69 --- /dev/null +++ b/skills/company-research/scripts/compile_report.mjs @@ -0,0 +1,356 @@ +#!/usr/bin/env node + +// Compiles per-company markdown research files into an HTML report + CSV. +// Reads the report template, fills in placeholders, generates index.html +// with a scored overview table linking to individual company pages. +// +// Usage: node compile_report.mjs [--template ] +// Example: node compile_report.mjs ~/Desktop/acme_research_2026-04-09 + +import { readdirSync, readFileSync, writeFileSync, existsSync } from 'fs'; +import { join, dirname } from 'path'; +import { fileURLToPath } from 'url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const args = process.argv.slice(2); + +if (args.includes('--help') || args.includes('-h') || args.length === 0) { + console.error(`Usage: node compile_report.mjs [--template ] + +Reads all .md files from , generates: + - index.html — overview page with scored table + - companies/.html — individual company research pages + - results.csv — scored spreadsheet + +Options: + --template Path to report-template.html (default: auto-detect) + --open Open index.html in browser after generation + --help, -h Show this help message + +Examples: + node compile_report.mjs ~/Desktop/acme_research_2026-04-09 + node compile_report.mjs ~/Desktop/research --open`); + process.exit(args.includes('--help') || args.includes('-h') ? 0 : 1); +} + +const dir = args[0]; +const shouldOpen = args.includes('--open'); +const templateIdx = args.indexOf('--template'); +let templatePath = templateIdx !== -1 ? args[templateIdx + 1] : null; + +// Auto-detect template +if (!templatePath) { + const candidates = [ + join(__dirname, '..', 'references', 'report-template.html'), + join(__dirname, 'report-template.html'), + ]; + templatePath = candidates.find(p => existsSync(p)); + if (!templatePath) { + console.error('Error: Could not find report-template.html. Use --template to specify path.'); + process.exit(1); + } +} + +const template = readFileSync(templatePath, 'utf-8'); + +// Read and parse markdown files +let files; +try { + files = readdirSync(dir).filter(f => f.endsWith('.md')).sort(); +} catch (err) { + console.error(`Error reading directory ${dir}: ${err.message}`); + process.exit(1); +} + +if (files.length === 0) { + console.error(`No .md files found in ${dir}`); + process.exit(1); +} + +function parseFrontmatter(content) { + const fmMatch = content.match(/^---\n([\s\S]*?)\n---/); + if (!fmMatch) return null; + const fields = {}; + for (const line of fmMatch[1].split('\n')) { + const idx = line.indexOf(':'); + if (idx > 0) { + const key = line.slice(0, idx).trim(); + const val = line.slice(idx + 1).trim().replace(/^["']|["']$/g, ''); + if (key && val) fields[key] = val; + } + } + return fields; +} + +function parseBody(content) { + const bodyMatch = content.match(/^---\n[\s\S]*?\n---\n([\s\S]*)/); + return bodyMatch ? bodyMatch[1].trim() : ''; +} + +function escapeHtml(str) { + return (str || '').replace(/&/g, '&').replace(//g, '>').replace(/"/g, '"'); +} + +function scoreClass(score) { + const s = parseInt(score) || 0; + if (s >= 8) return 'high'; + if (s >= 5) return 'medium'; + return 'low'; +} + +function mdToHtml(md) { + const lines = md.split('\n'); + const out = []; + let inList = false; + let paraLines = []; + + function flushPara() { + if (paraLines.length > 0) { + let text = escapeHtml(paraLines.join(' ').trim()); + text = text.replace(/\*\*\[(\w+)\]\*\*/g, '[$1]'); + text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); + if (text) out.push(`

${text}

`); + paraLines = []; + } + } + + function closeList() { + if (inList) { out.push(''); inList = false; } + } + + for (const line of lines) { + const trimmed = line.trim(); + + if (!trimmed) { + flushPara(); + closeList(); + continue; + } + + // Headings + if (trimmed.startsWith('## ')) { + flushPara(); closeList(); + out.push(`

${escapeHtml(trimmed.slice(3))}

`); + continue; + } + if (trimmed.startsWith('### ')) { + flushPara(); closeList(); + out.push(`

${escapeHtml(trimmed.slice(4))}

`); + continue; + } + + // List items + if (trimmed.startsWith('- ')) { + flushPara(); + if (!inList) { out.push('
    '); inList = true; } + let text = escapeHtml(trimmed.slice(2)); + text = text.replace(/\*\*\[(\w+)\]\*\*/g, '[$1]'); + text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); + out.push(`
  • ${text}
  • `); + continue; + } + + // Regular text — accumulate into paragraph + closeList(); + paraLines.push(trimmed); + } + + flushPara(); + closeList(); + return out.join('\n'); +} + +// Parse all companies +const companies = []; +for (const file of files) { + const content = readFileSync(join(dir, file), 'utf-8'); + const fields = parseFrontmatter(content); + if (!fields) continue; + const body = parseBody(content); + const slug = file.replace('.md', ''); + companies.push({ ...fields, body, slug, file }); +} + +// Sort by ICP score descending +companies.sort((a, b) => (parseInt(b.icp_fit_score) || 0) - (parseInt(a.icp_fit_score) || 0)); + +// Deduplicate +const seen = new Map(); +for (const c of companies) { + const name = (c.company_name || '').toLowerCase().replace(/[,\s]+(inc|llc|ltd|corp|co)\.?$/i, '').trim(); + if (!seen.has(name)) seen.set(name, c); +} +const deduped = [...seen.values()]; + +// Stats +const scores = deduped.map(c => parseInt(c.icp_fit_score) || 0); +const high = scores.filter(s => s >= 8).length; +const medium = scores.filter(s => s >= 5 && s < 8).length; +const low = scores.filter(s => s < 5).length; +const total = deduped.length; +const highPct = total > 0 ? Math.round((high / total) * 100) : 0; +const mediumPct = total > 0 ? Math.round((medium / total) * 100) : 0; +const lowPct = total > 0 ? 100 - highPct - mediumPct : 0; + +// Derive title from directory name +const dirName = dir.split('/').pop(); +const title = dirName.replace(/_/g, ' ').replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()); + +// Generate table rows +const tableRows = deduped.map(c => { + const sc = scoreClass(c.icp_fit_score); + const hasDetail = c.body && c.body.length > 50; + const nameHtml = hasDetail + ? `${escapeHtml(c.company_name)}` + : escapeHtml(c.company_name); + const websiteHtml = c.website + ? `
    ${escapeHtml(c.website.replace(/^https?:\/\/(www\.)?/, ''))}` + : ''; + return ` + ${escapeHtml(c.icp_fit_score || '—')} + ${nameHtml}${websiteHtml} + ${escapeHtml(c.product_description || '')} + ${escapeHtml(c.industry || '')} + ${escapeHtml(c.icp_fit_reasoning || '')} + `; +}).join('\n'); + +// Fill index template +const escapedTitle = escapeHtml(title); +let indexHtml = template + .replace(/\{\{TITLE\}\}/g, `Company Research — ${escapedTitle}`) + .replace(/\{\{COMPANY_NAME\}\}/g, escapedTitle) + .replace(/\{\{META\}\}/g, `${deduped.length} companies researched · ${new Date().toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' })}`) + .replace(/\{\{TOTAL\}\}/g, String(total)) + .replace(/\{\{HIGH_COUNT\}\}/g, String(high)) + .replace(/\{\{MEDIUM_COUNT\}\}/g, String(medium)) + .replace(/\{\{LOW_COUNT\}\}/g, String(low)) + .replace(/\{\{HIGH_PCT\}\}/g, String(highPct)) + .replace(/\{\{MEDIUM_PCT\}\}/g, String(mediumPct)) + .replace(/\{\{LOW_PCT\}\}/g, String(lowPct)) + .replace(/\{\{TABLE_ROWS\}\}/g, () => tableRows); + +writeFileSync(join(dir, 'index.html'), indexHtml); + +// Generate individual company pages +const { mkdirSync } = await import('fs'); +try { mkdirSync(join(dir, 'companies'), { recursive: true }); } catch {} + +for (const c of deduped) { + if (!c.body || c.body.length < 50) continue; + const sc = scoreClass(c.icp_fit_score); + const bodyHtml = mdToHtml(c.body); + + const companyHtml = ` + + + + +${escapeHtml(c.company_name)} — Research + + + + +
    + ← Back to overview +
    +

    ${escapeHtml(c.company_name)}

    +
    + ICP Score: ${escapeHtml(c.icp_fit_score || '—')} + ${c.website ? `${escapeHtml(c.website)}` : ''} +
    +
    +
    + ${c.product_description ? `
    Product
    ${escapeHtml(c.product_description)}
    ` : ''} + ${c.industry ? `
    Industry
    ${escapeHtml(c.industry)}
    ` : ''} + ${c.target_audience ? `
    Target Audience
    ${escapeHtml(c.target_audience)}
    ` : ''} + ${c.key_features ? `
    Key Features
    ${escapeHtml(c.key_features)}
    ` : ''} + ${c.employee_estimate ? `
    Employees
    ${escapeHtml(c.employee_estimate)}
    ` : ''} + ${c.funding_info ? `
    Funding
    ${escapeHtml(c.funding_info)}
    ` : ''} + ${c.headquarters ? `
    HQ
    ${escapeHtml(c.headquarters)}
    ` : ''} + ${c.icp_fit_reasoning ? `
    Fit Reasoning
    ${escapeHtml(c.icp_fit_reasoning)}
    ` : ''} +
    +
    + ${bodyHtml} +
    +
    + + +`; + + writeFileSync(join(dir, 'companies', `${c.slug}.html`), companyHtml); +} + +// Generate CSV +const priority = [ + 'company_name', 'website', 'product_description', 'icp_fit_score', + 'icp_fit_reasoning', 'industry', 'target_audience', 'key_features', + 'employee_estimate', 'funding_info', 'headquarters' +]; +const allCols = [...new Set(deduped.flatMap(r => Object.keys(r)).filter(k => k !== 'body' && k !== 'slug' && k !== 'file'))]; +const cols = [...priority.filter(c => allCols.includes(c)), ...allCols.filter(c => !priority.includes(c)).sort()]; + +function csvEscape(v) { + if (!v) return ''; + if (v.includes(',') || v.includes('"') || v.includes('\n')) return '"' + v.replace(/"/g, '""') + '"'; + return v; +} + +const csvLines = [cols.join(',')]; +for (const row of deduped) { + csvLines.push(cols.map(c => csvEscape(row[c] || '')).join(',')); +} +writeFileSync(join(dir, 'results.csv'), csvLines.join('\n') + '\n'); + +// Summary +console.error(JSON.stringify({ + total: deduped.length, + high_fit: high, + medium_fit: medium, + low_fit: low, + files_generated: { + index: join(dir, 'index.html'), + company_pages: deduped.filter(c => c.body && c.body.length > 50).length, + csv: join(dir, 'results.csv') + } +}, null, 2)); + +console.log(join(dir, 'index.html')); + +// Open in browser if requested +if (shouldOpen) { + const { execSync } = await import('child_process'); + try { execSync(`open "${join(dir, 'index.html')}"`); } catch {} +} diff --git a/skills/company-research/scripts/extract_page.mjs b/skills/company-research/scripts/extract_page.mjs new file mode 100755 index 0000000..ad17997 --- /dev/null +++ b/skills/company-research/scripts/extract_page.mjs @@ -0,0 +1,168 @@ +#!/usr/bin/env node +// Extract structured page content for company research. +// Fetches via `bb fetch` (raw HTML to a temp file), pulls title + meta tags +// + visible body text, and auto-falls back to `bb browse` when content is thin. +// +// Usage: node extract_page.mjs [--max-chars N] +// Output (stdout): structured block consumable by a research subagent. + +import { execFileSync } from "node:child_process"; +import { mkdtempSync, readFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +const THIN_CONTENT_THRESHOLD = 200; // body chars under this → JS-rendered, fall back + +function parseArgs(argv) { + const args = { url: null, maxChars: 3000 }; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--max-chars") args.maxChars = parseInt(argv[++i], 10); + else if (!args.url) args.url = a; + } + if (!args.url) { + console.error("Usage: extract_page.mjs [--max-chars N]"); + process.exit(2); + } + return args; +} + +function bbFetch(url, outFile) { + execFileSync("bb", ["fetch", "--allow-redirects", url, "--output", outFile], { + stdio: ["ignore", "ignore", "ignore"], + }); +} + +function bbBrowseMarkdown(url) { + try { + execFileSync("bb", ["browse", "--headless", "open", url], { + stdio: ["ignore", "ignore", "ignore"], + timeout: 90000, + }); + const out = execFileSync("bb", ["browse", "--headless", "get", "markdown"], { + encoding: "utf8", + timeout: 90000, + maxBuffer: 50 * 1024 * 1024, + }); + // bb browse prints banners (e.g. "Update available...") before the JSON blob. + // Find the first '{' and try to JSON.parse from there. + const start = out.indexOf("{"); + if (start < 0) return ""; + try { + const parsed = JSON.parse(out.slice(start)); + if (parsed && typeof parsed.markdown === "string") return parsed.markdown; + } catch { + // Fallback: extract "markdown": "..." with a lenient regex that handles + // escaped quotes and newlines. + const m = out.slice(start).match(/"markdown"\s*:\s*"((?:\\.|[^"\\])*)"/s); + if (m) { + try { return JSON.parse(`"${m[1]}"`); } catch { return m[1]; } + } + } + return ""; + } catch (err) { + return ""; + } +} + +function extractMeta(html, name, attr = "name") { + const re = new RegExp( + `]*>([^<]*)<\/title>/i); + return m ? m[1].trim() : ""; +} + +function extractVisibleText(html, maxChars) { + // Multi-line aware script/style removal. + let s = html + .replace(/]*>[\s\S]*?<\/script>/gi, " ") + .replace(/]*>[\s\S]*?<\/style>/gi, " ") + .replace(/]*>[\s\S]*?<\/noscript>/gi, " ") + .replace(//g, " ") + .replace(/<[^>]+>/g, " ") + .replace(/ /g, " ") + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/&#[0-9]+;/g, " ") + .replace(/\s+/g, " ") + .trim(); + return s.slice(0, maxChars); +} + +function extractHeadings(html, limit = 10) { + const re = /]*>([\s\S]*?)<\/h[1-3]>/gi; + const out = []; + let m; + while ((m = re.exec(html)) && out.length < limit) { + const text = m[1].replace(/<[^>]+>/g, "").replace(/\s+/g, " ").trim(); + if (text) out.push(text); + } + return out; +} + +function main() { + const { url, maxChars } = parseArgs(process.argv.slice(2)); + const dir = mkdtempSync(join(tmpdir(), "extract_page_")); + const htmlFile = join(dir, "page.html"); + + let html = ""; + let fetchOk = false; + try { + bbFetch(url, htmlFile); + html = readFileSync(htmlFile, "utf8"); + fetchOk = true; + } catch (err) { + console.error(`[extract_page] bb fetch failed: ${err.message}`); + } + + const title = extractTitle(html); + const metaDesc = extractMeta(html, "description"); + const ogTitle = extractMeta(html, "og:title", "property"); + const ogDesc = extractMeta(html, "og:description", "property"); + const headings = extractHeadings(html); + let body = extractVisibleText(html, maxChars); + + // Thin content → JS-rendered SPA → fall back to bb browse. + let fallbackUsed = false; + if (body.length < THIN_CONTENT_THRESHOLD) { + const md = bbBrowseMarkdown(url); + if (md && md.length > body.length) { + body = md.replace(/\s+/g, " ").slice(0, maxChars); + fallbackUsed = true; + } + } + + rmSync(dir, { recursive: true, force: true }); + + // Structured output for subagent to read. + const lines = [ + `URL: ${url}`, + `FETCH_OK: ${fetchOk}`, + `FALLBACK_TO_BROWSE: ${fallbackUsed}`, + `TITLE: ${title}`, + `META_DESCRIPTION: ${metaDesc}`, + `OG_TITLE: ${ogTitle}`, + `OG_DESCRIPTION: ${ogDesc}`, + `HEADINGS: ${headings.join(" | ")}`, + `BODY_CHARS: ${body.length}`, + `BODY:`, + body, + ]; + process.stdout.write(lines.join("\n") + "\n"); +} + +main(); diff --git a/skills/company-research/scripts/list_urls.mjs b/skills/company-research/scripts/list_urls.mjs new file mode 100755 index 0000000..33f6f81 --- /dev/null +++ b/skills/company-research/scripts/list_urls.mjs @@ -0,0 +1,85 @@ +#!/usr/bin/env node + +// Deduplicates discovery URLs from bb search JSON output files. +// Usage: node list_urls.mjs /tmp [--prefix company] +// Reads all {prefix}_discovery_batch_*.json files, deduplicates by domain, +// outputs one URL per line to stdout, stats to stderr. + +import { readdirSync, readFileSync } from 'fs'; +import { join } from 'path'; + +const args = process.argv.slice(2); + +if (args.includes('--help') || args.includes('-h') || args.length === 0) { + console.error(`Usage: node list_urls.mjs [--prefix ] + +Reads all _discovery_batch_*.json files from , +deduplicates URLs by domain, and outputs one URL per line to stdout. + +Options: + --prefix Batch file prefix (default: "company") + --help, -h Show this help message + +Examples: + node list_urls.mjs /tmp + node list_urls.mjs /tmp --prefix company`); + process.exit(args.includes('--help') || args.includes('-h') ? 0 : 1); +} + +const dir = args[0]; +const prefixIdx = args.indexOf('--prefix'); +const prefix = prefixIdx !== -1 && args[prefixIdx + 1] ? args[prefixIdx + 1] : 'company'; + +const pattern = new RegExp(`^${prefix}_discovery_batch_.*\\.json$`); + +let files; +try { + files = readdirSync(dir) + .filter(f => pattern.test(f)) + .sort(); +} catch (err) { + console.error(`Error reading directory ${dir}: ${err.message}`); + process.exit(1); +} + +if (files.length === 0) { + console.error(`No ${prefix}_discovery_batch_*.json files found in ${dir}`); + process.exit(1); +} + +const seenDomains = new Set(); +const urls = []; +let totalResults = 0; + +for (const file of files) { + try { + const data = JSON.parse(readFileSync(join(dir, file), 'utf-8')); + const results = Array.isArray(data) ? data : (data.results || []); + totalResults += results.length; + + for (const result of results) { + const url = result.url; + if (!url) continue; + + try { + const hostname = new URL(url).hostname.replace(/^www\./, ''); + if (!seenDomains.has(hostname)) { + seenDomains.add(hostname); + urls.push(url); + } + } catch { + // Skip invalid URLs + } + } + } catch (err) { + console.error(`Warning: Failed to parse ${file}: ${err.message}`); + } +} + +// Output deduplicated URLs to stdout +for (const url of urls) { + console.log(url); +} + +// Stats to stderr +console.error(`\n${files.length} files, ${totalResults} total results, ${urls.length} unique domains`); diff --git a/skills/company-research/scripts/package.json b/skills/company-research/scripts/package.json new file mode 100644 index 0000000..61a2c1b --- /dev/null +++ b/skills/company-research/scripts/package.json @@ -0,0 +1,6 @@ +{ + "name": "company-research-scripts", + "version": "0.1.0", + "private": true, + "type": "module" +}