From 9d15b7a6f6906a18b0009826402822c3d5853065 Mon Sep 17 00:00:00 2001 From: Jay Sahnan Date: Thu, 23 Apr 2026 10:40:17 +0100 Subject: [PATCH 1/4] =?UTF-8?q?Add=20company-research=20skill=20=E2=80=94?= =?UTF-8?q?=20deep=20ICP=20research=20with=20HTML=20report?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 5 +- skills/company-research/SKILL.md | 208 ++++++++++ .../profiles/browserbase.json | 9 + skills/company-research/profiles/example.json | 9 + .../references/example-research.md | 66 ++++ .../references/report-template.html | 139 +++++++ .../references/research-patterns.md | 178 +++++++++ .../company-research/references/workflow.md | 228 +++++++++++ .../scripts/compile_report.mjs | 354 ++++++++++++++++++ skills/company-research/scripts/list_urls.mjs | 85 +++++ skills/company-research/scripts/package.json | 6 + 11 files changed, 1286 insertions(+), 1 deletion(-) create mode 100644 skills/company-research/SKILL.md create mode 100644 skills/company-research/profiles/browserbase.json create mode 100644 skills/company-research/profiles/example.json create mode 100644 skills/company-research/references/example-research.md create mode 100644 skills/company-research/references/report-template.html create mode 100644 skills/company-research/references/research-patterns.md create mode 100644 skills/company-research/references/workflow.md create mode 100644 skills/company-research/scripts/compile_report.mjs create mode 100755 skills/company-research/scripts/list_urls.mjs create mode 100644 skills/company-research/scripts/package.json diff --git a/.gitignore b/.gitignore index 880342f..d8133af 100644 --- a/.gitignore +++ b/.gitignore @@ -46,4 +46,7 @@ agent/downloads/* tmp/.cache/.chrome-pid # Chrome PID -.chrome-pid \ No newline at end of file +.chrome-pid + +# Per-skill local Claude Code settings (contains user-specific permission approvals) +skills/*/.claude/settings.local.json diff --git a/skills/company-research/SKILL.md b/skills/company-research/SKILL.md new file mode 100644 index 0000000..6f032f2 --- /dev/null +++ b/skills/company-research/SKILL.md @@ -0,0 +1,208 @@ +--- +name: company-research +description: | + Company discovery and deep research skill. Researches a company's product and ICP, + discovers target companies to sell to using Browserbase Search API, deeply researches + each using a Plan→Research→Synthesize pattern, and scores ICP fit — compiled into + a scored research report and CSV. Supports depth modes (quick/deep/deeper) for + balancing scale vs intelligence. + Use when the user wants to: (1) find companies to sell to, (2) research potential + customers, (3) discover companies matching an ICP, (4) build a target company list, + (5) do market research on prospects. Triggers: "find companies to sell to", + "company research", "find prospects", "ICP research", "target companies", + "who should we sell to", "market research", "lead research", "prospect list". +license: MIT +compatibility: Requires bb CLI (@browserbasehq/cli) and BROWSERBASE_API_KEY env var +allowed-tools: Bash Agent +metadata: + author: browserbase + version: "1.1.0" +--- + +# Company Research + +Discover and deeply research companies to sell to. Uses Browserbase Search API for discovery and a Plan→Research→Synthesize pattern for deep enrichment — outputting a scored research report and CSV. + +**Required**: `BROWSERBASE_API_KEY` env var and `bb` CLI installed. + +**First-run setup**: On the first run you'll be prompted to approve `bb fetch`, `bb search`, `cat`, `mkdir`, `sed`, etc. Select **"Yes, and don't ask again for: bb fetch:\*"** (or equivalent) for each to auto-approve for the session. To permanently approve, add these to your `~/.claude/settings.json` under `permissions.allow`: +```json +"Bash(bb:*)", "Bash(bunx:*)", "Bash(bun:*)", "Bash(node:*)", +"Bash(cat:*)", "Bash(mkdir:*)", "Bash(sed:*)", "Bash(head:*)", "Bash(tr:*)", "Bash(rm:*)" +``` + +**Path rules**: Always use the full literal path in all Bash commands — NOT `~` or `$HOME` (both trigger "shell expansion syntax" approval prompts). Resolve the home directory once and use it everywhere. When constructing subagent prompts, replace `{SKILL_DIR}` with the full literal path. + +**Output directory**: All research output goes to `~/Desktop/{company_slug}_research_{YYYY-MM-DD}/`. This directory contains one `.md` file per researched company plus a final `.csv`. The user gets both the scored spreadsheet and the full research files on their Desktop. + +**CRITICAL — Tool restrictions (applies to main agent AND all subagents)**: +- All web searches: use `bb search`. NEVER use WebSearch. +- All page fetches: use `bb fetch --allow-redirects`. NEVER use WebFetch. `bb fetch` returns raw HTML — to extract text, pipe through: `sed 's/]*>.*<\/script>//g; s/]*>.*<\/style>//g; s/<[^>]*>//g' | tr -s ' \n'`. Has a 1MB response limit — for large or JS-heavy pages, use `bb browse` instead. +- All research output: subagents write **one markdown file per company** to `{OUTPUT_DIR}/{company-slug}.md` using bash heredoc. NEVER use the Write tool or `python3 -c`. See `references/example-research.md` for the file format. +- Report + CSV compilation: use `node {SKILL_DIR}/scripts/compile_report.mjs {OUTPUT_DIR} --open` — generates HTML report and CSV in one step, opens overview in browser. +- URL deduplication: use `node {SKILL_DIR}/scripts/list_urls.mjs /tmp` after discovery. +- **Subagents must use ONLY the Bash tool. No other tools allowed.** +- **Main agent NEVER reads raw discovery JSON batch files.** Use `list_urls.mjs` for dedup. + +**CRITICAL — Minimize permission prompts**: +- Subagents MUST batch ALL file writes into a SINGLE Bash call using chained heredocs. One Bash call = one permission prompt. +- Batch ALL searches and ALL fetches into single Bash calls using `&&` chaining. + +## Pipeline Overview + +Follow these 5 steps in order. Do not skip steps or reorder. + +1. **Company Research** — Deeply understand the user's company, product, and who they sell to +2. **Depth Mode Selection** — Choose research depth based on how many targets they want +3. **Discovery** — Find target companies using diverse search queries +4. **Deep Research & Scoring** — Research each company, score ICP fit +5. **Report & CSV** — Present findings, compile scored CSV + +--- + +## Step 0: Setup Output Directory + +Before starting, create the output directory on the user's Desktop: + +```bash +OUTPUT_DIR=~/Desktop/{company_slug}_research_{YYYY-MM-DD} +mkdir -p "$OUTPUT_DIR" +``` + +Replace `{company_slug}` with the user's company name (lowercase, hyphenated) and `{YYYY-MM-DD}` with today's date. Pass `{OUTPUT_DIR}` (as a full literal path, not with `~`) to all subagent prompts so they write research files there. + +Also clean up discovery batch files from prior runs: +```bash +rm -f /tmp/company_discovery_batch_*.json +``` + +## Step 1: Deep Company Research + +This is the most important step. The quality of everything downstream depends on deeply understanding the user's company. + +1. Ask the user for their company name or URL + +2. **Check for an existing profile**: + - List files in `{SKILL_DIR}/profiles/` (ignore `example.json`) + - If a matching profile exists → load it, present to user: "I have your profile from {researched_at}. Still accurate?" If yes → skip to Step 2. + - If no profile exists → proceed with deep research below. + +3. **Run a full deep research on the user's company** using the Plan→Research→Synthesize pattern. + See `references/research-patterns.md` for sub-question templates and research methodology. + + **Key research steps:** + - Search: `bb search "{company name}" --num-results 10` + - Fetch homepage: `bb fetch --allow-redirects "{company website}"` + - **Discover site pages via sitemap** (do NOT hardcode paths like `/about` or `/customers`): + 1. `bb fetch --allow-redirects "{company website}/sitemap.xml"` — primary source + 2. Scan for URLs with keywords: `customer`, `case-stud`, `pricing`, `about`, `use-case`, `industry`, `solution` + 3. Optionally also fetch `/llms.txt` for page descriptions + 4. Pick 3-5 most relevant URLs and fetch those + - Search for external context and competitors + - Accumulate findings with confidence levels + + **Synthesize into a profile**: + Company, Product, Existing Customers, Competitors, Use Cases. + Do NOT include ICP or sub-verticals — those are per-run decisions. + +4. Present the profile to the user for confirmation. Do not proceed until confirmed. + +5. **Save the confirmed profile** to `{SKILL_DIR}/profiles/{company-slug}.json` + +6. **Ask clarifying questions** using `AskUserQuestion` with checkboxes: + - "Which segments are you targeting?" with options derived from the company research + - "Company stage?" — Startups, Mid-market, Enterprise, All + - "How many companies / depth?" — Quick (~100), Deep (~50), Deeper (~25) + - This is the ONLY user interaction. After this, execute silently until results are ready. + +## Step 2: Depth Mode Selection + +| Mode | Research per company | Best for | +|------|---------------------|----------| +| `quick` | Homepage + 1-2 searches | ~100 companies, broad scan | +| `deep` | 2-3 sub-questions, 5-8 tool calls | ~50 companies, solid research | +| `deeper` | 4-5 sub-questions, 10-15 tool calls | ~25 companies, full intelligence | + +## Step 3: Discovery + +**Formula**: `ceil(requested_companies / 35)` search queries needed. Over-discover by ~2-3x because filtering typically drops 50-70%. + +Generate search queries with these patterns: +- Industry + company stage + geography ("fintech startups series A Bay Area") +- Technology stack + use case ("companies using Selenium for web scraping") +- Competitor adjacency ("alternatives to {known company in ICP}") +- Buyer persona + pain point ("engineering teams struggling with browser automation") + +**Process**: +1. Launch ALL discovery subagents at once (up to ~6 per message). Each runs its queries in a SINGLE Bash call: + ```bash + bb search "{query}" --num-results 25 --output /tmp/company_discovery_batch_{N}.json + ``` +2. After all waves complete, deduplicate: `node {SKILL_DIR}/scripts/list_urls.mjs /tmp` +3. **Filter the URL list** — remove: + - Blog posts, news articles (globenewswire.com, techcrunch.com, etc.) + - Directories/aggregators (tracxn.com, crunchbase.com, g2.com) + - The user's own competitors and existing customers (from profile) + Keep only company homepages. + +See `references/workflow.md` for subagent prompt templates and wave management. + +## Step 4: Deep Research & Scoring + +Launch subagents to research companies in parallel. See `references/workflow.md` for the enrichment subagent prompt template. See `references/research-patterns.md` for the full research methodology. + +**Process**: +1. Split filtered URLs into groups per subagent (quick: ~10, deep: ~5, deeper: ~2-3) +2. Launch ALL enrichment subagents at once (up to ~6 per message) +3. Each subagent uses ONLY Bash — for each company: + + **Phase A — Plan** (skip in quick mode): + Decompose into 2-5 sub-questions based on ICP and enrichment fields. + + **Phase B — Research Loop**: + Search and fetch pages, extract findings. Respect step budget (quick: 2-3, deep: 5-8, deeper: 10-15). + + **Phase C — Synthesize**: + Score ICP fit 1-10 with evidence. Fill enrichment fields from findings. + +4. Subagents write ALL markdown files in a SINGLE Bash call using chained heredocs to `{OUTPUT_DIR}/` +5. After ALL subagents complete, proceed to Step 5 + +**Critical**: Include the confirmed ICP description verbatim in every subagent prompt. Pass the full literal `{OUTPUT_DIR}` path to every subagent. + +## Step 5: Report & CSV + +1. **Generate HTML report + CSV** (opens overview in browser automatically): + ```bash + node {SKILL_DIR}/scripts/compile_report.mjs {OUTPUT_DIR} --open + ``` + This generates: + - `{OUTPUT_DIR}/index.html` — overview page with scored table (opens in browser) + - `{OUTPUT_DIR}/companies/*.html` — individual company pages (linked from overview) + - `{OUTPUT_DIR}/results.csv` — scored spreadsheet for import into sheets/CRM + +2. **Present a summary in chat** too: + +``` +## Company Research Complete + +- **Total companies researched**: {count} +- **Depth mode**: {mode} +- **Score distribution**: + - Strong fit (8-10): {count} + - Partial fit (5-7): {count} + - Weak fit (1-4): {count} +- **Report opened in browser**: ~/Desktop/{company_slug}_research_{date}/index.html +``` + +3. Show the **top companies** sorted by ICP score in a table: + +``` +| Company | Score | Product | Industry | Fit Reasoning | +|---------|-------|---------|----------|---------------| +| Acme | 9 | AI inventory management | E-commerce SaaS | Series A, uses Selenium, expanding to EU | +``` + +4. For the top 3-5 companies, show a brief research summary — key findings, why they're a good fit, and what specific angle to approach them with. + +Offer to dig deeper into specific companies, adjust scoring criteria, or re-run discovery with different queries. diff --git a/skills/company-research/profiles/browserbase.json b/skills/company-research/profiles/browserbase.json new file mode 100644 index 0000000..12105a7 --- /dev/null +++ b/skills/company-research/profiles/browserbase.json @@ -0,0 +1,9 @@ +{ + "company": "Browserbase", + "website": "https://www.browserbase.com", + "product": "Cloud browser infrastructure for AI agents and web automation. Run Playwright, Puppeteer, and Selenium at scale with stealth mode, CAPTCHA solving, session persistence, residential proxies, and debugging tools. Products include Browserbase (headless infra), Stagehand (browser automation SDK), and Director (workflow builder). Also offers MCP browser tool and Computer Use Agent support.", + "existing_customers": ["Firecrawl", "Ramp", "Exa", "Reducto", "Cerebras", "Cartesia", "Extend", "Polymarket"], + "competitors": ["Browserless", "Apify", "Scrapfly", "Surfsky", "BrowserTree", "Hyperbrowser", "Anchor Browser"], + "use_cases": ["AI agent browser access", "web scraping and data extraction", "automated testing", "form filling", "document downloading", "price monitoring", "lead research", "computer use agents"], + "researched_at": "2026-03-18" +} diff --git a/skills/company-research/profiles/example.json b/skills/company-research/profiles/example.json new file mode 100644 index 0000000..ae469f5 --- /dev/null +++ b/skills/company-research/profiles/example.json @@ -0,0 +1,9 @@ +{ + "company": "", + "website": "", + "product": "", + "existing_customers": [], + "competitors": [], + "use_cases": [], + "researched_at": "" +} diff --git a/skills/company-research/references/example-research.md b/skills/company-research/references/example-research.md new file mode 100644 index 0000000..31e47f9 --- /dev/null +++ b/skills/company-research/references/example-research.md @@ -0,0 +1,66 @@ +# Example Company Research File + +Each research subagent writes one markdown file per company to `{OUTPUT_DIR}/{company-slug}.md`, where `{OUTPUT_DIR}` is the per-run Desktop directory set up by the main agent in Step 0 (e.g., `/Users/jay/Desktop/browserbase_research_2026-04-23/`). The YAML frontmatter contains structured fields for report + CSV compilation. The body contains human-readable research. + +## Template + +```markdown +--- +company_name: Acme Inc +website: https://acme.com +product_description: AI-powered inventory management for e-commerce brands +industry: E-commerce / SaaS +target_audience: Mid-market e-commerce brands +key_features: demand forecasting | automated reordering | multi-warehouse sync +icp_fit_score: 8 +icp_fit_reasoning: Series A e-commerce SaaS, uses Selenium for scraping, expanding to EU — strong fit +employee_estimate: 50-100 +funding_info: Series A, $12M +headquarters: San Francisco, CA +--- + +## Product +AI-powered inventory management for e-commerce brands. Helps DTC brands +automate reordering and sync across multiple warehouses. + +## Research Findings +- **[high]** Checkout optimization for Shopify stores, serving mid-market DTC brands with $5M-$50M revenue (source: acme.com/about) +- **[high]** Series A, $12M raised in Q3 2025 from Sequoia (source: TechCrunch) +- **[medium]** Recently hired 3 data engineers, expanding platform team (source: LinkedIn job posts) +- **[medium]** Uses Selenium for web scraping in their data pipeline (source: careers page) +``` + +## Field Rules + +- **YAML frontmatter**: All structured fields go here. These are extracted for CSV compilation. +- **`key_features`**: Pipe-separated (`|`) list in YAML, not a JSON array. +- **`icp_fit_score`**: Integer 1-10. +- **`icp_fit_reasoning`**: One line, references specific findings. +- **Body sections**: `## Product`, `## Research Findings`. +- **Findings format**: `- **[confidence]** fact (source: url or description)` +- **Filename**: `{OUTPUT_DIR}/{company-slug}.md` where slug is lowercase, hyphenated (e.g., `acme-inc.md`). +- **Deduplication**: One file per company. If a subagent encounters a company that already has a file, overwrite with richer data. + +## Writing via Bash Heredoc + +Subagents write these files using bash heredoc to avoid security prompts. Use the full literal `{OUTPUT_DIR}` path — no `~` or `$HOME`: + +```bash +cat << 'COMPANY_MD' > {OUTPUT_DIR}/acme-inc.md +--- +company_name: Acme Inc +website: https://acme.com +... +--- + +## Product +... + +## Research Findings +... +COMPANY_MD +``` + +Use `'COMPANY_MD'` (quoted) as the delimiter to prevent shell variable expansion. + +**IMPORTANT**: Write ALL company files in a SINGLE Bash call using chained heredocs to minimize permission prompts. diff --git a/skills/company-research/references/report-template.html b/skills/company-research/references/report-template.html new file mode 100644 index 0000000..97abeb7 --- /dev/null +++ b/skills/company-research/references/report-template.html @@ -0,0 +1,139 @@ + + + + + +Company Research — {{COMPANY_NAME}} + + + + + + +
+
+
+

{{TITLE}}

+
{{META}}
+
+ + Powered by Browserbase + + +
+ +
+
Companies
{{TOTAL}}
+
Strong Fit (8-10)
{{HIGH_COUNT}}
+
Partial Fit (5-7)
{{MEDIUM_COUNT}}
+
Weak Fit (1-4)
{{LOW_COUNT}}
+
+ +
+
+ Score Distribution + {{HIGH_PCT}}% strong fit +
+
+
+
+
+
+
+ Strong (8-10) + Partial (5-7) + Weak (1-4) +
+
+ + + + + + + + + + + + + {{TABLE_ROWS}} + +
ScoreCompanyProductIndustryFit Reasoning
+
+ + + + + diff --git a/skills/company-research/references/research-patterns.md b/skills/company-research/references/research-patterns.md new file mode 100644 index 0000000..b050dc2 --- /dev/null +++ b/skills/company-research/references/research-patterns.md @@ -0,0 +1,178 @@ +# Company Research — Deep Research Patterns + +## Overview + +This reference defines two research contexts: +1. **Self-Research** (Step 1) — Deep research on the user's own company to build a strong ICP foundation +2. **Target Research** (Step 6) — Research each discovered company using Plan→Research→Synthesize + +Both use the same 3-phase pattern but with different sub-questions and goals. + +## Self-Research (User's Company) + +This is the most important research in the pipeline. Every downstream decision depends on it. + +### Sub-Questions +- "What does {company} sell and what specific problem does it solve?" +- "Who are {company}'s existing customers? What industries, company sizes, and use cases?" +- "Who are {company}'s competitors and what differentiates them?" +- "What pricing model does {company} use and who is the typical buyer persona?" +- "What use cases and pain points does {company}'s marketing emphasize?" + +### Page Discovery +Discover site pages dynamically — do NOT hardcode paths like `/about` or `/customers`: +1. Fetch `bb fetch --allow-redirects "{company website}/sitemap.xml"` — primary source, has ALL pages +2. Scan sitemap URLs for keywords: `customer`, `case-stud`, `pricing`, `about`, `use-case`, `blog`, `docs`, `industry`, `solution` +3. Optionally fetch `bb fetch --allow-redirects "{company website}/llms.txt"` for page descriptions +4. Pick the 3-5 most relevant URLs from the sitemap and fetch those +5. Sitemap is the source of truth. llms.txt is bonus context but often incomplete. + +### External Research +- Search: `"{company} customers use cases reviews"` +- Search: `"{company} alternatives competitors vs"` +- Fetch 1-2 of the most informative third-party results (G2, blog posts, comparisons) + +### Synthesis Output +From all findings, produce a company profile: +- **Company**: name +- **Product**: what they sell, how it works, key capabilities (2-3 sentences, specific) +- **Existing Customers**: named customers or customer types found +- **Competitors**: who they compete with, key differentiators +- **Use Cases**: broad list of use cases the product serves (NOT tied to one vertical) + +Do NOT include ICP, pitch angle, or sub-verticals in the profile. Those are per-run targeting decisions made in Step 2 after the profile is confirmed. The profile is a general-purpose company fact sheet that works regardless of which vertical you target next. + +### Why This Matters +A thin profile produces generic search queries, weak lead scoring, and cookie-cutter emails. A rich profile with specific customers, competitors, and use cases produces targeted queries, accurate scoring, and emails that reference real pain points. + +--- + +## Target Company Research (Step 6) + +### Sub-Question Templates + +Generate sub-questions from these categories based on the ICP and enrichment fields requested. Not every category applies to every company — pick the most relevant. + +### Priority 1 (Always ask) +- **Product/Market**: "What does {company} sell and who are their customers?" +- **ICP Fit**: "How does {company}'s product/market relate to {sender's ICP description}?" + +### Priority 2 (Ask in deep/deeper) +- **Tech Stack**: "What technologies, frameworks, or infrastructure does {company} use?" +- **Growth Signals**: "Has {company} raised funding, launched products, or expanded recently?" +- **Pain Points**: "What challenges might {company} face that {sender's product} addresses?" + +### Priority 3 (Ask in deeper only) +- **Decision Makers**: "Who leads engineering, product, or growth at {company}?" +- **Competitive Landscape**: "Who are {company}'s competitors and how are they differentiated?" +- **Customers/Case Studies**: "Who are {company}'s notable customers and what results do they highlight?" + +### Search Query Patterns + +For each sub-question, generate 2-3 search query variations: + +``` +# Product/Market +"{company name} what they do" +"{company name} product features customers" + +# Tech Stack +"{company name} tech stack engineering blog" +"{company name} careers software engineer" (job posts reveal stack) + +# Growth Signals +"{company name} funding round 2025 2026" +"{company name} launch announcement" +"{company name} hiring" + +# Pain Points +"{company name} challenges {relevant domain}" +"{company name} {problem sender solves}" + +# Decision Makers +"{company name} VP engineering CTO LinkedIn" +"{company name} head of growth product" +``` + +## Finding Format + +Each finding is a self-contained factual statement tied to a source: + +```json +{ + "subQuestion": "What does Acme sell and who are their customers?", + "fact": "Acme provides checkout optimization for Shopify stores, serving mid-market DTC brands with $5M-$50M revenue", + "sourceUrl": "https://acme.com/about", + "sourceTitle": "About Acme - Checkout Optimization", + "confidence": "high" +} +``` + +**Confidence levels**: +- `high`: Directly stated on the company's own website or official press +- `medium`: Inferred from job postings, third-party articles, or indirect signals +- `low`: Speculative based on industry/category, or from outdated sources + +## Research Loop Rules + +1. **Process sub-questions by priority** — Priority 1 first, then 2, then 3 +2. **3-5 findings per sub-question, then move on** — Don't exhaust a topic +3. **Use parallel tool calls** — Search multiple queries simultaneously when possible +4. **Rephrase, don't retry** — If a search returns poor results, try different keywords +5. **Fetch selectively** — Don't fetch every URL from search results. Pick the 1-2 most relevant based on title and URL +6. **Stop at step limit** — Respect the depth mode's step budget per company +7. **Homepage first** — Always fetch the company's homepage before branching to other pages +8. **Deduplicate findings** — Don't record the same fact twice from different sources + +## Depth Mode Behavior + +### Quick Mode (100+ leads) +- **Skip Phase A** — No sub-question decomposition +- **Phase B**: Fetch the company homepage. Run 1-2 supplementary searches if homepage data is thin. +- **Phase C**: Extract available data, score ICP, write email from what's available +- **Budget**: 2-3 total tool calls per company +- **Trade-off**: Fast and cheap, but emails may be less personalized + +### Deep Mode (25-50 leads) +- **Phase A**: Decompose into 2-3 sub-questions (Priority 1 + selected Priority 2) +- **Phase B**: For each sub-question, run 2-3 searches + fetch 1-2 URLs. Target 3-5 findings per sub-question. +- **Phase C**: Synthesize from all findings. ICP reasoning references specific evidence. Email uses the most specific/compelling finding. +- **Budget**: 5-8 total tool calls per company +- **Trade-off**: Good balance of depth and scale + +### Deeper Mode (10-25 leads) +- **Phase A**: Decompose into 4-5 sub-questions (Priority 1 + 2 + selected Priority 3) +- **Phase B**: Research exhaustively. Fetch multiple pages per company (homepage, about, blog, careers, product pages). Target 3-5 findings per sub-question. +- **Phase C**: Synthesize with cited evidence. ICP reasoning is detailed. Email references multiple specific signals. +- **Budget**: 10-15 total tool calls per company +- **Trade-off**: High quality intelligence, but slow and expensive + +## Synthesis Instructions + +After the research loop completes for a company, synthesize findings into the output record: + +### ICP Scoring +Score 1-10 using ALL accumulated findings as evidence: +- **8-10**: Strong match. Multiple high-confidence findings confirm right industry, company stage, and clear pain point alignment. The pitch angle directly addresses a visible need supported by evidence. +- **5-7**: Partial match. Some findings suggest relevance but key signals are missing or low-confidence. Adjacent industry or unclear pain point. +- **1-4**: Weak match. Findings indicate wrong segment, too large/small, or no apparent connection to sender's product. + +Write `icp_fit_reasoning` referencing specific findings: "Series A fintech (from Crunchbase), uses Selenium for scraping (from job posting), expanding to EU market (from blog) — strong fit for browser infrastructure." + +### Email Personalization +Use the **richest, most specific** findings for email context: +- Opening: Use the most concrete finding (a specific product feature, a recent launch, a job posting) +- Bridge: Connect a finding about their challenges/stack to the sender's pitch angle +- If only low-confidence findings exist, keep the email shorter and more general — don't fabricate specificity + +### Enrichment Fields +Map findings to enrichment fields: +- `product_description` → from Product/Market findings +- `industry` → inferred from Product/Market +- `employee_estimate` → from LinkedIn search or careers page findings +- `funding_info` → from Growth Signals findings +- `headquarters` → from company homepage or about page +- `target_audience` → from Product/Market findings +- `key_features` → from product page findings + +If a field has no supporting findings, leave it empty rather than guessing. diff --git a/skills/company-research/references/workflow.md b/skills/company-research/references/workflow.md new file mode 100644 index 0000000..c016d7a --- /dev/null +++ b/skills/company-research/references/workflow.md @@ -0,0 +1,228 @@ +# Company Research — Workflow Reference + +## Discovery Batch JSON Schema + +File: `/tmp/company_discovery_batch_{N}.json` + +`bb search --output` writes a JSON object (NOT a flat array): + +```json +{ + "requestId": "abc123", + "query": "AI data extraction startups", + "results": [ + { "url": "https://example.com", "title": "Example Corp", "author": null, "publishedDate": null }, + ... + ] +} +``` + +The `list_urls.mjs` script handles both formats (flat array and `{ results: [...] }`). + +## Company Research Markdown Format + +File: `{OUTPUT_DIR}/{company-slug}.md` + +Where `{OUTPUT_DIR}` is the per-run directory on the user's Desktop (e.g., `/Users/jay/Desktop/browserbase_research_2026-04-23/`). The main agent sets this up in Step 0 and passes the full literal path to every subagent. + +Each research subagent writes one markdown file per company. See `references/example-research.md` for the full template. + +**YAML frontmatter fields** (used for report + CSV compilation): +- `company_name` (required) +- `website` (required) +- `product_description` +- `industry` +- `target_audience` +- `key_features` (pipe-separated: `feature1 | feature2 | feature3`) +- `icp_fit_score` (integer 1-10, required) +- `icp_fit_reasoning` +- `employee_estimate` +- `funding_info` +- `headquarters` + +**Body sections**: +- `## Product` — what they do +- `## Research Findings` — evidence with confidence levels and sources + +**CRITICAL**: Use consistent field names across all files. The `compile_report.mjs` script reads these fields. + +## Extracting Text from HTML + +`bb fetch --allow-redirects` returns raw HTML. To extract readable text in a subagent Bash call, use: + +```bash +# Fetch and extract text in one pipeline +bb fetch --allow-redirects "https://example.com" | sed 's/]*>.*<\/script>//g; s/]*>.*<\/style>//g; s/<[^>]*>//g; s/&/\&/g; s/<//g; s/ / /g; s/&#[0-9]*;//g' | tr -s ' \n' | head -c 3000 +``` + +Or save to file first and then extract: +```bash +bb fetch --allow-redirects "https://example.com" --output /tmp/fetch_example.html && sed 's/]*>.*<\/script>//g; s/]*>.*<\/style>//g; s/<[^>]*>//g' /tmp/fetch_example.html | tr -s ' \n' | head -c 3000 +``` + +Limit to ~3000 chars per page to keep subagent context manageable. + +## Discovery Subagent Prompt Template + +``` +You are a company discovery subagent. Run search queries and save results. + +TOOL RULES — CRITICAL, FOLLOW EXACTLY: +1. You may ONLY use the Bash tool. No exceptions. +2. Run ALL searches in a SINGLE Bash call using && chaining. +3. BANNED TOOLS: WebFetch, WebSearch, Write, Read, Glob, Grep — ALL BANNED. + If you use ANY banned tool, the entire run fails. Use ONLY Bash. +4. NEVER use ~ or $HOME in paths — use full literal paths. + +TASK: +Run ALL of the following searches in ONE Bash command: + +bb search "{query1}" --num-results 25 --output /tmp/company_discovery_batch_{N1}.json && \ +bb search "{query2}" --num-results 25 --output /tmp/company_discovery_batch_{N2}.json && \ +bb search "{query3}" --num-results 25 --output /tmp/company_discovery_batch_{N3}.json && \ +echo "Discovery complete" + +After the command completes, report back ONLY the count of results found per batch. +Do NOT analyze, summarize, or return the actual results. +``` + +## Research Subagent Prompt Template + +``` +You are a company research subagent. For each company URL, research the company and score ICP fit. + +CONTEXT: +- User's company: {user_company} +- User's product: {user_product} +- ICP description: {icp_description} +- Depth mode: {depth_mode} +- Output directory: {OUTPUT_DIR} ← write research files HERE, as a full literal path + +URLS TO PROCESS: +{url_list} + +TOOL RULES — CRITICAL, FOLLOW EXACTLY: +1. You may ONLY use the Bash tool. No exceptions. +2. All searches: Bash → bb search "..." --num-results 10 +3. All page fetches: Bash → bb fetch --allow-redirects "..." + bb fetch returns RAW HTML. To extract text, pipe through: + sed 's/]*>.*<\/script>//g; s/]*>.*<\/style>//g; s/<[^>]*>//g' | tr -s ' \n' | head -c 3000 + If a page returns thin content or "enable JavaScript", use bb browse instead. +4. BATCH all file writes: Write ALL markdown files in a SINGLE Bash call using chained heredocs (one permission prompt, not one per file). +5. BANNED TOOLS: WebFetch, WebSearch, Write, Read, Glob, Grep — ALL BANNED. + If you use ANY banned tool, the entire run fails. Use ONLY Bash. +6. NEVER use ~ or $HOME in paths — use full literal paths. + +RESEARCH PATTERN (per company): + +Phase A — Plan (skip in quick mode): +Decompose what you need to know into sub-questions based on ICP and enrichment fields. + +Phase B — Research Loop: +For each sub-question (or just the homepage in quick mode): +1. Run bb search with relevant query +2. Pick 1-2 most relevant URLs from results +3. Run bb fetch --allow-redirects on selected URLs, pipe through sed to extract text +4. Smart page discovery: try /llms.txt or /sitemap.xml to find relevant pages — don't guess paths +5. Extract findings: factual statements with source, confidence level +6. Accumulate findings, move to next sub-question +7. Respect step budget: quick=2-3 calls, deep=5-8, deeper=10-15 + +Phase C — Synthesize: +From accumulated findings: +1. Score ICP fit 1-10 (see rubric below) +2. Fill enrichment fields from findings +3. Reference specific findings in icp_fit_reasoning + +ICP SCORING RUBRIC: +- 8-10: Strong match. Multiple high-confidence findings confirm fit. +- 5-7: Partial match. Some findings suggest relevance but key signals missing. +- 1-4: Weak match. Wrong segment or no apparent connection. + +OUTPUT — write ALL company files in a SINGLE Bash call using chained heredocs directly to {OUTPUT_DIR}: + +cat << 'COMPANY_MD' > {OUTPUT_DIR}/{slug1}.md +--- +company_name: {name} +website: {url} +product_description: {description} +industry: {industry} +target_audience: {audience} +key_features: {feature1} | {feature2} | {feature3} +icp_fit_score: {score} +icp_fit_reasoning: {reasoning} +employee_estimate: {estimate} +funding_info: {funding} +headquarters: {location} +--- + +## Product +{product description paragraph} + +## Research Findings +- **[{confidence}]** {finding} (source: {url}) +COMPANY_MD +cat << 'COMPANY_MD' > {OUTPUT_DIR}/{slug2}.md +--- +... +--- +... +COMPANY_MD + +Use 'COMPANY_MD' (quoted) as the heredoc delimiter to prevent shell variable expansion. + +Report back ONLY: "Batch {batch_id}: {succeeded}/{total} researched, {findings_count} total findings." +Do NOT return raw data to the main conversation. +``` + +## Wave Management + +### Key Principle: Maximize Parallelism, Minimize Prompts +Launch as many subagents as possible in a single message (up to ~6 Agent tool calls per message). Each subagent MUST batch all its Bash operations to minimize permission prompts. + +### Discovery Phase +- Launch up to 6 discovery subagents in a single message +- Each subagent runs ALL its queries in a SINGLE Bash call using `&&` chaining +- After all waves complete, run `node {SKILL_DIR}/scripts/list_urls.mjs /tmp` +- **Filter URLs**: Remove blog posts, news articles, directories, competitors, and existing customers. Keep only company homepages. + +### Research Phase +- Companies per subagent varies by depth: + - `quick`: ~10 companies per subagent + - `deep`: ~5 companies per subagent + - `deeper`: ~2-3 companies per subagent +- Each subagent writes ALL its markdown files in a SINGLE Bash call (chained heredocs) directly to `{OUTPUT_DIR}` + +### Sizing Formula +``` +search_queries = ceil(requested_companies / 35) +discovery_subagents = search_queries +expected_urls = search_queries * 20 + +quick: research_subagents = ceil(expected_urls / 10) +deep: research_subagents = ceil(expected_urls / 5) +deeper: research_subagents = ceil(expected_urls / 3) +``` + +### Error Handling +- If a subagent fails, log the error and continue with remaining batches +- If >50% of subagents fail in a wave, pause and inform the user +- If `bb fetch --allow-redirects` fails, try `bb browse` as fallback or skip + +## Report + CSV Compilation + +After all research subagents complete, compile the HTML report and CSV in one command: + +```bash +node {SKILL_DIR}/scripts/compile_report.mjs {OUTPUT_DIR} --open +``` + +The script: +- Reads all `.md` files in `{OUTPUT_DIR}` +- Parses YAML frontmatter + body sections +- Deduplicates by normalized company name (keeps highest ICP score) +- Generates `{OUTPUT_DIR}/index.html` — scored overview page +- Generates `{OUTPUT_DIR}/companies/{slug}.html` — one page per company +- Generates `{OUTPUT_DIR}/results.csv` — spreadsheet for sheets/CRM +- Opens `index.html` in the default browser (`--open` flag) +- Prints a JSON summary to stderr diff --git a/skills/company-research/scripts/compile_report.mjs b/skills/company-research/scripts/compile_report.mjs new file mode 100644 index 0000000..3463c96 --- /dev/null +++ b/skills/company-research/scripts/compile_report.mjs @@ -0,0 +1,354 @@ +#!/usr/bin/env node + +// Compiles per-company markdown research files into an HTML report + CSV. +// Reads the report template, fills in placeholders, generates index.html +// with a scored overview table linking to individual company pages. +// +// Usage: node compile_report.mjs [--template ] +// Example: node compile_report.mjs ~/Desktop/asprey_research_2026-04-09 + +import { readdirSync, readFileSync, writeFileSync, existsSync } from 'fs'; +import { join, dirname } from 'path'; +import { fileURLToPath } from 'url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const args = process.argv.slice(2); + +if (args.includes('--help') || args.includes('-h') || args.length === 0) { + console.error(`Usage: node compile_report.mjs [--template ] + +Reads all .md files from , generates: + - index.html — overview page with scored table + - companies/.html — individual company research pages + - results.csv — scored spreadsheet + +Options: + --template Path to report-template.html (default: auto-detect) + --open Open index.html in browser after generation + --help, -h Show this help message + +Examples: + node compile_report.mjs ~/Desktop/asprey_research_2026-04-09 + node compile_report.mjs ~/Desktop/research --open`); + process.exit(args.includes('--help') || args.includes('-h') ? 0 : 1); +} + +const dir = args[0]; +const shouldOpen = args.includes('--open'); +const templateIdx = args.indexOf('--template'); +let templatePath = templateIdx !== -1 ? args[templateIdx + 1] : null; + +// Auto-detect template +if (!templatePath) { + const candidates = [ + join(__dirname, '..', 'references', 'report-template.html'), + join(__dirname, 'report-template.html'), + ]; + templatePath = candidates.find(p => existsSync(p)); + if (!templatePath) { + console.error('Error: Could not find report-template.html. Use --template to specify path.'); + process.exit(1); + } +} + +const template = readFileSync(templatePath, 'utf-8'); + +// Read and parse markdown files +let files; +try { + files = readdirSync(dir).filter(f => f.endsWith('.md')).sort(); +} catch (err) { + console.error(`Error reading directory ${dir}: ${err.message}`); + process.exit(1); +} + +if (files.length === 0) { + console.error(`No .md files found in ${dir}`); + process.exit(1); +} + +function parseFrontmatter(content) { + const fmMatch = content.match(/^---\n([\s\S]*?)\n---/); + if (!fmMatch) return null; + const fields = {}; + for (const line of fmMatch[1].split('\n')) { + const idx = line.indexOf(':'); + if (idx > 0) { + const key = line.slice(0, idx).trim(); + const val = line.slice(idx + 1).trim().replace(/^["']|["']$/g, ''); + if (key && val) fields[key] = val; + } + } + return fields; +} + +function parseBody(content) { + const bodyMatch = content.match(/^---\n[\s\S]*?\n---\n([\s\S]*)/); + return bodyMatch ? bodyMatch[1].trim() : ''; +} + +function escapeHtml(str) { + return (str || '').replace(/&/g, '&').replace(//g, '>').replace(/"/g, '"'); +} + +function scoreClass(score) { + const s = parseInt(score) || 0; + if (s >= 8) return 'high'; + if (s >= 5) return 'medium'; + return 'low'; +} + +function mdToHtml(md) { + const lines = md.split('\n'); + const out = []; + let inList = false; + let paraLines = []; + + function flushPara() { + if (paraLines.length > 0) { + let text = paraLines.join(' ').trim(); + text = text.replace(/\*\*\[(\w+)\]\*\*/g, '[$1]'); + text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); + if (text) out.push(`

${text}

`); + paraLines = []; + } + } + + function closeList() { + if (inList) { out.push(''); inList = false; } + } + + for (const line of lines) { + const trimmed = line.trim(); + + if (!trimmed) { + flushPara(); + closeList(); + continue; + } + + // Headings + if (trimmed.startsWith('## ')) { + flushPara(); closeList(); + out.push(`

${escapeHtml(trimmed.slice(3))}

`); + continue; + } + if (trimmed.startsWith('### ')) { + flushPara(); closeList(); + out.push(`

${escapeHtml(trimmed.slice(4))}

`); + continue; + } + + // List items + if (trimmed.startsWith('- ')) { + flushPara(); + if (!inList) { out.push('
    '); inList = true; } + let text = trimmed.slice(2); + text = text.replace(/\*\*\[(\w+)\]\*\*/g, '[$1]'); + text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); + out.push(`
  • ${text}
  • `); + continue; + } + + // Regular text — accumulate into paragraph + closeList(); + paraLines.push(trimmed); + } + + flushPara(); + closeList(); + return out.join('\n'); +} + +// Parse all companies +const companies = []; +for (const file of files) { + const content = readFileSync(join(dir, file), 'utf-8'); + const fields = parseFrontmatter(content); + if (!fields) continue; + const body = parseBody(content); + const slug = file.replace('.md', ''); + companies.push({ ...fields, body, slug, file }); +} + +// Sort by ICP score descending +companies.sort((a, b) => (parseInt(b.icp_fit_score) || 0) - (parseInt(a.icp_fit_score) || 0)); + +// Deduplicate +const seen = new Map(); +for (const c of companies) { + const name = (c.company_name || '').toLowerCase().replace(/\s*(inc|llc|ltd|corp|co)\s*\.?$/i, '').trim(); + if (!seen.has(name)) seen.set(name, c); +} +const deduped = [...seen.values()]; + +// Stats +const scores = deduped.map(c => parseInt(c.icp_fit_score) || 0); +const high = scores.filter(s => s >= 8).length; +const medium = scores.filter(s => s >= 5 && s < 8).length; +const low = scores.filter(s => s < 5).length; +const total = deduped.length; +const highPct = total > 0 ? Math.round((high / total) * 100) : 0; +const mediumPct = total > 0 ? Math.round((medium / total) * 100) : 0; +const lowPct = total > 0 ? 100 - highPct - mediumPct : 0; + +// Derive title from directory name +const dirName = dir.split('/').pop(); +const title = dirName.replace(/_/g, ' ').replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()); + +// Generate table rows +const tableRows = deduped.map(c => { + const sc = scoreClass(c.icp_fit_score); + const hasDetail = c.body && c.body.length > 50; + const nameHtml = hasDetail + ? `${escapeHtml(c.company_name)}` + : escapeHtml(c.company_name); + const websiteHtml = c.website + ? `
    ${escapeHtml(c.website.replace(/^https?:\/\/(www\.)?/, ''))}` + : ''; + return ` + ${escapeHtml(c.icp_fit_score || '—')} + ${nameHtml}${websiteHtml} + ${escapeHtml(c.product_description || '')} + ${escapeHtml(c.industry || '')} + ${escapeHtml(c.icp_fit_reasoning || '')} + `; +}).join('\n'); + +// Fill index template +let indexHtml = template + .replace(/\{\{TITLE\}\}/g, `Company Research — ${escapeHtml(title)}`) + .replace('{{META}}', `${deduped.length} companies researched · ${new Date().toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' })}`) + .replace('{{TOTAL}}', String(total)) + .replace('{{HIGH_COUNT}}', String(high)) + .replace('{{MEDIUM_COUNT}}', String(medium)) + .replace('{{LOW_COUNT}}', String(low)) + .replace('{{HIGH_PCT}}', String(highPct)) + .replace('{{MEDIUM_PCT}}', String(mediumPct)) + .replace('{{LOW_PCT}}', String(lowPct)) + .replace('{{TABLE_ROWS}}', tableRows); + +writeFileSync(join(dir, 'index.html'), indexHtml); + +// Generate individual company pages +const { mkdirSync } = await import('fs'); +try { mkdirSync(join(dir, 'companies'), { recursive: true }); } catch {} + +for (const c of deduped) { + if (!c.body || c.body.length < 50) continue; + const sc = scoreClass(c.icp_fit_score); + const bodyHtml = mdToHtml(c.body); + + const companyHtml = ` + + + + +${escapeHtml(c.company_name)} — Research + + + + +
    + ← Back to overview +
    +

    ${escapeHtml(c.company_name)}

    +
    + ICP Score: ${escapeHtml(c.icp_fit_score || '—')} + ${c.website ? `${escapeHtml(c.website)}` : ''} +
    +
    +
    + ${c.product_description ? `
    Product
    ${escapeHtml(c.product_description)}
    ` : ''} + ${c.industry ? `
    Industry
    ${escapeHtml(c.industry)}
    ` : ''} + ${c.target_audience ? `
    Target Audience
    ${escapeHtml(c.target_audience)}
    ` : ''} + ${c.key_features ? `
    Key Features
    ${escapeHtml(c.key_features)}
    ` : ''} + ${c.employee_estimate ? `
    Employees
    ${escapeHtml(c.employee_estimate)}
    ` : ''} + ${c.funding_info ? `
    Funding
    ${escapeHtml(c.funding_info)}
    ` : ''} + ${c.headquarters ? `
    HQ
    ${escapeHtml(c.headquarters)}
    ` : ''} + ${c.icp_fit_reasoning ? `
    Fit Reasoning
    ${escapeHtml(c.icp_fit_reasoning)}
    ` : ''} +
    +
    + ${bodyHtml} +
    +
    + + +`; + + writeFileSync(join(dir, 'companies', `${c.slug}.html`), companyHtml); +} + +// Generate CSV +const priority = [ + 'company_name', 'website', 'product_description', 'icp_fit_score', + 'icp_fit_reasoning', 'industry', 'target_audience', 'key_features', + 'employee_estimate', 'funding_info', 'headquarters' +]; +const allCols = [...new Set(deduped.flatMap(r => Object.keys(r)).filter(k => k !== 'body' && k !== 'slug' && k !== 'file'))]; +const cols = [...priority.filter(c => allCols.includes(c)), ...allCols.filter(c => !priority.includes(c)).sort()]; + +function csvEscape(v) { + if (!v) return ''; + if (v.includes(',') || v.includes('"') || v.includes('\n')) return '"' + v.replace(/"/g, '""') + '"'; + return v; +} + +const csvLines = [cols.join(',')]; +for (const row of deduped) { + csvLines.push(cols.map(c => csvEscape(row[c] || '')).join(',')); +} +writeFileSync(join(dir, 'results.csv'), csvLines.join('\n') + '\n'); + +// Summary +console.error(JSON.stringify({ + total: deduped.length, + high_fit: high, + medium_fit: medium, + low_fit: low, + files_generated: { + index: join(dir, 'index.html'), + company_pages: deduped.filter(c => c.body && c.body.length > 50).length, + csv: join(dir, 'results.csv') + } +}, null, 2)); + +console.log(join(dir, 'index.html')); + +// Open in browser if requested +if (shouldOpen) { + const { execSync } = await import('child_process'); + try { execSync(`open "${join(dir, 'index.html')}"`); } catch {} +} diff --git a/skills/company-research/scripts/list_urls.mjs b/skills/company-research/scripts/list_urls.mjs new file mode 100755 index 0000000..33f6f81 --- /dev/null +++ b/skills/company-research/scripts/list_urls.mjs @@ -0,0 +1,85 @@ +#!/usr/bin/env node + +// Deduplicates discovery URLs from bb search JSON output files. +// Usage: node list_urls.mjs /tmp [--prefix company] +// Reads all {prefix}_discovery_batch_*.json files, deduplicates by domain, +// outputs one URL per line to stdout, stats to stderr. + +import { readdirSync, readFileSync } from 'fs'; +import { join } from 'path'; + +const args = process.argv.slice(2); + +if (args.includes('--help') || args.includes('-h') || args.length === 0) { + console.error(`Usage: node list_urls.mjs [--prefix ] + +Reads all _discovery_batch_*.json files from , +deduplicates URLs by domain, and outputs one URL per line to stdout. + +Options: + --prefix Batch file prefix (default: "company") + --help, -h Show this help message + +Examples: + node list_urls.mjs /tmp + node list_urls.mjs /tmp --prefix company`); + process.exit(args.includes('--help') || args.includes('-h') ? 0 : 1); +} + +const dir = args[0]; +const prefixIdx = args.indexOf('--prefix'); +const prefix = prefixIdx !== -1 && args[prefixIdx + 1] ? args[prefixIdx + 1] : 'company'; + +const pattern = new RegExp(`^${prefix}_discovery_batch_.*\\.json$`); + +let files; +try { + files = readdirSync(dir) + .filter(f => pattern.test(f)) + .sort(); +} catch (err) { + console.error(`Error reading directory ${dir}: ${err.message}`); + process.exit(1); +} + +if (files.length === 0) { + console.error(`No ${prefix}_discovery_batch_*.json files found in ${dir}`); + process.exit(1); +} + +const seenDomains = new Set(); +const urls = []; +let totalResults = 0; + +for (const file of files) { + try { + const data = JSON.parse(readFileSync(join(dir, file), 'utf-8')); + const results = Array.isArray(data) ? data : (data.results || []); + totalResults += results.length; + + for (const result of results) { + const url = result.url; + if (!url) continue; + + try { + const hostname = new URL(url).hostname.replace(/^www\./, ''); + if (!seenDomains.has(hostname)) { + seenDomains.add(hostname); + urls.push(url); + } + } catch { + // Skip invalid URLs + } + } + } catch (err) { + console.error(`Warning: Failed to parse ${file}: ${err.message}`); + } +} + +// Output deduplicated URLs to stdout +for (const url of urls) { + console.log(url); +} + +// Stats to stderr +console.error(`\n${files.length} files, ${totalResults} total results, ${urls.length} unique domains`); diff --git a/skills/company-research/scripts/package.json b/skills/company-research/scripts/package.json new file mode 100644 index 0000000..61a2c1b --- /dev/null +++ b/skills/company-research/scripts/package.json @@ -0,0 +1,6 @@ +{ + "name": "company-research-scripts", + "version": "0.1.0", + "private": true, + "type": "module" +} From cc8d99339447f99b9a4d4786572c2118f09ad1e8 Mon Sep 17 00:00:00 2001 From: Jay Sahnan Date: Thu, 23 Apr 2026 12:24:47 +0100 Subject: [PATCH 2/4] removed profiles --- skills/company-research/.gitignore | 2 ++ .../profiles/browserbase.json | 9 ------ .../references/example-research.md | 2 +- .../company-research/references/workflow.md | 2 +- .../scripts/compile_report.mjs | 30 ++++++++++--------- 5 files changed, 20 insertions(+), 25 deletions(-) create mode 100644 skills/company-research/.gitignore delete mode 100644 skills/company-research/profiles/browserbase.json diff --git a/skills/company-research/.gitignore b/skills/company-research/.gitignore new file mode 100644 index 0000000..d4fcb2d --- /dev/null +++ b/skills/company-research/.gitignore @@ -0,0 +1,2 @@ +profiles/*.json +!profiles/example.json diff --git a/skills/company-research/profiles/browserbase.json b/skills/company-research/profiles/browserbase.json deleted file mode 100644 index 12105a7..0000000 --- a/skills/company-research/profiles/browserbase.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "company": "Browserbase", - "website": "https://www.browserbase.com", - "product": "Cloud browser infrastructure for AI agents and web automation. Run Playwright, Puppeteer, and Selenium at scale with stealth mode, CAPTCHA solving, session persistence, residential proxies, and debugging tools. Products include Browserbase (headless infra), Stagehand (browser automation SDK), and Director (workflow builder). Also offers MCP browser tool and Computer Use Agent support.", - "existing_customers": ["Firecrawl", "Ramp", "Exa", "Reducto", "Cerebras", "Cartesia", "Extend", "Polymarket"], - "competitors": ["Browserless", "Apify", "Scrapfly", "Surfsky", "BrowserTree", "Hyperbrowser", "Anchor Browser"], - "use_cases": ["AI agent browser access", "web scraping and data extraction", "automated testing", "form filling", "document downloading", "price monitoring", "lead research", "computer use agents"], - "researched_at": "2026-03-18" -} diff --git a/skills/company-research/references/example-research.md b/skills/company-research/references/example-research.md index 31e47f9..aa10f1c 100644 --- a/skills/company-research/references/example-research.md +++ b/skills/company-research/references/example-research.md @@ -1,6 +1,6 @@ # Example Company Research File -Each research subagent writes one markdown file per company to `{OUTPUT_DIR}/{company-slug}.md`, where `{OUTPUT_DIR}` is the per-run Desktop directory set up by the main agent in Step 0 (e.g., `/Users/jay/Desktop/browserbase_research_2026-04-23/`). The YAML frontmatter contains structured fields for report + CSV compilation. The body contains human-readable research. +Each research subagent writes one markdown file per company to `{OUTPUT_DIR}/{company-slug}.md`, where `{OUTPUT_DIR}` is the per-run Desktop directory set up by the main agent in Step 0 (e.g., `~/Desktop/acme_research_2026-04-23/`). The YAML frontmatter contains structured fields for report + CSV compilation. The body contains human-readable research. ## Template diff --git a/skills/company-research/references/workflow.md b/skills/company-research/references/workflow.md index c016d7a..c0936c7 100644 --- a/skills/company-research/references/workflow.md +++ b/skills/company-research/references/workflow.md @@ -23,7 +23,7 @@ The `list_urls.mjs` script handles both formats (flat array and `{ results: [... File: `{OUTPUT_DIR}/{company-slug}.md` -Where `{OUTPUT_DIR}` is the per-run directory on the user's Desktop (e.g., `/Users/jay/Desktop/browserbase_research_2026-04-23/`). The main agent sets this up in Step 0 and passes the full literal path to every subagent. +Where `{OUTPUT_DIR}` is the per-run directory on the user's Desktop (e.g., `~/Desktop/acme_research_2026-04-23/`). The main agent sets this up in Step 0 and passes the full literal path to every subagent. Each research subagent writes one markdown file per company. See `references/example-research.md` for the full template. diff --git a/skills/company-research/scripts/compile_report.mjs b/skills/company-research/scripts/compile_report.mjs index 3463c96..3b0a8ba 100644 --- a/skills/company-research/scripts/compile_report.mjs +++ b/skills/company-research/scripts/compile_report.mjs @@ -5,7 +5,7 @@ // with a scored overview table linking to individual company pages. // // Usage: node compile_report.mjs [--template ] -// Example: node compile_report.mjs ~/Desktop/asprey_research_2026-04-09 +// Example: node compile_report.mjs ~/Desktop/acme_research_2026-04-09 import { readdirSync, readFileSync, writeFileSync, existsSync } from 'fs'; import { join, dirname } from 'path'; @@ -30,7 +30,7 @@ Options: --help, -h Show this help message Examples: - node compile_report.mjs ~/Desktop/asprey_research_2026-04-09 + node compile_report.mjs ~/Desktop/acme_research_2026-04-09 node compile_report.mjs ~/Desktop/research --open`); process.exit(args.includes('--help') || args.includes('-h') ? 0 : 1); } @@ -108,7 +108,7 @@ function mdToHtml(md) { function flushPara() { if (paraLines.length > 0) { - let text = paraLines.join(' ').trim(); + let text = escapeHtml(paraLines.join(' ').trim()); text = text.replace(/\*\*\[(\w+)\]\*\*/g, '[$1]'); text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); if (text) out.push(`

    ${text}

    `); @@ -145,7 +145,7 @@ function mdToHtml(md) { if (trimmed.startsWith('- ')) { flushPara(); if (!inList) { out.push('
      '); inList = true; } - let text = trimmed.slice(2); + let text = escapeHtml(trimmed.slice(2)); text = text.replace(/\*\*\[(\w+)\]\*\*/g, '[$1]'); text = text.replace(/\*\*([^*]+)\*\*/g, '$1'); out.push(`
    • ${text}
    • `); @@ -218,17 +218,19 @@ const tableRows = deduped.map(c => { }).join('\n'); // Fill index template +const escapedTitle = escapeHtml(title); let indexHtml = template - .replace(/\{\{TITLE\}\}/g, `Company Research — ${escapeHtml(title)}`) - .replace('{{META}}', `${deduped.length} companies researched · ${new Date().toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' })}`) - .replace('{{TOTAL}}', String(total)) - .replace('{{HIGH_COUNT}}', String(high)) - .replace('{{MEDIUM_COUNT}}', String(medium)) - .replace('{{LOW_COUNT}}', String(low)) - .replace('{{HIGH_PCT}}', String(highPct)) - .replace('{{MEDIUM_PCT}}', String(mediumPct)) - .replace('{{LOW_PCT}}', String(lowPct)) - .replace('{{TABLE_ROWS}}', tableRows); + .replace(/\{\{TITLE\}\}/g, `Company Research — ${escapedTitle}`) + .replace(/\{\{COMPANY_NAME\}\}/g, escapedTitle) + .replace(/\{\{META\}\}/g, `${deduped.length} companies researched · ${new Date().toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' })}`) + .replace(/\{\{TOTAL\}\}/g, String(total)) + .replace(/\{\{HIGH_COUNT\}\}/g, String(high)) + .replace(/\{\{MEDIUM_COUNT\}\}/g, String(medium)) + .replace(/\{\{LOW_COUNT\}\}/g, String(low)) + .replace(/\{\{HIGH_PCT\}\}/g, String(highPct)) + .replace(/\{\{MEDIUM_PCT\}\}/g, String(mediumPct)) + .replace(/\{\{LOW_PCT\}\}/g, String(lowPct)) + .replace(/\{\{TABLE_ROWS\}\}/g, tableRows); writeFileSync(join(dir, 'index.html'), indexHtml); From 498e544e163ef5c74cd43307a66eef0a02de6775 Mon Sep 17 00:00:00 2001 From: Jay Sahnan Date: Fri, 24 Apr 2026 13:10:38 +0100 Subject: [PATCH 3/4] reduce hallucinations --- skills/company-research/SKILL.md | 14 +- .../references/research-patterns.md | 9 + .../company-research/references/workflow.md | 51 ++++-- .../company-research/scripts/extract_page.mjs | 168 ++++++++++++++++++ 4 files changed, 226 insertions(+), 16 deletions(-) create mode 100755 skills/company-research/scripts/extract_page.mjs diff --git a/skills/company-research/SKILL.md b/skills/company-research/SKILL.md index 6f032f2..84f7cfd 100644 --- a/skills/company-research/SKILL.md +++ b/skills/company-research/SKILL.md @@ -37,13 +37,19 @@ Discover and deeply research companies to sell to. Uses Browserbase Search API f **CRITICAL — Tool restrictions (applies to main agent AND all subagents)**: - All web searches: use `bb search`. NEVER use WebSearch. -- All page fetches: use `bb fetch --allow-redirects`. NEVER use WebFetch. `bb fetch` returns raw HTML — to extract text, pipe through: `sed 's/]*>.*<\/script>//g; s/]*>.*<\/style>//g; s/<[^>]*>//g' | tr -s ' \n'`. Has a 1MB response limit — for large or JS-heavy pages, use `bb browse` instead. +- All page content extraction: use `node {SKILL_DIR}/scripts/extract_page.mjs ""`. This script fetches via `bb fetch`, parses title + meta tags + visible body text, and automatically falls back to `bb browse` when the page is JS-rendered or over 1MB. NEVER hand-roll a `bb fetch | sed` pipeline — it silently strips meta tags and doesn't handle the JSON envelope. NEVER use WebFetch. - All research output: subagents write **one markdown file per company** to `{OUTPUT_DIR}/{company-slug}.md` using bash heredoc. NEVER use the Write tool or `python3 -c`. See `references/example-research.md` for the file format. - Report + CSV compilation: use `node {SKILL_DIR}/scripts/compile_report.mjs {OUTPUT_DIR} --open` — generates HTML report and CSV in one step, opens overview in browser. - URL deduplication: use `node {SKILL_DIR}/scripts/list_urls.mjs /tmp` after discovery. - **Subagents must use ONLY the Bash tool. No other tools allowed.** - **Main agent NEVER reads raw discovery JSON batch files.** Use `list_urls.mjs` for dedup. +**CRITICAL — Anti-hallucination rules (applies to main agent AND all subagents)**: +- NEVER infer `product_description`, `industry`, or `target_audience` from a site's fonts, framework (Framer/Next.js/React), design system, or typography. These are cosmetic and say nothing about what the company sells. +- NEVER let the user's own ICP leak into a target's description. If you don't know what the target does, write `Unknown` — do not pattern-match them onto the ICP. +- `product_description` MUST quote or paraphrase a specific phrase from `extract_page.mjs` output (TITLE, META_DESCRIPTION, OG_DESCRIPTION, HEADINGS, or BODY). If none of those fields yield a recognizable product statement, write `Unknown — homepage content not accessible`. +- If `product_description` is `Unknown`, cap `icp_fit_score` at 3 and set `icp_fit_reasoning` to `Insufficient evidence — homepage returned no readable content`. + **CRITICAL — Minimize permission prompts**: - Subagents MUST batch ALL file writes into a SINGLE Bash call using chained heredocs. One Bash call = one permission prompt. - Batch ALL searches and ALL fetches into single Bash calls using `&&` chaining. @@ -92,12 +98,12 @@ This is the most important step. The quality of everything downstream depends on **Key research steps:** - Search: `bb search "{company name}" --num-results 10` - - Fetch homepage: `bb fetch --allow-redirects "{company website}"` + - Fetch homepage: `node {SKILL_DIR}/scripts/extract_page.mjs "{company website}"` - **Discover site pages via sitemap** (do NOT hardcode paths like `/about` or `/customers`): - 1. `bb fetch --allow-redirects "{company website}/sitemap.xml"` — primary source + 1. `bb fetch --allow-redirects "{company website}/sitemap.xml"` — sitemap is small, raw `bb fetch` is fine 2. Scan for URLs with keywords: `customer`, `case-stud`, `pricing`, `about`, `use-case`, `industry`, `solution` 3. Optionally also fetch `/llms.txt` for page descriptions - 4. Pick 3-5 most relevant URLs and fetch those + 4. Pick 3-5 most relevant URLs and extract with `extract_page.mjs` (NOT raw `bb fetch`) - Search for external context and competitors - Accumulate findings with confidence levels diff --git a/skills/company-research/references/research-patterns.md b/skills/company-research/references/research-patterns.md index b050dc2..faaee97 100644 --- a/skills/company-research/references/research-patterns.md +++ b/skills/company-research/references/research-patterns.md @@ -176,3 +176,12 @@ Map findings to enrichment fields: - `key_features` → from product page findings If a field has no supporting findings, leave it empty rather than guessing. + +### Anti-Hallucination Rules + +Apply these at synthesis time. They exist because the failure mode — especially on Framer/Next.js landing pages with little server-rendered copy — is for the subagent to pattern-match visual cues onto the sender's ICP and fabricate a plausible-sounding description: + +1. **Typography is not a product.** Never infer `product_description`, `industry`, or `target_audience` from fonts, design system, framework choice (Framer, Next.js, React), or site polish. "Framer-built" and "uses Geist Mono" are observations about tooling, not signals of what the company sells. +2. **No ICP leakage.** If the homepage is thin and external search turns up nothing, do NOT default the target's description toward the sender's ICP. Manufacturing AI ≠ browser automation just because both use AI. +3. **Quote, don't paraphrase from memory.** `product_description` must quote or closely paraphrase a specific phrase from `extract_page.mjs` output (TITLE / META_DESCRIPTION / OG_DESCRIPTION / HEADINGS / BODY) or from an external search result. If no such phrase exists, write `Unknown — homepage content not accessible`. +4. **Cap scores on thin evidence.** If `product_description` is `Unknown`, set `icp_fit_score` ≤ 3 and `icp_fit_reasoning: Insufficient evidence — homepage returned no readable content`. Do not justify a higher score on inferred signals alone. diff --git a/skills/company-research/references/workflow.md b/skills/company-research/references/workflow.md index c0936c7..e380594 100644 --- a/skills/company-research/references/workflow.md +++ b/skills/company-research/references/workflow.md @@ -46,21 +46,43 @@ Each research subagent writes one markdown file per company. See `references/exa **CRITICAL**: Use consistent field names across all files. The `compile_report.mjs` script reads these fields. -## Extracting Text from HTML +## Extracting Page Content -`bb fetch --allow-redirects` returns raw HTML. To extract readable text in a subagent Bash call, use: +Use `extract_page.mjs` for all homepage/product-page content extraction. It fetches via `bb fetch`, parses title + meta + visible body text, and falls back to `bb browse` automatically when the page is JS-rendered or too large for fetch: ```bash -# Fetch and extract text in one pipeline -bb fetch --allow-redirects "https://example.com" | sed 's/]*>.*<\/script>//g; s/]*>.*<\/style>//g; s/<[^>]*>//g; s/&/\&/g; s/<//g; s/ / /g; s/&#[0-9]*;//g' | tr -s ' \n' | head -c 3000 +node {SKILL_DIR}/scripts/extract_page.mjs "https://example.com" --max-chars 3000 ``` -Or save to file first and then extract: -```bash -bb fetch --allow-redirects "https://example.com" --output /tmp/fetch_example.html && sed 's/]*>.*<\/script>//g; s/]*>.*<\/style>//g; s/<[^>]*>//g' /tmp/fetch_example.html | tr -s ' \n' | head -c 3000 +Output is a structured block: +``` +URL: https://example.com +FETCH_OK: true|false +FALLBACK_TO_BROWSE: true|false +TITLE: ... +META_DESCRIPTION: ... +OG_TITLE: ... +OG_DESCRIPTION: ... +HEADINGS: h1/h2/h3 joined by " | " +BODY_CHARS: N +BODY: + ``` -Limit to ~3000 chars per page to keep subagent context manageable. +**Why not a raw `bb fetch | sed` pipeline?** `bb fetch` returns a JSON envelope with the HTML embedded as an escaped string — a naive sed pipeline strips `<>` from the JSON wrapper too and destroys the content. It also strips `` tags, which on Framer/Next.js SPAs are often the only readable content. `extract_page.mjs` handles both correctly. + +**When to use raw `bb fetch`**: Only for small structured files where you want the JSON envelope intact — e.g. `sitemap.xml`, `robots.txt`, `llms.txt`. For any HTML page you'd feed to a model, use `extract_page.mjs`. + +## Verifying content is real (not hallucinated) + +Before writing `product_description`, `industry`, or `target_audience` into a company file, confirm the claim is grounded in `extract_page.mjs` output. Quote or closely paraphrase from TITLE, META_DESCRIPTION, OG_DESCRIPTION, HEADINGS, or BODY. + +If `extract_page.mjs` returns `FETCH_OK: false` AND `FALLBACK_TO_BROWSE: false` (or BODY_CHARS < 50), the homepage is inaccessible. Do not fabricate. Write: +- `product_description: Unknown — homepage content not accessible` +- `icp_fit_score: 3` (or lower) +- `icp_fit_reasoning: Insufficient evidence — homepage returned no readable content` + +A classic failure mode this prevents: a Framer/Next.js landing page with no server-rendered copy, where the subagent pattern-matches visual cues ("design-forward", "Geist Mono", "Framer-built") onto the user's own ICP. Typography is not a product. ## Discovery Subagent Prompt Template @@ -104,15 +126,20 @@ URLS TO PROCESS: TOOL RULES — CRITICAL, FOLLOW EXACTLY: 1. You may ONLY use the Bash tool. No exceptions. 2. All searches: Bash → bb search "..." --num-results 10 -3. All page fetches: Bash → bb fetch --allow-redirects "..." - bb fetch returns RAW HTML. To extract text, pipe through: - sed 's/]*>.*<\/script>//g; s/]*>.*<\/style>//g; s/<[^>]*>//g' | tr -s ' \n' | head -c 3000 - If a page returns thin content or "enable JavaScript", use bb browse instead. +3. All homepage/product-page content extraction: + Bash → node {SKILL_DIR}/scripts/extract_page.mjs "URL" --max-chars 3000 + This returns structured TITLE / META_DESCRIPTION / OG_DESCRIPTION / HEADINGS / BODY and auto-falls back to bb browse for JS-rendered or >1MB pages. + DO NOT hand-roll a `bb fetch | sed` pipeline — it silently strips meta tags and doesn't parse the JSON envelope. Use `bb fetch` raw only for sitemap.xml, robots.txt, llms.txt. 4. BATCH all file writes: Write ALL markdown files in a SINGLE Bash call using chained heredocs (one permission prompt, not one per file). 5. BANNED TOOLS: WebFetch, WebSearch, Write, Read, Glob, Grep — ALL BANNED. If you use ANY banned tool, the entire run fails. Use ONLY Bash. 6. NEVER use ~ or $HOME in paths — use full literal paths. +ANTI-HALLUCINATION RULES — CRITICAL: +- NEVER infer product_description, industry, or target_audience from fonts, framework (Framer/Next.js/React), design system, or visual style. Typography is not a product. +- NEVER let the sender's ICP leak into a target's description. If you don't know what the target does, write "Unknown" — do not pattern-match them onto the ICP. +- product_description MUST quote or closely paraphrase a phrase from extract_page.mjs output. If none of TITLE/META/OG/HEADINGS/BODY yield a recognizable product statement, write "Unknown — homepage content not accessible" and cap icp_fit_score at 3. + RESEARCH PATTERN (per company): Phase A — Plan (skip in quick mode): diff --git a/skills/company-research/scripts/extract_page.mjs b/skills/company-research/scripts/extract_page.mjs new file mode 100755 index 0000000..ad17997 --- /dev/null +++ b/skills/company-research/scripts/extract_page.mjs @@ -0,0 +1,168 @@ +#!/usr/bin/env node +// Extract structured page content for company research. +// Fetches via `bb fetch` (raw HTML to a temp file), pulls title + meta tags +// + visible body text, and auto-falls back to `bb browse` when content is thin. +// +// Usage: node extract_page.mjs [--max-chars N] +// Output (stdout): structured block consumable by a research subagent. + +import { execFileSync } from "node:child_process"; +import { mkdtempSync, readFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +const THIN_CONTENT_THRESHOLD = 200; // body chars under this → JS-rendered, fall back + +function parseArgs(argv) { + const args = { url: null, maxChars: 3000 }; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--max-chars") args.maxChars = parseInt(argv[++i], 10); + else if (!args.url) args.url = a; + } + if (!args.url) { + console.error("Usage: extract_page.mjs [--max-chars N]"); + process.exit(2); + } + return args; +} + +function bbFetch(url, outFile) { + execFileSync("bb", ["fetch", "--allow-redirects", url, "--output", outFile], { + stdio: ["ignore", "ignore", "ignore"], + }); +} + +function bbBrowseMarkdown(url) { + try { + execFileSync("bb", ["browse", "--headless", "open", url], { + stdio: ["ignore", "ignore", "ignore"], + timeout: 90000, + }); + const out = execFileSync("bb", ["browse", "--headless", "get", "markdown"], { + encoding: "utf8", + timeout: 90000, + maxBuffer: 50 * 1024 * 1024, + }); + // bb browse prints banners (e.g. "Update available...") before the JSON blob. + // Find the first '{' and try to JSON.parse from there. + const start = out.indexOf("{"); + if (start < 0) return ""; + try { + const parsed = JSON.parse(out.slice(start)); + if (parsed && typeof parsed.markdown === "string") return parsed.markdown; + } catch { + // Fallback: extract "markdown": "..." with a lenient regex that handles + // escaped quotes and newlines. + const m = out.slice(start).match(/"markdown"\s*:\s*"((?:\\.|[^"\\])*)"/s); + if (m) { + try { return JSON.parse(`"${m[1]}"`); } catch { return m[1]; } + } + } + return ""; + } catch (err) { + return ""; + } +} + +function extractMeta(html, name, attr = "name") { + const re = new RegExp( + `]*>([^<]*)<\/title>/i); + return m ? m[1].trim() : ""; +} + +function extractVisibleText(html, maxChars) { + // Multi-line aware script/style removal. + let s = html + .replace(/]*>[\s\S]*?<\/script>/gi, " ") + .replace(/]*>[\s\S]*?<\/style>/gi, " ") + .replace(/]*>[\s\S]*?<\/noscript>/gi, " ") + .replace(//g, " ") + .replace(/<[^>]+>/g, " ") + .replace(/ /g, " ") + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/&#[0-9]+;/g, " ") + .replace(/\s+/g, " ") + .trim(); + return s.slice(0, maxChars); +} + +function extractHeadings(html, limit = 10) { + const re = /]*>([\s\S]*?)<\/h[1-3]>/gi; + const out = []; + let m; + while ((m = re.exec(html)) && out.length < limit) { + const text = m[1].replace(/<[^>]+>/g, "").replace(/\s+/g, " ").trim(); + if (text) out.push(text); + } + return out; +} + +function main() { + const { url, maxChars } = parseArgs(process.argv.slice(2)); + const dir = mkdtempSync(join(tmpdir(), "extract_page_")); + const htmlFile = join(dir, "page.html"); + + let html = ""; + let fetchOk = false; + try { + bbFetch(url, htmlFile); + html = readFileSync(htmlFile, "utf8"); + fetchOk = true; + } catch (err) { + console.error(`[extract_page] bb fetch failed: ${err.message}`); + } + + const title = extractTitle(html); + const metaDesc = extractMeta(html, "description"); + const ogTitle = extractMeta(html, "og:title", "property"); + const ogDesc = extractMeta(html, "og:description", "property"); + const headings = extractHeadings(html); + let body = extractVisibleText(html, maxChars); + + // Thin content → JS-rendered SPA → fall back to bb browse. + let fallbackUsed = false; + if (body.length < THIN_CONTENT_THRESHOLD) { + const md = bbBrowseMarkdown(url); + if (md && md.length > body.length) { + body = md.replace(/\s+/g, " ").slice(0, maxChars); + fallbackUsed = true; + } + } + + rmSync(dir, { recursive: true, force: true }); + + // Structured output for subagent to read. + const lines = [ + `URL: ${url}`, + `FETCH_OK: ${fetchOk}`, + `FALLBACK_TO_BROWSE: ${fallbackUsed}`, + `TITLE: ${title}`, + `META_DESCRIPTION: ${metaDesc}`, + `OG_TITLE: ${ogTitle}`, + `OG_DESCRIPTION: ${ogDesc}`, + `HEADINGS: ${headings.join(" | ")}`, + `BODY_CHARS: ${body.length}`, + `BODY:`, + body, + ]; + process.stdout.write(lines.join("\n") + "\n"); +} + +main(); From a342256dec4b76797f3e641a8afdfa896c6f9420 Mon Sep 17 00:00:00 2001 From: Jay Sahnan Date: Fri, 24 Apr 2026 13:28:14 +0100 Subject: [PATCH 4/4] bugbot fix --- skills/company-research/references/workflow.md | 7 ++++--- skills/company-research/scripts/compile_report.mjs | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/skills/company-research/references/workflow.md b/skills/company-research/references/workflow.md index e380594..a013d35 100644 --- a/skills/company-research/references/workflow.md +++ b/skills/company-research/references/workflow.md @@ -149,8 +149,9 @@ Phase B — Research Loop: For each sub-question (or just the homepage in quick mode): 1. Run bb search with relevant query 2. Pick 1-2 most relevant URLs from results -3. Run bb fetch --allow-redirects on selected URLs, pipe through sed to extract text -4. Smart page discovery: try /llms.txt or /sitemap.xml to find relevant pages — don't guess paths +3. Extract page content: node {SKILL_DIR}/scripts/extract_page.mjs "URL" --max-chars 3000 + (auto-handles the JSON envelope, meta tags, and the bb browse fallback) +4. Smart page discovery: use `bb fetch --allow-redirects` on /sitemap.xml or /llms.txt to find relevant URLs — these are small XML/text files where the raw JSON envelope is fine. For the actual HTML pages you discover, use extract_page.mjs. 5. Extract findings: factual statements with source, confidence level 6. Accumulate findings, move to next sub-question 7. Respect step budget: quick=2-3 calls, deep=5-8, deeper=10-15 @@ -234,7 +235,7 @@ deeper: research_subagents = ceil(expected_urls / 3) ### Error Handling - If a subagent fails, log the error and continue with remaining batches - If >50% of subagents fail in a wave, pause and inform the user -- If `bb fetch --allow-redirects` fails, try `bb browse` as fallback or skip +- `extract_page.mjs` already handles the bb fetch → bb browse fallback internally. If it still returns FETCH_OK: false with empty BODY, skip the company and mark product_description as Unknown (do not guess). ## Report + CSV Compilation diff --git a/skills/company-research/scripts/compile_report.mjs b/skills/company-research/scripts/compile_report.mjs index 3b0a8ba..2759c69 100644 --- a/skills/company-research/scripts/compile_report.mjs +++ b/skills/company-research/scripts/compile_report.mjs @@ -179,7 +179,7 @@ companies.sort((a, b) => (parseInt(b.icp_fit_score) || 0) - (parseInt(a.icp_fit_ // Deduplicate const seen = new Map(); for (const c of companies) { - const name = (c.company_name || '').toLowerCase().replace(/\s*(inc|llc|ltd|corp|co)\s*\.?$/i, '').trim(); + const name = (c.company_name || '').toLowerCase().replace(/[,\s]+(inc|llc|ltd|corp|co)\.?$/i, '').trim(); if (!seen.has(name)) seen.set(name, c); } const deduped = [...seen.values()]; @@ -230,7 +230,7 @@ let indexHtml = template .replace(/\{\{HIGH_PCT\}\}/g, String(highPct)) .replace(/\{\{MEDIUM_PCT\}\}/g, String(mediumPct)) .replace(/\{\{LOW_PCT\}\}/g, String(lowPct)) - .replace(/\{\{TABLE_ROWS\}\}/g, tableRows); + .replace(/\{\{TABLE_ROWS\}\}/g, () => tableRows); writeFileSync(join(dir, 'index.html'), indexHtml);