diff --git a/scripts/ingest.ts b/scripts/ingest.ts index 1d65da83df..808e9eb00f 100644 --- a/scripts/ingest.ts +++ b/scripts/ingest.ts @@ -34,7 +34,7 @@ function logIngest(sourceId: string, type: string, title: string): void { } // ─── Constants ───────────────────────────────────────────────────────── -const KB_ROOT = join(__dirname, "..", "knowledge-base"); +import { KB_ROOT } from "./lib/kb-root"; const CONTENT_THRESHOLD = 3000; // words // ─── CLI arg parsing ─────────────────────────────────────────────────── @@ -393,6 +393,29 @@ async function main(): Promise { break; } + case "paste": { + const { ingestPaste } = await import("./lib/ingest/paste.js"); + + try { + const result = await ingestPaste(url, KB_ROOT); + console.log(`SUCCESS: Source created as ${result.sourceId}`); + console.log(); + logIngest(result.sourceId, "paste", result.title); + meta("Type", "paste"); + meta("Title", result.title); + meta("Word count", result.wordCount.toLocaleString()); + if (domains.length > 0) { + meta("Existing domains", domains.join(", ")); + } + meta("Source file", result.sourcePath); + console.log(`Raw copy saved to ${result.rawPath}.`); + } catch (err: unknown) { + console.error(err instanceof Error ? err.message : String(err)); + process.exit(1); + } + break; + } + case "image": { const { ingestImage } = await import("./lib/ingest/image.js"); const result = await ingestImage(url, KB_ROOT); diff --git a/scripts/lib/ingest/audio.ts b/scripts/lib/ingest/audio.ts index 76a6f3f9d0..b74fffde5c 100644 --- a/scripts/lib/ingest/audio.ts +++ b/scripts/lib/ingest/audio.ts @@ -82,11 +82,29 @@ function transcribe(filePath: string): WhisperOutput { const whisper = resolveWhisper(); const outputDir = join(tmpdir(), `whisper-${Date.now()}`); - execFileSync( - whisper, - [filePath, "--model", "base", "--language", "en", "--output_format", "json", "--output_dir", outputDir], - { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"], maxBuffer: 50 * 1024 * 1024 }, - ); + // Configurable + auto-detecting. The previous hardcoded + // `--model base --language en` confabulated on non-English audio (forcing + // English on the weakest model produced fluent hallucination + repetition + // loops). Defaults: a stronger model and per-file language auto-detection + // (omit --language). Override via env, e.g. ZUHN_WHISPER_TASK=translate to + // emit English from any source language. + const model = process.env.ZUHN_WHISPER_MODEL ?? "small"; + const task = process.env.ZUHN_WHISPER_TASK ?? "transcribe"; + const args = [ + filePath, + "--model", model, + "--task", task, + "--output_format", "json", + "--output_dir", outputDir, + ]; + const language = process.env.ZUHN_WHISPER_LANGUAGE; + if (language) args.push("--language", language); // else Whisper auto-detects + + execFileSync(whisper, args, { + encoding: "utf-8", + stdio: ["pipe", "pipe", "pipe"], + maxBuffer: 50 * 1024 * 1024, + }); // Whisper writes {basename_without_ext}.json in output_dir const base = basename(filePath); diff --git a/scripts/lib/ingest/detect.test.ts b/scripts/lib/ingest/detect.test.ts index 82cb28240f..693ccb5a11 100644 --- a/scripts/lib/ingest/detect.test.ts +++ b/scripts/lib/ingest/detect.test.ts @@ -51,8 +51,13 @@ describe("detectType", () => { it("detects local PDF paths", () => { expect(detectType("/path/to/document.pdf")).toBe("pdf"); }); - it("throws for unsupported local file types", () => { - expect(() => detectType("/path/to/file.txt")).toThrow(/Unsupported local file type/); + it("detects local text/markdown as paste", () => { + expect(detectType("/path/to/notes.txt")).toBe("paste"); + expect(detectType("/tmp/gtm-atlas.md")).toBe("paste"); + expect(detectType("/path/to/essay.markdown")).toBe("paste"); + }); + it("throws for genuinely unsupported local file types", () => { + expect(() => detectType("/path/to/archive.zip")).toThrow(/Unsupported local file type/); }); }); diff --git a/scripts/lib/ingest/detect.ts b/scripts/lib/ingest/detect.ts index 3e48a8f8d2..2883c27f44 100644 --- a/scripts/lib/ingest/detect.ts +++ b/scripts/lib/ingest/detect.ts @@ -1,9 +1,13 @@ -export type ContentType = "youtube" | "reddit" | "blog" | "pdf" | "audio" | "image"; +export type ContentType = "youtube" | "reddit" | "blog" | "pdf" | "audio" | "image" | "paste"; const TRACKING_PARAMS = ["utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "ref", "si", "source"]; const AUDIO_EXTENSIONS = [".mp3", ".wav", ".m4a", ".aac", ".ogg", ".flac", ".webm"]; const IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".heic", ".webp", ".gif", ".svg", ".tiff"]; +// Local plain-text / markdown is a sanctioned "paste" source: high-signal +// content captured by hand (essays behind JS sites, transcripts, paywalled +// pieces) that has no scrapeable URL. Handled by lib/ingest/paste.ts. +const TEXT_EXTENSIONS = [".txt", ".md", ".markdown", ".text"]; export function normalizeUrl(urlString: string): string { const url = new URL(urlString); @@ -24,6 +28,7 @@ export function detectType(input: string): ContentType { if (AUDIO_EXTENSIONS.some((ext) => lower.endsWith(ext))) return "audio"; if (IMAGE_EXTENSIONS.some((ext) => lower.endsWith(ext))) return "image"; if (lower.endsWith(".pdf")) return "pdf"; + if (TEXT_EXTENSIONS.some((ext) => lower.endsWith(ext))) return "paste"; throw new Error(`Unsupported local file type: ${input}`); } diff --git a/scripts/lib/ingest/paste.test.ts b/scripts/lib/ingest/paste.test.ts new file mode 100644 index 0000000000..0268e3342f --- /dev/null +++ b/scripts/lib/ingest/paste.test.ts @@ -0,0 +1,77 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import { mkdtemp, rm, writeFile, readFile, stat } from "node:fs/promises"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import matter from "gray-matter"; +import { ingestPaste } from "./paste"; + +describe("ingestPaste", () => { + let tempDir: string; + let kbRoot: string; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), "paste-test-")); + kbRoot = join(tempDir, "knowledge-base"); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it("copies the verbatim original to sources/paste/raw/", async () => { + const f = join(tempDir, "notes.md"); + await writeFile(f, "# Hello\n\nbody text here"); + const result = await ingestPaste(f, kbRoot); + const rawStat = await stat(result.rawPath); + expect(rawStat.isFile()).toBe(true); + expect(result.rawPath).toContain("sources/paste/raw/"); + expect(result.rawPath).toMatch(/\.txt$/); + }); + + it("creates source .md with valid paste frontmatter", async () => { + const f = join(tempDir, "notes.md"); + await writeFile(f, "# My Title\n\nsome content with five words"); + const result = await ingestPaste(f, kbRoot); + const parsed = matter(await readFile(result.sourcePath, "utf-8")); + expect(parsed.data.id).toMatch(/^SRC-\d{6}-[A-F0-9]{4}$/); + expect(parsed.data.type).toBe("paste"); + expect(parsed.data.title).toBe("My Title"); + expect(parsed.data.date_ingested).toMatch(/^\d{4}-\d{2}-\d{2}$/); + expect(parsed.data.insight_count).toBe(0); + expect(typeof parsed.data.word_count).toBe("number"); + // No url field for a local paste (gray-matter cannot serialize undefined) + expect(parsed.data.url).toBeUndefined(); + }); + + it("preserves the full text as the source body", async () => { + const f = join(tempDir, "doc.txt"); + const body = "# Doc\n\nthe quick brown fox jumps over the lazy dog"; + await writeFile(f, body); + const result = await ingestPaste(f, kbRoot); + const parsed = matter(await readFile(result.sourcePath, "utf-8")); + expect(parsed.content).toContain("the quick brown fox"); + }); + + it("derives title from the first H1 when present", async () => { + const f = join(tempDir, "whatever-filename.md"); + await writeFile(f, "# Real Document Title\n\ncontent"); + const result = await ingestPaste(f, kbRoot); + expect(result.title).toBe("Real Document Title"); + }); + + it("falls back to a humanized filename when there is no H1", async () => { + const f = join(tempDir, "gtm-atlas-notes.txt"); + await writeFile(f, "no heading here, just prose content"); + const result = await ingestPaste(f, kbRoot); + expect(result.title).toBe("gtm atlas notes"); + }); + + it("returns sourceId matching the SRC- pattern", async () => { + const f = join(tempDir, "x.md"); + await writeFile(f, "# X\n\nbody"); + const result = await ingestPaste(f, kbRoot); + expect(result.sourceId).toMatch(/^SRC-\d{6}-[A-F0-9]{4}$/); + expect(result.sourcePath).toContain("sources/paste/"); + expect(result.sourcePath).toMatch(/\.md$/); + }); +}); diff --git a/scripts/lib/ingest/paste.ts b/scripts/lib/ingest/paste.ts new file mode 100644 index 0000000000..d37ab1052d --- /dev/null +++ b/scripts/lib/ingest/paste.ts @@ -0,0 +1,78 @@ +import { join, basename, extname } from "node:path"; +import { mkdir, copyFile, writeFile, readFile } from "node:fs/promises"; +import matter from "gray-matter"; +import { generateSourceId } from "../generate-id"; +import { slugify } from "./slug"; + +export interface PasteResult { + sourceId: string; + sourcePath: string; + rawPath: string; + title: string; + wordCount: number; +} + +function wordCount(text: string): number { + return text.split(/\s+/).filter(Boolean).length; +} + +/** + * Derive a title: the first markdown H1 if present, otherwise a humanized + * filename. Keeps provenance honest — the title is the document's own, not + * something invented at extraction time. + */ +function deriveTitle(text: string, filePath: string): string { + const h1 = text.match(/^#\s+(.+?)\s*$/m); + if (h1?.[1]) return h1[1].trim(); + const base = basename(filePath); + const stem = base.slice(0, base.length - extname(base).length); + const humanized = stem.replace(/[-_]+/g, " ").trim(); + return humanized.length > 0 ? humanized : "untitled paste"; +} + +/** + * Ingest a local plain-text / markdown file as a "paste" source. + * + * Mirrors lib/ingest/pdf.ts: copy the verbatim original into sources/paste/ + * raw/, write a source .md whose body is the full clean text (like the blog + * and reddit handlers) with gray-matter frontmatter. No `url` field — paste + * sources have no scrapeable origin, and gray-matter cannot serialize + * undefined (same reason pdf.ts omits url for local files). + */ +export async function ingestPaste( + input: string, + kbRoot?: string, +): Promise { + const root = kbRoot ?? join(__dirname, "../../..", "knowledge-base"); + const rawDir = join(root, "sources", "paste", "raw"); + const sourceDir = join(root, "sources", "paste"); + + await mkdir(rawDir, { recursive: true }); + await mkdir(sourceDir, { recursive: true }); + + const text = (await readFile(input, "utf-8")).trim(); + const title = deriveTitle(text, input); + const slug = slugify(title); + + const rawPath = join(rawDir, `${slug}.txt`); + await copyFile(input, rawPath); + + const sourceId = generateSourceId(slug, input); + const today = new Date().toISOString().slice(0, 10); + const wc = wordCount(text); + + const sourceData: Record = { + id: sourceId, + type: "paste", + title, + date_ingested: today, + insight_count: 0, + word_count: wc, + }; + + const sourceMd = matter.stringify(text, sourceData); + const sourcePath = join(sourceDir, `${slug}.md`); + await writeFile(sourcePath, sourceMd, "utf-8"); + + return { sourceId, sourcePath, rawPath, title, wordCount: wc }; +}