Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion scripts/ingest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ function logIngest(sourceId: string, type: string, title: string): void {
}

// ─── Constants ─────────────────────────────────────────────────────────
const KB_ROOT = join(__dirname, "..", "knowledge-base");
import { KB_ROOT } from "./lib/kb-root";
const CONTENT_THRESHOLD = 3000; // words

// ─── CLI arg parsing ───────────────────────────────────────────────────
Expand Down Expand Up @@ -393,6 +393,29 @@ async function main(): Promise<void> {
break;
}

case "paste": {
const { ingestPaste } = await import("./lib/ingest/paste.js");

try {
const result = await ingestPaste(url, KB_ROOT);
console.log(`SUCCESS: Source created as ${result.sourceId}`);
console.log();
logIngest(result.sourceId, "paste", result.title);
meta("Type", "paste");
meta("Title", result.title);
meta("Word count", result.wordCount.toLocaleString());
if (domains.length > 0) {
meta("Existing domains", domains.join(", "));
}
meta("Source file", result.sourcePath);
console.log(`Raw copy saved to ${result.rawPath}.`);
} catch (err: unknown) {
console.error(err instanceof Error ? err.message : String(err));
process.exit(1);
}
break;
}

case "image": {
const { ingestImage } = await import("./lib/ingest/image.js");
const result = await ingestImage(url, KB_ROOT);
Expand Down
28 changes: 23 additions & 5 deletions scripts/lib/ingest/audio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,29 @@ function transcribe(filePath: string): WhisperOutput {
const whisper = resolveWhisper();
const outputDir = join(tmpdir(), `whisper-${Date.now()}`);

execFileSync(
whisper,
[filePath, "--model", "base", "--language", "en", "--output_format", "json", "--output_dir", outputDir],
{ encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"], maxBuffer: 50 * 1024 * 1024 },
);
// Configurable + auto-detecting. The previous hardcoded
// `--model base --language en` confabulated on non-English audio (forcing
// English on the weakest model produced fluent hallucination + repetition
// loops). Defaults: a stronger model and per-file language auto-detection
// (omit --language). Override via env, e.g. ZUHN_WHISPER_TASK=translate to
// emit English from any source language.
const model = process.env.ZUHN_WHISPER_MODEL ?? "small";
const task = process.env.ZUHN_WHISPER_TASK ?? "transcribe";
const args = [
filePath,
"--model", model,
"--task", task,
"--output_format", "json",
"--output_dir", outputDir,
];
const language = process.env.ZUHN_WHISPER_LANGUAGE;
if (language) args.push("--language", language); // else Whisper auto-detects

execFileSync(whisper, args, {
encoding: "utf-8",
stdio: ["pipe", "pipe", "pipe"],
maxBuffer: 50 * 1024 * 1024,
});

// Whisper writes {basename_without_ext}.json in output_dir
const base = basename(filePath);
Expand Down
9 changes: 7 additions & 2 deletions scripts/lib/ingest/detect.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,13 @@ describe("detectType", () => {
it("detects local PDF paths", () => {
expect(detectType("/path/to/document.pdf")).toBe("pdf");
});
it("throws for unsupported local file types", () => {
expect(() => detectType("/path/to/file.txt")).toThrow(/Unsupported local file type/);
it("detects local text/markdown as paste", () => {
expect(detectType("/path/to/notes.txt")).toBe("paste");
expect(detectType("/tmp/gtm-atlas.md")).toBe("paste");
expect(detectType("/path/to/essay.markdown")).toBe("paste");
});
it("throws for genuinely unsupported local file types", () => {
expect(() => detectType("/path/to/archive.zip")).toThrow(/Unsupported local file type/);
});
});

Expand Down
7 changes: 6 additions & 1 deletion scripts/lib/ingest/detect.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
export type ContentType = "youtube" | "reddit" | "blog" | "pdf" | "audio" | "image";
export type ContentType = "youtube" | "reddit" | "blog" | "pdf" | "audio" | "image" | "paste";

const TRACKING_PARAMS = ["utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", "ref", "si", "source"];

const AUDIO_EXTENSIONS = [".mp3", ".wav", ".m4a", ".aac", ".ogg", ".flac", ".webm"];
const IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".heic", ".webp", ".gif", ".svg", ".tiff"];
// Local plain-text / markdown is a sanctioned "paste" source: high-signal
// content captured by hand (essays behind JS sites, transcripts, paywalled
// pieces) that has no scrapeable URL. Handled by lib/ingest/paste.ts.
const TEXT_EXTENSIONS = [".txt", ".md", ".markdown", ".text"];

export function normalizeUrl(urlString: string): string {
const url = new URL(urlString);
Expand All @@ -24,6 +28,7 @@ export function detectType(input: string): ContentType {
if (AUDIO_EXTENSIONS.some((ext) => lower.endsWith(ext))) return "audio";
if (IMAGE_EXTENSIONS.some((ext) => lower.endsWith(ext))) return "image";
if (lower.endsWith(".pdf")) return "pdf";
if (TEXT_EXTENSIONS.some((ext) => lower.endsWith(ext))) return "paste";
throw new Error(`Unsupported local file type: ${input}`);
}

Expand Down
77 changes: 77 additions & 0 deletions scripts/lib/ingest/paste.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import { describe, it, expect, beforeEach, afterEach } from "vitest";
import { mkdtemp, rm, writeFile, readFile, stat } from "node:fs/promises";
import { join } from "node:path";
import { tmpdir } from "node:os";
import matter from "gray-matter";
import { ingestPaste } from "./paste";

describe("ingestPaste", () => {
let tempDir: string;
let kbRoot: string;

beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), "paste-test-"));
kbRoot = join(tempDir, "knowledge-base");
});

afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});

it("copies the verbatim original to sources/paste/raw/", async () => {
const f = join(tempDir, "notes.md");
await writeFile(f, "# Hello\n\nbody text here");
const result = await ingestPaste(f, kbRoot);
const rawStat = await stat(result.rawPath);
expect(rawStat.isFile()).toBe(true);
expect(result.rawPath).toContain("sources/paste/raw/");
expect(result.rawPath).toMatch(/\.txt$/);
});

it("creates source .md with valid paste frontmatter", async () => {
const f = join(tempDir, "notes.md");
await writeFile(f, "# My Title\n\nsome content with five words");
const result = await ingestPaste(f, kbRoot);
const parsed = matter(await readFile(result.sourcePath, "utf-8"));
expect(parsed.data.id).toMatch(/^SRC-\d{6}-[A-F0-9]{4}$/);
expect(parsed.data.type).toBe("paste");
expect(parsed.data.title).toBe("My Title");
expect(parsed.data.date_ingested).toMatch(/^\d{4}-\d{2}-\d{2}$/);
expect(parsed.data.insight_count).toBe(0);
expect(typeof parsed.data.word_count).toBe("number");
// No url field for a local paste (gray-matter cannot serialize undefined)
expect(parsed.data.url).toBeUndefined();
});

it("preserves the full text as the source body", async () => {
const f = join(tempDir, "doc.txt");
const body = "# Doc\n\nthe quick brown fox jumps over the lazy dog";
await writeFile(f, body);
const result = await ingestPaste(f, kbRoot);
const parsed = matter(await readFile(result.sourcePath, "utf-8"));
expect(parsed.content).toContain("the quick brown fox");
});

it("derives title from the first H1 when present", async () => {
const f = join(tempDir, "whatever-filename.md");
await writeFile(f, "# Real Document Title\n\ncontent");
const result = await ingestPaste(f, kbRoot);
expect(result.title).toBe("Real Document Title");
});

it("falls back to a humanized filename when there is no H1", async () => {
const f = join(tempDir, "gtm-atlas-notes.txt");
await writeFile(f, "no heading here, just prose content");
const result = await ingestPaste(f, kbRoot);
expect(result.title).toBe("gtm atlas notes");
});

it("returns sourceId matching the SRC- pattern", async () => {
const f = join(tempDir, "x.md");
await writeFile(f, "# X\n\nbody");
const result = await ingestPaste(f, kbRoot);
expect(result.sourceId).toMatch(/^SRC-\d{6}-[A-F0-9]{4}$/);
expect(result.sourcePath).toContain("sources/paste/");
expect(result.sourcePath).toMatch(/\.md$/);
});
});
78 changes: 78 additions & 0 deletions scripts/lib/ingest/paste.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import { join, basename, extname } from "node:path";
import { mkdir, copyFile, writeFile, readFile } from "node:fs/promises";
import matter from "gray-matter";
import { generateSourceId } from "../generate-id";
import { slugify } from "./slug";

export interface PasteResult {
sourceId: string;
sourcePath: string;
rawPath: string;
title: string;
wordCount: number;
}

function wordCount(text: string): number {
return text.split(/\s+/).filter(Boolean).length;
}

/**
* Derive a title: the first markdown H1 if present, otherwise a humanized
* filename. Keeps provenance honest — the title is the document's own, not
* something invented at extraction time.
*/
function deriveTitle(text: string, filePath: string): string {
const h1 = text.match(/^#\s+(.+?)\s*$/m);
if (h1?.[1]) return h1[1].trim();
const base = basename(filePath);
const stem = base.slice(0, base.length - extname(base).length);
const humanized = stem.replace(/[-_]+/g, " ").trim();
return humanized.length > 0 ? humanized : "untitled paste";
}

/**
* Ingest a local plain-text / markdown file as a "paste" source.
*
* Mirrors lib/ingest/pdf.ts: copy the verbatim original into sources/paste/
* raw/, write a source .md whose body is the full clean text (like the blog
* and reddit handlers) with gray-matter frontmatter. No `url` field — paste
* sources have no scrapeable origin, and gray-matter cannot serialize
* undefined (same reason pdf.ts omits url for local files).
*/
export async function ingestPaste(
input: string,
kbRoot?: string,
): Promise<PasteResult> {
const root = kbRoot ?? join(__dirname, "../../..", "knowledge-base");
const rawDir = join(root, "sources", "paste", "raw");
const sourceDir = join(root, "sources", "paste");

await mkdir(rawDir, { recursive: true });
await mkdir(sourceDir, { recursive: true });

const text = (await readFile(input, "utf-8")).trim();
const title = deriveTitle(text, input);
const slug = slugify(title);

const rawPath = join(rawDir, `${slug}.txt`);
await copyFile(input, rawPath);

const sourceId = generateSourceId(slug, input);
const today = new Date().toISOString().slice(0, 10);
const wc = wordCount(text);

const sourceData: Record<string, string | number> = {
id: sourceId,
type: "paste",
title,
date_ingested: today,
insight_count: 0,
word_count: wc,
};

const sourceMd = matter.stringify(text, sourceData);
const sourcePath = join(sourceDir, `${slug}.md`);
await writeFile(sourcePath, sourceMd, "utf-8");

return { sourceId, sourcePath, rawPath, title, wordCount: wc };
}
Loading