From 9a4aadda2fad795a32b5a177c1f6c201f74ccbc9 Mon Sep 17 00:00:00 2001 From: Aiden Bai Date: Sat, 4 Apr 2026 21:24:02 -0700 Subject: [PATCH 1/3] feat: add video-transcript prototype for generating tests from screen recordings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds apps/video-transcript — a standalone CLI that extracts structured interaction transcripts from screen recordings using Gemini 2.5 Flash via AI SDK. Designed to compound with git diff context so the test agent gets both "what changed" and "how the feature works." Pipeline: video → ffmpeg idle-time cutting → Gemini transcript extraction Updates the video-transcript spec to use AI SDK (@ai-sdk/google) instead of @google/genai, matching the existing codebase dependency surface. --- .specs/video-transcript.md | 2 +- apps/video-transcript/package.json | 42 ++++ .../video-transcript/src/activity-analyzer.ts | 209 ++++++++++++++++++ apps/video-transcript/src/constants.ts | 14 ++ .../src/extract-transcript.ts | 36 +++ apps/video-transcript/src/index.ts | 128 +++++++++++ .../video-transcript/src/transcript-prompt.ts | 38 ++++ apps/video-transcript/src/types.ts | 7 + apps/video-transcript/tsconfig.json | 9 + pnpm-lock.yaml | 83 +++++++ 10 files changed, 567 insertions(+), 1 deletion(-) create mode 100644 apps/video-transcript/package.json create mode 100644 apps/video-transcript/src/activity-analyzer.ts create mode 100644 apps/video-transcript/src/constants.ts create mode 100644 apps/video-transcript/src/extract-transcript.ts create mode 100644 apps/video-transcript/src/index.ts create mode 100644 apps/video-transcript/src/transcript-prompt.ts create mode 100644 apps/video-transcript/src/types.ts create mode 100644 apps/video-transcript/tsconfig.json diff --git a/.specs/video-transcript.md b/.specs/video-transcript.md index 7db25f972..eeacff58d 100644 --- a/.specs/video-transcript.md +++ b/.specs/video-transcript.md @@ -81,7 +81,7 @@ const { text } = await generateText({ messages: [{ role: "user", content: [ - { type: "file", data: readFileSync(videoPath), mimeType: "video/mp4" }, + { type: "file", data: readFileSync(videoPath), mediaType: "video/mp4" }, { type: "text", text: transcriptPrompt }, ], }], diff --git a/apps/video-transcript/package.json b/apps/video-transcript/package.json new file mode 100644 index 000000000..7082286be --- /dev/null +++ b/apps/video-transcript/package.json @@ -0,0 +1,42 @@ +{ + "name": "@expect/video-transcript", + "version": "0.0.1", + "private": true, + "description": "Extract structured interaction transcripts from screen recordings", + "type": "module", + "bin": { + "video-transcript": "./dist/index.js" + }, + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "scripts": { + "build": "vp pack", + "dev": "vp pack --watch", + "lint": "vp lint && tsc --noEmit", + "format": "vp fmt", + "format:check": "vp fmt --check", + "check": "vp check", + "test": "vp test", + "typecheck": "tsgo --noEmit" + }, + "dependencies": { + "@ai-sdk/google": "^3.0.53", + "ai": "^6.0.146", + "commander": "^13.1.0", + "effect": "4.0.0-beta.35", + "picocolors": "^1.1.1" + }, + "devDependencies": { + "@types/node": "^22.15.0", + "typescript": "^5.7.0" + }, + "engines": { + "node": ">=18" + } +} diff --git a/apps/video-transcript/src/activity-analyzer.ts b/apps/video-transcript/src/activity-analyzer.ts new file mode 100644 index 000000000..3b742ff61 --- /dev/null +++ b/apps/video-transcript/src/activity-analyzer.ts @@ -0,0 +1,209 @@ +import { execFile } from "node:child_process"; +import { readFileSync, mkdtempSync, rmSync, readdirSync } from "node:fs"; +import { tmpdir } from "node:os"; +import path from "node:path"; +import { + FRAME_DIFF_IDLE_THRESHOLD, + FRAMES_PER_SECOND, + IDLE_CUT_THRESHOLD_SECONDS, + MIN_ACTIVE_SEGMENT_SECONDS, + SCENE_CHANGE_THRESHOLD, +} from "./constants"; +import type { ActivitySegment, ActivityTimeline } from "./types"; + +const execFileAsync = ( + command: string, + args: readonly string[], +): Promise<{ stdout: string; stderr: string }> => + new Promise((resolve, reject) => { + execFile(command, args, { maxBuffer: 10 * 1024 * 1024 }, (error, stdout, stderr) => { + if (error) reject(error); + else resolve({ stdout, stderr }); + }); + }); + +export const checkFfmpegAvailable = async (): Promise => { + try { + await execFileAsync("ffmpeg", ["-version"]); + return true; + } catch { + return false; + } +}; + +const extractFrames = async (videoPath: string, outputDir: string): Promise => { + await execFileAsync("ffmpeg", [ + "-i", + videoPath, + "-vf", + `fps=${FRAMES_PER_SECOND}`, + "-vsync", + "vfr", + "-f", + "rawvideo", + "-pix_fmt", + "gray", + "-s", + "320x180", + path.join(outputDir, "frame_%05d.raw"), + ]); + + const files = readdirSync(outputDir).filter((file) => file.startsWith("frame_")); + return files.length; +}; + +const computeFrameDiff = (frameA: Buffer, frameB: Buffer): number => { + const length = Math.min(frameA.length, frameB.length); + if (length === 0) return 0; + + let totalDiff = 0; + for (let index = 0; index < length; index++) { + totalDiff += Math.abs(frameA[index]! - frameB[index]!) / 255; + } + + return totalDiff / length; +}; + +const classifySegments = (diffs: readonly number[]): ActivityTimeline => { + const rawClassification: Array<"active" | "idle" | "scene_change"> = []; + + for (const diff of diffs) { + if (diff >= SCENE_CHANGE_THRESHOLD) { + rawClassification.push("scene_change"); + } else if (diff > FRAME_DIFF_IDLE_THRESHOLD) { + rawClassification.push("active"); + } else { + rawClassification.push("idle"); + } + } + + const segments: ActivitySegment[] = []; + let currentType = rawClassification[0]; + let segmentStart = 0; + + if (!currentType) return []; + + for (let index = 1; index <= rawClassification.length; index++) { + const nextType = rawClassification[index]; + if (nextType !== currentType || index === rawClassification.length) { + segments.push({ + type: currentType, + startSeconds: segmentStart, + endSeconds: index, + }); + if (nextType) { + currentType = nextType; + segmentStart = index; + } + } + } + + return mergeShortSegments(segments); +}; + +const mergeShortSegments = (segments: readonly ActivitySegment[]): ActivityTimeline => { + const merged: ActivitySegment[] = []; + + for (const segment of segments) { + const duration = segment.endSeconds - segment.startSeconds; + + if (segment.type === "active" && duration < MIN_ACTIVE_SEGMENT_SECONDS) { + const previous = merged[merged.length - 1]; + if (previous && previous.type === "idle") { + merged[merged.length - 1] = { ...previous, endSeconds: segment.endSeconds }; + } else { + merged.push(segment); + } + continue; + } + + if (segment.type === "idle" && duration <= IDLE_CUT_THRESHOLD_SECONDS) { + const previous = merged[merged.length - 1]; + if (previous && previous.type === "active") { + merged[merged.length - 1] = { ...previous, endSeconds: segment.endSeconds }; + continue; + } + } + + merged.push(segment); + } + + return merged; +}; + +export const analyzeActivity = async (videoPath: string): Promise => { + const framesDir = mkdtempSync(path.join(tmpdir(), "expect-frames-")); + + try { + const frameCount = await extractFrames(videoPath, framesDir); + if (frameCount < 2) return [{ type: "active", startSeconds: 0, endSeconds: frameCount }]; + + const frameSize = 320 * 180; + const diffs: number[] = []; + + for (let index = 1; index < frameCount; index++) { + const prevPath = path.join(framesDir, `frame_${String(index).padStart(5, "0")}.raw`); + const currPath = path.join(framesDir, `frame_${String(index + 1).padStart(5, "0")}.raw`); + + const prevFrame = readFileSync(prevPath); + const currFrame = readFileSync(currPath); + diffs.push(computeFrameDiff(prevFrame, currFrame)); + } + + return classifySegments(diffs); + } finally { + rmSync(framesDir, { recursive: true, force: true }); + } +}; + +export const buildTrimmedVideo = async ( + videoPath: string, + timeline: ActivityTimeline, +): Promise => { + const activeSegments = timeline.filter( + (segment) => segment.type === "active" || segment.type === "scene_change", + ); + + if (activeSegments.length === 0) return videoPath; + + const outputDir = mkdtempSync(path.join(tmpdir(), "expect-trimmed-")); + const outputPath = path.join(outputDir, "trimmed.mp4"); + + const filterParts = activeSegments.map( + (segment) => `between(t,${segment.startSeconds},${segment.endSeconds})`, + ); + const selectFilter = filterParts.join("+"); + + await execFileAsync("ffmpeg", [ + "-i", + videoPath, + "-vf", + `select='${selectFilter}',setpts=N/FRAME_RATE/TB`, + "-af", + `aselect='${selectFilter}',asetpts=N/SR/TB`, + "-y", + outputPath, + ]); + + return outputPath; +}; + +export const formatTimeline = (timeline: ActivityTimeline): string => { + const formatTime = (seconds: number): string => { + const minutes = Math.floor(seconds / 60); + const secs = seconds % 60; + return `${String(minutes).padStart(2, "0")}:${String(secs).padStart(2, "0")}`; + }; + + const lines = timeline.map((segment) => { + const label = + segment.type === "scene_change" + ? "scene change (likely navigation)" + : segment.type === "idle" + ? "idle" + : "active"; + return `- [${formatTime(segment.startSeconds)}–${formatTime(segment.endSeconds)}] ${label}`; + }); + + return lines.join("\n"); +}; diff --git a/apps/video-transcript/src/constants.ts b/apps/video-transcript/src/constants.ts new file mode 100644 index 000000000..816487dd2 --- /dev/null +++ b/apps/video-transcript/src/constants.ts @@ -0,0 +1,14 @@ +export const FRAME_DIFF_IDLE_THRESHOLD = 0.005; +export const IDLE_CUT_THRESHOLD_SECONDS = 3; +export const SCENE_CHANGE_THRESHOLD = 0.15; +export const MIN_ACTIVE_SEGMENT_SECONDS = 1; +export const FRAMES_PER_SECOND = 1; + +export const SUPPORTED_VIDEO_EXTENSIONS = [".mp4", ".webm", ".mov", ".avi", ".mkv"] as const; +export const SUPPORTED_MIME_TYPES: Record = { + ".mp4": "video/mp4", + ".webm": "video/webm", + ".mov": "video/quicktime", + ".avi": "video/x-msvideo", + ".mkv": "video/x-matroska", +}; diff --git a/apps/video-transcript/src/extract-transcript.ts b/apps/video-transcript/src/extract-transcript.ts new file mode 100644 index 000000000..d6e305639 --- /dev/null +++ b/apps/video-transcript/src/extract-transcript.ts @@ -0,0 +1,36 @@ +import { readFileSync } from "node:fs"; +import path from "node:path"; +import { google } from "@ai-sdk/google"; +import { generateText } from "ai"; +import { SUPPORTED_MIME_TYPES } from "./constants"; +import { buildTranscriptPrompt } from "./transcript-prompt"; +import type { ActivityTimeline } from "./types"; + +const getMimeType = (videoPath: string): string => { + const extension = path.extname(videoPath).toLowerCase(); + return SUPPORTED_MIME_TYPES[extension] ?? "video/mp4"; +}; + +export const extractTranscript = async ( + videoPath: string, + timeline: ActivityTimeline | undefined, +): Promise => { + const mimeType = getMimeType(videoPath); + const videoData = readFileSync(videoPath); + const prompt = buildTranscriptPrompt(timeline); + + const { text } = await generateText({ + model: google("gemini-2.5-flash"), + messages: [ + { + role: "user", + content: [ + { type: "file", data: videoData, mediaType: mimeType }, + { type: "text", text: prompt }, + ], + }, + ], + }); + + return text; +}; diff --git a/apps/video-transcript/src/index.ts b/apps/video-transcript/src/index.ts new file mode 100644 index 000000000..6e2a0d359 --- /dev/null +++ b/apps/video-transcript/src/index.ts @@ -0,0 +1,128 @@ +#!/usr/bin/env node +import { existsSync, writeFileSync } from "node:fs"; +import path from "node:path"; +import { Command } from "commander"; +import pc from "picocolors"; +import { SUPPORTED_VIDEO_EXTENSIONS } from "./constants"; +import { + analyzeActivity, + buildTrimmedVideo, + checkFfmpegAvailable, + formatTimeline, +} from "./activity-analyzer"; +import { extractTranscript } from "./extract-transcript"; + +const program = new Command() + .name("video-transcript") + .description("Extract structured interaction transcripts from screen recordings") + .version("0.0.1") + .argument("