diff --git a/.gitignore b/.gitignore index c71647f..a2c5d6c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ output .vscode dist .turbo +.DS_Store \ No newline at end of file diff --git a/bot_async_transcription_hybrid_diarization/README.md b/bot_async_transcription_hybrid_diarization/README.md index 6c75452..4784b65 100644 --- a/bot_async_transcription_hybrid_diarization/README.md +++ b/bot_async_transcription_hybrid_diarization/README.md @@ -6,17 +6,25 @@ This example demonstrates how to get accurate speaker attribution in your transc Standard transcription diarization has a tradeoff: +- **Speaker-timeline diarization** (from Recall.ai) uses active speaker events emitted by the meeting platform to know who is in the meeting (using the participant display names), but can't distinguish multiple people speaking from the same participant tile (e.g. calling from the same device/room). - **Machine diarization** (from providers like Deepgram) distinguishes different voices, but only gives you anonymous labels like "Speaker 0" and "Speaker 1". -- **Speaker timeline diarization** (from Recall.ai) uses participant speaker change events from the meeting platform to determine who is speaking but is unable to distinguish participants if they're speaking from the same participant tile (e.g. calling from the same device/room). -**Hybrid diarization combines both approaches.** It uses machine diarization to detect distinct voices, then maps them to real participant names when there's a clear 1-to-1 match. When multiple people share a device, it falls back to anonymous speaker labels. +**Hybrid diarization combines both approaches.** It uses machine diarization to detect distinct voices per participant, then maps them to real participant names when there's a clear 1-to-1 match. When multiple people share a device (i.e. a participant has more than one anonymous speaker label), it falls back to anonymous speaker labels. ## How It Works The server listens for webhook events from Recall.ai: 1. When `recording.done` is received, it triggers async transcript creation via Recall's API -2. When `transcript.done` is received, it downloads both the transcript and speaker timeline data, then merges them using the hybrid diarization algorithm +2. When `transcript.done` is received, it downloads both the transcript and the participants list, then merges them using the hybrid diarization algorithm + +### Hybrid Diarization Algorithm + +Each transcript part has a participant name in the format `{participant_id}-{anonymous_label}` (e.g. `200-0` means participant ID 200, anonymous label 0). The algorithm: + +1. Builds a map of `participant_id → Set` from all transcript parts +2. If a participant has **exactly one** anonymous label, we can confidently attribute all their segments to a single speaker — so we replace the anonymous label with the real participant name and metadata +3. If a participant has **multiple** anonymous labels (e.g. `200-0` and `200-1`), multiple people are sharing that device, so we leave those segments with their anonymous labels ## Prerequisites @@ -103,5 +111,7 @@ Replace `RECALL_REGION`, `RECALL_API_KEY`, and `YOUR_MEETING_URL` with your own After the call ends and the transcript is processed, you'll find the output files in the `output/` folder, organized by recording ID: -- `transcript.json` — The transcript data with hybrid diarization applied -- `readable.txt` — A human-readable version of the transcript +- `participants.json` — The list of participants in the meeting +- `transcript.json` — The raw transcript parts (before hybrid diarization) +- `hybrid_diarization_transcript.json` — The transcript with hybrid diarization applied +- `hybrid_diarization_transcript.txt` — A human-readable version of the hybrid diarized transcript diff --git a/bot_async_transcription_hybrid_diarization/src/bot_async_transcription_hybrid_diarization.ts b/bot_async_transcription_hybrid_diarization/src/bot_async_transcription_hybrid_diarization.ts index 7234e39..88d4d2d 100644 --- a/bot_async_transcription_hybrid_diarization/src/bot_async_transcription_hybrid_diarization.ts +++ b/bot_async_transcription_hybrid_diarization/src/bot_async_transcription_hybrid_diarization.ts @@ -4,8 +4,8 @@ import { z } from "zod"; import { env } from "./config/env"; import { convert_to_hybrid_diarized_transcript_parts } from "./convert_to_hybrid_diarized_transcript_parts"; import { convert_to_readable_transcript } from "./convert_to_readable_transcript"; +import { ParticipantPartSchema } from "./schemas/ParticipantPartSchema"; import { RecordingArtifactSchema } from "./schemas/RecordingArtifactSchema"; -import { SpeakerTimelinePartSchema } from "./schemas/SpeakerTimelinePartSchema"; import { TranscriptArtifactEventSchema, type TranscriptArtifactEventType } from "./schemas/TranscriptArtifactEventSchema"; import { TranscriptArtifactSchema } from "./schemas/TranscriptArtifactSchema"; import { TranscriptPartSchema } from "./schemas/TranscriptPartSchema"; @@ -24,6 +24,7 @@ export async function create_async_transcript(args: { recording_id: string }) { }, body: JSON.stringify({ provider: { deepgram_async: { diarize: true } }, + diarization: { use_separate_streams_when_available: true }, }), }); if (!response.ok) throw new Error(await response.text()); @@ -42,8 +43,8 @@ export async function bot_async_transcription(args: { msg: TranscriptArtifactEve if (!recording.media_shortcuts?.transcript?.data?.download_url) { throw new Error("Transcript download URL is null"); } - if (!recording.media_shortcuts.participant_events?.data?.speaker_timeline_download_url) { - throw new Error("Speaker timeline download URL is null"); + if (!recording.media_shortcuts.participant_events?.data?.participants_download_url) { + throw new Error("Participants download URL is null"); } // Retrieve and format transcript data. @@ -51,38 +52,35 @@ export async function bot_async_transcription(args: { msg: TranscriptArtifactEve download_url: recording.media_shortcuts.transcript.data.download_url, }); console.log(`Retrieved ${transcript_parts.length} transcript parts`); - const speaker_timeline_data = await retrieve_speaker_timeline_parts({ - download_url: recording.media_shortcuts.participant_events.data.speaker_timeline_download_url, + const participants = await retrieve_participants({ + download_url: recording.media_shortcuts.participant_events.data.participants_download_url, }); - console.log(`Retrieved ${speaker_timeline_data.length} speaker timeline parts`); + console.log(`Retrieved ${participants.length} participants`); const hybrid_transcript_parts = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data, + participants, }); console.log(`Formatted ${hybrid_transcript_parts.length} hybrid transcript parts`); const readable_hybrid_transcript_parts = convert_to_readable_transcript({ transcript_parts: hybrid_transcript_parts }); console.log(`Formatted ${readable_hybrid_transcript_parts.length} readable hybrid transcript parts`); - // Write the hybrid transcript parts data to a file. - const output_path_events = path.join( - process.cwd(), - `output/recording-${msg.data.recording.id}/transcript.json`, - ); - if (!fs.existsSync(output_path_events)) { - fs.mkdirSync(path.dirname(output_path_events), { recursive: true }); - fs.writeFileSync(output_path_events, "[]", { flag: "w+" }); - } - fs.writeFileSync(output_path_events, JSON.stringify(hybrid_transcript_parts, null, 2), { flag: "w+" }); - - // Write the readable hybrid transcript to a file. - const output_path_readable = path.join( - process.cwd(), - `output/recording-${msg.data.recording.id}/readable.txt`, - ); - if (!fs.existsSync(output_path_readable)) { - fs.mkdirSync(path.dirname(output_path_readable), { recursive: true }); - fs.writeFileSync(output_path_readable, "", { flag: "w+" }); - } + const output_dir = path.join(process.cwd(), `output/recording-${msg.data.recording.id}`); + fs.mkdirSync(output_dir, { recursive: true }); + + // Write the participants list to a file. + const output_path_participants = path.join(output_dir, "participants.json"); + fs.writeFileSync(output_path_participants, JSON.stringify(participants, null, 2), { flag: "w+" }); + + // Write the raw transcript parts to a file. + const output_path_transcript = path.join(output_dir, "transcript.json"); + fs.writeFileSync(output_path_transcript, JSON.stringify(transcript_parts, null, 2), { flag: "w+" }); + + // Write the hybrid diarized transcript parts to a file. + const output_path_hybrid = path.join(output_dir, "hybrid_diarization_transcript.json"); + fs.writeFileSync(output_path_hybrid, JSON.stringify(hybrid_transcript_parts, null, 2), { flag: "w+" }); + + // Write the readable hybrid diarized transcript to a file. + const output_path_readable = path.join(output_dir, "hybrid_diarization_transcript.txt"); fs.writeFileSync(output_path_readable, readable_hybrid_transcript_parts.map((t) => t ? `${t.speaker}: ${t.paragraph}` : "").join("\n"), { flag: "w+" }); // Return the transcript parts and readable transcript. @@ -121,13 +119,13 @@ async function retrieve_transcript_parts(args: { download_url: string }) { } /** - * Retrieve the speaker timeline data from the participant events artifact's `download_url`. + * Retrieve the participants list from the participant events artifact's `participants_download_url`. */ -async function retrieve_speaker_timeline_parts(args: { download_url: string }) { +async function retrieve_participants(args: { download_url: string }) { const { download_url } = z.object({ download_url: z.string() }).parse(args); const response = await fetch(download_url); if (!response.ok) throw new Error(await response.text()); - return SpeakerTimelinePartSchema.array().parse(await response.json()); + return ParticipantPartSchema.array().parse(await response.json()); } diff --git a/bot_async_transcription_hybrid_diarization/src/convert_to_hybrid_diarized_transcript_parts.test.ts b/bot_async_transcription_hybrid_diarization/src/convert_to_hybrid_diarized_transcript_parts.test.ts index fac9f97..c1095b9 100644 --- a/bot_async_transcription_hybrid_diarization/src/convert_to_hybrid_diarized_transcript_parts.test.ts +++ b/bot_async_transcription_hybrid_diarization/src/convert_to_hybrid_diarized_transcript_parts.test.ts @@ -1,19 +1,17 @@ import { describe, it, expect } from "vitest"; import { convert_to_hybrid_diarized_transcript_parts } from "./convert_to_hybrid_diarized_transcript_parts"; -import type { SpeakerTimelinePartType } from "./schemas/SpeakerTimelinePartSchema"; +import type { ParticipantPartType } from "./schemas/ParticipantPartSchema"; import type { TranscriptPartType } from "./schemas/TranscriptPartSchema"; -// Helper to create a transcript segment function create_transcript(opts: { speakerName: string | null; - speakerId?: number | null; startTime: number; endTime: number; text?: string; }): TranscriptPartType { return { participant: { - id: opts.speakerId ?? null, + id: null, name: opts.speakerName, is_host: null, platform: null, @@ -30,43 +28,38 @@ function create_transcript(opts: { }; } -// Helper to create a speaker timeline event -function create_speaker_event(opts: { - participantId: number | null; - participantName: string | null; - startTime: number; - endTime: number | null; -}): SpeakerTimelinePartType { +function create_participant(opts: { + id: number; + name: string | null; + is_host?: boolean | null; + platform?: string | null; + extra_data?: unknown; + email?: string | null; +}): ParticipantPartType { return { - participant: { - id: opts.participantId, - name: opts.participantName, - is_host: null, - platform: null, - extra_data: null, - email: null, - }, - start_timestamp: { relative: opts.startTime, absolute: null }, - end_timestamp: opts.endTime !== null - ? { relative: opts.endTime, absolute: null } - : null, + id: opts.id, + name: opts.name, + is_host: opts.is_host ?? null, + platform: opts.platform ?? null, + extra_data: opts.extra_data ?? null, + email: opts.email ?? null, }; } describe("convert_to_hybrid_diarized_transcript_parts", () => { - describe("Happy Path - Single Speaker Per Participant", () => { - it("should map anonymous speaker to real participant when only one speaker exists", () => { + describe("Happy Path - Single Anonymous Label Per Participant", () => { + it("should map transcript parts to real participant when only one anonymous label exists", () => { const transcript_parts: TranscriptPartType[] = [ - create_transcript({ speakerName: "Speaker A", startTime: 1, endTime: 5 }), - create_transcript({ speakerName: "Speaker A", startTime: 6, endTime: 10 }), + create_transcript({ speakerName: "100-0", startTime: 1, endTime: 5 }), + create_transcript({ speakerName: "100-0", startTime: 6, endTime: 10 }), ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 15 }), + const participants: ParticipantPartType[] = [ + create_participant({ id: 100, name: "John" }), ]; const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data, + participants, }); expect(result).toHaveLength(2); @@ -76,141 +69,73 @@ describe("convert_to_hybrid_diarized_transcript_parts", () => { expect(result[1].participant.name).toBe("John"); }); - it("should map multiple participants correctly when each has a unique anonymous speaker", () => { + it("should map multiple participants correctly when each has a single anonymous label", () => { const transcript_parts: TranscriptPartType[] = [ - create_transcript({ speakerName: "Speaker A", startTime: 1, endTime: 5 }), - create_transcript({ speakerName: "Speaker B", startTime: 16, endTime: 20 }), + create_transcript({ speakerName: "100-0", startTime: 1, endTime: 5 }), + create_transcript({ speakerName: "200-0", startTime: 6, endTime: 10 }), ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), - create_speaker_event({ participantId: 200, participantName: "Mary", startTime: 15, endTime: 25 }), + const participants: ParticipantPartType[] = [ + create_participant({ id: 100, name: "John" }), + create_participant({ id: 200, name: "Mary" }), ]; const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data, + participants, }); expect(result).toHaveLength(2); expect(result[0].participant.name).toBe("John"); + expect(result[0].participant.id).toBe(100); expect(result[1].participant.name).toBe("Mary"); + expect(result[1].participant.id).toBe(200); }); }); - describe("Multiple Speakers Per Participant - No Mapping", () => { - it("should NOT map when participant has multiple anonymous speakers in same segment", () => { + describe("Multiple Anonymous Labels Per Participant - No Mapping", () => { + it("should NOT map when participant has multiple anonymous labels", () => { const transcript_parts: TranscriptPartType[] = [ - create_transcript({ speakerName: "Speaker A", startTime: 1, endTime: 5 }), - create_transcript({ speakerName: "Speaker B", startTime: 6, endTime: 10 }), + create_transcript({ speakerName: "100-0", startTime: 1, endTime: 5 }), + create_transcript({ speakerName: "100-1", startTime: 6, endTime: 10 }), ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 15 }), + const participants: ParticipantPartType[] = [ + create_participant({ id: 100, name: "John" }), ]; const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data, + participants, }); - // Should remain unchanged - no mapping applied - expect(result[0].participant.name).toBe("Speaker A"); + expect(result[0].participant.name).toBe("100-0"); expect(result[0].participant.id).toBeNull(); - expect(result[1].participant.name).toBe("Speaker B"); + expect(result[1].participant.name).toBe("100-1"); expect(result[1].participant.id).toBeNull(); }); - - it("should NOT map when participant has multiple speakers across different timeline segments", () => { - // This is the key edge case: single speaker first, then multiple, then single again - const transcript_parts: TranscriptPartType[] = [ - // Segment 1: Only Speaker A - create_transcript({ speakerName: "Speaker A", startTime: 1, endTime: 5 }), - // Segment 2: Both Speaker A and Speaker B - create_transcript({ speakerName: "Speaker A", startTime: 11, endTime: 13 }), - create_transcript({ speakerName: "Speaker B", startTime: 14, endTime: 18 }), - // Segment 3: Only Speaker A again - create_transcript({ speakerName: "Speaker A", startTime: 21, endTime: 25 }), - ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), - create_speaker_event({ participantId: 100, participantName: "John", startTime: 10, endTime: 20 }), - create_speaker_event({ participantId: 100, participantName: "John", startTime: 20, endTime: 30 }), - ]; - - const result = convert_to_hybrid_diarized_transcript_parts({ - transcript_parts, - speaker_timeline_data, - }); - - // Because John had both A and B at some point, NONE should be mapped - expect(result[0].participant.name).toBe("Speaker A"); - expect(result[0].participant.id).toBeNull(); - expect(result[1].participant.name).toBe("Speaker A"); - expect(result[2].participant.name).toBe("Speaker B"); - expect(result[3].participant.name).toBe("Speaker A"); - }); - - it("should NOT map when different speakers use same device in separate timeline segments", () => { - // Scenario: Two people call in from the same device (John) at different times - // Speaker A speaks during John's first segment - // Speaker B speaks during John's second segment (different person, same device) - // Mary has only Speaker C, so she should be mapped - const transcript_parts: TranscriptPartType[] = [ - // John segment 1: Speaker A - create_transcript({ speakerName: "Speaker A", startTime: 1, endTime: 5 }), - // Mary segment: Speaker C - create_transcript({ speakerName: "Speaker C", startTime: 11, endTime: 14 }), - // John segment 2: Speaker B (different person on same device) - create_transcript({ speakerName: "Speaker B", startTime: 16, endTime: 20 }), - ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), - create_speaker_event({ participantId: 200, participantName: "Mary", startTime: 10, endTime: 15 }), - create_speaker_event({ participantId: 100, participantName: "John", startTime: 15, endTime: 25 }), - ]; - - const result = convert_to_hybrid_diarized_transcript_parts({ - transcript_parts, - speaker_timeline_data, - }); - - // John had both A and B across segments - stays anonymous - expect(result[0].participant.name).toBe("Speaker A"); - expect(result[0].participant.id).toBeNull(); - // Mary had only C - gets mapped - expect(result[1].participant.name).toBe("Mary"); - expect(result[1].participant.id).toBe(200); - // John's second segment also stays anonymous - expect(result[2].participant.name).toBe("Speaker B"); - expect(result[2].participant.id).toBeNull(); - }); }); describe("Mixed Participants - Some Mapped, Some Not", () => { - it("should map participant with single speaker but not participant with multiple speakers", () => { + it("should map participant with single label but not participant with multiple labels", () => { const transcript_parts: TranscriptPartType[] = [ - // John's segments - has multiple speakers (A and B) - create_transcript({ speakerName: "Speaker A", startTime: 1, endTime: 5 }), - create_transcript({ speakerName: "Speaker B", startTime: 6, endTime: 9 }), - // Mary's segments - has single speaker (C) - create_transcript({ speakerName: "Speaker C", startTime: 16, endTime: 20 }), - create_transcript({ speakerName: "Speaker C", startTime: 21, endTime: 25 }), + create_transcript({ speakerName: "100-0", startTime: 1, endTime: 5 }), + create_transcript({ speakerName: "100-1", startTime: 6, endTime: 9 }), + create_transcript({ speakerName: "200-0", startTime: 16, endTime: 20 }), + create_transcript({ speakerName: "200-0", startTime: 21, endTime: 25 }), ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), - create_speaker_event({ participantId: 200, participantName: "Mary", startTime: 15, endTime: 30 }), + const participants: ParticipantPartType[] = [ + create_participant({ id: 100, name: "John" }), + create_participant({ id: 200, name: "Mary" }), ]; const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data, + participants, }); - // John's segments stay anonymous - expect(result[0].participant.name).toBe("Speaker A"); + expect(result[0].participant.name).toBe("100-0"); expect(result[0].participant.id).toBeNull(); - expect(result[1].participant.name).toBe("Speaker B"); + expect(result[1].participant.name).toBe("100-1"); expect(result[1].participant.id).toBeNull(); - // Mary's segments get mapped expect(result[2].participant.name).toBe("Mary"); expect(result[2].participant.id).toBe(200); expect(result[3].participant.name).toBe("Mary"); @@ -222,189 +147,118 @@ describe("convert_to_hybrid_diarized_transcript_parts", () => { it("should return empty array when transcript_parts is empty", () => { const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts: [], - speaker_timeline_data: [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), + participants: [ + create_participant({ id: 100, name: "John" }), ], }); expect(result).toHaveLength(0); }); - it("should return unchanged transcripts when speaker_timeline_data is empty", () => { + it("should return unchanged transcripts when participants is empty", () => { const transcript_parts: TranscriptPartType[] = [ - create_transcript({ speakerName: "Speaker A", startTime: 1, endTime: 5 }), + create_transcript({ speakerName: "100-0", startTime: 1, endTime: 5 }), ]; const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data: [], + participants: [], }); expect(result).toHaveLength(1); - expect(result[0].participant.name).toBe("Speaker A"); + expect(result[0].participant.name).toBe("100-0"); expect(result[0].participant.id).toBeNull(); }); - it("should skip speaker events with null participant id", () => { + it("should leave transcript unchanged when name is null", () => { const transcript_parts: TranscriptPartType[] = [ - create_transcript({ speakerName: "Speaker A", startTime: 1, endTime: 5 }), + create_transcript({ speakerName: null, startTime: 1, endTime: 5 }), ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: null, participantName: "John", startTime: 0, endTime: 10 }), + const participants: ParticipantPartType[] = [ + create_participant({ id: 100, name: "John" }), ]; const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data, + participants, }); - // No mapping should occur - expect(result[0].participant.name).toBe("Speaker A"); + expect(result[0].participant.name).toBeNull(); expect(result[0].participant.id).toBeNull(); }); - it("should skip speaker events with null participant name", () => { + it("should leave transcript unchanged when name does not match expected format", () => { const transcript_parts: TranscriptPartType[] = [ create_transcript({ speakerName: "Speaker A", startTime: 1, endTime: 5 }), ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: null, startTime: 0, endTime: 10 }), + const participants: ParticipantPartType[] = [ + create_participant({ id: 100, name: "John" }), ]; const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data, + participants, }); expect(result[0].participant.name).toBe("Speaker A"); expect(result[0].participant.id).toBeNull(); }); - it("should not add to speaker set when transcript has null participant name", () => { + it("should leave transcript unchanged when participant_id has no match in participants list", () => { const transcript_parts: TranscriptPartType[] = [ - create_transcript({ speakerName: null, startTime: 1, endTime: 5 }), + create_transcript({ speakerName: "999-0", startTime: 1, endTime: 5 }), ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), + const participants: ParticipantPartType[] = [ + create_participant({ id: 100, name: "John" }), ]; const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data, + participants, }); - // Should remain unchanged - expect(result[0].participant.name).toBeNull(); + expect(result[0].participant.name).toBe("999-0"); expect(result[0].participant.id).toBeNull(); }); - }); - - describe("Edge Cases - Timing and Boundaries", () => { - it("should only include transcript segments fully contained within speaker event", () => { - const transcript_parts: TranscriptPartType[] = [ - // Fully contained - should be included - create_transcript({ speakerName: "Speaker A", startTime: 2, endTime: 8 }), - // Starts before speaker event - should NOT be included - create_transcript({ speakerName: "Speaker B", startTime: -1, endTime: 5 }), - // Ends at or after speaker event end - should NOT be included - create_transcript({ speakerName: "Speaker C", startTime: 5, endTime: 10 }), - ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), - ]; - - const result = convert_to_hybrid_diarized_transcript_parts({ - transcript_parts, - speaker_timeline_data, - }); - - // Only Speaker A should be mapped to John (only one that was fully contained) - expect(result[0].participant.name).toBe("John"); - expect(result[0].participant.id).toBe(100); - // Others remain unchanged - expect(result[1].participant.name).toBe("Speaker B"); - expect(result[2].participant.name).toBe("Speaker C"); - }); - it("should handle speaker event with null end_timestamp (extends to infinity)", () => { + it("should handle transcript with empty words array gracefully", () => { const transcript_parts: TranscriptPartType[] = [ - create_transcript({ speakerName: "Speaker A", startTime: 100, endTime: 200 }), + { + participant: { + id: null, + name: "100-0", + is_host: null, + platform: null, + extra_data: null, + email: null, + }, + words: [], + }, ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 50, endTime: null }), + const participants: ParticipantPartType[] = [ + create_participant({ id: 100, name: "John" }), ]; const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data, + participants, }); + expect(result).toHaveLength(1); expect(result[0].participant.name).toBe("John"); expect(result[0].participant.id).toBe(100); }); - - it("should handle transcript segment with start exactly at speaker event start", () => { - const transcript_parts: TranscriptPartType[] = [ - create_transcript({ speakerName: "Speaker A", startTime: 0, endTime: 5 }), - ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), - ]; - - const result = convert_to_hybrid_diarized_transcript_parts({ - transcript_parts, - speaker_timeline_data, - }); - - expect(result[0].participant.name).toBe("John"); - }); - - it("should NOT include transcript that ends exactly at speaker event end", () => { - const transcript_parts: TranscriptPartType[] = [ - create_transcript({ speakerName: "Speaker A", startTime: 5, endTime: 10 }), - ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), - ]; - - const result = convert_to_hybrid_diarized_transcript_parts({ - transcript_parts, - speaker_timeline_data, - }); - - // Condition is speaker_event_end > end, so 10 > 10 is false - not included - expect(result[0].participant.name).toBe("Speaker A"); - }); - }); - - describe("Edge Cases - Transcript Not Matching Any Speaker Event", () => { - it("should leave transcript unchanged when it doesn't fall within any speaker event", () => { - const transcript_parts: TranscriptPartType[] = [ - create_transcript({ speakerName: "Speaker A", startTime: 50, endTime: 60 }), - ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), - ]; - - const result = convert_to_hybrid_diarized_transcript_parts({ - transcript_parts, - speaker_timeline_data, - }); - - expect(result[0].participant.name).toBe("Speaker A"); - expect(result[0].participant.id).toBeNull(); - }); }); describe("Data Preservation", () => { - it("should preserve other transcript fields when mapping participant", () => { + it("should replace participant fields with real participant data when mapping", () => { const transcript_parts: TranscriptPartType[] = [ { participant: { id: null, - name: "Speaker A", - is_host: true, - platform: "desktop", + name: "100-0", + is_host: null, + platform: "mobile_app", extra_data: { custom: "data" }, email: "original@example.com", }, @@ -417,23 +271,28 @@ describe("convert_to_hybrid_diarized_transcript_parts", () => { ], }, ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), + const participants: ParticipantPartType[] = [ + create_participant({ + id: 100, + name: "John", + is_host: true, + platform: "desktop", + extra_data: { zoom: { guest: false } }, + email: "john@example.com", + }), ]; const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data, + participants, }); - // Participant fields should be updated expect(result[0].participant.id).toBe(100); expect(result[0].participant.name).toBe("John"); - // Other participant fields should be preserved expect(result[0].participant.is_host).toBe(true); expect(result[0].participant.platform).toBe("desktop"); - expect(result[0].participant.extra_data).toEqual({ custom: "data" }); - expect(result[0].participant.email).toBe("original@example.com"); + expect(result[0].participant.extra_data).toEqual({ zoom: { guest: false } }); + expect(result[0].participant.email).toBe("john@example.com"); // Words should be preserved expect(result[0].words[0].text).toBe("Hello world"); expect(result[0].words[0].start_timestamp?.absolute).toBe("2025-01-01T00:00:01Z"); @@ -441,233 +300,39 @@ describe("convert_to_hybrid_diarized_transcript_parts", () => { }); describe("Word Order Preservation", () => { - it("should preserve hybrid diarization behavior and keep words in order in a multi-speaker conversation", () => { + it("should preserve word order in a multi-speaker conversation", () => { const transcript_parts: TranscriptPartType[] = [ - { - participant: { - id: null, - name: "0", - is_host: null, - platform: null, - extra_data: null, - email: null, - }, - words: [ - { - text: "how", - start_timestamp: { relative: 1, absolute: null }, - end_timestamp: { relative: 2, absolute: null }, - }, - { - text: "is", - start_timestamp: { relative: 2, absolute: null }, - end_timestamp: { relative: 3, absolute: null }, - }, - { - text: "it", - start_timestamp: { relative: 3, absolute: null }, - end_timestamp: { relative: 4, absolute: null }, - }, - { - text: "going", - start_timestamp: { relative: 4, absolute: null }, - end_timestamp: { relative: 5, absolute: null }, - }, - { - text: "today", - start_timestamp: { relative: 5, absolute: null }, - end_timestamp: { relative: 6, absolute: null }, - }, - ], - }, - { - participant: { - id: null, - name: "1", - is_host: null, - platform: null, - extra_data: null, - email: null, - }, - words: [ - { - text: "it", - start_timestamp: { relative: 10, absolute: null }, - end_timestamp: { relative: 11, absolute: null }, - }, - { - text: "is", - start_timestamp: { relative: 11, absolute: null }, - end_timestamp: { relative: 12, absolute: null }, - }, - { - text: "good", - start_timestamp: { relative: 12, absolute: null }, - end_timestamp: { relative: 13, absolute: null }, - }, - ], - }, - { - participant: { - id: null, - name: "2", - is_host: null, - platform: null, - extra_data: null, - email: null, - }, - words: [ - { - text: "Actually", - start_timestamp: { relative: 14, absolute: null }, - end_timestamp: { relative: 15, absolute: null }, - }, - { - text: "it", - start_timestamp: { relative: 15, absolute: null }, - end_timestamp: { relative: 16, absolute: null }, - }, - { - text: "is", - start_timestamp: { relative: 16, absolute: null }, - end_timestamp: { relative: 17, absolute: null }, - }, - { - text: "great", - start_timestamp: { relative: 17, absolute: null }, - end_timestamp: { relative: 18, absolute: null }, - }, - ], - }, - { - participant: { - id: null, - name: "0", - is_host: null, - platform: null, - extra_data: null, - email: null, - }, - words: [ - { - text: "Oh", - start_timestamp: { relative: 22, absolute: null }, - end_timestamp: { relative: 23, absolute: null }, - }, - { - text: "that's", - start_timestamp: { relative: 23, absolute: null }, - end_timestamp: { relative: 24, absolute: null }, - }, - { - text: "great", - start_timestamp: { relative: 24, absolute: null }, - end_timestamp: { relative: 25, absolute: null }, - }, - { - text: "to", - start_timestamp: { relative: 25, absolute: null }, - end_timestamp: { relative: 26, absolute: null }, - }, - { - text: "hear", - start_timestamp: { relative: 26, absolute: null }, - end_timestamp: { relative: 27, absolute: null }, - }, - { - text: "then!", - start_timestamp: { relative: 27, absolute: null }, - end_timestamp: { relative: 28, absolute: null }, - }, - ], - }, + create_transcript({ speakerName: "100-0", startTime: 1, endTime: 6, text: "how is it going today" }), + create_transcript({ speakerName: "200-0", startTime: 10, endTime: 13, text: "it is good" }), + create_transcript({ speakerName: "200-1", startTime: 14, endTime: 18, text: "Actually it is great" }), + create_transcript({ speakerName: "100-0", startTime: 22, endTime: 28, text: "Oh that's great to hear then!" }), ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "Max", startTime: 0, endTime: 8 }), - create_speaker_event({ participantId: 200, participantName: "Anon", startTime: 8, endTime: 20 }), - create_speaker_event({ participantId: 100, participantName: "Max", startTime: 20, endTime: 35 }), + const participants: ParticipantPartType[] = [ + create_participant({ id: 100, name: "Max" }), + create_participant({ id: 200, name: "Anon" }), ]; const result = convert_to_hybrid_diarized_transcript_parts({ transcript_parts, - speaker_timeline_data, + participants, }); - // Hybrid behavior: - // - Max maps to participant id=100 in both of his segments - // - Shared Device has two anonymous speakers (0 and 1), so neither should be mapped + // Max (100) has only label "0" → mapped expect(result[0].participant.name).toBe("Max"); expect(result[0].participant.id).toBe(100); - expect(result[1].participant.name).toBe("1"); + // Anon (200) has labels "0" and "1" → NOT mapped + expect(result[1].participant.name).toBe("200-0"); expect(result[1].participant.id).toBeNull(); - expect(result[2].participant.name).toBe("2"); + expect(result[2].participant.name).toBe("200-1"); expect(result[2].participant.id).toBeNull(); + // Max again expect(result[3].participant.name).toBe("Max"); expect(result[3].participant.id).toBe(100); - // Word order should remain chronological within each utterance. - expect(result[0].words.map((word) => word.text).join(" ")).toBe("how is it going today"); - expect(result[1].words.map((word) => word.text).join(" ")).toBe("it is good"); - expect(result[2].words.map((word) => word.text).join(" ")).toBe("Actually it is great"); - expect(result[3].words.map((word) => word.text).join(" ")).toBe("Oh that's great to hear then!"); - }); - }); - - describe("Edge Cases - Same Anonymous Speaker for Multiple Participants", () => { - it("should overwrite mapping when same anonymous speaker appears for different participants", () => { - // This is a potential issue: if machine diarization assigns same label to different participants - const transcript_parts: TranscriptPartType[] = [ - create_transcript({ speakerName: "Speaker A", startTime: 1, endTime: 5 }), - create_transcript({ speakerName: "Speaker A", startTime: 16, endTime: 20 }), - ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), - create_speaker_event({ participantId: 200, participantName: "Mary", startTime: 15, endTime: 25 }), - ]; - - const result = convert_to_hybrid_diarized_transcript_parts({ - transcript_parts, - speaker_timeline_data, - }); - - // Current behavior: last one wins (Mary overwrites John) - // Both get mapped to Mary - expect(result[0].participant.name).toBe("Mary"); - expect(result[1].participant.name).toBe("Mary"); - }); - }); - - describe("Edge Cases - Empty Words Array", () => { - it("should handle transcript with empty words array gracefully", () => { - const transcript_parts: TranscriptPartType[] = [ - { - participant: { - id: null, - name: "Speaker A", - is_host: null, - platform: null, - extra_data: null, - email: null, - }, - words: [], - }, - ]; - const speaker_timeline_data: SpeakerTimelinePartType[] = [ - create_speaker_event({ participantId: 100, participantName: "John", startTime: 0, endTime: 10 }), - ]; - - // Should not throw - the optional chaining on words[0]?.start_timestamp should handle this - const result = convert_to_hybrid_diarized_transcript_parts({ - transcript_parts, - speaker_timeline_data, - }); - - expect(result).toHaveLength(1); - // With empty words, start defaults to NEGATIVE_INFINITY and end to POSITIVE_INFINITY - // So it won't be contained within the speaker event (start must be >= speaker_event_start) - // Actually NEGATIVE_INFINITY >= 0 is false, so it won't match - expect(result[0].participant.name).toBe("Speaker A"); + expect(result[0].words[0].text).toBe("how is it going today"); + expect(result[1].words[0].text).toBe("it is good"); + expect(result[2].words[0].text).toBe("Actually it is great"); + expect(result[3].words[0].text).toBe("Oh that's great to hear then!"); }); }); }); - diff --git a/bot_async_transcription_hybrid_diarization/src/convert_to_hybrid_diarized_transcript_parts.ts b/bot_async_transcription_hybrid_diarization/src/convert_to_hybrid_diarized_transcript_parts.ts index 1e83d2c..74b477f 100644 --- a/bot_async_transcription_hybrid_diarization/src/convert_to_hybrid_diarized_transcript_parts.ts +++ b/bot_async_transcription_hybrid_diarization/src/convert_to_hybrid_diarized_transcript_parts.ts @@ -1,106 +1,89 @@ import { z } from "zod"; -import { SpeakerTimelinePartSchema, type SpeakerTimelinePartType } from "./schemas/SpeakerTimelinePartSchema"; +import { ParticipantPartSchema, type ParticipantPartType } from "./schemas/ParticipantPartSchema"; import { TranscriptPartSchema, type TranscriptPartType } from "./schemas/TranscriptPartSchema"; /** * Format the transcript data with hybrid diarization. - * This will use use machine diarization to get anonymous speaker labels for each participant in the transcript, - * and will diarize them using speaker-timeline diarization if there's only one machine-diarized participant speaking for that participant. - * - * The end result is a transcript which uses speaker-timeline diarization for each participant unless there are multiple people speaking from the same device. + * + * Transcript part names follow the format `{participant_id}-{anonymous_label}`. + * This builds a map of participant_id → Set and only replaces + * participant info when a given participant_id has exactly one anonymous label, + * meaning we can confidently attribute those segments to a single speaker. */ export function convert_to_hybrid_diarized_transcript_parts( args: { transcript_parts: TranscriptPartType[], - speaker_timeline_data: SpeakerTimelinePartType[], + participants: ParticipantPartType[], }, ): TranscriptPartType[] { - const { transcript_parts, speaker_timeline_data } = z.object({ + const { transcript_parts, participants } = z.object({ transcript_parts: TranscriptPartSchema.array(), - speaker_timeline_data: SpeakerTimelinePartSchema.array(), + participants: ParticipantPartSchema.array(), }).parse(args); - // eslint-disable-next-line @typescript-eslint/naming-convention - const ParticipantMappingSchema = z.object({ id: z.number().nullable(), name: z.string().nullable() }); - // eslint-disable-next-line @typescript-eslint/naming-convention - type ParticipantMappingType = z.infer; + const participants_by_id = new Map( + participants.map((p) => [p.id, p]), + ); - // Collect all anonymous speakers per participant across all their timeline segments. - // This ensures we know the total number of unique speakers for each participant before making mapping decisions. - const participant_to_anon = new Map>(); - for (const speaker_change_event of speaker_timeline_data) { - if (!speaker_change_event.participant.id || !speaker_change_event.participant.name) { - continue; - } - - // Get the bounds of the current speaker event - const speaker_event_start = speaker_change_event.start_timestamp.relative; - const speaker_event_end = speaker_change_event.end_timestamp?.relative ?? Number.POSITIVE_INFINITY; + // Build a map of participant_id → Set from transcript part names. + // Name format: "{participant_id}-{anonymous_label}" (e.g. "200-0") + const participant_id_to_anon_labels = new Map>(); + for (const part of transcript_parts) { + if (!part.participant.name) continue; - // Get the transcript segments that are within the current speaker event - const transcript_segments = transcript_parts.filter((transcript) => { - const start = transcript.words.find( - (word) => word.start_timestamp?.relative !== undefined && word.start_timestamp.relative < speaker_event_end, - )?.start_timestamp?.relative ?? Number.NEGATIVE_INFINITY; - const end = [...transcript.words].reverse().find( - (word) => word.end_timestamp?.relative !== undefined && word.end_timestamp.relative < speaker_event_end, - )?.end_timestamp?.relative ?? Number.POSITIVE_INFINITY; - return speaker_event_start <= start && speaker_event_end > end; - }); + const match = part.participant.name.match(/^(\d+)-(.+)$/); + if (!match) continue; - // Add the participant to the mapping if it's not already present - const participant_key = JSON.stringify(ParticipantMappingSchema.parse({ - id: speaker_change_event.participant.id ?? null, - name: speaker_change_event.participant.name ?? null, - })); - if (!participant_to_anon.has(participant_key)) { - participant_to_anon.set(participant_key, new Set()); - } + const participant_id = parseInt(match[1], 10); + const anon_label = match[2]; - // Add all anonymous speakers from this segment - for (const segment of transcript_segments) { - const participants = participant_to_anon.get(participant_key); - if (participants && segment.participant.name) { - participants.add(segment.participant.name); - } + if (!participant_id_to_anon_labels.has(participant_id)) { + participant_id_to_anon_labels.set(participant_id, new Set()); } + participant_id_to_anon_labels.get(participant_id)!.add(anon_label); } - // Derive mappings: only create mapping for participants with exactly 1 speaker across ALL their segments. - // If a participant ever had multiple speakers, none of them should be mapped. - const anon_to_participant = new Map(); - for (const [participant_raw, anon] of participant_to_anon.entries()) { - const result = ParticipantMappingSchema.safeParse(JSON.parse(participant_raw)); - if (!result.success) { - console.log(`Failed to parse participant: ${participant_raw} - ${result.error.message}`); - continue; - } - const { data: participant } = result; - - if (anon.size === 1) { - const anon_key = anon.values().next().value!; - anon_to_participant.set(anon_key, participant); - } else if (anon.size > 1) { - console.log(`Participant "${participant.name}" (id: ${participant.id}) has ${anon.size} speakers: ${JSON.stringify(Array.from(anon))} - not mapping`); + // Log the mapping for debugging + for (const [participant_id, anon_labels] of participant_id_to_anon_labels) { + const participant = participants_by_id.get(participant_id); + const label_list = JSON.stringify(Array.from(anon_labels)); + if (anon_labels.size === 1) { + console.log(`Participant "${participant?.name}" (id: ${participant_id}) has 1 anonymous label: ${label_list} - will map`); } else { - console.log(`Expected participant to have at least 1 speaker, but has ${anon.size}`); + console.log(`Participant "${participant?.name}" (id: ${participant_id}) has ${anon_labels.size} anonymous labels: ${label_list} - skipping`); } } - console.log(`Participant mapping: ${JSON.stringify(Object.fromEntries(anon_to_participant))}`); + // Only map participants that have exactly one anonymous label. + const mappable_participant_ids = new Set( + [...participant_id_to_anon_labels.entries()] + .filter(([, labels]) => labels.size === 1) + .map(([id]) => id), + ); - // Replace the participant data with the mapped participant data. const hybrid_transcript_parts = transcript_parts.map((transcript) => { - const participant_data = anon_to_participant.get(transcript.participant.name ?? ""); - if (!participant_data) { - return transcript; - } + if (!transcript.participant.name) return transcript; + + const match = transcript.participant.name.match(/^(\d+)-(.+)$/); + if (!match) return transcript; + + const participant_id = parseInt(match[1], 10); + if (!mappable_participant_ids.has(participant_id)) return transcript; + + const participant = participants_by_id.get(participant_id); + if (!participant) return transcript; + return { ...transcript, participant: { ...transcript.participant, - ...participant_data, + id: participant.id, + name: participant.name, + is_host: participant.is_host, + platform: participant.platform, + extra_data: participant.extra_data, + email: participant.email, }, }; }); diff --git a/bot_async_transcription_hybrid_diarization/src/schemas/ParticipantPartSchema.ts b/bot_async_transcription_hybrid_diarization/src/schemas/ParticipantPartSchema.ts new file mode 100644 index 0000000..7b3d427 --- /dev/null +++ b/bot_async_transcription_hybrid_diarization/src/schemas/ParticipantPartSchema.ts @@ -0,0 +1,15 @@ +import { z } from "zod"; + +/** + * Schema for a single participant from the participants list. + */ +export const ParticipantPartSchema = z.object({ + id: z.number().nullable(), + name: z.string().nullable(), + is_host: z.boolean().nullable(), + platform: z.string().nullable(), + extra_data: z.any().nullable(), + email: z.string().nullable(), +}); + +export type ParticipantPartType = z.infer; diff --git a/bot_async_transcription_hybrid_diarization/src/schemas/SpeakerTimelinePartSchema.ts b/bot_async_transcription_hybrid_diarization/src/schemas/SpeakerTimelinePartSchema.ts deleted file mode 100644 index ba8814e..0000000 --- a/bot_async_transcription_hybrid_diarization/src/schemas/SpeakerTimelinePartSchema.ts +++ /dev/null @@ -1,25 +0,0 @@ -import { z } from "zod"; - -/** - * Schema for the speaker timeline data. - */ -export const SpeakerTimelinePartSchema = z.object({ - participant: z.object({ - id: z.number().nullable(), // Recall.ai assigned participant id (e.g. 100, 200, 300) - name: z.string().nullable(), // Display name from meeting - is_host: z.boolean().nullable(), // True if the participant is the host - platform: z.string().nullable(), // Meeting platform constant. values: 'desktop', 'dial-in', 'unknown' - extra_data: z.any().nullable(), // Extra data about the participant from the meeting platform - email: z.string().nullish(), // Email address of the participant if using Recall's calendar integration - }), - start_timestamp: z.object({ - absolute: z.string().nullable(), // ISO 8601 absolute timestamp (e.g. 2025-01-01 00:00:00) - relative: z.number(), // Timestamp in seconds from the start of the recording - }), - end_timestamp: z.object({ - absolute: z.string().nullish(), // ISO 8601 absolute timestamp (e.g. 2025-01-01 00:00:00) - relative: z.number().nullable(), // Timestamp in seconds from the start of the recording - }).nullable(), -}); - -export type SpeakerTimelinePartType = z.infer; \ No newline at end of file diff --git a/bot_async_transcription_hybrid_diarization/src/schemas/TranscriptPartSchema.ts b/bot_async_transcription_hybrid_diarization/src/schemas/TranscriptPartSchema.ts index d38929e..f7c2036 100644 --- a/bot_async_transcription_hybrid_diarization/src/schemas/TranscriptPartSchema.ts +++ b/bot_async_transcription_hybrid_diarization/src/schemas/TranscriptPartSchema.ts @@ -1,17 +1,11 @@ import { z } from "zod"; +import { ParticipantPartSchema } from "./ParticipantPartSchema"; /** * Schema for a single transcript part. */ export const TranscriptPartSchema = z.object({ - participant: z.object({ - id: z.number().nullable(), // Recall.ai assigned participant id (e.g. 100, 200, 300) - name: z.string().nullable(), // Display name from meeting - is_host: z.boolean().nullable(), // True if the participant is the host - platform: z.string().nullable(), // Meeting platform constant. values: 'desktop', 'dial-in', 'unknown' - extra_data: z.any().nullable(), // Extra data about the participant from the meeting platform - email: z.string().nullish(), // Email address of the participant if using Recall's calendar integration - }), + participant: ParticipantPartSchema, words: z.object({ text: z.string(), start_timestamp: z.object({ diff --git a/package-lock.json b/package-lock.json index 1ccdd00..ab01496 100644 --- a/package-lock.json +++ b/package-lock.json @@ -32007,6 +32007,22 @@ "funding": { "url": "https://github.com/sponsors/colinhacks" } + }, + "workspace_delete_bots_dsdk_recordings": { + "version": "1.0.0", + "extraneous": true, + "license": "MIT", + "dependencies": { + "dotenv": "^17.2.3", + "mri": "^1.2.0", + "zod": "^4.1.13" + }, + "devDependencies": { + "@types/mri": "^1.1.4", + "@types/node": "^24.10.1", + "ts-node": "^10.9.2", + "typescript": "^5.9.3" + } } } }