Skip to content

Commit 79c057e

Browse files
committed
test(zoom): cover iterative sanitization in transcript parser
1 parent 9c2e5ff commit 79c057e

2 files changed

Lines changed: 90 additions & 1 deletion

File tree

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/**
2+
* @vitest-environment node
3+
*/
4+
import { describe, expect, it } from 'vitest'
5+
import { parseVtt } from '@/connectors/zoom/zoom'
6+
7+
const HEADER = 'WEBVTT\n\n'
8+
9+
describe('parseVtt', () => {
10+
it.concurrent('returns empty string for input with no cues', () => {
11+
expect(parseVtt(HEADER)).toBe('')
12+
})
13+
14+
it.concurrent('extracts plain spoken text from a single cue', () => {
15+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nHello world\n`
16+
expect(parseVtt(vtt)).toBe('Hello world')
17+
})
18+
19+
it.concurrent('preserves WebVTT voice tags as "Speaker: text"', () => {
20+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<v Alice>hello there</v>\n`
21+
expect(parseVtt(vtt)).toBe('Alice: hello there')
22+
})
23+
24+
it.concurrent('preserves voice tags with class suffix', () => {
25+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<v.host Bob>welcome</v>\n`
26+
expect(parseVtt(vtt)).toBe('Bob: welcome')
27+
})
28+
29+
it.concurrent('strips inline formatting tags but keeps text', () => {
30+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<b>bold</b> and <i>italic</i>\n`
31+
expect(parseVtt(vtt)).toBe('bold and italic')
32+
})
33+
34+
it.concurrent('strips karaoke timestamp tags', () => {
35+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello <00:00:01.000>world\n`
36+
expect(parseVtt(vtt)).toBe('hello world')
37+
})
38+
39+
it.concurrent('strips class spans', () => {
40+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<c.loud>SHOUT</c>\n`
41+
expect(parseVtt(vtt)).toBe('SHOUT')
42+
})
43+
44+
it.concurrent('skips cue identifier lines before timing', () => {
45+
const vtt = `${HEADER}cue-1\n00:00:00.000 --> 00:00:02.000\nhello\n`
46+
expect(parseVtt(vtt)).toBe('hello')
47+
})
48+
49+
it.concurrent('joins multiple cues with newlines', () => {
50+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nfirst\n\n00:00:02.000 --> 00:00:04.000\nsecond\n`
51+
expect(parseVtt(vtt)).toBe('first\nsecond')
52+
})
53+
54+
it.concurrent('collapses repeated whitespace within a cue', () => {
55+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello world\n`
56+
expect(parseVtt(vtt)).toBe('hello world')
57+
})
58+
59+
it.concurrent('iteratively strips overlapping tags that reconstruct after one pass', () => {
60+
const crafted = '<<b>b>injected</<b>b>'
61+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
62+
const result = parseVtt(vtt)
63+
expect(result).not.toMatch(/<\/?[^>]+>/)
64+
expect(result).toContain('injected')
65+
})
66+
67+
it.concurrent('iteratively strips nested script-like tag fragments', () => {
68+
const crafted = '<scr<script>ipt>alert(1)</scr</script>ipt>'
69+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
70+
const result = parseVtt(vtt)
71+
expect(result).not.toMatch(/<\/?[^>]+>/)
72+
expect(result.toLowerCase()).not.toContain('script')
73+
})
74+
75+
it.concurrent('sanitizes crafted speaker names that embed tag fragments', () => {
76+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<v <b>Evil</b>>payload</v>\n`
77+
const result = parseVtt(vtt)
78+
expect(result).not.toMatch(/<\/?[^>]+>/)
79+
})
80+
81+
it.concurrent('terminates on adversarial deeply-nested input', () => {
82+
const crafted = `${'<'.repeat(50)}b${'>'.repeat(50)}text${'<'.repeat(50)}/b${'>'.repeat(50)}`
83+
const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
84+
const result = parseVtt(vtt)
85+
expect(result).not.toMatch(/<\/?[^>]+>/)
86+
})
87+
})

apps/sim/connectors/zoom/zoom.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,10 @@ function findTranscriptFile(files?: ZoomRecordingFile[]): ZoomRecordingFile | un
120120
* Extracts spoken text from a Zoom WebVTT transcript, stripping cue identifiers,
121121
* timestamps, and inline markup. Handles both Zoom's `Speaker: text` convention
122122
* and standard WebVTT `<v Speaker>text</v>` voice tags.
123+
*
124+
* Exported for unit tests; not part of the connector's public surface.
123125
*/
124-
function parseVtt(vtt: string): string {
126+
export function parseVtt(vtt: string): string {
125127
const lines = vtt.split(/\r?\n/)
126128
const segments: string[] = []
127129
let i = 0

0 commit comments

Comments
 (0)