From 55a963939eb0337622e79978f70d5d2db3dae44b Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Tue, 30 Dec 2025 21:57:24 +0300 Subject: [PATCH 1/6] Add Apple Keyword Recognition results --- BENCHMARKS.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index fc64256..8774bec 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -537,11 +537,11 @@ -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 14.69 | 16.89 | -| earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | -| earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | +| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 28.42 | 17 | 14.69 | 16.89 | +| earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | 26.98 | - | +| earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 27.26 | - |

@@ -564,11 +564,11 @@ If the model predicts 20 keywords and 15 of them match the ground truth, precisi -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | -| earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | -| earnings22-keywords
(file-keywords) | 0.96 | 0.93 | 0.96 | 0.94 | 0.96 | 0.95 | +| earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | 1 | 0.99 | +| earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | 0.99 | - | +| earnings22-keywords
(file-keywords) | 0.96 | 0.93 | 0.96 | 0.94 | 0.96 | 0.95 | 0.99 | - |

@@ -591,11 +591,11 @@ If the ground-truth transcript has 25 keywords and the model correctly finds 15, -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | -| earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | -| earnings22-keywords
(file-keywords) | 0.83 | 0.79 | 0.68 | 0.82 | 0.82 | 0.8 | +| earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | 0.26 | 0.39 | +| earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | 0.45 | - | +| earnings22-keywords
(file-keywords) | 0.83 | 0.79 | 0.68 | 0.82 | 0.82 | 0.8 | 0.4 | - |

@@ -620,11 +620,11 @@ F1 = 2 × (0.75 × 0.6) / (0.75 + 0.6) = **66.7%**, reflecting the model's overa -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | -| earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | -| earnings22-keywords
(file-keywords) | 0.89 | 0.86 | 0.8 | 0.87 | 0.88 | 0.87 | +| earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | 0.41 | 0.56 | +| earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | 0.62 | - | +| earnings22-keywords
(file-keywords) | 0.89 | 0.86 | 0.8 | 0.87 | 0.88 | 0.87 | 0.58 | - |

From 23d36be6d7f8aaa47d1d67fb3e41d6a5d9cac3f1 Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Tue, 30 Dec 2025 22:00:28 +0300 Subject: [PATCH 2/6] Fix table format bug --- BENCHMARKS.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index 8774bec..118c855 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -538,7 +538,7 @@ | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------| +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 28.42 | 17 | 14.69 | 16.89 | | earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | 26.98 | - | | earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 27.26 | - | @@ -565,7 +565,7 @@ If the model predicts 20 keywords and 15 of them match the ground truth, precisi | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------| +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | 1 | 0.99 | | earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | 0.99 | - | | earnings22-keywords
(file-keywords) | 0.96 | 0.93 | 0.96 | 0.94 | 0.96 | 0.95 | 0.99 | - | @@ -592,7 +592,7 @@ If the ground-truth transcript has 25 keywords and the model correctly finds 15, | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------| +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | 0.26 | 0.39 | | earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | 0.45 | - | | earnings22-keywords
(file-keywords) | 0.83 | 0.79 | 0.68 | 0.82 | 0.82 | 0.8 | 0.4 | - | @@ -621,7 +621,7 @@ F1 = 2 × (0.75 × 0.6) / (0.75 + 0.6) = **66.7%**, reflecting the model's overa | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------| +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | 0.41 | 0.56 | | earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | 0.62 | - | | earnings22-keywords
(file-keywords) | 0.89 | 0.86 | 0.8 | 0.87 | 0.88 | 0.87 | 0.58 | - | From 794a8e7b19c11896b5a93461c57bc061ad39b29d Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Tue, 30 Dec 2025 22:03:51 +0300 Subject: [PATCH 3/6] fix WER values --- BENCHMARKS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index 118c855..18bc823 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -539,7 +539,7 @@ | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 28.42 | 17 | 14.69 | 16.89 | +| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 28.42 | 14.69 | 28.42 | 17 | | earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | 26.98 | - | | earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 27.26 | - | From e2480e8133b2def6e24b5a887542a11a39f16d17 Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Tue, 30 Dec 2025 22:12:20 +0300 Subject: [PATCH 4/6] update benchmarks keyword recognition descriptions --- BENCHMARKS.md | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index 18bc823..93ed4a5 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -505,6 +505,19 @@ - **Code Reference:** [openbench/pipeline/transcription/transcription_whisperkitpro.py](https://github.com/argmaxinc/OpenBench/blob/main/src/openbench/pipeline/transcription/transcription_whisperkitpro.py) - **Hardware**: M2 Ultra Mac Studio +### Apple +- **Latest Run:** `2025-12-30` +- **Model Version:** `SFSpeechRecognizer` +- **Configuration:** Open-source Apple SFSpeechTranscriber implementation with custom vocabulary. See [SFSpeechRecognizer](https://developer.apple.com/documentation/speech/sfspeechrecognizer) for more details. +- **Code Reference:** [openbench/pipeline/transcription/apple_speech_analyzer.py](https://github.com/argmaxinc/OpenBench/blob/main/src/openbench/pipeline/transcription/apple_speech_analyzer.py) + +### Apple +- **Latest Run:** `2025-12-30` +- **Model Version:** `SpeechAnalyzer` +- **Configuration:** Open-source Apple SpeechAnalyzer implementation. See [SpeechAnalyzer](https://developer.apple.com/documentation/speech/speechanalyzer) for more details. +- **Code Reference:** [openbench/pipeline/transcription/apple_speech_analyzer.py](https://github.com/argmaxinc/OpenBench/blob/main/src/openbench/pipeline/transcription/apple_speech_analyzer.py) +- **Note:** SpeechAnalyzer doesn't support custom vocabulary. +
@@ -537,7 +550,7 @@ -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 28.42 | 14.69 | 28.42 | 17 | | earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | 26.98 | - | @@ -564,7 +577,7 @@ If the model predicts 20 keywords and 15 of them match the ground truth, precisi -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | 1 | 0.99 | | earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | 0.99 | - | @@ -591,7 +604,7 @@ If the ground-truth transcript has 25 keywords and the model correctly finds 15, -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | 0.26 | 0.39 | | earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | 0.45 | - | @@ -620,7 +633,7 @@ F1 = 2 × (0.75 × 0.6) / (0.75 + 0.6) = **66.7%**, reflecting the model's overa -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechTranscriber
(Old API) | Apple
(SpeechAnalyzer)| +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | 0.41 | 0.56 | | earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | 0.62 | - | From b0f7f6ff893dd6f1e8102eb1d02fd9eeab95d092 Mon Sep 17 00:00:00 2001 From: atila Date: Tue, 30 Dec 2025 14:50:36 -0800 Subject: [PATCH 5/6] Update system description --- BENCHMARKS.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index 93ed4a5..23ed46c 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -508,15 +508,14 @@ ### Apple - **Latest Run:** `2025-12-30` - **Model Version:** `SFSpeechRecognizer` -- **Configuration:** Open-source Apple SFSpeechTranscriber implementation with custom vocabulary. See [SFSpeechRecognizer](https://developer.apple.com/documentation/speech/sfspeechrecognizer) for more details. +- **Configuration:** [SFSpeechRecognizer](https://developer.apple.com/documentation/speech/sfspeechrecognizer) and Custom Vocabulary feature enabled with the `analysisContext` argument. - **Code Reference:** [openbench/pipeline/transcription/apple_speech_analyzer.py](https://github.com/argmaxinc/OpenBench/blob/main/src/openbench/pipeline/transcription/apple_speech_analyzer.py) ### Apple - **Latest Run:** `2025-12-30` - **Model Version:** `SpeechAnalyzer` -- **Configuration:** Open-source Apple SpeechAnalyzer implementation. See [SpeechAnalyzer](https://developer.apple.com/documentation/speech/speechanalyzer) for more details. +- **Configuration:** [SpeechAnalyzer](https://developer.apple.com/documentation/speech/speechanalyzer). Note that this system does not support Custom Vocabulary as far as Apple's documentation explains. This system is only benchmarked on the "no keywords" baseline task. - **Code Reference:** [openbench/pipeline/transcription/apple_speech_analyzer.py](https://github.com/argmaxinc/OpenBench/blob/main/src/openbench/pipeline/transcription/apple_speech_analyzer.py) -- **Note:** SpeechAnalyzer doesn't support custom vocabulary. @@ -550,9 +549,9 @@ -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 28.42 | 14.69 | 28.42 | 17 | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|---------------------------------------------|----------------------------| +| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 28.42 | 14.69 | 28.42 | 17 | | earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | 26.98 | - | | earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 27.26 | - | @@ -577,9 +576,9 @@ If the model predicts 20 keywords and 15 of them match the ground truth, precisi -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | 1 | 0.99 | +| earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | 1 | 0.99 | | earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | 0.99 | - | | earnings22-keywords
(file-keywords) | 0.96 | 0.93 | 0.96 | 0.94 | 0.96 | 0.95 | 0.99 | - | @@ -604,9 +603,9 @@ If the ground-truth transcript has 25 keywords and the model correctly finds 15, -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | 0.26 | 0.39 | +| earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | 0.26 | 0.39 | | earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | 0.45 | - | | earnings22-keywords
(file-keywords) | 0.83 | 0.79 | 0.68 | 0.82 | 0.82 | 0.8 | 0.4 | - | From 5afd16b8d8418b705b3a82d84acc485a8993abd5 Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Wed, 31 Dec 2025 13:25:40 +0300 Subject: [PATCH 6/6] fix typo --- BENCHMARKS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index 23ed46c..91801c0 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -551,7 +551,7 @@ | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|---------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 28.42 | 14.69 | 28.42 | 17 | +| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 14.69 | 16.89 | 28.42 | 17 | | earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | 26.98 | - | | earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 27.26 | - |