From a2e4ad7497f8c1de4fb6c03f730283256f571965 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 25 May 2025 23:24:54 +0800 Subject: [PATCH 1/2] =?UTF-8?q?1/=20=E4=BF=AE=E5=A4=8D=E7=94=9F=E6=88=90?= =?UTF-8?q?=E5=8D=95=E8=AF=8D=E7=BA=A7=E5=B8=A6=E6=97=B6=E9=97=B4=E6=88=B3?= =?UTF-8?q?=E7=9A=84=E5=AD=97=E5=B9=95=E7=9A=84=E9=97=AE=E9=A2=98=202/=20?= =?UTF-8?q?=E8=BF=94=E5=9B=9E=E4=B8=ADtext=E4=B8=BA=E7=A9=BA=E7=9A=84?= =?UTF-8?q?=E6=83=85=E5=86=B5=E7=9A=84=E5=A4=84=E7=90=86=203/=20=E5=88=A0?= =?UTF-8?q?=E9=99=A4=E4=B8=8D=E5=BF=85=E8=A6=81=E7=9A=84json=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E7=94=9F=E6=88=90=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/asr_backend/elevenlabs_asr.py | 76 +++++++++++++----------------- 1 file changed, 32 insertions(+), 44 deletions(-) diff --git a/core/asr_backend/elevenlabs_asr.py b/core/asr_backend/elevenlabs_asr.py index 5a4c5dda..59fc789c 100644 --- a/core/asr_backend/elevenlabs_asr.py +++ b/core/asr_backend/elevenlabs_asr.py @@ -30,40 +30,34 @@ # ---------------------------- SPLIT_GAP = 1 -def elev2whisper(elev_json, word_level_timestamp = False): +def elev2whisper(elev_json, word_level_timestamp = True): + segments = [] + current_segment = None words = elev_json.get("words", []) if not words: return {"segments": []} - - segments, seg = [], { - "text": "", # accumulated text - "start": words[0]["start"], # seg start time - "end": words[0]["end"], # seg end time (updates) - "speaker_id": words[0]["speaker_id"], - "words": [] # optional per‑word info - } - - for prev, nxt in zip(words, words[1:] + [None]): # pairwise with sentinel - seg["text"] += prev["text"] - seg["end"] = prev["end"] - if word_level_timestamp: - seg["words"].append({"text": prev["text"], "start": prev["start"], "end": prev["end"]}) - # decide whether to break the segment - if nxt is None or (nxt["start"] - prev["end"] > SPLIT_GAP) or (nxt["speaker_id"] != seg["speaker_id"]): - seg["text"] = seg["text"].strip() - if not word_level_timestamp: - seg.pop("words") - segments.append(seg) - if nxt is not None: # seed next segment - seg = { - "text": "", - "start": nxt["start"], - "end": nxt["end"], - "speaker_id": nxt["speaker_id"], - "words": [] - } + + for word in elev_json.get('words', []): + if word['text']== ' ': + continue + # Process timestamps + start = word['start'] + end = word['end'] + text = word['text'] + + # Update or create segment + if not current_segment: + current_segment = {'words': []} + + # Add word to current segment + current_segment['words'].append({'word': text, 'start': start, 'end': end}) + + if current_segment: + segments.append(current_segment) + return {"segments": segments} + def transcribe_audio_elevenlabs(raw_audio_path, vocal_audio_path, start = None, end = None): rprint(f"[cyan]🎤 Processing audio transcription, file path: {vocal_audio_path}[/cyan]") LOG_FILE = f"output/log/elevenlabs_transcribe_{start}_{end}.json" @@ -99,7 +93,7 @@ def transcribe_audio_elevenlabs(raw_audio_path, vocal_audio_path, start = None, "timestamps_granularity": "word", "language_code": load_key("whisper.language"), "diarize": True, - "num_speakers": None, + "num_speakers": 1, "tag_audio_events": False } @@ -114,20 +108,12 @@ def transcribe_audio_elevenlabs(raw_audio_path, vocal_audio_path, start = None, # save detected language detected_language = iso_639_2_to_1.get(result["language_code"], result["language_code"]) update_key("whisper.detected_language", detected_language) - - # Adjust timestamps for all words by adding the start time - if start is not None and 'words' in result: - for word in result['words']: - if 'start' in word: - word['start'] += start - if 'end' in word: - word['end'] += start rprint(f"[green]✓ Transcription completed in {time.time() - start_time:.2f} seconds[/green]") parsed_result = elev2whisper(result) - os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True) - with open(LOG_FILE, "w", encoding="utf-8") as f: - json.dump(parsed_result, f, indent=4, ensure_ascii=False) + # os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True) + # with open(LOG_FILE, "w", encoding="utf-8") as f: + # json.dump(parsed_result, f, indent=4, ensure_ascii=False) return parsed_result finally: # Clean up the temporary file @@ -135,9 +121,11 @@ def transcribe_audio_elevenlabs(raw_audio_path, vocal_audio_path, start = None, os.remove(temp_filepath) if __name__ == "__main__": - file_path = input("Enter local audio file path (mp3 format): ") - language = input("Enter language code for transcription (en or zh or other...): ") - result = transcribe_audio_elevenlabs(file_path, language_code=language) + # file_path = input("Enter local audio file path (mp3 format): ") + # language = input("Enter language code for transcription (en or zh or other...): ") + file_path='/Users/gw/workspace/source/vl3/output/audio/raw.mp3' + language='en' + result = transcribe_audio_elevenlabs(file_path, file_path) print(result) # Save result to file From 25398bf64f94b67f75dfa551710ed0cfd1fb090e Mon Sep 17 00:00:00 2001 From: Happy <6496966+taiyishengshui@users.noreply.github.com> Date: Wed, 28 May 2025 11:47:27 +0800 Subject: [PATCH 2/2] Update elevenlabs_asr.py --- core/asr_backend/elevenlabs_asr.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/core/asr_backend/elevenlabs_asr.py b/core/asr_backend/elevenlabs_asr.py index 59fc789c..54c1a50c 100644 --- a/core/asr_backend/elevenlabs_asr.py +++ b/core/asr_backend/elevenlabs_asr.py @@ -121,10 +121,8 @@ def transcribe_audio_elevenlabs(raw_audio_path, vocal_audio_path, start = None, os.remove(temp_filepath) if __name__ == "__main__": - # file_path = input("Enter local audio file path (mp3 format): ") - # language = input("Enter language code for transcription (en or zh or other...): ") - file_path='/Users/gw/workspace/source/vl3/output/audio/raw.mp3' - language='en' + file_path = input("Enter local audio file path (mp3 format): ") + language = input("Enter language code for transcription (en or zh or other...): ") result = transcribe_audio_elevenlabs(file_path, file_path) print(result)