-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDigraph Extraction.py
More file actions
63 lines (49 loc) · 2.35 KB
/
Digraph Extraction.py
File metadata and controls
63 lines (49 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Install the necessary libraries
# Import required libraries
import re
import speech_recognition as sr
from pydub import AudioSegment
# Function to extract words with digraphs from audio
def extract_words_with_digraphs_from_audio(audio_file_path, target_digraphs):
# Initialize SpeechRecognizer
recognizer = sr.Recognizer()
# Load audio file
audio = AudioSegment.from_file(audio_file_path)
audio.export("temp.wav", format="wav")
with sr.AudioFile("temp.wav") as source:
audio_data = recognizer.record(source)
# Recognize speech using Google Web Speech API
try:
recognized_text = recognizer.recognize_google(audio_data, language="sw-TZ")
except sr.UnknownValueError:
print("Google Web Speech API could not understand audio")
return []
except sr.RequestError as e:
print(f"Could not request results from Google Web Speech API; {e}")
return []
return extract_words_with_digraphs(recognized_text, target_digraphs)
# Function to extract words with digraphs from text
def extract_words_with_digraphs(text, target_digraphs):
# Initialize a list to store words containing target digraphs
words_with_digraphs = []
# Find words containing target digraphs in the text
words = text.split()
for word in words:
if any(digraph in word for digraph in target_digraphs):
words_with_digraphs.append(word)
return words_with_digraphs
# Function to extract words with digraphs from a document
def extract_words_with_digraphs_from_document(document_path, target_digraphs):
# Read the document
with open(document_path, 'r', encoding='utf-8') as file:
text = file.read()
# Extract digraphs from the text
return extract_words_with_digraphs(text, target_digraphs)
# Example usage
audio_file_path = "example_audio.wav"
document_path = "CLN4-DEV SWAHILI DATA SETS.txt"
target_digraphs = ["ch", "dh", "gh", "kh", "ng", "ny", "sh", "th", "ng"]
audio_digraphs = extract_words_with_digraphs_from_audio(audio_file_path, target_digraphs)
print("Words containing the specified digraphs found in audio:", audio_digraphs)
document_digraphs = extract_words_with_digraphs_from_document(document_path, target_digraphs)
print("Words containing the specified digraphs found in document:", document_digraphs)