-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTranscriber.clj
More file actions
239 lines (198 loc) · 6.26 KB
/
Transcriber.clj
File metadata and controls
239 lines (198 loc) · 6.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
(import
'(edu.cmu.sphinx.decoder Decoder)
'(edu.cmu.sphinx.decoder ResultListener)
'(edu.cmu.sphinx.decoder.pruner SimplePruner)
'(edu.cmu.sphinx.decoder.scorer ThreadedAcousticScorer)
'(edu.cmu.sphinx.decoder.search PartitionActiveListFactory)
'(edu.cmu.sphinx.decoder.search SimpleBreadthFirstSearchManager)
'(edu.cmu.sphinx.frontend DataBlocker)
'(edu.cmu.sphinx.frontend FrontEnd)
'(edu.cmu.sphinx.frontend.endpoint NonSpeechDataFilter)
'(edu.cmu.sphinx.frontend.endpoint SpeechClassifier)
'(edu.cmu.sphinx.frontend.endpoint SpeechMarker)
'(edu.cmu.sphinx.frontend.feature DeltasFeatureExtractor)
'(edu.cmu.sphinx.frontend.feature LiveCMN)
'(edu.cmu.sphinx.frontend.filter Preemphasizer)
'(edu.cmu.sphinx.frontend.frequencywarp MelFrequencyFilterBank)
'(edu.cmu.sphinx.frontend.transform DiscreteCosineTransform)
'(edu.cmu.sphinx.frontend.transform DiscreteFourierTransform)
'(edu.cmu.sphinx.frontend.util AudioFileDataSource)
'(edu.cmu.sphinx.frontend.window RaisedCosineWindower)
'(edu.cmu.sphinx.instrumentation BestPathAccuracyTracker)
'(edu.cmu.sphinx.instrumentation MemoryTracker)
'(edu.cmu.sphinx.instrumentation SpeedTracker)
'(edu.cmu.sphinx.jsgf JSGFGrammar)
'(edu.cmu.sphinx.linguist.acoustic UnitManager)
'(edu.cmu.sphinx.linguist.acoustic.tiedstate Sphinx3Loader)
'(edu.cmu.sphinx.linguist.acoustic.tiedstate TiedStateAcousticModel)
'(edu.cmu.sphinx.linguist.dictionary FastDictionary)
'(edu.cmu.sphinx.linguist.flat FlatLinguist)
'(edu.cmu.sphinx.recognizer Recognizer)
'(edu.cmu.sphinx.util LogMath)
'(java.util.logging Logger)
'(java.util.logging Level)
'(java.net URL)
)
(def root "build/ai/components")
;; Init common
(.setLevel (Logger/getLogger "") Level/WARNING)
(def logMath (new LogMath 1.0001 true))
(def absoluteBeamWidth -1)
(def relativeBeamWidth 1E-80)
(def wordInsertionProbability 1E-36)
(def languageWeight 8.0)
;; Init audio data
(def audioSource (new AudioFileDataSource 3200 nil))
(def audioURL (new URL (str "file:" root "/demo/files/10001-90210-01803.wav")))
(.setAudioFile audioSource audioURL nil)
;; Init front end
(def dataBlocker (new DataBlocker
10)) ;; blockSizeMs
(def speechClassifier (new SpeechClassifier
10 ;; frameLengthMs
0.003 ;; adjustment
10 ;; threshold
0)) ;; minSignal
(def speechMarker (new SpeechMarker
200 ;; startSpeechTime
500 ;; endSilenceTime
100 ;; speechLeader
50 ;; speechLeaderFrames
100 ;; speechTrailer
15.0)) ;; decay
(def nonSpeechDataFilter (new NonSpeechDataFilter))
(def premphasizer (new Preemphasizer
0.97)) ;; preemphasisFactor
(def windower (new RaisedCosineWindower
0.46 ;; double alpha
25.625 ;; windowSizeInMs
10.0)) ;; windowShiftInMs
(def fft (new DiscreteFourierTransform
-1 ;; numberFftPoints
false)) ;; invert
(def melFilterBank (new MelFrequencyFilterBank
130.0 ;; minFreq
6800.0 ;; maxFreq
40)) ;; numberFilters
(def dct (new DiscreteCosineTransform
40 ;; numberMelFilters
13)) ;; cepstrumSize
(def cmn (new LiveCMN
12.0 ;; initialMean
100 ;; cmnWindow
160)) ;; cmnShiftWindow
(def featureExtraction (new DeltasFeatureExtractor
3)) ;; window
;; Sequence of processing ops
(def pipeline [
audioSource
dataBlocker
speechClassifier
speechMarker
nonSpeechDataFilter
premphasizer
windower
fft
melFilterBank
dct
cmn
featureExtraction])
;; Instantiate a Java object from class
(def frontend (new FrontEnd pipeline))
;; Initialize model manager
(def unitManager (new UnitManager))
;; Instance of dictionary type
(def dictionary (new FastDictionary
;; URL as constructor method signature
(new URL (str "file:" root "/models/acoustic/tidigits/dict/dictionary"))
(new URL (str "file:" root "/models/acoustic/tidigits/noisedict"))
[]
false
"<sil>"
false
false
unitManager))
;; ???
(def modelLoader (new Sphinx3Loader
(new URL (str "file:" root "/models/acoustic/tidigits"))
"mdef"
""
logMath
unitManager
(float 0.0)
(float 1e-7)
(float 0.0001)
true
))
;; Junction, bind all objects together
(def model (new TiedStateAcousticModel modelLoader unitManager true))
;; Init linguistics: grammar
(def grammar (new JSGFGrammar
;; URL baseURL
(new URL (str "file:" root "/src/apps/edu/cmu/sphinx/demo/transcriber/"))
logMath ;; LogMath logMath
"digits" ;; String grammarName
false ;; boolean showGrammar
false ;; boolean optimizeGrammar
false ;; boolean addSilenceWords
false ;; boolean addFillerWords
dictionary)) ;; Dictionary dictionary
;; Flat model
(def linguist (new FlatLinguist
model ;; AcousticModel acousticModel
logMath ;; LogMath logMath
grammar ;; Grammar grammar
unitManager ;; UnitManager unitManager
wordInsertionProbability ;; double wordInsertionProbability
1.0 ;; double silenceInsertionProbability
1.0 ;; double fillerInsertionProbability
1.0 ;; double unitInsertionProbability
languageWeight ;; float languageWeight
false ;; boolean dumpGStates
false ;; boolean showCompilationProgress
false ;; boolean spreadWordProbabilitiesAcrossPronunciations
false ;; boolean addOutOfGrammarBranch
1.0 ;; double outOfGrammarBranchProbability
1.0 ;; double phoneInsertionProbability
nil)) ;; AcousticModel phoneLoopAcousticModel
;; Init recognizer multi-threads?
(def scorer
(new ThreadedAcousticScorer
frontend
nil
10
true
0
Thread/NORM_PRIORITY))
;; Simple cut instrument
(def pruner (new SimplePruner))
;; Active ordered list (async returns possible?)
(def activeListFactory
(new PartitionActiveListFactory
absoluteBeamWidth
relativeBeamWidth
logMath))
;; Create a search manager object and initialize
(def searchManager (new SimpleBreadthFirstSearchManager
logMath linguist pruner
scorer activeListFactory
false 0.0 0 false))
;; Decome messages from search?
(def decoder (new Decoder
searchManager
false false
[]
100000))
; ???
(def recognizer (new Recognizer decoder nil))
;; Allocate the resources necessary for the recognizer
(.allocate recognizer)
;; Loop until last utterance in the audio file has been decoded in which case
;; the recognizer will return null.
;;(print (.getBestResultNoFiller (.recognize recognizer)))
(loop
[result (.recognize recognizer)]
(if (not (= result nil))
(let []
(println (.getBestResultNoFiller result))
(recur (.recognize recognizer)))))