jhj0517 commited on
Commit
20c2916
·
1 Parent(s): e92850a

migrate faster-whisper to 1.0.3

Browse files
app.py CHANGED
@@ -115,7 +115,6 @@ class App:
115
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
116
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
117
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
118
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
119
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
120
  with gr.Accordion("Diarization", open=False):
121
  cb_diarize = gr.Checkbox(label="Enable Diarization")
@@ -152,7 +151,6 @@ class App:
152
  min_speech_duration_ms=nb_min_speech_duration_ms,
153
  max_speech_duration_s=nb_max_speech_duration_s,
154
  min_silence_duration_ms=nb_min_silence_duration_ms,
155
- window_size_sample=nb_window_size_sample,
156
  speech_pad_ms=nb_speech_pad_ms,
157
  chunk_length_s=nb_chunk_length_s,
158
  batch_size=nb_batch_size,
@@ -203,7 +201,6 @@ class App:
203
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
204
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
205
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
206
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
207
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
208
  with gr.Accordion("Diarization", open=False):
209
  cb_diarize = gr.Checkbox(label="Enable Diarization")
@@ -241,7 +238,6 @@ class App:
241
  min_speech_duration_ms=nb_min_speech_duration_ms,
242
  max_speech_duration_s=nb_max_speech_duration_s,
243
  min_silence_duration_ms=nb_min_silence_duration_ms,
244
- window_size_sample=nb_window_size_sample,
245
  speech_pad_ms=nb_speech_pad_ms,
246
  chunk_length_s=nb_chunk_length_s,
247
  batch_size=nb_batch_size,
@@ -284,7 +280,6 @@ class App:
284
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
285
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
286
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
287
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
288
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
289
  with gr.Accordion("Diarization", open=False):
290
  cb_diarize = gr.Checkbox(label="Enable Diarization")
@@ -324,7 +319,6 @@ class App:
324
  min_speech_duration_ms=nb_min_speech_duration_ms,
325
  max_speech_duration_s=nb_max_speech_duration_s,
326
  min_silence_duration_ms=nb_min_silence_duration_ms,
327
- window_size_sample=nb_window_size_sample,
328
  speech_pad_ms=nb_speech_pad_ms,
329
  chunk_length_s=nb_chunk_length_s,
330
  batch_size=nb_batch_size,
 
115
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
116
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
117
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
 
118
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
119
  with gr.Accordion("Diarization", open=False):
120
  cb_diarize = gr.Checkbox(label="Enable Diarization")
 
151
  min_speech_duration_ms=nb_min_speech_duration_ms,
152
  max_speech_duration_s=nb_max_speech_duration_s,
153
  min_silence_duration_ms=nb_min_silence_duration_ms,
 
154
  speech_pad_ms=nb_speech_pad_ms,
155
  chunk_length_s=nb_chunk_length_s,
156
  batch_size=nb_batch_size,
 
201
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
202
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
203
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
 
204
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
205
  with gr.Accordion("Diarization", open=False):
206
  cb_diarize = gr.Checkbox(label="Enable Diarization")
 
238
  min_speech_duration_ms=nb_min_speech_duration_ms,
239
  max_speech_duration_s=nb_max_speech_duration_s,
240
  min_silence_duration_ms=nb_min_silence_duration_ms,
 
241
  speech_pad_ms=nb_speech_pad_ms,
242
  chunk_length_s=nb_chunk_length_s,
243
  batch_size=nb_batch_size,
 
280
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
281
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
282
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
 
283
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
284
  with gr.Accordion("Diarization", open=False):
285
  cb_diarize = gr.Checkbox(label="Enable Diarization")
 
319
  min_speech_duration_ms=nb_min_speech_duration_ms,
320
  max_speech_duration_s=nb_max_speech_duration_s,
321
  min_silence_duration_ms=nb_min_silence_duration_ms,
 
322
  speech_pad_ms=nb_speech_pad_ms,
323
  chunk_length_s=nb_chunk_length_s,
324
  batch_size=nb_batch_size,
modules/whisper/whisper_base.py CHANGED
@@ -91,7 +91,6 @@ class WhisperBase(ABC):
91
  min_speech_duration_ms=params.min_speech_duration_ms,
92
  max_speech_duration_s=params.max_speech_duration_s,
93
  min_silence_duration_ms=params.min_silence_duration_ms,
94
- window_size_samples=params.window_size_samples,
95
  speech_pad_ms=params.speech_pad_ms
96
  )
97
  self.vad.run(
 
91
  min_speech_duration_ms=params.min_speech_duration_ms,
92
  max_speech_duration_s=params.max_speech_duration_s,
93
  min_silence_duration_ms=params.min_silence_duration_ms,
 
94
  speech_pad_ms=params.speech_pad_ms
95
  )
96
  self.vad.run(
modules/whisper/whisper_parameter.py CHANGED
@@ -23,7 +23,6 @@ class WhisperParameters:
23
  min_speech_duration_ms: gr.Number
24
  max_speech_duration_s: gr.Number
25
  min_silence_duration_ms: gr.Number
26
- window_size_sample: gr.Number
27
  speech_pad_ms: gr.Number
28
  chunk_length_s: gr.Number
29
  batch_size: gr.Number
@@ -111,11 +110,6 @@ class WhisperParameters:
111
  This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
112
  before separating it
113
 
114
- window_size_samples: gr.Number
115
- This parameter is related with Silero VAD. Audio chunks of window_size_samples size are fed to the silero VAD model.
116
- WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
117
- Values other than these may affect model performance!!
118
-
119
  speech_pad_ms: gr.Number
120
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
121
 
@@ -178,13 +172,12 @@ class WhisperParameters:
178
  min_speech_duration_ms=args[15],
179
  max_speech_duration_s=args[16],
180
  min_silence_duration_ms=args[17],
181
- window_size_samples=args[18],
182
- speech_pad_ms=args[19],
183
- chunk_length_s=args[20],
184
- batch_size=args[21],
185
- is_diarize=args[22],
186
- hf_token=args[23],
187
- diarization_device=args[24]
188
  )
189
 
190
 
@@ -208,7 +201,6 @@ class WhisperValues:
208
  min_speech_duration_ms: int
209
  max_speech_duration_s: float
210
  min_silence_duration_ms: int
211
- window_size_samples: int
212
  speech_pad_ms: int
213
  chunk_length_s: int
214
  batch_size: int
@@ -217,4 +209,4 @@ class WhisperValues:
217
  diarization_device: str
218
  """
219
  A data class to use Whisper parameters.
220
- """
 
23
  min_speech_duration_ms: gr.Number
24
  max_speech_duration_s: gr.Number
25
  min_silence_duration_ms: gr.Number
 
26
  speech_pad_ms: gr.Number
27
  chunk_length_s: gr.Number
28
  batch_size: gr.Number
 
110
  This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
111
  before separating it
112
 
 
 
 
 
 
113
  speech_pad_ms: gr.Number
114
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
115
 
 
172
  min_speech_duration_ms=args[15],
173
  max_speech_duration_s=args[16],
174
  min_silence_duration_ms=args[17],
175
+ speech_pad_ms=args[18],
176
+ chunk_length_s=args[19],
177
+ batch_size=args[20],
178
+ is_diarize=args[21],
179
+ hf_token=args[22],
180
+ diarization_device=args[23]
 
181
  )
182
 
183
 
 
201
  min_speech_duration_ms: int
202
  max_speech_duration_s: float
203
  min_silence_duration_ms: int
 
204
  speech_pad_ms: int
205
  chunk_length_s: int
206
  batch_size: int
 
209
  diarization_device: str
210
  """
211
  A data class to use Whisper parameters.
212
+ """