jhj0517 commited on
Commit
6cba7cb
·
unverified ·
2 Parent(s): 14d5cf9 ed10d53

Merge pull request #159 from jhj0517/feature/add-vad-parameter

Browse files
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import os
3
  import argparse
 
4
 
5
  from modules.whisper_Inference import WhisperInference
6
  from modules.faster_whisper_inference import FasterWhisperInference
@@ -60,8 +61,15 @@ class App:
60
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
61
  with gr.Row():
62
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
63
- with gr.Accordion("Advanced_Parameters", open=False):
64
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
 
 
 
 
 
 
 
65
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
66
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
67
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -93,7 +101,14 @@ class App:
93
  initial_prompt=tb_initial_prompt,
94
  temperature=sd_temperature,
95
  compression_ratio_threshold=nb_compression_ratio_threshold,
96
- vad_filter=cb_vad_filter)
 
 
 
 
 
 
 
97
  btn_run.click(fn=self.whisper_inf.transcribe_file,
98
  inputs=params + whisper_params.to_list(),
99
  outputs=[tb_indicator, files_subtitles])
@@ -120,8 +135,15 @@ class App:
120
  with gr.Row():
121
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
122
  interactive=True)
123
- with gr.Accordion("Advanced_Parameters", open=False):
124
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
 
 
 
 
 
 
 
125
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
126
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
127
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -153,7 +175,13 @@ class App:
153
  initial_prompt=tb_initial_prompt,
154
  temperature=sd_temperature,
155
  compression_ratio_threshold=nb_compression_ratio_threshold,
156
- vad_filter=cb_vad_filter)
 
 
 
 
 
 
157
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
158
  inputs=params + whisper_params.to_list(),
159
  outputs=[tb_indicator, files_subtitles])
@@ -173,8 +201,15 @@ class App:
173
  dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
174
  with gr.Row():
175
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
176
- with gr.Accordion("Advanced_Parameters", open=False):
177
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
 
 
 
 
 
 
 
178
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
179
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
180
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -205,7 +240,13 @@ class App:
205
  initial_prompt=tb_initial_prompt,
206
  temperature=sd_temperature,
207
  compression_ratio_threshold=nb_compression_ratio_threshold,
208
- vad_filter=cb_vad_filter)
 
 
 
 
 
 
209
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
210
  inputs=params + whisper_params.to_list(),
211
  outputs=[tb_indicator, files_subtitles])
@@ -284,6 +325,7 @@ class App:
284
  launch_args['server_port'] = self.args.server_port
285
  if self.args.username and self.args.password:
286
  launch_args['auth'] = (self.args.username, self.args.password)
 
287
  self.app.queue(api_open=False).launch(**launch_args)
288
 
289
 
 
1
  import gradio as gr
2
  import os
3
  import argparse
4
+ import webbrowser
5
 
6
  from modules.whisper_Inference import WhisperInference
7
  from modules.faster_whisper_inference import FasterWhisperInference
 
61
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
62
  with gr.Row():
63
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
64
+ with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
65
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
66
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
67
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
68
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
69
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
70
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
71
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
72
+ with gr.Accordion("Advanced_Parameters", open=False):
73
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
74
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
75
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
 
101
  initial_prompt=tb_initial_prompt,
102
  temperature=sd_temperature,
103
  compression_ratio_threshold=nb_compression_ratio_threshold,
104
+ vad_filter=cb_vad_filter,
105
+ threshold=sd_threshold,
106
+ min_speech_duration_ms=nb_min_speech_duration_ms,
107
+ max_speech_duration_s=nb_max_speech_duration_s,
108
+ min_silence_duration_ms=nb_min_silence_duration_ms,
109
+ window_size_sample=nb_window_size_sample,
110
+ speech_pad_ms=nb_speech_pad_ms)
111
+
112
  btn_run.click(fn=self.whisper_inf.transcribe_file,
113
  inputs=params + whisper_params.to_list(),
114
  outputs=[tb_indicator, files_subtitles])
 
135
  with gr.Row():
136
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
137
  interactive=True)
138
+ with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
139
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
140
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
141
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
142
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
143
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
144
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
145
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
146
+ with gr.Accordion("Advanced_Parameters", open=False):
147
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
148
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
149
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
 
175
  initial_prompt=tb_initial_prompt,
176
  temperature=sd_temperature,
177
  compression_ratio_threshold=nb_compression_ratio_threshold,
178
+ vad_filter=cb_vad_filter,
179
+ threshold=sd_threshold,
180
+ min_speech_duration_ms=nb_min_speech_duration_ms,
181
+ max_speech_duration_s=nb_max_speech_duration_s,
182
+ min_silence_duration_ms=nb_min_silence_duration_ms,
183
+ window_size_sample=nb_window_size_sample,
184
+ speech_pad_ms=nb_speech_pad_ms)
185
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
186
  inputs=params + whisper_params.to_list(),
187
  outputs=[tb_indicator, files_subtitles])
 
201
  dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
202
  with gr.Row():
203
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
204
+ with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
205
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
206
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
207
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
208
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
209
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
210
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
211
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
212
+ with gr.Accordion("Advanced_Parameters", open=False):
213
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
214
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
215
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
 
240
  initial_prompt=tb_initial_prompt,
241
  temperature=sd_temperature,
242
  compression_ratio_threshold=nb_compression_ratio_threshold,
243
+ vad_filter=cb_vad_filter,
244
+ threshold=sd_threshold,
245
+ min_speech_duration_ms=nb_min_speech_duration_ms,
246
+ max_speech_duration_s=nb_max_speech_duration_s,
247
+ min_silence_duration_ms=nb_min_silence_duration_ms,
248
+ window_size_sample=nb_window_size_sample,
249
+ speech_pad_ms=nb_speech_pad_ms)
250
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
251
  inputs=params + whisper_params.to_list(),
252
  outputs=[tb_indicator, files_subtitles])
 
325
  launch_args['server_port'] = self.args.server_port
326
  if self.args.username and self.args.password:
327
  launch_args['auth'] = (self.args.username, self.args.password)
328
+
329
  self.app.queue(api_open=False).launch(**launch_args)
330
 
331
 
modules/faster_whisper_inference.py CHANGED
@@ -6,6 +6,7 @@ from typing import BinaryIO, Union, Tuple, List
6
  from datetime import datetime
7
 
8
  import faster_whisper
 
9
  import ctranslate2
10
  import whisper
11
  import torch
@@ -260,6 +261,15 @@ class FasterWhisperInference(BaseInterface):
260
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
261
  params.lang = language_code_dict[params.lang]
262
 
 
 
 
 
 
 
 
 
 
263
  segments, info = self.model.transcribe(
264
  audio=audio,
265
  language=params.lang,
@@ -272,6 +282,7 @@ class FasterWhisperInference(BaseInterface):
272
  temperature=params.temperature,
273
  compression_ratio_threshold=params.compression_ratio_threshold,
274
  vad_filter=params.vad_filter,
 
275
  )
276
  progress(0, desc="Loading audio..")
277
 
 
6
  from datetime import datetime
7
 
8
  import faster_whisper
9
+ from faster_whisper.vad import VadOptions
10
  import ctranslate2
11
  import whisper
12
  import torch
 
261
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
262
  params.lang = language_code_dict[params.lang]
263
 
264
+ vad_options = VadOptions(
265
+ threshold=params.threshold,
266
+ min_speech_duration_ms=params.min_speech_duration_ms,
267
+ max_speech_duration_s=params.max_speech_duration_s,
268
+ min_silence_duration_ms=params.min_silence_duration_ms,
269
+ window_size_samples=params.window_size_samples,
270
+ speech_pad_ms=params.speech_pad_ms
271
+ )
272
+
273
  segments, info = self.model.transcribe(
274
  audio=audio,
275
  language=params.lang,
 
282
  temperature=params.temperature,
283
  compression_ratio_threshold=params.compression_ratio_threshold,
284
  vad_filter=params.vad_filter,
285
+ vad_parameters=vad_options
286
  )
287
  progress(0, desc="Loading audio..")
288
 
modules/whisper_parameter.py CHANGED
@@ -19,6 +19,12 @@ class WhisperGradioComponents:
19
  temperature: gr.Slider
20
  compression_ratio_threshold: gr.Number
21
  vad_filter: gr.Checkbox
 
 
 
 
 
 
22
  """
23
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
24
  See more about Gradio pre-processing: https://www.gradio.app/docs/components
@@ -78,6 +84,33 @@ class WhisperGradioComponents:
78
  Enable the voice activity detection (VAD) to filter out parts of the audio
79
  without speech. This step is using the Silero VAD model
80
  https://github.com/snakers4/silero-vad.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  """
82
 
83
  def to_list(self) -> list:
@@ -108,6 +141,12 @@ class WhisperValues:
108
  temperature: float
109
  compression_ratio_threshold: float
110
  vad_filter: bool
 
 
 
 
 
 
111
  """
112
  A data class to use Whisper parameters. Use "after" Gradio pre-processing.
113
  See more about Gradio pre-processing: : https://www.gradio.app/docs/components
 
19
  temperature: gr.Slider
20
  compression_ratio_threshold: gr.Number
21
  vad_filter: gr.Checkbox
22
+ threshold: gr.Slider
23
+ min_speech_duration_ms: gr.Number
24
+ max_speech_duration_s: gr.Number
25
+ min_silence_duration_ms: gr.Number
26
+ window_size_sample: gr.Number
27
+ speech_pad_ms: gr.Number
28
  """
29
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
30
  See more about Gradio pre-processing: https://www.gradio.app/docs/components
 
84
  Enable the voice activity detection (VAD) to filter out parts of the audio
85
  without speech. This step is using the Silero VAD model
86
  https://github.com/snakers4/silero-vad.
87
+
88
+ threshold: gr.Slider
89
+ This parameter is related with Silero VAD. Speech threshold.
90
+ Silero VAD outputs speech probabilities for each audio chunk,
91
+ probabilities ABOVE this value are considered as SPEECH. It is better to tune this
92
+ parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
93
+
94
+ min_speech_duration_ms: gr.Number
95
+ This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
96
+
97
+ max_speech_duration_s: gr.Number
98
+ This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
99
+ than max_speech_duration_s will be split at the timestamp of the last silence that
100
+ lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
101
+ split aggressively just before max_speech_duration_s.
102
+
103
+ min_silence_duration_ms: gr.Number
104
+ This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
105
+ before separating it
106
+
107
+ window_size_samples: gr.Number
108
+ This parameter is related with Silero VAD. Audio chunks of window_size_samples size are fed to the silero VAD model.
109
+ WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
110
+ Values other than these may affect model performance!!
111
+
112
+ speech_pad_ms: gr.Number
113
+ This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
114
  """
115
 
116
  def to_list(self) -> list:
 
141
  temperature: float
142
  compression_ratio_threshold: float
143
  vad_filter: bool
144
+ threshold: float
145
+ min_speech_duration_ms: int
146
+ max_speech_duration_s: float
147
+ min_silence_duration_ms: int
148
+ window_size_samples: int
149
+ speech_pad_ms: int
150
  """
151
  A data class to use Whisper parameters. Use "after" Gradio pre-processing.
152
  See more about Gradio pre-processing: : https://www.gradio.app/docs/components