jhj0517 commited on
Commit
edcb1e1
·
1 Parent(s): 14d5cf9

add Silero VAD Options

Browse files
Files changed (1) hide show
  1. app.py +46 -6
app.py CHANGED
@@ -60,8 +60,15 @@ class App:
60
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
61
  with gr.Row():
62
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
 
 
 
 
 
 
 
 
63
  with gr.Accordion("Advanced_Parameters", open=False):
64
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
65
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
66
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
67
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -93,7 +100,14 @@ class App:
93
  initial_prompt=tb_initial_prompt,
94
  temperature=sd_temperature,
95
  compression_ratio_threshold=nb_compression_ratio_threshold,
96
- vad_filter=cb_vad_filter)
 
 
 
 
 
 
 
97
  btn_run.click(fn=self.whisper_inf.transcribe_file,
98
  inputs=params + whisper_params.to_list(),
99
  outputs=[tb_indicator, files_subtitles])
@@ -120,8 +134,15 @@ class App:
120
  with gr.Row():
121
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
122
  interactive=True)
 
 
 
 
 
 
 
 
123
  with gr.Accordion("Advanced_Parameters", open=False):
124
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
125
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
126
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
127
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -153,7 +174,13 @@ class App:
153
  initial_prompt=tb_initial_prompt,
154
  temperature=sd_temperature,
155
  compression_ratio_threshold=nb_compression_ratio_threshold,
156
- vad_filter=cb_vad_filter)
 
 
 
 
 
 
157
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
158
  inputs=params + whisper_params.to_list(),
159
  outputs=[tb_indicator, files_subtitles])
@@ -173,8 +200,15 @@ class App:
173
  dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
174
  with gr.Row():
175
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
 
 
 
 
 
 
 
 
176
  with gr.Accordion("Advanced_Parameters", open=False):
177
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
178
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
179
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
180
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -205,7 +239,13 @@ class App:
205
  initial_prompt=tb_initial_prompt,
206
  temperature=sd_temperature,
207
  compression_ratio_threshold=nb_compression_ratio_threshold,
208
- vad_filter=cb_vad_filter)
 
 
 
 
 
 
209
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
210
  inputs=params + whisper_params.to_list(),
211
  outputs=[tb_indicator, files_subtitles])
 
60
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
61
  with gr.Row():
62
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
63
+ with gr.Accordion("Silero VAD Options", open=False):
64
+ cb_vad_filter = gr.Checkbox(label="Enable VAD Filter", value=False, interactive=True)
65
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
66
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
67
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
68
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
69
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
70
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
71
  with gr.Accordion("Advanced_Parameters", open=False):
 
72
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
73
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
74
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
 
100
  initial_prompt=tb_initial_prompt,
101
  temperature=sd_temperature,
102
  compression_ratio_threshold=nb_compression_ratio_threshold,
103
+ vad_filter=cb_vad_filter,
104
+ threshold=sd_threshold,
105
+ min_speech_duration_ms=nb_min_speech_duration_ms,
106
+ max_speech_duration_s=nb_max_speech_duration_s,
107
+ min_silence_duration_ms=nb_min_silence_duration_ms,
108
+ window_size_sample=nb_window_size_sample,
109
+ speech_pad_ms=nb_speech_pad_ms)
110
+
111
  btn_run.click(fn=self.whisper_inf.transcribe_file,
112
  inputs=params + whisper_params.to_list(),
113
  outputs=[tb_indicator, files_subtitles])
 
134
  with gr.Row():
135
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
136
  interactive=True)
137
+ with gr.Accordion("Silero VAD Options", open=False):
138
+ cb_vad_filter = gr.Checkbox(label="Enable VAD Filter", value=False, interactive=True)
139
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
140
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
141
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
142
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
143
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
144
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
145
  with gr.Accordion("Advanced_Parameters", open=False):
 
146
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
147
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
148
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
 
174
  initial_prompt=tb_initial_prompt,
175
  temperature=sd_temperature,
176
  compression_ratio_threshold=nb_compression_ratio_threshold,
177
+ vad_filter=cb_vad_filter,
178
+ threshold=sd_threshold,
179
+ min_speech_duration_ms=nb_min_speech_duration_ms,
180
+ max_speech_duration_s=nb_max_speech_duration_s,
181
+ min_silence_duration_ms=nb_min_silence_duration_ms,
182
+ window_size_sample=nb_window_size_sample,
183
+ speech_pad_ms=nb_speech_pad_ms)
184
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
185
  inputs=params + whisper_params.to_list(),
186
  outputs=[tb_indicator, files_subtitles])
 
200
  dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
201
  with gr.Row():
202
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
203
+ with gr.Accordion("Silero VAD Options", open=False):
204
+ cb_vad_filter = gr.Checkbox(label="Enable VAD Filter", value=False, interactive=True)
205
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
206
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
207
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
208
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
209
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
210
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
211
  with gr.Accordion("Advanced_Parameters", open=False):
 
212
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
213
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
214
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
 
239
  initial_prompt=tb_initial_prompt,
240
  temperature=sd_temperature,
241
  compression_ratio_threshold=nb_compression_ratio_threshold,
242
+ vad_filter=cb_vad_filter,
243
+ threshold=sd_threshold,
244
+ min_speech_duration_ms=nb_min_speech_duration_ms,
245
+ max_speech_duration_s=nb_max_speech_duration_s,
246
+ min_silence_duration_ms=nb_min_silence_duration_ms,
247
+ window_size_sample=nb_window_size_sample,
248
+ speech_pad_ms=nb_speech_pad_ms)
249
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
250
  inputs=params + whisper_params.to_list(),
251
  outputs=[tb_indicator, files_subtitles])