Spaces:
Running
Running
jhj0517
commited on
Commit
·
edcb1e1
1
Parent(s):
14d5cf9
add Silero VAD Options
Browse files
app.py
CHANGED
@@ -60,8 +60,15 @@ class App:
|
|
60 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
61 |
with gr.Row():
|
62 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
with gr.Accordion("Advanced_Parameters", open=False):
|
64 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
65 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
66 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
67 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
@@ -93,7 +100,14 @@ class App:
|
|
93 |
initial_prompt=tb_initial_prompt,
|
94 |
temperature=sd_temperature,
|
95 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
96 |
-
vad_filter=cb_vad_filter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
98 |
inputs=params + whisper_params.to_list(),
|
99 |
outputs=[tb_indicator, files_subtitles])
|
@@ -120,8 +134,15 @@ class App:
|
|
120 |
with gr.Row():
|
121 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
122 |
interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
with gr.Accordion("Advanced_Parameters", open=False):
|
124 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
125 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
126 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
127 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
@@ -153,7 +174,13 @@ class App:
|
|
153 |
initial_prompt=tb_initial_prompt,
|
154 |
temperature=sd_temperature,
|
155 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
156 |
-
vad_filter=cb_vad_filter
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
158 |
inputs=params + whisper_params.to_list(),
|
159 |
outputs=[tb_indicator, files_subtitles])
|
@@ -173,8 +200,15 @@ class App:
|
|
173 |
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
174 |
with gr.Row():
|
175 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
with gr.Accordion("Advanced_Parameters", open=False):
|
177 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
178 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
179 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
180 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
@@ -205,7 +239,13 @@ class App:
|
|
205 |
initial_prompt=tb_initial_prompt,
|
206 |
temperature=sd_temperature,
|
207 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
208 |
-
vad_filter=cb_vad_filter
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
210 |
inputs=params + whisper_params.to_list(),
|
211 |
outputs=[tb_indicator, files_subtitles])
|
|
|
60 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
61 |
with gr.Row():
|
62 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
63 |
+
with gr.Accordion("Silero VAD Options", open=False):
|
64 |
+
cb_vad_filter = gr.Checkbox(label="Enable VAD Filter", value=False, interactive=True)
|
65 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
66 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
67 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
68 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
69 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
70 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
71 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
72 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
73 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
74 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
100 |
initial_prompt=tb_initial_prompt,
|
101 |
temperature=sd_temperature,
|
102 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
103 |
+
vad_filter=cb_vad_filter,
|
104 |
+
threshold=sd_threshold,
|
105 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
106 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
107 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
108 |
+
window_size_sample=nb_window_size_sample,
|
109 |
+
speech_pad_ms=nb_speech_pad_ms)
|
110 |
+
|
111 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
112 |
inputs=params + whisper_params.to_list(),
|
113 |
outputs=[tb_indicator, files_subtitles])
|
|
|
134 |
with gr.Row():
|
135 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
136 |
interactive=True)
|
137 |
+
with gr.Accordion("Silero VAD Options", open=False):
|
138 |
+
cb_vad_filter = gr.Checkbox(label="Enable VAD Filter", value=False, interactive=True)
|
139 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
140 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
141 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
142 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
143 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
144 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
145 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
146 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
147 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
148 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
174 |
initial_prompt=tb_initial_prompt,
|
175 |
temperature=sd_temperature,
|
176 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
177 |
+
vad_filter=cb_vad_filter,
|
178 |
+
threshold=sd_threshold,
|
179 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
180 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
181 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
182 |
+
window_size_sample=nb_window_size_sample,
|
183 |
+
speech_pad_ms=nb_speech_pad_ms)
|
184 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
185 |
inputs=params + whisper_params.to_list(),
|
186 |
outputs=[tb_indicator, files_subtitles])
|
|
|
200 |
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
201 |
with gr.Row():
|
202 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
203 |
+
with gr.Accordion("Silero VAD Options", open=False):
|
204 |
+
cb_vad_filter = gr.Checkbox(label="Enable VAD Filter", value=False, interactive=True)
|
205 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
206 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
207 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
208 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
209 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
210 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
211 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
212 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
213 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
214 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
239 |
initial_prompt=tb_initial_prompt,
|
240 |
temperature=sd_temperature,
|
241 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
242 |
+
vad_filter=cb_vad_filter,
|
243 |
+
threshold=sd_threshold,
|
244 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
245 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
246 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
247 |
+
window_size_sample=nb_window_size_sample,
|
248 |
+
speech_pad_ms=nb_speech_pad_ms)
|
249 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
250 |
inputs=params + whisper_params.to_list(),
|
251 |
outputs=[tb_indicator, files_subtitles])
|