Spaces:
Running
Running
Merge pull request #159 from jhj0517/feature/add-vad-parameter
Browse files- app.py +48 -6
- modules/faster_whisper_inference.py +11 -0
- modules/whisper_parameter.py +39 -0
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import argparse
|
|
|
4 |
|
5 |
from modules.whisper_Inference import WhisperInference
|
6 |
from modules.faster_whisper_inference import FasterWhisperInference
|
@@ -60,8 +61,15 @@ class App:
|
|
60 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
61 |
with gr.Row():
|
62 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
63 |
-
with gr.Accordion("
|
64 |
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
66 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
67 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
@@ -93,7 +101,14 @@ class App:
|
|
93 |
initial_prompt=tb_initial_prompt,
|
94 |
temperature=sd_temperature,
|
95 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
96 |
-
vad_filter=cb_vad_filter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
98 |
inputs=params + whisper_params.to_list(),
|
99 |
outputs=[tb_indicator, files_subtitles])
|
@@ -120,8 +135,15 @@ class App:
|
|
120 |
with gr.Row():
|
121 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
122 |
interactive=True)
|
123 |
-
with gr.Accordion("
|
124 |
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
126 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
127 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
@@ -153,7 +175,13 @@ class App:
|
|
153 |
initial_prompt=tb_initial_prompt,
|
154 |
temperature=sd_temperature,
|
155 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
156 |
-
vad_filter=cb_vad_filter
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
158 |
inputs=params + whisper_params.to_list(),
|
159 |
outputs=[tb_indicator, files_subtitles])
|
@@ -173,8 +201,15 @@ class App:
|
|
173 |
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
174 |
with gr.Row():
|
175 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
176 |
-
with gr.Accordion("
|
177 |
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
179 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
180 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
@@ -205,7 +240,13 @@ class App:
|
|
205 |
initial_prompt=tb_initial_prompt,
|
206 |
temperature=sd_temperature,
|
207 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
208 |
-
vad_filter=cb_vad_filter
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
210 |
inputs=params + whisper_params.to_list(),
|
211 |
outputs=[tb_indicator, files_subtitles])
|
@@ -284,6 +325,7 @@ class App:
|
|
284 |
launch_args['server_port'] = self.args.server_port
|
285 |
if self.args.username and self.args.password:
|
286 |
launch_args['auth'] = (self.args.username, self.args.password)
|
|
|
287 |
self.app.queue(api_open=False).launch(**launch_args)
|
288 |
|
289 |
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import argparse
|
4 |
+
import webbrowser
|
5 |
|
6 |
from modules.whisper_Inference import WhisperInference
|
7 |
from modules.faster_whisper_inference import FasterWhisperInference
|
|
|
61 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
62 |
with gr.Row():
|
63 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
64 |
+
with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
|
65 |
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
66 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
67 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
68 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
69 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
70 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
71 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
72 |
+
with gr.Accordion("Advanced_Parameters", open=False):
|
73 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
74 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
75 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
101 |
initial_prompt=tb_initial_prompt,
|
102 |
temperature=sd_temperature,
|
103 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
104 |
+
vad_filter=cb_vad_filter,
|
105 |
+
threshold=sd_threshold,
|
106 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
107 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
108 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
109 |
+
window_size_sample=nb_window_size_sample,
|
110 |
+
speech_pad_ms=nb_speech_pad_ms)
|
111 |
+
|
112 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
113 |
inputs=params + whisper_params.to_list(),
|
114 |
outputs=[tb_indicator, files_subtitles])
|
|
|
135 |
with gr.Row():
|
136 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
137 |
interactive=True)
|
138 |
+
with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
|
139 |
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
140 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
141 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
142 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
143 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
144 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
145 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
146 |
+
with gr.Accordion("Advanced_Parameters", open=False):
|
147 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
148 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
149 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
175 |
initial_prompt=tb_initial_prompt,
|
176 |
temperature=sd_temperature,
|
177 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
178 |
+
vad_filter=cb_vad_filter,
|
179 |
+
threshold=sd_threshold,
|
180 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
181 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
182 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
183 |
+
window_size_sample=nb_window_size_sample,
|
184 |
+
speech_pad_ms=nb_speech_pad_ms)
|
185 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
186 |
inputs=params + whisper_params.to_list(),
|
187 |
outputs=[tb_indicator, files_subtitles])
|
|
|
201 |
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
202 |
with gr.Row():
|
203 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
204 |
+
with gr.Accordion("VAD Options", open=False, visible=not self.args.disable_faster_whisper):
|
205 |
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
206 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
207 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
208 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
209 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
210 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
211 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
212 |
+
with gr.Accordion("Advanced_Parameters", open=False):
|
213 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
214 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
215 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
240 |
initial_prompt=tb_initial_prompt,
|
241 |
temperature=sd_temperature,
|
242 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
243 |
+
vad_filter=cb_vad_filter,
|
244 |
+
threshold=sd_threshold,
|
245 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
246 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
247 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
248 |
+
window_size_sample=nb_window_size_sample,
|
249 |
+
speech_pad_ms=nb_speech_pad_ms)
|
250 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
251 |
inputs=params + whisper_params.to_list(),
|
252 |
outputs=[tb_indicator, files_subtitles])
|
|
|
325 |
launch_args['server_port'] = self.args.server_port
|
326 |
if self.args.username and self.args.password:
|
327 |
launch_args['auth'] = (self.args.username, self.args.password)
|
328 |
+
|
329 |
self.app.queue(api_open=False).launch(**launch_args)
|
330 |
|
331 |
|
modules/faster_whisper_inference.py
CHANGED
@@ -6,6 +6,7 @@ from typing import BinaryIO, Union, Tuple, List
|
|
6 |
from datetime import datetime
|
7 |
|
8 |
import faster_whisper
|
|
|
9 |
import ctranslate2
|
10 |
import whisper
|
11 |
import torch
|
@@ -260,6 +261,15 @@ class FasterWhisperInference(BaseInterface):
|
|
260 |
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
261 |
params.lang = language_code_dict[params.lang]
|
262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
segments, info = self.model.transcribe(
|
264 |
audio=audio,
|
265 |
language=params.lang,
|
@@ -272,6 +282,7 @@ class FasterWhisperInference(BaseInterface):
|
|
272 |
temperature=params.temperature,
|
273 |
compression_ratio_threshold=params.compression_ratio_threshold,
|
274 |
vad_filter=params.vad_filter,
|
|
|
275 |
)
|
276 |
progress(0, desc="Loading audio..")
|
277 |
|
|
|
6 |
from datetime import datetime
|
7 |
|
8 |
import faster_whisper
|
9 |
+
from faster_whisper.vad import VadOptions
|
10 |
import ctranslate2
|
11 |
import whisper
|
12 |
import torch
|
|
|
261 |
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
262 |
params.lang = language_code_dict[params.lang]
|
263 |
|
264 |
+
vad_options = VadOptions(
|
265 |
+
threshold=params.threshold,
|
266 |
+
min_speech_duration_ms=params.min_speech_duration_ms,
|
267 |
+
max_speech_duration_s=params.max_speech_duration_s,
|
268 |
+
min_silence_duration_ms=params.min_silence_duration_ms,
|
269 |
+
window_size_samples=params.window_size_samples,
|
270 |
+
speech_pad_ms=params.speech_pad_ms
|
271 |
+
)
|
272 |
+
|
273 |
segments, info = self.model.transcribe(
|
274 |
audio=audio,
|
275 |
language=params.lang,
|
|
|
282 |
temperature=params.temperature,
|
283 |
compression_ratio_threshold=params.compression_ratio_threshold,
|
284 |
vad_filter=params.vad_filter,
|
285 |
+
vad_parameters=vad_options
|
286 |
)
|
287 |
progress(0, desc="Loading audio..")
|
288 |
|
modules/whisper_parameter.py
CHANGED
@@ -19,6 +19,12 @@ class WhisperGradioComponents:
|
|
19 |
temperature: gr.Slider
|
20 |
compression_ratio_threshold: gr.Number
|
21 |
vad_filter: gr.Checkbox
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
"""
|
23 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
24 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
@@ -78,6 +84,33 @@ class WhisperGradioComponents:
|
|
78 |
Enable the voice activity detection (VAD) to filter out parts of the audio
|
79 |
without speech. This step is using the Silero VAD model
|
80 |
https://github.com/snakers4/silero-vad.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
"""
|
82 |
|
83 |
def to_list(self) -> list:
|
@@ -108,6 +141,12 @@ class WhisperValues:
|
|
108 |
temperature: float
|
109 |
compression_ratio_threshold: float
|
110 |
vad_filter: bool
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
"""
|
112 |
A data class to use Whisper parameters. Use "after" Gradio pre-processing.
|
113 |
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|
|
|
19 |
temperature: gr.Slider
|
20 |
compression_ratio_threshold: gr.Number
|
21 |
vad_filter: gr.Checkbox
|
22 |
+
threshold: gr.Slider
|
23 |
+
min_speech_duration_ms: gr.Number
|
24 |
+
max_speech_duration_s: gr.Number
|
25 |
+
min_silence_duration_ms: gr.Number
|
26 |
+
window_size_sample: gr.Number
|
27 |
+
speech_pad_ms: gr.Number
|
28 |
"""
|
29 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
30 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
|
|
84 |
Enable the voice activity detection (VAD) to filter out parts of the audio
|
85 |
without speech. This step is using the Silero VAD model
|
86 |
https://github.com/snakers4/silero-vad.
|
87 |
+
|
88 |
+
threshold: gr.Slider
|
89 |
+
This parameter is related with Silero VAD. Speech threshold.
|
90 |
+
Silero VAD outputs speech probabilities for each audio chunk,
|
91 |
+
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
|
92 |
+
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
93 |
+
|
94 |
+
min_speech_duration_ms: gr.Number
|
95 |
+
This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
|
96 |
+
|
97 |
+
max_speech_duration_s: gr.Number
|
98 |
+
This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
|
99 |
+
than max_speech_duration_s will be split at the timestamp of the last silence that
|
100 |
+
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
|
101 |
+
split aggressively just before max_speech_duration_s.
|
102 |
+
|
103 |
+
min_silence_duration_ms: gr.Number
|
104 |
+
This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
|
105 |
+
before separating it
|
106 |
+
|
107 |
+
window_size_samples: gr.Number
|
108 |
+
This parameter is related with Silero VAD. Audio chunks of window_size_samples size are fed to the silero VAD model.
|
109 |
+
WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
|
110 |
+
Values other than these may affect model performance!!
|
111 |
+
|
112 |
+
speech_pad_ms: gr.Number
|
113 |
+
This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
|
114 |
"""
|
115 |
|
116 |
def to_list(self) -> list:
|
|
|
141 |
temperature: float
|
142 |
compression_ratio_threshold: float
|
143 |
vad_filter: bool
|
144 |
+
threshold: float
|
145 |
+
min_speech_duration_ms: int
|
146 |
+
max_speech_duration_s: float
|
147 |
+
min_silence_duration_ms: int
|
148 |
+
window_size_samples: int
|
149 |
+
speech_pad_ms: int
|
150 |
"""
|
151 |
A data class to use Whisper parameters. Use "after" Gradio pre-processing.
|
152 |
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|