jhj0517 commited on
Commit
899eb46
·
1 Parent(s): edcb1e1

add Silero VAD Options

Browse files
modules/faster_whisper_inference.py CHANGED
@@ -6,6 +6,7 @@ from typing import BinaryIO, Union, Tuple, List
6
  from datetime import datetime
7
 
8
  import faster_whisper
 
9
  import ctranslate2
10
  import whisper
11
  import torch
@@ -260,6 +261,15 @@ class FasterWhisperInference(BaseInterface):
260
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
261
  params.lang = language_code_dict[params.lang]
262
 
 
 
 
 
 
 
 
 
 
263
  segments, info = self.model.transcribe(
264
  audio=audio,
265
  language=params.lang,
@@ -272,6 +282,7 @@ class FasterWhisperInference(BaseInterface):
272
  temperature=params.temperature,
273
  compression_ratio_threshold=params.compression_ratio_threshold,
274
  vad_filter=params.vad_filter,
 
275
  )
276
  progress(0, desc="Loading audio..")
277
 
 
6
  from datetime import datetime
7
 
8
  import faster_whisper
9
+ from faster_whisper.vad import VadOptions
10
  import ctranslate2
11
  import whisper
12
  import torch
 
261
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
262
  params.lang = language_code_dict[params.lang]
263
 
264
+ vad_options = VadOptions(
265
+ threshold=params.threshold,
266
+ min_speech_duration_ms=params.min_speech_duration_ms,
267
+ max_speech_duration_s=params.max_speech_duration_s,
268
+ min_silence_duration_ms=params.min_silence_duration_ms,
269
+ window_size_samples=params.window_size_samples,
270
+ speech_pad_ms=params.speech_pad_ms
271
+ )
272
+
273
  segments, info = self.model.transcribe(
274
  audio=audio,
275
  language=params.lang,
 
282
  temperature=params.temperature,
283
  compression_ratio_threshold=params.compression_ratio_threshold,
284
  vad_filter=params.vad_filter,
285
+ vad_parameters=vad_options
286
  )
287
  progress(0, desc="Loading audio..")
288
 
modules/whisper_parameter.py CHANGED
@@ -19,6 +19,12 @@ class WhisperGradioComponents:
19
  temperature: gr.Slider
20
  compression_ratio_threshold: gr.Number
21
  vad_filter: gr.Checkbox
 
 
 
 
 
 
22
  """
23
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
24
  See more about Gradio pre-processing: https://www.gradio.app/docs/components
@@ -78,6 +84,33 @@ class WhisperGradioComponents:
78
  Enable the voice activity detection (VAD) to filter out parts of the audio
79
  without speech. This step is using the Silero VAD model
80
  https://github.com/snakers4/silero-vad.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  """
82
 
83
  def to_list(self) -> list:
@@ -108,6 +141,12 @@ class WhisperValues:
108
  temperature: float
109
  compression_ratio_threshold: float
110
  vad_filter: bool
 
 
 
 
 
 
111
  """
112
  A data class to use Whisper parameters. Use "after" Gradio pre-processing.
113
  See more about Gradio pre-processing: : https://www.gradio.app/docs/components
 
19
  temperature: gr.Slider
20
  compression_ratio_threshold: gr.Number
21
  vad_filter: gr.Checkbox
22
+ threshold: gr.Slider
23
+ min_speech_duration_ms: gr.Number
24
+ max_speech_duration_s: gr.Number
25
+ min_silence_duration_ms: gr.Number
26
+ window_size_sample: gr.Number
27
+ speech_pad_ms: gr.Number
28
  """
29
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
30
  See more about Gradio pre-processing: https://www.gradio.app/docs/components
 
84
  Enable the voice activity detection (VAD) to filter out parts of the audio
85
  without speech. This step is using the Silero VAD model
86
  https://github.com/snakers4/silero-vad.
87
+
88
+ threshold: gr.Slider
89
+ This parameter is related with Silero VAD. Speech threshold.
90
+ Silero VAD outputs speech probabilities for each audio chunk,
91
+ probabilities ABOVE this value are considered as SPEECH. It is better to tune this
92
+ parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
93
+
94
+ min_speech_duration_ms: gr.Number
95
+ This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
96
+
97
+ max_speech_duration_s: gr.Number
98
+ This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
99
+ than max_speech_duration_s will be split at the timestamp of the last silence that
100
+ lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
101
+ split aggressively just before max_speech_duration_s.
102
+
103
+ min_silence_duration_ms: gr.Number
104
+ This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
105
+ before separating it
106
+
107
+ window_size_samples: gr.Number
108
+ This parameter is related with Silero VAD. Audio chunks of window_size_samples size are fed to the silero VAD model.
109
+ WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
110
+ Values other than these may affect model performance!!
111
+
112
+ speech_pad_ms: gr.Number
113
+ This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
114
  """
115
 
116
  def to_list(self) -> list:
 
141
  temperature: float
142
  compression_ratio_threshold: float
143
  vad_filter: bool
144
+ threshold: float
145
+ min_speech_duration_ms: int
146
+ max_speech_duration_s: float
147
+ min_silence_duration_ms: int
148
+ window_size_samples: int
149
+ speech_pad_ms: int
150
  """
151
  A data class to use Whisper parameters. Use "after" Gradio pre-processing.
152
  See more about Gradio pre-processing: : https://www.gradio.app/docs/components