jhj0517 commited on
Commit
6a24751
·
unverified ·
2 Parent(s): d8c2ba0 c1f12f6

Merge pull request #198 from jhj0517/feature/upgrade-faster-whisper

Browse files
app.py CHANGED
@@ -115,7 +115,6 @@ class App:
115
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
116
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
117
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
118
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
119
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
120
  with gr.Accordion("Diarization", open=False):
121
  cb_diarize = gr.Checkbox(label="Enable Diarization")
@@ -152,7 +151,6 @@ class App:
152
  min_speech_duration_ms=nb_min_speech_duration_ms,
153
  max_speech_duration_s=nb_max_speech_duration_s,
154
  min_silence_duration_ms=nb_min_silence_duration_ms,
155
- window_size_sample=nb_window_size_sample,
156
  speech_pad_ms=nb_speech_pad_ms,
157
  chunk_length_s=nb_chunk_length_s,
158
  batch_size=nb_batch_size,
@@ -203,7 +201,6 @@ class App:
203
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
204
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
205
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
206
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
207
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
208
  with gr.Accordion("Diarization", open=False):
209
  cb_diarize = gr.Checkbox(label="Enable Diarization")
@@ -241,7 +238,6 @@ class App:
241
  min_speech_duration_ms=nb_min_speech_duration_ms,
242
  max_speech_duration_s=nb_max_speech_duration_s,
243
  min_silence_duration_ms=nb_min_silence_duration_ms,
244
- window_size_sample=nb_window_size_sample,
245
  speech_pad_ms=nb_speech_pad_ms,
246
  chunk_length_s=nb_chunk_length_s,
247
  batch_size=nb_batch_size,
@@ -284,7 +280,6 @@ class App:
284
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
285
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
286
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
287
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
288
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
289
  with gr.Accordion("Diarization", open=False):
290
  cb_diarize = gr.Checkbox(label="Enable Diarization")
@@ -324,7 +319,6 @@ class App:
324
  min_speech_duration_ms=nb_min_speech_duration_ms,
325
  max_speech_duration_s=nb_max_speech_duration_s,
326
  min_silence_duration_ms=nb_min_silence_duration_ms,
327
- window_size_sample=nb_window_size_sample,
328
  speech_pad_ms=nb_speech_pad_ms,
329
  chunk_length_s=nb_chunk_length_s,
330
  batch_size=nb_batch_size,
 
115
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
116
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
117
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
 
118
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
119
  with gr.Accordion("Diarization", open=False):
120
  cb_diarize = gr.Checkbox(label="Enable Diarization")
 
151
  min_speech_duration_ms=nb_min_speech_duration_ms,
152
  max_speech_duration_s=nb_max_speech_duration_s,
153
  min_silence_duration_ms=nb_min_silence_duration_ms,
 
154
  speech_pad_ms=nb_speech_pad_ms,
155
  chunk_length_s=nb_chunk_length_s,
156
  batch_size=nb_batch_size,
 
201
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
202
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
203
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
 
204
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
205
  with gr.Accordion("Diarization", open=False):
206
  cb_diarize = gr.Checkbox(label="Enable Diarization")
 
238
  min_speech_duration_ms=nb_min_speech_duration_ms,
239
  max_speech_duration_s=nb_max_speech_duration_s,
240
  min_silence_duration_ms=nb_min_silence_duration_ms,
 
241
  speech_pad_ms=nb_speech_pad_ms,
242
  chunk_length_s=nb_chunk_length_s,
243
  batch_size=nb_batch_size,
 
280
  nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
281
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
282
  nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
 
283
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
284
  with gr.Accordion("Diarization", open=False):
285
  cb_diarize = gr.Checkbox(label="Enable Diarization")
 
319
  min_speech_duration_ms=nb_min_speech_duration_ms,
320
  max_speech_duration_s=nb_max_speech_duration_s,
321
  min_silence_duration_ms=nb_min_silence_duration_ms,
 
322
  speech_pad_ms=nb_speech_pad_ms,
323
  chunk_length_s=nb_chunk_length_s,
324
  batch_size=nb_batch_size,
modules/vad/silero_vad.py CHANGED
@@ -1,4 +1,4 @@
1
- from faster_whisper.vad import VadOptions
2
  import numpy as np
3
  from typing import BinaryIO, Union, List, Optional
4
  import warnings
@@ -9,6 +9,8 @@ import gradio as gr
9
  class SileroVAD:
10
  def __init__(self):
11
  self.sampling_rate = 16000
 
 
12
 
13
  def run(self,
14
  audio: Union[str, BinaryIO, np.ndarray],
@@ -54,8 +56,8 @@ class SileroVAD:
54
 
55
  return audio
56
 
57
- @staticmethod
58
  def get_speech_timestamps(
 
59
  audio: np.ndarray,
60
  vad_options: Optional[VadOptions] = None,
61
  progress: gr.Progress = gr.Progress(),
@@ -72,6 +74,10 @@ class SileroVAD:
72
  Returns:
73
  List of dicts containing begin and end samples of each speech chunk.
74
  """
 
 
 
 
75
  if vad_options is None:
76
  vad_options = VadOptions(**kwargs)
77
 
@@ -79,15 +85,8 @@ class SileroVAD:
79
  min_speech_duration_ms = vad_options.min_speech_duration_ms
80
  max_speech_duration_s = vad_options.max_speech_duration_s
81
  min_silence_duration_ms = vad_options.min_silence_duration_ms
82
- window_size_samples = vad_options.window_size_samples
83
  speech_pad_ms = vad_options.speech_pad_ms
84
-
85
- if window_size_samples not in [512, 1024, 1536]:
86
- warnings.warn(
87
- "Unusual window_size_samples! Supported window_size_samples:\n"
88
- " - [512, 1024, 1536] for 16000 sampling_rate"
89
- )
90
-
91
  sampling_rate = 16000
92
  min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
93
  speech_pad_samples = sampling_rate * speech_pad_ms / 1000
@@ -101,8 +100,7 @@ class SileroVAD:
101
 
102
  audio_length_samples = len(audio)
103
 
104
- model = faster_whisper.vad.get_vad_model()
105
- state = model.get_initial_state(batch_size=1)
106
 
107
  speech_probs = []
108
  for current_start_sample in range(0, audio_length_samples, window_size_samples):
@@ -111,7 +109,7 @@ class SileroVAD:
111
  chunk = audio[current_start_sample: current_start_sample + window_size_samples]
112
  if len(chunk) < window_size_samples:
113
  chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
114
- speech_prob, state = model(chunk, state, sampling_rate)
115
  speech_probs.append(speech_prob)
116
 
117
  triggered = False
@@ -207,6 +205,9 @@ class SileroVAD:
207
 
208
  return speeches
209
 
 
 
 
210
  @staticmethod
211
  def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
212
  """Collects and concatenates audio chunks."""
 
1
+ from faster_whisper.vad import VadOptions, get_vad_model
2
  import numpy as np
3
  from typing import BinaryIO, Union, List, Optional
4
  import warnings
 
9
  class SileroVAD:
10
  def __init__(self):
11
  self.sampling_rate = 16000
12
+ self.window_size_samples = 512
13
+ self.model = None
14
 
15
  def run(self,
16
  audio: Union[str, BinaryIO, np.ndarray],
 
56
 
57
  return audio
58
 
 
59
  def get_speech_timestamps(
60
+ self,
61
  audio: np.ndarray,
62
  vad_options: Optional[VadOptions] = None,
63
  progress: gr.Progress = gr.Progress(),
 
74
  Returns:
75
  List of dicts containing begin and end samples of each speech chunk.
76
  """
77
+
78
+ if self.model is None:
79
+ self.update_model()
80
+
81
  if vad_options is None:
82
  vad_options = VadOptions(**kwargs)
83
 
 
85
  min_speech_duration_ms = vad_options.min_speech_duration_ms
86
  max_speech_duration_s = vad_options.max_speech_duration_s
87
  min_silence_duration_ms = vad_options.min_silence_duration_ms
88
+ window_size_samples = self.window_size_samples
89
  speech_pad_ms = vad_options.speech_pad_ms
 
 
 
 
 
 
 
90
  sampling_rate = 16000
91
  min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
92
  speech_pad_samples = sampling_rate * speech_pad_ms / 1000
 
100
 
101
  audio_length_samples = len(audio)
102
 
103
+ state, context = self.model.get_initial_states(batch_size=1)
 
104
 
105
  speech_probs = []
106
  for current_start_sample in range(0, audio_length_samples, window_size_samples):
 
109
  chunk = audio[current_start_sample: current_start_sample + window_size_samples]
110
  if len(chunk) < window_size_samples:
111
  chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
112
+ speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
113
  speech_probs.append(speech_prob)
114
 
115
  triggered = False
 
205
 
206
  return speeches
207
 
208
+ def update_model(self):
209
+ self.model = get_vad_model()
210
+
211
  @staticmethod
212
  def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
213
  """Collects and concatenates audio chunks."""
modules/whisper/whisper_base.py CHANGED
@@ -91,7 +91,6 @@ class WhisperBase(ABC):
91
  min_speech_duration_ms=params.min_speech_duration_ms,
92
  max_speech_duration_s=params.max_speech_duration_s,
93
  min_silence_duration_ms=params.min_silence_duration_ms,
94
- window_size_samples=params.window_size_samples,
95
  speech_pad_ms=params.speech_pad_ms
96
  )
97
  self.vad.run(
 
91
  min_speech_duration_ms=params.min_speech_duration_ms,
92
  max_speech_duration_s=params.max_speech_duration_s,
93
  min_silence_duration_ms=params.min_silence_duration_ms,
 
94
  speech_pad_ms=params.speech_pad_ms
95
  )
96
  self.vad.run(
modules/whisper/whisper_parameter.py CHANGED
@@ -23,7 +23,6 @@ class WhisperParameters:
23
  min_speech_duration_ms: gr.Number
24
  max_speech_duration_s: gr.Number
25
  min_silence_duration_ms: gr.Number
26
- window_size_sample: gr.Number
27
  speech_pad_ms: gr.Number
28
  chunk_length_s: gr.Number
29
  batch_size: gr.Number
@@ -111,11 +110,6 @@ class WhisperParameters:
111
  This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
112
  before separating it
113
 
114
- window_size_samples: gr.Number
115
- This parameter is related with Silero VAD. Audio chunks of window_size_samples size are fed to the silero VAD model.
116
- WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
117
- Values other than these may affect model performance!!
118
-
119
  speech_pad_ms: gr.Number
120
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
121
 
@@ -178,13 +172,12 @@ class WhisperParameters:
178
  min_speech_duration_ms=args[15],
179
  max_speech_duration_s=args[16],
180
  min_silence_duration_ms=args[17],
181
- window_size_samples=args[18],
182
- speech_pad_ms=args[19],
183
- chunk_length_s=args[20],
184
- batch_size=args[21],
185
- is_diarize=args[22],
186
- hf_token=args[23],
187
- diarization_device=args[24]
188
  )
189
 
190
 
@@ -208,7 +201,6 @@ class WhisperValues:
208
  min_speech_duration_ms: int
209
  max_speech_duration_s: float
210
  min_silence_duration_ms: int
211
- window_size_samples: int
212
  speech_pad_ms: int
213
  chunk_length_s: int
214
  batch_size: int
@@ -217,4 +209,4 @@ class WhisperValues:
217
  diarization_device: str
218
  """
219
  A data class to use Whisper parameters.
220
- """
 
23
  min_speech_duration_ms: gr.Number
24
  max_speech_duration_s: gr.Number
25
  min_silence_duration_ms: gr.Number
 
26
  speech_pad_ms: gr.Number
27
  chunk_length_s: gr.Number
28
  batch_size: gr.Number
 
110
  This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
111
  before separating it
112
 
 
 
 
 
 
113
  speech_pad_ms: gr.Number
114
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
115
 
 
172
  min_speech_duration_ms=args[15],
173
  max_speech_duration_s=args[16],
174
  min_silence_duration_ms=args[17],
175
+ speech_pad_ms=args[18],
176
+ chunk_length_s=args[19],
177
+ batch_size=args[20],
178
+ is_diarize=args[21],
179
+ hf_token=args[22],
180
+ diarization_device=args[23]
 
181
  )
182
 
183
 
 
201
  min_speech_duration_ms: int
202
  max_speech_duration_s: float
203
  min_silence_duration_ms: int
 
204
  speech_pad_ms: int
205
  chunk_length_s: int
206
  batch_size: int
 
209
  diarization_device: str
210
  """
211
  A data class to use Whisper parameters.
212
+ """
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
  --extra-index-url https://download.pytorch.org/whl/cu121
2
  torch
3
  git+https://github.com/jhj0517/jhj0517-whisper.git
4
- faster-whisper==1.0.2
5
  transformers
6
  gradio==4.29.0
7
  pytube
 
1
  --extra-index-url https://download.pytorch.org/whl/cu121
2
  torch
3
  git+https://github.com/jhj0517/jhj0517-whisper.git
4
+ faster-whisper==1.0.3
5
  transformers
6
  gradio==4.29.0
7
  pytube