Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Jul 15, 2024

Commit

48382b6

unverified ·

2 Parent(s): 7386da0 815f5df

Merge pull request #216 from jhj0517/feature/modularize-vad

Browse files

Files changed (4) hide show

app.py +1 -1
modules/vad/silero_vad.py +25 -4
modules/whisper/faster_whisper_inference.py +0 -16
modules/whisper/whisper_base.py +26 -0

app.py CHANGED Viewed

@@ -137,7 +137,7 @@ class App:
                 nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                 nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
-        with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
             cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
             sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5,
                                      info="Lower it to be more sensitive to small sounds.")

                 nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                 nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
+        with gr.Accordion("VAD", open=False):
             cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
             sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5,
                                      info="Lower it to be more sensitive to small sounds.")

modules/vad/silero_vad.py CHANGED Viewed

@@ -2,9 +2,10 @@
 from faster_whisper.vad import VadOptions, get_vad_model
 import numpy as np
-from typing import BinaryIO, Union, List, Optional
 import warnings
 import faster_whisper
 import gradio as gr
@@ -17,7 +18,8 @@ class SileroVAD:
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
             vad_parameters: VadOptions,
-            progress: gr.Progress = gr.Progress()):
         """
         Run VAD
@@ -32,8 +34,10 @@ class SileroVAD:
         Returns
         ----------
-        audio: np.ndarray
             Pre-processed audio with VAD
         """
         sampling_rate = self.sampling_rate
@@ -56,7 +60,7 @@ class SileroVAD:
         audio = self.collect_chunks(audio, speech_chunks)
         duration_after_vad = audio.shape[0] / sampling_rate
-        return audio
     def get_speech_timestamps(
         self,
@@ -241,3 +245,20 @@ class SileroVAD:
             f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
         )

 from faster_whisper.vad import VadOptions, get_vad_model
 import numpy as np
+from typing import BinaryIO, Union, List, Optional, Tuple
 import warnings
 import faster_whisper
+from faster_whisper.transcribe import SpeechTimestampsMap, Segment
 import gradio as gr
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
             vad_parameters: VadOptions,
+            progress: gr.Progress = gr.Progress()
+            ) -> Tuple[np.ndarray, List[dict]]:
         """
         Run VAD
         Returns
         ----------
+        np.ndarray
             Pre-processed audio with VAD
+        List[dict]
+            Chunks of speeches to be used to restore the timestamps later
         """
         sampling_rate = self.sampling_rate
         audio = self.collect_chunks(audio, speech_chunks)
         duration_after_vad = audio.shape[0] / sampling_rate
+        return audio, speech_chunks
     def get_speech_timestamps(
         self,
             f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
         )
+    def restore_speech_timestamps(
+        self,
+        segments: List[dict],
+        speech_chunks: List[dict],
+        sampling_rate: Optional[int] = None,
+    ) -> List[dict]:
+        if sampling_rate is None:
+            sampling_rate = self.sampling_rate
+        ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
+        for segment in segments:
+            segment["start"] = ts_map.get_original_time(segment["start"])
+            segment["end"] = ts_map.get_original_time(segment["end"])
+        return segments

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -71,20 +71,6 @@ class FasterWhisperInference(WhisperBase):
         if not params.hotwords:
             params.hotwords = None
-        vad_options = None
-        if params.vad_filter:
-            # Explicit value set for float('inf') from gr.Number()
-            if params.max_speech_duration_s >= 9999:
-                params.max_speech_duration_s = float('inf')
-            vad_options = VadOptions(
-                threshold=params.threshold,
-                min_speech_duration_ms=params.min_speech_duration_ms,
-                max_speech_duration_s=params.max_speech_duration_s,
-                min_silence_duration_ms=params.min_silence_duration_ms,
-                speech_pad_ms=params.speech_pad_ms
-            )
         params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
         segments, info = self.model.transcribe(
@@ -115,8 +101,6 @@ class FasterWhisperInference(WhisperBase):
             language_detection_threshold=params.language_detection_threshold,
             language_detection_segments=params.language_detection_segments,
             prompt_reset_on_temperature=params.prompt_reset_on_temperature,
-            vad_filter=params.vad_filter,
-            vad_parameters=vad_options
         )
         progress(0, desc="Loading audio..")

         if not params.hotwords:
             params.hotwords = None
         params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
         segments, info = self.model.transcribe(
             language_detection_threshold=params.language_detection_threshold,
             language_detection_segments=params.language_detection_segments,
             prompt_reset_on_temperature=params.prompt_reset_on_temperature,
         )
         progress(0, desc="Loading audio..")

modules/whisper/whisper_base.py CHANGED Viewed

@@ -91,12 +91,38 @@ class WhisperBase(ABC):
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
             params.lang = language_code_dict[params.lang]
         result, elapsed_time = self.transcribe(
             audio,
             progress,
             *astuple(params)
         )
         if params.is_diarize:
             result, elapsed_time_diarization = self.diarizer.run(
                 audio=audio,

             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
             params.lang = language_code_dict[params.lang]
+        speech_chunks = None
+        if params.vad_filter:
+            # Explicit value set for float('inf') from gr.Number()
+            if params.max_speech_duration_s >= 9999:
+                params.max_speech_duration_s = float('inf')
+            vad_options = VadOptions(
+                threshold=params.threshold,
+                min_speech_duration_ms=params.min_speech_duration_ms,
+                max_speech_duration_s=params.max_speech_duration_s,
+                min_silence_duration_ms=params.min_silence_duration_ms,
+                speech_pad_ms=params.speech_pad_ms
+            )
+            audio, speech_chunks = self.vad.run(
+                audio=audio,
+                vad_parameters=vad_options,
+                progress=progress
+            )
         result, elapsed_time = self.transcribe(
             audio,
             progress,
             *astuple(params)
         )
+        if params.vad_filter:
+            result = self.vad.restore_speech_timestamps(
+                segments=result,
+                speech_chunks=speech_chunks,
+            )
         if params.is_diarize:
             result, elapsed_time_diarization = self.diarizer.run(
                 audio=audio,