Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Sep 13, 2024

Commit

f5d473e

2 Parent(s): 545761a 0c8c544

Merge branch 'master' into feature/add-bgm-tab

Browse files

Files changed (10) hide show

README.md +2 -3
app.py +2 -1
modules/ui/htmls.py +1 -1
modules/uvr/music_separator.py +0 -1
modules/whisper/faster_whisper_inference.py +3 -1
modules/whisper/insanely_fast_whisper_inference.py +4 -2
modules/whisper/whisper_Inference.py +4 -2
modules/whisper/whisper_base.py +5 -1
modules/whisper/whisper_factory.py +12 -5
notebook/whisper-webui.ipynb +3 -2

README.md CHANGED Viewed

@@ -25,6 +25,7 @@ If you wish to try this on Colab, you can do it in [here](https://colab.research
   - Translate subtitle files using Facebook NLLB models
   - Translate subtitle files using DeepL API
 - Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
 - Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
    - To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
       1. https://huggingface.co/pyannote/speaker-diarization-3.1
@@ -109,8 +110,6 @@ This is Whisper's original VRAM usage table for models.
 - [x] Integrate with faster-whisper
 - [x] Integrate with insanely-fast-whisper
 - [x] Integrate with whisperX ( Only speaker diarization part )
-- [ ] Add background music separation pre-processing with [MVSEP-MDX23](https://github.com/ZFTurbo/MVSEP-MDX23-music-separation-model)
 - [ ] Add fast api script
 - [ ] Support real-time transcription for microphone

   - Translate subtitle files using Facebook NLLB models
   - Translate subtitle files using DeepL API
 - Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
+- Pre-processing audio input to separate BGM with [UVR](https://github.com/Anjok07/ultimatevocalremovergui), [UVR-api](https://github.com/NextAudioGen/ultimatevocalremover_api).
 - Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
    - To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
       1. https://huggingface.co/pyannote/speaker-diarization-3.1
 - [x] Integrate with faster-whisper
 - [x] Integrate with insanely-fast-whisper
 - [x] Integrate with whisperX ( Only speaker diarization part )
+- [x] Add background music separation pre-processing with [UVR](https://github.com/Anjok07/ultimatevocalremovergui)
 - [ ] Add fast api script
 - [ ] Support real-time transcription for microphone

app.py CHANGED Viewed

@@ -26,6 +26,7 @@ class App:
             whisper_model_dir=self.args.whisper_model_dir,
             faster_whisper_model_dir=self.args.faster_whisper_model_dir,
             insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
             output_dir=self.args.output_dir,
         )
         self.nllb_inf = NLLBInference(
@@ -269,7 +270,7 @@ class App:
                         files_subtitles = gr.Files(label="Downloadable output file", scale=3)
                         btn_openfolder = gr.Button('📂', scale=1)
-                    params = [mic_input, dd_file_format]
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.as_list(),

             whisper_model_dir=self.args.whisper_model_dir,
             faster_whisper_model_dir=self.args.faster_whisper_model_dir,
             insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
+            uvr_model_dir=self.args.uvr_model_dir,
             output_dir=self.args.output_dir,
         )
         self.nllb_inf = NLLBInference(
                         files_subtitles = gr.Files(label="Downloadable output file", scale=3)
                         btn_openfolder = gr.Button('📂', scale=1)
+                    params = [mic_input, dd_file_format, cb_timestamp]
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.as_list(),

modules/ui/htmls.py CHANGED Viewed

@@ -38,7 +38,7 @@ CSS = """
 """
 MARKDOWN = """
-### [Whisper Web-UI](https://github.com/jhj0517/Whsiper-WebUI)
 """

 """
 MARKDOWN = """
+### [Whisper-WebUI](https://github.com/jhj0517/Whsiper-WebUI)
 """

modules/uvr/music_separator.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# Credit to Team UVR : https://github.com/Anjok07/ultimatevocalremovergui
 from typing import Optional, Union
 import numpy as np
 import torchaudio

 from typing import Optional, Union
 import numpy as np
 import torchaudio

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -11,7 +11,7 @@ import whisper
 import gradio as gr
 from argparse import Namespace
-from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
@@ -20,11 +20,13 @@ class FasterWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = FASTER_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             diarization_model_dir=diarization_model_dir,
             output_dir=output_dir
         )
         self.model_dir = model_dir

 import gradio as gr
 from argparse import Namespace
+from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
     def __init__(self,
                  model_dir: str = FASTER_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir,
             output_dir=output_dir
         )
         self.model_dir = model_dir

modules/whisper/insanely_fast_whisper_inference.py CHANGED Viewed

@@ -11,7 +11,7 @@ import whisper
 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
 from argparse import Namespace
-from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
@@ -20,12 +20,14 @@ class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
-            diarization_model_dir=diarization_model_dir
         )
         self.model_dir = model_dir
         os.makedirs(self.model_dir, exist_ok=True)

 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
 from argparse import Namespace
+from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
     def __init__(self,
                  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir
         )
         self.model_dir = model_dir
         os.makedirs(self.model_dir, exist_ok=True)

modules/whisper/whisper_Inference.py CHANGED Viewed

@@ -7,7 +7,7 @@ import torch
 import os
 from argparse import Namespace
-from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_base import WhisperBase
 from modules.whisper.whisper_parameter import *
@@ -16,12 +16,14 @@ class WhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
-            diarization_model_dir=diarization_model_dir
         )
     def transcribe(self,

 import os
 from argparse import Namespace
+from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
 from modules.whisper.whisper_base import WhisperBase
 from modules.whisper.whisper_parameter import *
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir
         )
     def transcribe(self,

modules/whisper/whisper_base.py CHANGED Viewed

@@ -251,6 +251,7 @@ class WhisperBase(ABC):
     def transcribe_mic(self,
                        mic_audio: str,
                        file_format: str,
                        progress=gr.Progress(),
                        *whisper_params,
                        ) -> list:
@@ -263,6 +264,8 @@ class WhisperBase(ABC):
             Audio file path from gr.Microphone()
         file_format: str
             Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
@@ -280,6 +283,7 @@ class WhisperBase(ABC):
             transcribed_segments, time_for_task = self.run(
                 mic_audio,
                 progress,
                 *whisper_params,
             )
             progress(1, desc="Completed!")
@@ -287,7 +291,7 @@ class WhisperBase(ABC):
             subtitle, result_file_path = self.generate_and_write_file(
                 file_name="Mic",
                 transcribed_segments=transcribed_segments,
-                add_timestamp=True,
                 file_format=file_format,
                 output_dir=self.output_dir
             )

     def transcribe_mic(self,
                        mic_audio: str,
                        file_format: str,
+                       add_timestamp: bool,
                        progress=gr.Progress(),
                        *whisper_params,
                        ) -> list:
             Audio file path from gr.Microphone()
         file_format: str
             Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
             transcribed_segments, time_for_task = self.run(
                 mic_audio,
                 progress,
+                add_timestamp,
                 *whisper_params,
             )
             progress(1, desc="Completed!")
             subtitle, result_file_path = self.generate_and_write_file(
                 file_name="Mic",
                 transcribed_segments=transcribed_segments,
+                add_timestamp=add_timestamp,
                 file_format=file_format,
                 output_dir=self.output_dir
             )

modules/whisper/whisper_factory.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Optional
 import os
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
-                                 INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR)
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.whisper_Inference import WhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
@@ -17,6 +17,7 @@ class WhisperFactory:
         faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
         insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
         diarization_model_dir: str = DIARIZATION_MODELS_DIR,
         output_dir: str = OUTPUT_DIR,
     ) -> "WhisperBase":
         """
@@ -37,6 +38,8 @@ class WhisperFactory:
             Directory path for the Insanely Fast Whisper model.
         diarization_model_dir : str
             Directory path for the diarization model.
         output_dir : str
             Directory path where output files will be saved.
@@ -61,23 +64,27 @@ class WhisperFactory:
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )
         elif whisper_type in whisper_typos:
             return WhisperInference(
                 model_dir=whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )
         elif whisper_type in insanely_fast_whisper_typos:
             return InsanelyFastWhisperInference(
                 model_dir=insanely_fast_whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )
         else:
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )

 import os
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
+                                 INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR, UVR_MODELS_DIR)
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.whisper_Inference import WhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
         faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
         insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
         diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+        uvr_model_dir: str = UVR_MODELS_DIR,
         output_dir: str = OUTPUT_DIR,
     ) -> "WhisperBase":
         """
             Directory path for the Insanely Fast Whisper model.
         diarization_model_dir : str
             Directory path for the diarization model.
+        uvr_model_dir : str
+            Directory path for the UVR model.
         output_dir : str
             Directory path where output files will be saved.
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )
         elif whisper_type in whisper_typos:
             return WhisperInference(
                 model_dir=whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )
         elif whisper_type in insanely_fast_whisper_typos:
             return InsanelyFastWhisperInference(
                 model_dir=insanely_fast_whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )
         else:
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )

notebook/whisper-webui.ipynb CHANGED Viewed

@@ -58,7 +58,8 @@
         "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
         "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
         "!pip install tokenizers==0.19.1\n",
-        "!pip install pyannote.audio==3.3.1"
       ]
     },
     {
@@ -96,7 +97,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
         "id": "PQroYRRZzQiN",
         "cellView": "form"

         "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
         "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
         "!pip install tokenizers==0.19.1\n",
+        "!pip install pyannote.audio==3.3.1\n",
+        "!pip install git+https://github.com/jhj0517/ultimatevocalremover_api.git"
       ]
     },
     {
     },
     {
       "cell_type": "code",
+      "execution_count": 3,
       "metadata": {
         "id": "PQroYRRZzQiN",
         "cellView": "form"