Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Sep 13, 2024

Commit

0c8c544

unverified ·

2 Parent(s): ea7397a 2109221

Merge pull request #267 from jhj0517/feature/bgm-separation

Browse files

Files changed (14) hide show

README.md +2 -3
app.py +21 -4
configs/default_parameters.yaml +6 -0
modules/ui/htmls.py +1 -1
modules/utils/paths.py +8 -1
modules/uvr/music_separator.py +132 -0
modules/whisper/faster_whisper_inference.py +3 -1
modules/whisper/insanely_fast_whisper_inference.py +4 -2
modules/whisper/whisper_Inference.py +4 -2
modules/whisper/whisper_base.py +47 -2
modules/whisper/whisper_factory.py +12 -5
modules/whisper/whisper_parameter.py +48 -17
notebook/whisper-webui.ipynb +3 -2
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -25,6 +25,7 @@ If you wish to try this on Colab, you can do it in [here](https://colab.research
   - Translate subtitle files using Facebook NLLB models
   - Translate subtitle files using DeepL API
 - Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
 - Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
    - To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
       1. https://huggingface.co/pyannote/speaker-diarization-3.1
@@ -109,8 +110,6 @@ This is Whisper's original VRAM usage table for models.
 - [x] Integrate with faster-whisper
 - [x] Integrate with insanely-fast-whisper
 - [x] Integrate with whisperX ( Only speaker diarization part )
-- [ ] Add background music separation pre-processing with [MVSEP-MDX23](https://github.com/ZFTurbo/MVSEP-MDX23-music-separation-model)
 - [ ] Add fast api script
 - [ ] Support real-time transcription for microphone

   - Translate subtitle files using Facebook NLLB models
   - Translate subtitle files using DeepL API
 - Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
+- Pre-processing audio input to separate BGM with [UVR](https://github.com/Anjok07/ultimatevocalremovergui), [UVR-api](https://github.com/NextAudioGen/ultimatevocalremover_api).
 - Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
    - To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
       1. https://huggingface.co/pyannote/speaker-diarization-3.1
 - [x] Integrate with faster-whisper
 - [x] Integrate with insanely-fast-whisper
 - [x] Integrate with whisperX ( Only speaker diarization part )
+- [x] Add background music separation pre-processing with [UVR](https://github.com/Anjok07/ultimatevocalremovergui)
 - [ ] Add fast api script
 - [ ] Support real-time transcription for microphone

app.py CHANGED Viewed

@@ -4,7 +4,8 @@ import gradio as gr
 import yaml
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
-                                 INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH)
 from modules.utils.files_manager import load_yaml
 from modules.whisper.whisper_factory import WhisperFactory
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
@@ -25,10 +26,9 @@ class App:
             whisper_model_dir=self.args.whisper_model_dir,
             faster_whisper_model_dir=self.args.faster_whisper_model_dir,
             insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
             output_dir=self.args.output_dir,
         )
-        print(f"Use \"{self.args.whisper_type}\" implementation")
-        print(f"Device \"{self.whisper_inf.device}\" is detected")
         self.nllb_inf = NLLBInference(
             model_dir=self.args.nllb_model_dir,
             output_dir=os.path.join(self.args.output_dir, "translations")
@@ -37,11 +37,14 @@ class App:
             output_dir=os.path.join(self.args.output_dir, "translations")
         )
         self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
     def create_whisper_parameters(self):
         whisper_params = self.default_params["whisper"]
         vad_params = self.default_params["vad"]
         diarization_params = self.default_params["diarization"]
         with gr.Row():
             dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
@@ -127,6 +130,16 @@ class App:
                                               precision=0)
                 nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
         with gr.Accordion("VAD", open=False):
             cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
                                         interactive=True)
@@ -173,7 +186,9 @@ class App:
                 hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
                 language_detection_threshold=nb_language_detection_threshold,
                 language_detection_segments=nb_language_detection_segments,
-                prompt_reset_on_temperature=sld_prompt_reset_on_temperature
             ),
             dd_file_format,
             cb_timestamp
@@ -383,6 +398,8 @@ parser.add_argument('--diarization_model_dir', type=str, default=DIARIZATION_MOD
                     help='Directory path of the diarization model')
 parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
                     help='Directory path of the Facebook NLLB model')
 parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
 _args = parser.parse_args()

 import yaml
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
+                                 INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
+                                 UVR_MODELS_DIR)
 from modules.utils.files_manager import load_yaml
 from modules.whisper.whisper_factory import WhisperFactory
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
             whisper_model_dir=self.args.whisper_model_dir,
             faster_whisper_model_dir=self.args.faster_whisper_model_dir,
             insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
+            uvr_model_dir=self.args.uvr_model_dir,
             output_dir=self.args.output_dir,
         )
         self.nllb_inf = NLLBInference(
             model_dir=self.args.nllb_model_dir,
             output_dir=os.path.join(self.args.output_dir, "translations")
             output_dir=os.path.join(self.args.output_dir, "translations")
         )
         self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        print(f"Use \"{self.args.whisper_type}\" implementation")
+        print(f"Device \"{self.whisper_inf.device}\" is detected")
     def create_whisper_parameters(self):
         whisper_params = self.default_params["whisper"]
         vad_params = self.default_params["vad"]
         diarization_params = self.default_params["diarization"]
+        uvr_params = self.default_params["bgm_separation"]
         with gr.Row():
             dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
                                               precision=0)
                 nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
+        with gr.Accordion("BGM Separation", open=False):
+            cb_bgm_separation = gr.Checkbox(label="Enable BGM separation", value=uvr_params["is_separate_bgm"],
+                                            interactive=True)
+            dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
+                                        choices=self.whisper_inf.music_separator.available_devices)
+            dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
+                                            choices=self.whisper_inf.music_separator.available_models)
+            nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
+            cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
         with gr.Accordion("VAD", open=False):
             cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
                                         interactive=True)
                 hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
                 language_detection_threshold=nb_language_detection_threshold,
                 language_detection_segments=nb_language_detection_segments,
+                prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
+                uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
+                uvr_save_file=cb_uvr_save_file
             ),
             dd_file_format,
             cb_timestamp
                     help='Directory path of the diarization model')
 parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
                     help='Directory path of the Facebook NLLB model')
+parser.add_argument('--uvr_model_dir', type=str, default=UVR_MODELS_DIR,
+                    help='Directory path of the UVR model')
 parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
 _args = parser.parse_args()

configs/default_parameters.yaml CHANGED Viewed

@@ -44,6 +44,12 @@ diarization:
   is_diarize: false
   hf_token: ""
 translation:
   deepl:
     api_key: ""

   is_diarize: false
   hf_token: ""
+bgm_separation:
+  is_separate_bgm: false
+  model_size: "UVR-MDX-NET-Inst_HQ_4"
+  segment_size: 256
+  save_file: true
 translation:
   deepl:
     api_key: ""

modules/ui/htmls.py CHANGED Viewed

@@ -38,7 +38,7 @@ CSS = """
 """
 MARKDOWN = """
-### [Whisper Web-UI](https://github.com/jhj0517/Whsiper-WebUI)
 """

 """
 MARKDOWN = """
+### [Whisper-WebUI](https://github.com/jhj0517/Whsiper-WebUI)
 """

modules/utils/paths.py CHANGED Viewed

@@ -7,10 +7,14 @@ FASTER_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "faster-whisper")
 INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
 NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
 DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
 CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
 DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
 OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
 TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
 for dir_path in [MODELS_DIR,
                  WHISPER_MODELS_DIR,
@@ -18,7 +22,10 @@ for dir_path in [MODELS_DIR,
                  INSANELY_FAST_WHISPER_MODELS_DIR,
                  NLLB_MODELS_DIR,
                  DIARIZATION_MODELS_DIR,
                  CONFIGS_DIR,
                  OUTPUT_DIR,
-                 TRANSLATION_OUTPUT_DIR]:
     os.makedirs(dir_path, exist_ok=True)

 INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
 NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
 DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
+UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
 CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
 DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
 OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
 TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
+UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
+UVR_INSTRUMENTAL_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "instrumental")
+UVR_VOCALS_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "vocals")
 for dir_path in [MODELS_DIR,
                  WHISPER_MODELS_DIR,
                  INSANELY_FAST_WHISPER_MODELS_DIR,
                  NLLB_MODELS_DIR,
                  DIARIZATION_MODELS_DIR,
+                 UVR_MODELS_DIR,
                  CONFIGS_DIR,
                  OUTPUT_DIR,
+                 TRANSLATION_OUTPUT_DIR,
+                 UVR_INSTRUMENTAL_OUTPUT_DIR,
+                 UVR_VOCALS_OUTPUT_DIR]:
     os.makedirs(dir_path, exist_ok=True)

modules/uvr/music_separator.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from typing import Optional, Union
+import numpy as np
+import torchaudio
+import soundfile as sf
+import os
+import torch
+import gc
+import gradio as gr
+from datetime import datetime
+from uvr.models import MDX, Demucs, VrNetwork, MDXC
+class MusicSeparator:
+    def __init__(self,
+                 model_dir: Optional[str] = None,
+                 output_dir: Optional[str] = None):
+        self.model = None
+        self.device = self.get_device()
+        self.available_devices = ["cpu", "cuda"]
+        self.model_dir = model_dir
+        self.output_dir = output_dir
+        self.audio_info = None
+        self.available_models = ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
+        self.default_model = self.available_models[0]
+        self.current_model_size = self.default_model
+        self.model_config = {
+            "segment": 256,
+            "split": True
+        }
+    def update_model(self,
+                     model_name: str = "UVR-MDX-NET-Inst_1",
+                     device: Optional[str] = None,
+                     segment_size: int = 256):
+        """
+        Update model with the given model name
+        Args:
+            model_name (str): Model name.
+            device (str): Device to use for the model.
+            segment_size (int): Segment size for the prediction.
+        """
+        if device is None:
+            device = self.device
+        self.device = device
+        self.model_config = {
+            "segment": segment_size,
+            "split": True
+        }
+        self.model = MDX(name=model_name,
+                         other_metadata=self.model_config,
+                         device=self.device,
+                         logger=None,
+                         model_dir=self.model_dir)
+    def separate(self,
+                 audio: Union[str, np.ndarray],
+                 model_name: str,
+                 device: Optional[str] = None,
+                 segment_size: int = 256,
+                 save_file: bool = False,
+                 progress: gr.Progress = gr.Progress()) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Separate the background music from the audio.
+        Args:
+            audio (Union[str, np.ndarray]): Audio path or numpy array.
+            model_name (str): Model name.
+            device (str): Device to use for the model.
+            segment_size (int): Segment size for the prediction.
+            save_file (bool): Whether to save the separated audio to output path or not.
+            progress (gr.Progress): Gradio progress indicator.
+        Returns:
+            tuple[np.ndarray, np.ndarray]: Instrumental and vocals numpy arrays.
+        """
+        if isinstance(audio, str):
+            self.audio_info = torchaudio.info(audio)
+            sample_rate = self.audio_info.sample_rate
+            output_filename, ext = os.path.splitext(audio)
+            output_filename, ext = os.path.basename(audio), ".wav"
+        else:
+            sample_rate = 16000
+            timestamp = datetime.now().strftime("%m%d%H%M%S")
+            output_filename, ext = f"UVR-{timestamp}", ".wav"
+        model_config = {
+            "segment": segment_size,
+            "split": True
+        }
+        if (self.model is None or
+                self.current_model_size != model_name or
+                self.model_config != model_config or
+                self.audio_info.sample_rate != sample_rate or
+                self.device != device):
+            progress(0, desc="Initializing UVR Model..")
+            self.update_model(
+                model_name=model_name,
+                device=device,
+                segment_size=segment_size
+            )
+            self.model.sample_rate = sample_rate
+        progress(0, desc="Separating background music from the audio..")
+        result = self.model(audio)
+        instrumental, vocals = result["instrumental"].T, result["vocals"].T
+        if save_file:
+            instrumental_output_path = os.path.join(self.output_dir, "instrumental", f"{output_filename}-instrumental{ext}")
+            vocals_output_path = os.path.join(self.output_dir, "vocals", f"{output_filename}-vocals{ext}")
+            sf.write(instrumental_output_path, instrumental, sample_rate, format="WAV")
+            sf.write(vocals_output_path, vocals, sample_rate, format="WAV")
+        return instrumental, vocals
+    @staticmethod
+    def get_device():
+        """Get device for the model"""
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    def offload(self):
+        """Offload the model and free up the memory"""
+        if self.model is not None:
+            del self.model
+            self.model = None
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+        self.audio_info = None

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -11,7 +11,7 @@ import whisper
 import gradio as gr
 from argparse import Namespace
-from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
@@ -20,11 +20,13 @@ class FasterWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = FASTER_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             diarization_model_dir=diarization_model_dir,
             output_dir=output_dir
         )
         self.model_dir = model_dir

 import gradio as gr
 from argparse import Namespace
+from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
     def __init__(self,
                  model_dir: str = FASTER_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir,
             output_dir=output_dir
         )
         self.model_dir = model_dir

modules/whisper/insanely_fast_whisper_inference.py CHANGED Viewed

@@ -11,7 +11,7 @@ import whisper
 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
 from argparse import Namespace
-from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
@@ -20,12 +20,14 @@ class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
-            diarization_model_dir=diarization_model_dir
         )
         self.model_dir = model_dir
         os.makedirs(self.model_dir, exist_ok=True)

 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
 from argparse import Namespace
+from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
     def __init__(self,
                  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir
         )
         self.model_dir = model_dir
         os.makedirs(self.model_dir, exist_ok=True)

modules/whisper/whisper_Inference.py CHANGED Viewed

@@ -7,7 +7,7 @@ import torch
 import os
 from argparse import Namespace
-from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_base import WhisperBase
 from modules.whisper.whisper_parameter import *
@@ -16,12 +16,14 @@ class WhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
-            diarization_model_dir=diarization_model_dir
         )
     def transcribe(self,

 import os
 from argparse import Namespace
+from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
 from modules.whisper.whisper_base import WhisperBase
 from modules.whisper.whisper_parameter import *
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir
         )
     def transcribe(self,

modules/whisper/whisper_base.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import torch
 import whisper
 import gradio as gr
 from abc import ABC, abstractmethod
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
@@ -9,7 +10,9 @@ from datetime import datetime
 from faster_whisper.vad import VadOptions
 from dataclasses import astuple
-from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH)
 from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
 from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
@@ -22,6 +25,7 @@ class WhisperBase(ABC):
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         self.model_dir = model_dir
@@ -32,6 +36,10 @@ class WhisperBase(ABC):
             model_dir=diarization_model_dir
         )
         self.vad = SileroVAD()
         self.model = None
         self.current_model_size = None
@@ -102,7 +110,26 @@ class WhisperBase(ABC):
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
             params.lang = language_code_dict[params.lang]
-        speech_chunks = None
         if params.vad_filter:
             # Explicit value set for float('inf') from gr.Number()
             if params.max_speech_duration_s >= 9999:
@@ -437,12 +464,14 @@ class WhisperBase(ABC):
     @staticmethod
     def release_cuda_memory():
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             torch.cuda.reset_max_memory_allocated()
     @staticmethod
     def remove_input_files(file_paths: List[str]):
         if not file_paths:
             return
@@ -455,9 +484,25 @@ class WhisperBase(ABC):
         whisper_params: WhisperValues,
         add_timestamp: bool
     ):
         cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
         cached_whisper_param = whisper_params.to_yaml()
         cached_yaml = {**cached_params, **cached_whisper_param}
         cached_yaml["whisper"]["add_timestamp"] = add_timestamp
         save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)

 import torch
 import whisper
 import gradio as gr
+import torchaudio
 from abc import ABC, abstractmethod
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from faster_whisper.vad import VadOptions
 from dataclasses import astuple
+from modules.uvr.music_separator import MusicSeparator
+from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
+                                 UVR_MODELS_DIR)
 from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
 from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         self.model_dir = model_dir
             model_dir=diarization_model_dir
         )
         self.vad = SileroVAD()
+        self.music_separator = MusicSeparator(
+            model_dir=uvr_model_dir,
+            output_dir=os.path.join(output_dir, "UVR")
+        )
         self.model = None
         self.current_model_size = None
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
             params.lang = language_code_dict[params.lang]
+        if params.is_bgm_separate:
+            music, audio = self.music_separator.separate(
+                audio=audio,
+                model_name=params.uvr_model_size,
+                device=params.uvr_device,
+                segment_size=params.uvr_segment_size,
+                save_file=params.uvr_save_file,
+                progress=progress
+            )
+            if audio.ndim >= 2:
+                audio = audio.mean(axis=1)
+                if self.music_separator.audio_info is None:
+                    origin_sample_rate = 16000
+                else:
+                    origin_sample_rate = self.music_separator.audio_info.sample_rate
+                audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
+            self.music_separator.offload()
         if params.vad_filter:
             # Explicit value set for float('inf') from gr.Number()
             if params.max_speech_duration_s >= 9999:
     @staticmethod
     def release_cuda_memory():
+        """Release memory"""
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             torch.cuda.reset_max_memory_allocated()
     @staticmethod
     def remove_input_files(file_paths: List[str]):
+        """Remove gradio cached files"""
         if not file_paths:
             return
         whisper_params: WhisperValues,
         add_timestamp: bool
     ):
+        """cache parameters to the yaml file"""
         cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
         cached_whisper_param = whisper_params.to_yaml()
         cached_yaml = {**cached_params, **cached_whisper_param}
         cached_yaml["whisper"]["add_timestamp"] = add_timestamp
         save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
+    @staticmethod
+    def resample_audio(audio: Union[str, np.ndarray],
+                       new_sample_rate: int = 16000,
+                       original_sample_rate: Optional[int] = None,) -> np.ndarray:
+        """Resamples audio to 16k sample rate, standard on Whisper model"""
+        if isinstance(audio, str):
+            audio, original_sample_rate = torchaudio.load(audio)
+        else:
+            if original_sample_rate is None:
+                raise ValueError("original_sample_rate must be provided when audio is numpy array.")
+            audio = torch.from_numpy(audio)
+        resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=new_sample_rate)
+        resampled_audio = resampler(audio).numpy()
+        return resampled_audio

modules/whisper/whisper_factory.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Optional
 import os
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
-                                 INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR)
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.whisper_Inference import WhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
@@ -17,6 +17,7 @@ class WhisperFactory:
         faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
         insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
         diarization_model_dir: str = DIARIZATION_MODELS_DIR,
         output_dir: str = OUTPUT_DIR,
     ) -> "WhisperBase":
         """
@@ -37,6 +38,8 @@ class WhisperFactory:
             Directory path for the Insanely Fast Whisper model.
         diarization_model_dir : str
             Directory path for the diarization model.
         output_dir : str
             Directory path where output files will be saved.
@@ -61,23 +64,27 @@ class WhisperFactory:
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )
         elif whisper_type in whisper_typos:
             return WhisperInference(
                 model_dir=whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )
         elif whisper_type in insanely_fast_whisper_typos:
             return InsanelyFastWhisperInference(
                 model_dir=insanely_fast_whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )
         else:
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )

 import os
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
+                                 INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR, UVR_MODELS_DIR)
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.whisper_Inference import WhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
         faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
         insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
         diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+        uvr_model_dir: str = UVR_MODELS_DIR,
         output_dir: str = OUTPUT_DIR,
     ) -> "WhisperBase":
         """
             Directory path for the Insanely Fast Whisper model.
         diarization_model_dir : str
             Directory path for the diarization model.
+        uvr_model_dir : str
+            Directory path for the UVR model.
         output_dir : str
             Directory path where output files will be saved.
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )
         elif whisper_type in whisper_typos:
             return WhisperInference(
                 model_dir=whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )
         elif whisper_type in insanely_fast_whisper_typos:
             return InsanelyFastWhisperInference(
                 model_dir=insanely_fast_whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )
         else:
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )

modules/whisper/whisper_parameter.py CHANGED Viewed

@@ -47,6 +47,11 @@ class WhisperParameters:
     hotwords: gr.Textbox
     language_detection_threshold: gr.Number
     language_detection_segments: gr.Number
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
@@ -148,61 +153,76 @@ class WhisperParameters:
     diarization_device: gr.Dropdown
         This parameter is related with whisperx. Device to run diarization model
-    length_penalty:
         This parameter is related to faster-whisper. Exponential length penalty constant.
-    repetition_penalty:
         This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
         (set > 1 to penalize).
-    no_repeat_ngram_size:
         This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
-    prefix:
         This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
-    suppress_blank:
         This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
-    suppress_tokens:
         This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
         of symbols as defined in the model config.json file.
-    max_initial_timestamp:
         This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
-    word_timestamps:
         This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
         and dynamic time warping, and include the timestamps for each word in each segment.
-    prepend_punctuations:
         This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
         with the next word.
-    append_punctuations:
         This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
         with the previous word.
-    max_new_tokens:
         This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
         the maximum will be set by the default max_length.
-    chunk_length:
         This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
         default chunk_length of the FeatureExtractor.
-    hallucination_silence_threshold:
         This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
         (in seconds) when a possible hallucination is detected.
-    hotwords:
         This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
-    language_detection_threshold:
         This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
-    language_detection_segments:
         This parameter is related to faster-whisper. Number of segments to consider for the language detection.
     """
     def as_list(self) -> list:
@@ -273,6 +293,11 @@ class WhisperValues:
     hotwords: Optional[str]
     language_detection_threshold: Optional[float]
     language_detection_segments: int
     """
     A data class to use Whisper parameters.
     """
@@ -323,6 +348,12 @@ class WhisperValues:
             "diarization": {
                 "is_diarize": self.is_diarize,
                 "hf_token": self.hf_token
-            }
         }
         return data

     hotwords: gr.Textbox
     language_detection_threshold: gr.Number
     language_detection_segments: gr.Number
+    is_bgm_separate: gr.Checkbox
+    uvr_model_size: gr.Dropdown
+    uvr_device: gr.Dropdown
+    uvr_segment_size: gr.Number
+    uvr_save_file: gr.Checkbox
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
     diarization_device: gr.Dropdown
         This parameter is related with whisperx. Device to run diarization model
+    length_penalty: gr.Number
         This parameter is related to faster-whisper. Exponential length penalty constant.
+    repetition_penalty: gr.Number
         This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
         (set > 1 to penalize).
+    no_repeat_ngram_size: gr.Number
         This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
+    prefix: gr.Textbox
         This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
+    suppress_blank: gr.Checkbox
         This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
+    suppress_tokens: gr.Textbox
         This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
         of symbols as defined in the model config.json file.
+    max_initial_timestamp: gr.Number
         This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
+    word_timestamps: gr.Checkbox
         This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
         and dynamic time warping, and include the timestamps for each word in each segment.
+    prepend_punctuations: gr.Textbox
         This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
         with the next word.
+    append_punctuations: gr.Textbox
         This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
         with the previous word.
+    max_new_tokens: gr.Number
         This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
         the maximum will be set by the default max_length.
+    chunk_length: gr.Number
         This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
         default chunk_length of the FeatureExtractor.
+    hallucination_silence_threshold: gr.Number
         This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
         (in seconds) when a possible hallucination is detected.
+    hotwords: gr.Textbox
         This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
+    language_detection_threshold: gr.Number
         This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
+    language_detection_segments: gr.Number
         This parameter is related to faster-whisper. Number of segments to consider for the language detection.
+    is_separate_bgm: gr.Checkbox
+        This parameter is related to UVR. Boolean value that determines whether to separate bgm or not.
+    uvr_model_size: gr.Dropdown
+        This parameter is related to UVR. UVR model size.
+    uvr_device: gr.Dropdown
+        This parameter is related to UVR. Device to run UVR model.
+    uvr_segment_size: gr.Number
+        This parameter is related to UVR. Segment size for UVR model.
+    uvr_save_file: gr.Checkbox
+        This parameter is related to UVR. Boolean value that determines whether to save the file or not.
     """
     def as_list(self) -> list:
     hotwords: Optional[str]
     language_detection_threshold: Optional[float]
     language_detection_segments: int
+    is_bgm_separate: bool
+    uvr_model_size: str
+    uvr_device: str
+    uvr_segment_size: int
+    uvr_save_file: bool
     """
     A data class to use Whisper parameters.
     """
             "diarization": {
                 "is_diarize": self.is_diarize,
                 "hf_token": self.hf_token
+            },
+            "bgm_separation": {
+                "is_separate_bgm": self.is_bgm_separate,
+                "model_size": self.uvr_model_size,
+                "segment_size": self.uvr_segment_size,
+                "save_file": self.uvr_save_file,
+            },
         }
         return data

notebook/whisper-webui.ipynb CHANGED Viewed

@@ -58,7 +58,8 @@
         "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
         "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
         "!pip install tokenizers==0.19.1\n",
-        "!pip install pyannote.audio==3.3.1"
       ]
     },
     {
@@ -96,7 +97,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
         "id": "PQroYRRZzQiN",
         "cellView": "form"

         "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
         "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
         "!pip install tokenizers==0.19.1\n",
+        "!pip install pyannote.audio==3.3.1\n",
+        "!pip install git+https://github.com/jhj0517/ultimatevocalremover_api.git"
       ]
     },
     {
     },
     {
       "cell_type": "code",
+      "execution_count": 3,
       "metadata": {
         "id": "PQroYRRZzQiN",
         "cellView": "form"

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ transformers==4.42.3
 gradio==4.43.0
 pytubefix
 ruamel.yaml==0.18.6
-pyannote.audio==3.3.1

 gradio==4.43.0
 pytubefix
 ruamel.yaml==0.18.6
+pyannote.audio==3.3.1
+git+https://github.com/jhj0517/ultimatevocalremover_api.git