jhj0517 commited on
Commit
0c8c544
·
unverified ·
2 Parent(s): ea7397a 2109221

Merge pull request #267 from jhj0517/feature/bgm-separation

Browse files
README.md CHANGED
@@ -25,6 +25,7 @@ If you wish to try this on Colab, you can do it in [here](https://colab.research
25
  - Translate subtitle files using Facebook NLLB models
26
  - Translate subtitle files using DeepL API
27
  - Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
 
28
  - Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
29
  - To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
30
  1. https://huggingface.co/pyannote/speaker-diarization-3.1
@@ -109,8 +110,6 @@ This is Whisper's original VRAM usage table for models.
109
  - [x] Integrate with faster-whisper
110
  - [x] Integrate with insanely-fast-whisper
111
  - [x] Integrate with whisperX ( Only speaker diarization part )
112
- - [ ] Add background music separation pre-processing with [MVSEP-MDX23](https://github.com/ZFTurbo/MVSEP-MDX23-music-separation-model)
113
  - [ ] Add fast api script
114
  - [ ] Support real-time transcription for microphone
115
-
116
-
 
25
  - Translate subtitle files using Facebook NLLB models
26
  - Translate subtitle files using DeepL API
27
  - Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
28
+ - Pre-processing audio input to separate BGM with [UVR](https://github.com/Anjok07/ultimatevocalremovergui), [UVR-api](https://github.com/NextAudioGen/ultimatevocalremover_api).
29
  - Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
30
  - To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
31
  1. https://huggingface.co/pyannote/speaker-diarization-3.1
 
110
  - [x] Integrate with faster-whisper
111
  - [x] Integrate with insanely-fast-whisper
112
  - [x] Integrate with whisperX ( Only speaker diarization part )
113
+ - [x] Add background music separation pre-processing with [UVR](https://github.com/Anjok07/ultimatevocalremovergui)
114
  - [ ] Add fast api script
115
  - [ ] Support real-time transcription for microphone
 
 
app.py CHANGED
@@ -4,7 +4,8 @@ import gradio as gr
4
  import yaml
5
 
6
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
7
- INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH)
 
8
  from modules.utils.files_manager import load_yaml
9
  from modules.whisper.whisper_factory import WhisperFactory
10
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
@@ -25,10 +26,9 @@ class App:
25
  whisper_model_dir=self.args.whisper_model_dir,
26
  faster_whisper_model_dir=self.args.faster_whisper_model_dir,
27
  insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
 
28
  output_dir=self.args.output_dir,
29
  )
30
- print(f"Use \"{self.args.whisper_type}\" implementation")
31
- print(f"Device \"{self.whisper_inf.device}\" is detected")
32
  self.nllb_inf = NLLBInference(
33
  model_dir=self.args.nllb_model_dir,
34
  output_dir=os.path.join(self.args.output_dir, "translations")
@@ -37,11 +37,14 @@ class App:
37
  output_dir=os.path.join(self.args.output_dir, "translations")
38
  )
39
  self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
 
 
40
 
41
  def create_whisper_parameters(self):
42
  whisper_params = self.default_params["whisper"]
43
  vad_params = self.default_params["vad"]
44
  diarization_params = self.default_params["diarization"]
 
45
 
46
  with gr.Row():
47
  dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
@@ -127,6 +130,16 @@ class App:
127
  precision=0)
128
  nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
129
 
 
 
 
 
 
 
 
 
 
 
130
  with gr.Accordion("VAD", open=False):
131
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
132
  interactive=True)
@@ -173,7 +186,9 @@ class App:
173
  hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
174
  language_detection_threshold=nb_language_detection_threshold,
175
  language_detection_segments=nb_language_detection_segments,
176
- prompt_reset_on_temperature=sld_prompt_reset_on_temperature
 
 
177
  ),
178
  dd_file_format,
179
  cb_timestamp
@@ -383,6 +398,8 @@ parser.add_argument('--diarization_model_dir', type=str, default=DIARIZATION_MOD
383
  help='Directory path of the diarization model')
384
  parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
385
  help='Directory path of the Facebook NLLB model')
 
 
386
  parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
387
  _args = parser.parse_args()
388
 
 
4
  import yaml
5
 
6
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
7
+ INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
8
+ UVR_MODELS_DIR)
9
  from modules.utils.files_manager import load_yaml
10
  from modules.whisper.whisper_factory import WhisperFactory
11
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
 
26
  whisper_model_dir=self.args.whisper_model_dir,
27
  faster_whisper_model_dir=self.args.faster_whisper_model_dir,
28
  insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
29
+ uvr_model_dir=self.args.uvr_model_dir,
30
  output_dir=self.args.output_dir,
31
  )
 
 
32
  self.nllb_inf = NLLBInference(
33
  model_dir=self.args.nllb_model_dir,
34
  output_dir=os.path.join(self.args.output_dir, "translations")
 
37
  output_dir=os.path.join(self.args.output_dir, "translations")
38
  )
39
  self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
40
+ print(f"Use \"{self.args.whisper_type}\" implementation")
41
+ print(f"Device \"{self.whisper_inf.device}\" is detected")
42
 
43
  def create_whisper_parameters(self):
44
  whisper_params = self.default_params["whisper"]
45
  vad_params = self.default_params["vad"]
46
  diarization_params = self.default_params["diarization"]
47
+ uvr_params = self.default_params["bgm_separation"]
48
 
49
  with gr.Row():
50
  dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
 
130
  precision=0)
131
  nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
132
 
133
+ with gr.Accordion("BGM Separation", open=False):
134
+ cb_bgm_separation = gr.Checkbox(label="Enable BGM separation", value=uvr_params["is_separate_bgm"],
135
+ interactive=True)
136
+ dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
137
+ choices=self.whisper_inf.music_separator.available_devices)
138
+ dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
139
+ choices=self.whisper_inf.music_separator.available_models)
140
+ nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
141
+ cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
142
+
143
  with gr.Accordion("VAD", open=False):
144
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
145
  interactive=True)
 
186
  hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
187
  language_detection_threshold=nb_language_detection_threshold,
188
  language_detection_segments=nb_language_detection_segments,
189
+ prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
190
+ uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
191
+ uvr_save_file=cb_uvr_save_file
192
  ),
193
  dd_file_format,
194
  cb_timestamp
 
398
  help='Directory path of the diarization model')
399
  parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
400
  help='Directory path of the Facebook NLLB model')
401
+ parser.add_argument('--uvr_model_dir', type=str, default=UVR_MODELS_DIR,
402
+ help='Directory path of the UVR model')
403
  parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
404
  _args = parser.parse_args()
405
 
configs/default_parameters.yaml CHANGED
@@ -44,6 +44,12 @@ diarization:
44
  is_diarize: false
45
  hf_token: ""
46
 
 
 
 
 
 
 
47
  translation:
48
  deepl:
49
  api_key: ""
 
44
  is_diarize: false
45
  hf_token: ""
46
 
47
+ bgm_separation:
48
+ is_separate_bgm: false
49
+ model_size: "UVR-MDX-NET-Inst_HQ_4"
50
+ segment_size: 256
51
+ save_file: true
52
+
53
  translation:
54
  deepl:
55
  api_key: ""
modules/ui/htmls.py CHANGED
@@ -38,7 +38,7 @@ CSS = """
38
  """
39
 
40
  MARKDOWN = """
41
- ### [Whisper Web-UI](https://github.com/jhj0517/Whsiper-WebUI)
42
  """
43
 
44
 
 
38
  """
39
 
40
  MARKDOWN = """
41
+ ### [Whisper-WebUI](https://github.com/jhj0517/Whsiper-WebUI)
42
  """
43
 
44
 
modules/utils/paths.py CHANGED
@@ -7,10 +7,14 @@ FASTER_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "faster-whisper")
7
  INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
8
  NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
9
  DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
 
10
  CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
11
  DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
12
  OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
13
  TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
 
 
 
14
 
15
  for dir_path in [MODELS_DIR,
16
  WHISPER_MODELS_DIR,
@@ -18,7 +22,10 @@ for dir_path in [MODELS_DIR,
18
  INSANELY_FAST_WHISPER_MODELS_DIR,
19
  NLLB_MODELS_DIR,
20
  DIARIZATION_MODELS_DIR,
 
21
  CONFIGS_DIR,
22
  OUTPUT_DIR,
23
- TRANSLATION_OUTPUT_DIR]:
 
 
24
  os.makedirs(dir_path, exist_ok=True)
 
7
  INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
8
  NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
9
  DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
10
+ UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
11
  CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
12
  DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
13
  OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
14
  TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
15
+ UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
16
+ UVR_INSTRUMENTAL_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "instrumental")
17
+ UVR_VOCALS_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "vocals")
18
 
19
  for dir_path in [MODELS_DIR,
20
  WHISPER_MODELS_DIR,
 
22
  INSANELY_FAST_WHISPER_MODELS_DIR,
23
  NLLB_MODELS_DIR,
24
  DIARIZATION_MODELS_DIR,
25
+ UVR_MODELS_DIR,
26
  CONFIGS_DIR,
27
  OUTPUT_DIR,
28
+ TRANSLATION_OUTPUT_DIR,
29
+ UVR_INSTRUMENTAL_OUTPUT_DIR,
30
+ UVR_VOCALS_OUTPUT_DIR]:
31
  os.makedirs(dir_path, exist_ok=True)
modules/uvr/music_separator.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+ import numpy as np
3
+ import torchaudio
4
+ import soundfile as sf
5
+ import os
6
+ import torch
7
+ import gc
8
+ import gradio as gr
9
+ from datetime import datetime
10
+
11
+ from uvr.models import MDX, Demucs, VrNetwork, MDXC
12
+
13
+
14
+ class MusicSeparator:
15
+ def __init__(self,
16
+ model_dir: Optional[str] = None,
17
+ output_dir: Optional[str] = None):
18
+ self.model = None
19
+ self.device = self.get_device()
20
+ self.available_devices = ["cpu", "cuda"]
21
+ self.model_dir = model_dir
22
+ self.output_dir = output_dir
23
+ self.audio_info = None
24
+ self.available_models = ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
25
+ self.default_model = self.available_models[0]
26
+ self.current_model_size = self.default_model
27
+ self.model_config = {
28
+ "segment": 256,
29
+ "split": True
30
+ }
31
+
32
+ def update_model(self,
33
+ model_name: str = "UVR-MDX-NET-Inst_1",
34
+ device: Optional[str] = None,
35
+ segment_size: int = 256):
36
+ """
37
+ Update model with the given model name
38
+
39
+ Args:
40
+ model_name (str): Model name.
41
+ device (str): Device to use for the model.
42
+ segment_size (int): Segment size for the prediction.
43
+ """
44
+ if device is None:
45
+ device = self.device
46
+
47
+ self.device = device
48
+ self.model_config = {
49
+ "segment": segment_size,
50
+ "split": True
51
+ }
52
+ self.model = MDX(name=model_name,
53
+ other_metadata=self.model_config,
54
+ device=self.device,
55
+ logger=None,
56
+ model_dir=self.model_dir)
57
+
58
+ def separate(self,
59
+ audio: Union[str, np.ndarray],
60
+ model_name: str,
61
+ device: Optional[str] = None,
62
+ segment_size: int = 256,
63
+ save_file: bool = False,
64
+ progress: gr.Progress = gr.Progress()) -> tuple[np.ndarray, np.ndarray]:
65
+ """
66
+ Separate the background music from the audio.
67
+
68
+ Args:
69
+ audio (Union[str, np.ndarray]): Audio path or numpy array.
70
+ model_name (str): Model name.
71
+ device (str): Device to use for the model.
72
+ segment_size (int): Segment size for the prediction.
73
+ save_file (bool): Whether to save the separated audio to output path or not.
74
+ progress (gr.Progress): Gradio progress indicator.
75
+
76
+ Returns:
77
+ tuple[np.ndarray, np.ndarray]: Instrumental and vocals numpy arrays.
78
+ """
79
+ if isinstance(audio, str):
80
+ self.audio_info = torchaudio.info(audio)
81
+ sample_rate = self.audio_info.sample_rate
82
+ output_filename, ext = os.path.splitext(audio)
83
+ output_filename, ext = os.path.basename(audio), ".wav"
84
+ else:
85
+ sample_rate = 16000
86
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
87
+ output_filename, ext = f"UVR-{timestamp}", ".wav"
88
+
89
+ model_config = {
90
+ "segment": segment_size,
91
+ "split": True
92
+ }
93
+
94
+ if (self.model is None or
95
+ self.current_model_size != model_name or
96
+ self.model_config != model_config or
97
+ self.audio_info.sample_rate != sample_rate or
98
+ self.device != device):
99
+ progress(0, desc="Initializing UVR Model..")
100
+ self.update_model(
101
+ model_name=model_name,
102
+ device=device,
103
+ segment_size=segment_size
104
+ )
105
+ self.model.sample_rate = sample_rate
106
+
107
+ progress(0, desc="Separating background music from the audio..")
108
+ result = self.model(audio)
109
+ instrumental, vocals = result["instrumental"].T, result["vocals"].T
110
+
111
+ if save_file:
112
+ instrumental_output_path = os.path.join(self.output_dir, "instrumental", f"{output_filename}-instrumental{ext}")
113
+ vocals_output_path = os.path.join(self.output_dir, "vocals", f"{output_filename}-vocals{ext}")
114
+ sf.write(instrumental_output_path, instrumental, sample_rate, format="WAV")
115
+ sf.write(vocals_output_path, vocals, sample_rate, format="WAV")
116
+
117
+ return instrumental, vocals
118
+
119
+ @staticmethod
120
+ def get_device():
121
+ """Get device for the model"""
122
+ return "cuda" if torch.cuda.is_available() else "cpu"
123
+
124
+ def offload(self):
125
+ """Offload the model and free up the memory"""
126
+ if self.model is not None:
127
+ del self.model
128
+ self.model = None
129
+ if self.device == "cuda":
130
+ torch.cuda.empty_cache()
131
+ gc.collect()
132
+ self.audio_info = None
modules/whisper/faster_whisper_inference.py CHANGED
@@ -11,7 +11,7 @@ import whisper
11
  import gradio as gr
12
  from argparse import Namespace
13
 
14
- from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
15
  from modules.whisper.whisper_parameter import *
16
  from modules.whisper.whisper_base import WhisperBase
17
 
@@ -20,11 +20,13 @@ class FasterWhisperInference(WhisperBase):
20
  def __init__(self,
21
  model_dir: str = FASTER_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
23
  output_dir: str = OUTPUT_DIR,
24
  ):
25
  super().__init__(
26
  model_dir=model_dir,
27
  diarization_model_dir=diarization_model_dir,
 
28
  output_dir=output_dir
29
  )
30
  self.model_dir = model_dir
 
11
  import gradio as gr
12
  from argparse import Namespace
13
 
14
+ from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
  from modules.whisper.whisper_parameter import *
16
  from modules.whisper.whisper_base import WhisperBase
17
 
 
20
  def __init__(self,
21
  model_dir: str = FASTER_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
23
+ uvr_model_dir: str = UVR_MODELS_DIR,
24
  output_dir: str = OUTPUT_DIR,
25
  ):
26
  super().__init__(
27
  model_dir=model_dir,
28
  diarization_model_dir=diarization_model_dir,
29
+ uvr_model_dir=uvr_model_dir,
30
  output_dir=output_dir
31
  )
32
  self.model_dir = model_dir
modules/whisper/insanely_fast_whisper_inference.py CHANGED
@@ -11,7 +11,7 @@ import whisper
11
  from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
12
  from argparse import Namespace
13
 
14
- from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
15
  from modules.whisper.whisper_parameter import *
16
  from modules.whisper.whisper_base import WhisperBase
17
 
@@ -20,12 +20,14 @@ class InsanelyFastWhisperInference(WhisperBase):
20
  def __init__(self,
21
  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
23
  output_dir: str = OUTPUT_DIR,
24
  ):
25
  super().__init__(
26
  model_dir=model_dir,
27
  output_dir=output_dir,
28
- diarization_model_dir=diarization_model_dir
 
29
  )
30
  self.model_dir = model_dir
31
  os.makedirs(self.model_dir, exist_ok=True)
 
11
  from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
12
  from argparse import Namespace
13
 
14
+ from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
  from modules.whisper.whisper_parameter import *
16
  from modules.whisper.whisper_base import WhisperBase
17
 
 
20
  def __init__(self,
21
  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
23
+ uvr_model_dir: str = UVR_MODELS_DIR,
24
  output_dir: str = OUTPUT_DIR,
25
  ):
26
  super().__init__(
27
  model_dir=model_dir,
28
  output_dir=output_dir,
29
+ diarization_model_dir=diarization_model_dir,
30
+ uvr_model_dir=uvr_model_dir
31
  )
32
  self.model_dir = model_dir
33
  os.makedirs(self.model_dir, exist_ok=True)
modules/whisper/whisper_Inference.py CHANGED
@@ -7,7 +7,7 @@ import torch
7
  import os
8
  from argparse import Namespace
9
 
10
- from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
11
  from modules.whisper.whisper_base import WhisperBase
12
  from modules.whisper.whisper_parameter import *
13
 
@@ -16,12 +16,14 @@ class WhisperInference(WhisperBase):
16
  def __init__(self,
17
  model_dir: str = WHISPER_MODELS_DIR,
18
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
19
  output_dir: str = OUTPUT_DIR,
20
  ):
21
  super().__init__(
22
  model_dir=model_dir,
23
  output_dir=output_dir,
24
- diarization_model_dir=diarization_model_dir
 
25
  )
26
 
27
  def transcribe(self,
 
7
  import os
8
  from argparse import Namespace
9
 
10
+ from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
11
  from modules.whisper.whisper_base import WhisperBase
12
  from modules.whisper.whisper_parameter import *
13
 
 
16
  def __init__(self,
17
  model_dir: str = WHISPER_MODELS_DIR,
18
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
19
+ uvr_model_dir: str = UVR_MODELS_DIR,
20
  output_dir: str = OUTPUT_DIR,
21
  ):
22
  super().__init__(
23
  model_dir=model_dir,
24
  output_dir=output_dir,
25
+ diarization_model_dir=diarization_model_dir,
26
+ uvr_model_dir=uvr_model_dir
27
  )
28
 
29
  def transcribe(self,
modules/whisper/whisper_base.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import torch
3
  import whisper
4
  import gradio as gr
 
5
  from abc import ABC, abstractmethod
6
  from typing import BinaryIO, Union, Tuple, List
7
  import numpy as np
@@ -9,7 +10,9 @@ from datetime import datetime
9
  from faster_whisper.vad import VadOptions
10
  from dataclasses import astuple
11
 
12
- from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH)
 
 
13
  from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
14
  from modules.utils.youtube_manager import get_ytdata, get_ytaudio
15
  from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
@@ -22,6 +25,7 @@ class WhisperBase(ABC):
22
  def __init__(self,
23
  model_dir: str = WHISPER_MODELS_DIR,
24
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
25
  output_dir: str = OUTPUT_DIR,
26
  ):
27
  self.model_dir = model_dir
@@ -32,6 +36,10 @@ class WhisperBase(ABC):
32
  model_dir=diarization_model_dir
33
  )
34
  self.vad = SileroVAD()
 
 
 
 
35
 
36
  self.model = None
37
  self.current_model_size = None
@@ -102,7 +110,26 @@ class WhisperBase(ABC):
102
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
103
  params.lang = language_code_dict[params.lang]
104
 
105
- speech_chunks = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  if params.vad_filter:
107
  # Explicit value set for float('inf') from gr.Number()
108
  if params.max_speech_duration_s >= 9999:
@@ -437,12 +464,14 @@ class WhisperBase(ABC):
437
 
438
  @staticmethod
439
  def release_cuda_memory():
 
440
  if torch.cuda.is_available():
441
  torch.cuda.empty_cache()
442
  torch.cuda.reset_max_memory_allocated()
443
 
444
  @staticmethod
445
  def remove_input_files(file_paths: List[str]):
 
446
  if not file_paths:
447
  return
448
 
@@ -455,9 +484,25 @@ class WhisperBase(ABC):
455
  whisper_params: WhisperValues,
456
  add_timestamp: bool
457
  ):
 
458
  cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
459
  cached_whisper_param = whisper_params.to_yaml()
460
  cached_yaml = {**cached_params, **cached_whisper_param}
461
  cached_yaml["whisper"]["add_timestamp"] = add_timestamp
462
 
463
  save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import torch
3
  import whisper
4
  import gradio as gr
5
+ import torchaudio
6
  from abc import ABC, abstractmethod
7
  from typing import BinaryIO, Union, Tuple, List
8
  import numpy as np
 
10
  from faster_whisper.vad import VadOptions
11
  from dataclasses import astuple
12
 
13
+ from modules.uvr.music_separator import MusicSeparator
14
+ from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
15
+ UVR_MODELS_DIR)
16
  from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
17
  from modules.utils.youtube_manager import get_ytdata, get_ytaudio
18
  from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
 
25
  def __init__(self,
26
  model_dir: str = WHISPER_MODELS_DIR,
27
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
28
+ uvr_model_dir: str = UVR_MODELS_DIR,
29
  output_dir: str = OUTPUT_DIR,
30
  ):
31
  self.model_dir = model_dir
 
36
  model_dir=diarization_model_dir
37
  )
38
  self.vad = SileroVAD()
39
+ self.music_separator = MusicSeparator(
40
+ model_dir=uvr_model_dir,
41
+ output_dir=os.path.join(output_dir, "UVR")
42
+ )
43
 
44
  self.model = None
45
  self.current_model_size = None
 
110
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
111
  params.lang = language_code_dict[params.lang]
112
 
113
+ if params.is_bgm_separate:
114
+ music, audio = self.music_separator.separate(
115
+ audio=audio,
116
+ model_name=params.uvr_model_size,
117
+ device=params.uvr_device,
118
+ segment_size=params.uvr_segment_size,
119
+ save_file=params.uvr_save_file,
120
+ progress=progress
121
+ )
122
+
123
+ if audio.ndim >= 2:
124
+ audio = audio.mean(axis=1)
125
+ if self.music_separator.audio_info is None:
126
+ origin_sample_rate = 16000
127
+ else:
128
+ origin_sample_rate = self.music_separator.audio_info.sample_rate
129
+ audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
130
+
131
+ self.music_separator.offload()
132
+
133
  if params.vad_filter:
134
  # Explicit value set for float('inf') from gr.Number()
135
  if params.max_speech_duration_s >= 9999:
 
464
 
465
  @staticmethod
466
  def release_cuda_memory():
467
+ """Release memory"""
468
  if torch.cuda.is_available():
469
  torch.cuda.empty_cache()
470
  torch.cuda.reset_max_memory_allocated()
471
 
472
  @staticmethod
473
  def remove_input_files(file_paths: List[str]):
474
+ """Remove gradio cached files"""
475
  if not file_paths:
476
  return
477
 
 
484
  whisper_params: WhisperValues,
485
  add_timestamp: bool
486
  ):
487
+ """cache parameters to the yaml file"""
488
  cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
489
  cached_whisper_param = whisper_params.to_yaml()
490
  cached_yaml = {**cached_params, **cached_whisper_param}
491
  cached_yaml["whisper"]["add_timestamp"] = add_timestamp
492
 
493
  save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
494
+
495
+ @staticmethod
496
+ def resample_audio(audio: Union[str, np.ndarray],
497
+ new_sample_rate: int = 16000,
498
+ original_sample_rate: Optional[int] = None,) -> np.ndarray:
499
+ """Resamples audio to 16k sample rate, standard on Whisper model"""
500
+ if isinstance(audio, str):
501
+ audio, original_sample_rate = torchaudio.load(audio)
502
+ else:
503
+ if original_sample_rate is None:
504
+ raise ValueError("original_sample_rate must be provided when audio is numpy array.")
505
+ audio = torch.from_numpy(audio)
506
+ resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=new_sample_rate)
507
+ resampled_audio = resampler(audio).numpy()
508
+ return resampled_audio
modules/whisper/whisper_factory.py CHANGED
@@ -2,7 +2,7 @@ from typing import Optional
2
  import os
3
 
4
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
5
- INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR)
6
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
7
  from modules.whisper.whisper_Inference import WhisperInference
8
  from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
@@ -17,6 +17,7 @@ class WhisperFactory:
17
  faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
18
  insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
19
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
20
  output_dir: str = OUTPUT_DIR,
21
  ) -> "WhisperBase":
22
  """
@@ -37,6 +38,8 @@ class WhisperFactory:
37
  Directory path for the Insanely Fast Whisper model.
38
  diarization_model_dir : str
39
  Directory path for the diarization model.
 
 
40
  output_dir : str
41
  Directory path where output files will be saved.
42
 
@@ -61,23 +64,27 @@ class WhisperFactory:
61
  return FasterWhisperInference(
62
  model_dir=faster_whisper_model_dir,
63
  output_dir=output_dir,
64
- diarization_model_dir=diarization_model_dir
 
65
  )
66
  elif whisper_type in whisper_typos:
67
  return WhisperInference(
68
  model_dir=whisper_model_dir,
69
  output_dir=output_dir,
70
- diarization_model_dir=diarization_model_dir
 
71
  )
72
  elif whisper_type in insanely_fast_whisper_typos:
73
  return InsanelyFastWhisperInference(
74
  model_dir=insanely_fast_whisper_model_dir,
75
  output_dir=output_dir,
76
- diarization_model_dir=diarization_model_dir
 
77
  )
78
  else:
79
  return FasterWhisperInference(
80
  model_dir=faster_whisper_model_dir,
81
  output_dir=output_dir,
82
- diarization_model_dir=diarization_model_dir
 
83
  )
 
2
  import os
3
 
4
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
5
+ INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR, UVR_MODELS_DIR)
6
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
7
  from modules.whisper.whisper_Inference import WhisperInference
8
  from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
 
17
  faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
18
  insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
19
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
20
+ uvr_model_dir: str = UVR_MODELS_DIR,
21
  output_dir: str = OUTPUT_DIR,
22
  ) -> "WhisperBase":
23
  """
 
38
  Directory path for the Insanely Fast Whisper model.
39
  diarization_model_dir : str
40
  Directory path for the diarization model.
41
+ uvr_model_dir : str
42
+ Directory path for the UVR model.
43
  output_dir : str
44
  Directory path where output files will be saved.
45
 
 
64
  return FasterWhisperInference(
65
  model_dir=faster_whisper_model_dir,
66
  output_dir=output_dir,
67
+ diarization_model_dir=diarization_model_dir,
68
+ uvr_model_dir=uvr_model_dir
69
  )
70
  elif whisper_type in whisper_typos:
71
  return WhisperInference(
72
  model_dir=whisper_model_dir,
73
  output_dir=output_dir,
74
+ diarization_model_dir=diarization_model_dir,
75
+ uvr_model_dir=uvr_model_dir
76
  )
77
  elif whisper_type in insanely_fast_whisper_typos:
78
  return InsanelyFastWhisperInference(
79
  model_dir=insanely_fast_whisper_model_dir,
80
  output_dir=output_dir,
81
+ diarization_model_dir=diarization_model_dir,
82
+ uvr_model_dir=uvr_model_dir
83
  )
84
  else:
85
  return FasterWhisperInference(
86
  model_dir=faster_whisper_model_dir,
87
  output_dir=output_dir,
88
+ diarization_model_dir=diarization_model_dir,
89
+ uvr_model_dir=uvr_model_dir
90
  )
modules/whisper/whisper_parameter.py CHANGED
@@ -47,6 +47,11 @@ class WhisperParameters:
47
  hotwords: gr.Textbox
48
  language_detection_threshold: gr.Number
49
  language_detection_segments: gr.Number
 
 
 
 
 
50
  """
51
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
52
  This data class is used to mitigate the key-value problem between Gradio components and function parameters.
@@ -148,61 +153,76 @@ class WhisperParameters:
148
  diarization_device: gr.Dropdown
149
  This parameter is related with whisperx. Device to run diarization model
150
 
151
- length_penalty:
152
  This parameter is related to faster-whisper. Exponential length penalty constant.
153
 
154
- repetition_penalty:
155
  This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
156
  (set > 1 to penalize).
157
 
158
- no_repeat_ngram_size:
159
  This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
160
 
161
- prefix:
162
  This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
163
 
164
- suppress_blank:
165
  This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
166
 
167
- suppress_tokens:
168
  This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
169
  of symbols as defined in the model config.json file.
170
 
171
- max_initial_timestamp:
172
  This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
173
 
174
- word_timestamps:
175
  This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
176
  and dynamic time warping, and include the timestamps for each word in each segment.
177
 
178
- prepend_punctuations:
179
  This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
180
  with the next word.
181
 
182
- append_punctuations:
183
  This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
184
  with the previous word.
185
 
186
- max_new_tokens:
187
  This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
188
  the maximum will be set by the default max_length.
189
 
190
- chunk_length:
191
  This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
192
  default chunk_length of the FeatureExtractor.
193
 
194
- hallucination_silence_threshold:
195
  This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
196
  (in seconds) when a possible hallucination is detected.
197
 
198
- hotwords:
199
  This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
200
 
201
- language_detection_threshold:
202
  This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
203
 
204
- language_detection_segments:
205
  This parameter is related to faster-whisper. Number of segments to consider for the language detection.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  """
207
 
208
  def as_list(self) -> list:
@@ -273,6 +293,11 @@ class WhisperValues:
273
  hotwords: Optional[str]
274
  language_detection_threshold: Optional[float]
275
  language_detection_segments: int
 
 
 
 
 
276
  """
277
  A data class to use Whisper parameters.
278
  """
@@ -323,6 +348,12 @@ class WhisperValues:
323
  "diarization": {
324
  "is_diarize": self.is_diarize,
325
  "hf_token": self.hf_token
326
- }
 
 
 
 
 
 
327
  }
328
  return data
 
47
  hotwords: gr.Textbox
48
  language_detection_threshold: gr.Number
49
  language_detection_segments: gr.Number
50
+ is_bgm_separate: gr.Checkbox
51
+ uvr_model_size: gr.Dropdown
52
+ uvr_device: gr.Dropdown
53
+ uvr_segment_size: gr.Number
54
+ uvr_save_file: gr.Checkbox
55
  """
56
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
57
  This data class is used to mitigate the key-value problem between Gradio components and function parameters.
 
153
  diarization_device: gr.Dropdown
154
  This parameter is related with whisperx. Device to run diarization model
155
 
156
+ length_penalty: gr.Number
157
  This parameter is related to faster-whisper. Exponential length penalty constant.
158
 
159
+ repetition_penalty: gr.Number
160
  This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
161
  (set > 1 to penalize).
162
 
163
+ no_repeat_ngram_size: gr.Number
164
  This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
165
 
166
+ prefix: gr.Textbox
167
  This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
168
 
169
+ suppress_blank: gr.Checkbox
170
  This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
171
 
172
+ suppress_tokens: gr.Textbox
173
  This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
174
  of symbols as defined in the model config.json file.
175
 
176
+ max_initial_timestamp: gr.Number
177
  This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
178
 
179
+ word_timestamps: gr.Checkbox
180
  This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
181
  and dynamic time warping, and include the timestamps for each word in each segment.
182
 
183
+ prepend_punctuations: gr.Textbox
184
  This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
185
  with the next word.
186
 
187
+ append_punctuations: gr.Textbox
188
  This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
189
  with the previous word.
190
 
191
+ max_new_tokens: gr.Number
192
  This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
193
  the maximum will be set by the default max_length.
194
 
195
+ chunk_length: gr.Number
196
  This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
197
  default chunk_length of the FeatureExtractor.
198
 
199
+ hallucination_silence_threshold: gr.Number
200
  This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
201
  (in seconds) when a possible hallucination is detected.
202
 
203
+ hotwords: gr.Textbox
204
  This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
205
 
206
+ language_detection_threshold: gr.Number
207
  This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
208
 
209
+ language_detection_segments: gr.Number
210
  This parameter is related to faster-whisper. Number of segments to consider for the language detection.
211
+
212
+ is_separate_bgm: gr.Checkbox
213
+ This parameter is related to UVR. Boolean value that determines whether to separate bgm or not.
214
+
215
+ uvr_model_size: gr.Dropdown
216
+ This parameter is related to UVR. UVR model size.
217
+
218
+ uvr_device: gr.Dropdown
219
+ This parameter is related to UVR. Device to run UVR model.
220
+
221
+ uvr_segment_size: gr.Number
222
+ This parameter is related to UVR. Segment size for UVR model.
223
+
224
+ uvr_save_file: gr.Checkbox
225
+ This parameter is related to UVR. Boolean value that determines whether to save the file or not.
226
  """
227
 
228
  def as_list(self) -> list:
 
293
  hotwords: Optional[str]
294
  language_detection_threshold: Optional[float]
295
  language_detection_segments: int
296
+ is_bgm_separate: bool
297
+ uvr_model_size: str
298
+ uvr_device: str
299
+ uvr_segment_size: int
300
+ uvr_save_file: bool
301
  """
302
  A data class to use Whisper parameters.
303
  """
 
348
  "diarization": {
349
  "is_diarize": self.is_diarize,
350
  "hf_token": self.hf_token
351
+ },
352
+ "bgm_separation": {
353
+ "is_separate_bgm": self.is_bgm_separate,
354
+ "model_size": self.uvr_model_size,
355
+ "segment_size": self.uvr_segment_size,
356
+ "save_file": self.uvr_save_file,
357
+ },
358
  }
359
  return data
notebook/whisper-webui.ipynb CHANGED
@@ -58,7 +58,8 @@
58
  "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
59
  "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
60
  "!pip install tokenizers==0.19.1\n",
61
- "!pip install pyannote.audio==3.3.1"
 
62
  ]
63
  },
64
  {
@@ -96,7 +97,7 @@
96
  },
97
  {
98
  "cell_type": "code",
99
- "execution_count": null,
100
  "metadata": {
101
  "id": "PQroYRRZzQiN",
102
  "cellView": "form"
 
58
  "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
59
  "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
60
  "!pip install tokenizers==0.19.1\n",
61
+ "!pip install pyannote.audio==3.3.1\n",
62
+ "!pip install git+https://github.com/jhj0517/ultimatevocalremover_api.git"
63
  ]
64
  },
65
  {
 
97
  },
98
  {
99
  "cell_type": "code",
100
+ "execution_count": 3,
101
  "metadata": {
102
  "id": "PQroYRRZzQiN",
103
  "cellView": "form"
requirements.txt CHANGED
@@ -12,4 +12,5 @@ transformers==4.42.3
12
  gradio==4.43.0
13
  pytubefix
14
  ruamel.yaml==0.18.6
15
- pyannote.audio==3.3.1
 
 
12
  gradio==4.43.0
13
  pytubefix
14
  ruamel.yaml==0.18.6
15
+ pyannote.audio==3.3.1
16
+ git+https://github.com/jhj0517/ultimatevocalremover_api.git