Spaces:
Running
Running
jhj0517
commited on
Commit
·
6ff3ca6
1
Parent(s):
3ec9a9b
Add music separation pre-process to whisper base
Browse files
modules/whisper/whisper_base.py
CHANGED
@@ -9,7 +9,9 @@ from datetime import datetime
|
|
9 |
from faster_whisper.vad import VadOptions
|
10 |
from dataclasses import astuple
|
11 |
|
12 |
-
from modules.
|
|
|
|
|
13 |
from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
|
14 |
from modules.utils.youtube_manager import get_ytdata, get_ytaudio
|
15 |
from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
|
@@ -22,6 +24,7 @@ class WhisperBase(ABC):
|
|
22 |
def __init__(self,
|
23 |
model_dir: str = WHISPER_MODELS_DIR,
|
24 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
|
|
25 |
output_dir: str = OUTPUT_DIR,
|
26 |
):
|
27 |
self.model_dir = model_dir
|
@@ -32,6 +35,10 @@ class WhisperBase(ABC):
|
|
32 |
model_dir=diarization_model_dir
|
33 |
)
|
34 |
self.vad = SileroVAD()
|
|
|
|
|
|
|
|
|
35 |
|
36 |
self.model = None
|
37 |
self.current_model_size = None
|
@@ -102,7 +109,15 @@ class WhisperBase(ABC):
|
|
102 |
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
103 |
params.lang = language_code_dict[params.lang]
|
104 |
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
if params.vad_filter:
|
107 |
# Explicit value set for float('inf') from gr.Number()
|
108 |
if params.max_speech_duration_s >= 9999:
|
|
|
9 |
from faster_whisper.vad import VadOptions
|
10 |
from dataclasses import astuple
|
11 |
|
12 |
+
from modules.uvr.music_separator import MusicSeparator
|
13 |
+
from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
|
14 |
+
UVR_MODELS_DIR)
|
15 |
from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
|
16 |
from modules.utils.youtube_manager import get_ytdata, get_ytaudio
|
17 |
from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
|
|
|
24 |
def __init__(self,
|
25 |
model_dir: str = WHISPER_MODELS_DIR,
|
26 |
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
27 |
+
uvr_model_dir: str = UVR_MODELS_DIR,
|
28 |
output_dir: str = OUTPUT_DIR,
|
29 |
):
|
30 |
self.model_dir = model_dir
|
|
|
35 |
model_dir=diarization_model_dir
|
36 |
)
|
37 |
self.vad = SileroVAD()
|
38 |
+
self.music_separator = MusicSeparator(
|
39 |
+
model_dir=uvr_model_dir,
|
40 |
+
output_dir=os.path.join(output_dir, "UVR")
|
41 |
+
)
|
42 |
|
43 |
self.model = None
|
44 |
self.current_model_size = None
|
|
|
109 |
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
110 |
params.lang = language_code_dict[params.lang]
|
111 |
|
112 |
+
if params.is_bgm_separate:
|
113 |
+
music, audio = self.music_separator.separate(
|
114 |
+
audio_file_path=audio,
|
115 |
+
model_name=params.uvr_model_size,
|
116 |
+
device=params.uvr_device,
|
117 |
+
segment_size=params.uvr_segment_size,
|
118 |
+
)
|
119 |
+
self.music_separator.offload()
|
120 |
+
|
121 |
if params.vad_filter:
|
122 |
# Explicit value set for float('inf') from gr.Number()
|
123 |
if params.max_speech_duration_s >= 9999:
|