Spaces:

jhj0517
/

Whisper-WebUI

Running

Whisper-WebUI / modules /uvr /music_separator.py

jhj0517

Merge branch 'master' into feature/add-bgm-tab

bfbf385 5 months ago

7.11 kB

	from typing import Optional, Union, List, Dict
	import numpy as np
	import torchaudio
	import soundfile as sf
	import os
	import torch
	import gc
	import gradio as gr
	from datetime import datetime

	from uvr.models import MDX, Demucs, VrNetwork, MDXC
	from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
	from modules.utils.files_manager import load_yaml, save_yaml, is_video
	from modules.diarize.audio_loader import load_audio

	class MusicSeparator:
	def __init__(self,
	model_dir: Optional[str] = None,
	output_dir: Optional[str] = None):
	self.model = None
	self.device = self.get_device()
	self.available_devices = ["cpu", "cuda"]
	self.model_dir = model_dir
	self.output_dir = output_dir
	instrumental_output_dir = os.path.join(self.output_dir, "instrumental")
	vocals_output_dir = os.path.join(self.output_dir, "vocals")
	os.makedirs(instrumental_output_dir, exist_ok=True)
	os.makedirs(vocals_output_dir, exist_ok=True)
	self.audio_info = None
	self.available_models = ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
	self.default_model = self.available_models[0]
	self.current_model_size = self.default_model
	self.model_config = {
	"segment": 256,
	"split": True
	}

	def update_model(self,
	model_name: str = "UVR-MDX-NET-Inst_1",
	device: Optional[str] = None,
	segment_size: int = 256):
	"""
	Update model with the given model name

	Args:
	model_name (str): Model name.
	device (str): Device to use for the model.
	segment_size (int): Segment size for the prediction.
	"""
	if device is None:
	device = self.device

	self.device = device
	self.model_config = {
	"segment": segment_size,
	"split": True
	}
	self.model = MDX(name=model_name,
	other_metadata=self.model_config,
	device=self.device,
	logger=None,
	model_dir=self.model_dir)

	def separate(self,
	audio: Union[str, np.ndarray],
	model_name: str,
	device: Optional[str] = None,
	segment_size: int = 256,
	save_file: bool = False,
	progress: gr.Progress = gr.Progress()) -> tuple[np.ndarray, np.ndarray, List]:
	"""
	Separate the background music from the audio.

	Args:
	audio (Union[str, np.ndarray]): Audio path or numpy array.
	model_name (str): Model name.
	device (str): Device to use for the model.
	segment_size (int): Segment size for the prediction.
	save_file (bool): Whether to save the separated audio to output path or not.
	progress (gr.Progress): Gradio progress indicator.

	Returns:
	A Tuple of
	np.ndarray: Instrumental numpy arrays.
	np.ndarray: Vocals numpy arrays.
	file_paths: List of file paths where the separated audio is saved. Return empty when save_file is False.
	"""
	if isinstance(audio, str):
	output_filename, ext = os.path.basename(audio), ".wav"
	output_filename, orig_ext = os.path.splitext(output_filename)

	if is_video(audio):
	audio = load_audio(audio)
	sample_rate = 16000
	else:
	self.audio_info = torchaudio.info(audio)
	sample_rate = self.audio_info.sample_rate
	else:
	timestamp = datetime.now().strftime("%m%d%H%M%S")
	output_filename, ext = f"UVR-{timestamp}", ".wav"
	sample_rate = 16000

	model_config = {
	"segment": segment_size,
	"split": True
	}

	if (self.model is None or
	self.current_model_size != model_name or
	self.model_config != model_config or
	self.model.sample_rate != sample_rate or
	self.device != device):
	progress(0, desc="Initializing UVR Model..")
	self.update_model(
	model_name=model_name,
	device=device,
	segment_size=segment_size
	)
	self.model.sample_rate = sample_rate

	progress(0, desc="Separating background music from the audio..")
	result = self.model(audio)
	instrumental, vocals = result["instrumental"].T, result["vocals"].T

	file_paths = []
	if save_file:
	instrumental_output_path = os.path.join(self.output_dir, "instrumental", f"{output_filename}-instrumental{ext}")
	vocals_output_path = os.path.join(self.output_dir, "vocals", f"{output_filename}-vocals{ext}")
	sf.write(instrumental_output_path, instrumental, sample_rate, format="WAV")
	sf.write(vocals_output_path, vocals, sample_rate, format="WAV")
	file_paths += [instrumental_output_path, vocals_output_path]

	return instrumental, vocals, file_paths

	def separate_files(self,
	files: List,
	model_name: str,
	device: Optional[str] = None,
	segment_size: int = 256,
	save_file: bool = True,
	progress: gr.Progress = gr.Progress()) -> List[str]:
	"""Separate the background music from the audio files. Returns only last Instrumental and vocals file paths
	to display into gr.Audio()"""
	self.cache_parameters(model_size=model_name, segment_size=segment_size)

	for file_path in files:
	instrumental, vocals, file_paths = self.separate(
	audio=file_path,
	model_name=model_name,
	device=device,
	segment_size=segment_size,
	save_file=save_file,
	progress=progress
	)
	return file_paths

	@staticmethod
	def get_device():
	"""Get device for the model"""
	return "cuda" if torch.cuda.is_available() else "cpu"

	def offload(self):
	"""Offload the model and free up the memory"""
	if self.model is not None:
	del self.model
	self.model = None
	if self.device == "cuda":
	torch.cuda.empty_cache()
	gc.collect()
	self.audio_info = None

	@staticmethod
	def cache_parameters(model_size: str,
	segment_size: int):
	cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
	cached_uvr_params = cached_params["bgm_separation"]
	uvr_params_to_cache = {
	"model_size": model_size,
	"segment_size": segment_size
	}
	cached_uvr_params = {cached_uvr_params, uvr_params_to_cache}
	cached_params["bgm_separation"] = cached_uvr_params
	save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)