import gradio as gr | |
from diffusers import AudioLDMPipeline | |
# The recommended "fast" scheduler: | |
#from diffusers import DPMSolverMultistepScheduler | |
# The Default AudioLDM scheduler: | |
#from diffusers import DDIMScheduler | |
#from diffusers import DDPMScheduler | |
#from diffusers import DEISMultistepScheduler | |
#from diffusers import DPMSolverSinglestepScheduler | |
#from diffusers import HeunDiscreteScheduler | |
from diffusers import KDPM2DiscreteScheduler | |
#from diffusers import KDPM2AncestralDiscreteScheduler | |
#from diffusers import LMSDiscreteScheduler | |
#from diffusers import PNDMScheduler | |
#from diffusers import EulerDiscreteScheduler | |
#from diffusers import EulerAncestralDiscreteScheduler | |
#from diffusers import UniPCMultistepScheduler | |
from transformers import AutoProcessor, ClapModel | |
import torch | |
# import scipy | |
device="cpu" | |
repo_id = "cvssp/audioldm-s-full-v2" | |
pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float32) | |
#pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) | |
#pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) | |
#pipe.scheduler = DDPMScheduler.from_config(pipe.scheduler.config) | |
#pipe.scheduler = DEISMultistepScheduler.from_config(pipe.scheduler.config) | |
#pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config) | |
#pipe.scheduler = HeunDiscreteScheduler.from_config(pipe.scheduler.config) | |
pipe.scheduler = KDPM2DiscreteScheduler.from_config(pipe.scheduler.config) | |
#pipe.scheduler = KDPM2AncestralDiscreteScheduler.from_config(pipe.scheduler.config) | |
#pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) | |
#pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config) | |
#pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) | |
#pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) | |
#pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) | |
pipe = pipe.to(device) | |
clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device) | |
processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full", sampling_rate=16000) | |
generator = torch.Generator(device) | |
def texttoaudio(prompt, neg_prompt, seed, inf_steps, guidance_scale, n_candidates): | |
if prompt is None: | |
raise gr.Error("Please provide a text input.") | |
waveforms = pipe( | |
prompt, | |
negative_prompt=neg_prompt, | |
num_inference_steps=int(inf_steps), | |
guidance_scale=guidance_scale, | |
audio_length_in_s=5.0, | |
generator=generator.manual_seed(int(seed)), | |
num_waveforms_per_prompt=int(n_candidates) if n_candidates else 1, | |
)["audios"] | |
# save the audio sample as a .wav file | |
# scipy.io.wavfile.write("output.wav", rate=16000, data=audio) | |
if waveforms.shape[0] > 1: | |
waveform = score_waveforms(prompt, waveforms) | |
else: | |
waveform = waveforms[0] | |
return (16000, waveform) | |
def score_waveforms(text, waveforms): | |
inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True, sampling_rate=16000) | |
inputs = {key: inputs[key].to(device) for key in inputs} | |
with torch.no_grad(): | |
logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score | |
probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities | |
most_probable = torch.argmax(probs) # and now select the most likely audio waveform | |
waveform = waveforms[most_probable] | |
return waveform | |
iface = gr.Interface(fn=texttoaudio, title="AudioLDM Testing Playground", inputs=["text", "text", "number", "number", "number", "number"], outputs="audio") | |
iface.launch() |