Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoProcessor, BarkModel | |
import scipy.io.wavfile | |
import torch | |
import os | |
from typing import Optional | |
import numpy as np | |
# Check for CUDA availability and set device | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
# Initialize model and processor globally with optimizations | |
processor = AutoProcessor.from_pretrained("suno/bark") | |
model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32) | |
model.to(DEVICE) | |
# Enable model optimizations | |
if DEVICE == "cuda": | |
torch.backends.cudnn.benchmark = True | |
model.eval() # Set to evaluation mode | |
# Cache for storing generated audio files | |
CACHE_DIR = "audio_cache" | |
os.makedirs(CACHE_DIR, exist_ok=True) | |
def get_cache_path(text: str, voice_preset: str) -> str: | |
"""Generate a unique cache path for the given text and voice preset.""" | |
import hashlib | |
hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest() | |
return os.path.join(CACHE_DIR, f"{hash_key}.wav") | |
# More efficient than no_grad for inference | |
def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2") -> Optional[str]: | |
try: | |
# Check cache first | |
cache_path = get_cache_path(text, voice_preset) | |
if os.path.exists(cache_path): | |
return cache_path | |
# Generate audio from text | |
inputs = processor(text, voice_preset=voice_preset) | |
# Move inputs to device | |
inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v | |
for k, v in inputs.items()} | |
# Generate audio with optimized settings | |
with torch.cuda.amp.autocast() if DEVICE == "cuda" else torch.no_grad(): | |
audio_array = model.generate(**inputs, | |
do_sample=True, | |
guidance_scale=2.5, | |
temperature=0.7) | |
# Move to CPU and convert to numpy | |
audio_array = audio_array.cpu().numpy().squeeze() | |
# Normalize audio | |
audio_array = np.clip(audio_array, -1, 1) | |
# Get sample rate from model config | |
sample_rate = model.generation_config.sample_rate | |
# Save audio file to cache | |
scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array) | |
return cache_path | |
except Exception as e: | |
print(f"Error generating audio: {str(e)}") | |
return None | |
# Define available voice presets | |
voice_presets = [ | |
"v2/hi_speaker_1", | |
"v2/hi_speaker_2", | |
"v2/hi_speaker_3", | |
"v2/hi_speaker_4", | |
"v2/hi_speaker_5" | |
] | |
# Create Gradio interface with optimized settings | |
demo = gr.Interface( | |
fn=text_to_speech, | |
inputs=[ | |
gr.Textbox(label="Enter text (Hindi or English)"), | |
gr.Dropdown(choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice") | |
], | |
outputs=gr.Audio(label="Generated Speech"), | |
title="Bark Text-to-Speech", | |
description="Convert text to speech using the Bark model. Supports Hindi and English text.", | |
cache_examples=True, | |
) | |
# Launch the app with optimized settings | |
if __name__ == "__main__": | |
demo.launch() | |