Spaces:
Running
Running
import os | |
import sys | |
import logging | |
import gradio as gr | |
import torch | |
import scipy.io.wavfile | |
import warnings | |
from functools import lru_cache | |
from typing import Optional | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Suppress warnings | |
warnings.filterwarnings('ignore') | |
def check_dependencies(): | |
try: | |
from transformers import AutoProcessor, BarkModel | |
return True | |
except ImportError as e: | |
logger.error(f"Error importing required modules: {str(e)}") | |
return False | |
if not check_dependencies(): | |
logger.error("Required dependencies not found. Please install them using:") | |
logger.error("pip install -r requirements.txt") | |
sys.exit(1) | |
from transformers import AutoProcessor, BarkModel | |
# Global variables for model and processor | |
processor = None | |
model = None | |
def initialize_model(): | |
global processor, model | |
# Initialize processor and model only once | |
if processor is None or model is None: | |
logger.info("Initializing model and processor...") | |
# Load processor | |
processor = AutoProcessor.from_pretrained("suno/bark") | |
# Load model with optimizations | |
model = BarkModel.from_pretrained("suno/bark") | |
# Move model to GPU if available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
if device == "cuda": | |
# Use half-precision floating point numbers | |
model = model.half() | |
model = model.to(device) | |
# Enable model optimization | |
model.eval() | |
torch.set_grad_enabled(False) | |
# Optional: Use torch.compile for PyTorch 2.0+ | |
if hasattr(torch, 'compile'): | |
try: | |
model = torch.compile(model) | |
logger.info("Model compiled successfully") | |
except Exception as e: | |
logger.warning(f"Could not compile model: {e}") | |
logger.info(f"Model initialized on {device}") | |
return processor, model | |
# Cache the text preprocessing step | |
def preprocess_text(text: str, voice_preset: str): | |
processor, _ = initialize_model() | |
return processor(text, voice_preset=voice_preset) | |
def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2", history: Optional[list] = None): | |
try: | |
if not text.strip(): | |
raise ValueError("Please enter some text") | |
# Initialize model if not already initialized | |
processor, model = initialize_model() | |
# Get device | |
device = next(model.parameters()).device | |
# Preprocess text (cached) | |
inputs = preprocess_text(text, voice_preset) | |
# Move inputs to the same device as model | |
inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()} | |
# Generate audio with optimized settings | |
with torch.inference_mode(): # Faster than no_grad() | |
audio_array = model.generate( | |
**inputs, | |
do_sample=False, # Deterministic generation is faster | |
num_beams=1, # No beam search for faster generation | |
) | |
# Move to CPU and convert to numpy | |
audio_array = audio_array.cpu().numpy().squeeze() | |
# Get sample rate from model config | |
sample_rate = model.generation_config.sample_rate | |
# Create output directory if it doesn't exist | |
os.makedirs("outputs", exist_ok=True) | |
# Generate unique filename based on text hash | |
output_path = os.path.join("outputs", f"audio_{hash(text)}_{hash(voice_preset)}.wav") | |
# Save audio file | |
scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array) | |
return output_path | |
except Exception as e: | |
logger.error(f"Error in text_to_speech: {str(e)}") | |
raise gr.Error(str(e)) | |
# Define available voice presets | |
voice_presets = [ | |
"v2/hi_speaker_1", | |
"v2/hi_speaker_2", | |
"v2/hi_speaker_3", | |
"v2/hi_speaker_4", | |
"v2/hi_speaker_5" | |
] | |
# Create Gradio interface with optimized settings | |
with gr.Blocks(analytics_enabled=False) as demo: | |
gr.Markdown("# Bark Text-to-Speech (Optimized)") | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Enter text (Hindi or English)", | |
placeholder="तुम बहुत अच्छे हो...", | |
lines=3 | |
) | |
voice_input = gr.Dropdown( | |
choices=voice_presets, | |
value="v2/hi_speaker_2", | |
label="Select Voice" | |
) | |
submit_btn = gr.Button("Generate Speech") | |
with gr.Column(): | |
audio_output = gr.Audio(label="Generated Speech") | |
# Add examples | |
gr.Examples( | |
examples=[ | |
["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"], | |
["You are very nice and I am also nice like you", "v2/hi_speaker_1"] | |
], | |
inputs=[text_input, voice_input], | |
outputs=audio_output, | |
cache_examples=True # Cache example outputs | |
) | |
# Connect components | |
submit_btn.click( | |
fn=text_to_speech, | |
inputs=[text_input, voice_input], | |
outputs=audio_output | |
) | |
# Launch the app with optimized settings | |
if __name__ == "__main__": | |
# Initialize model at startup | |
initialize_model() | |
# Launch with optimized settings | |
demo.launch( | |
enable_queue=True, # Enable queue for better handling of multiple requests | |
cache_examples=True, # Cache example outputs | |
show_error=True, # Show errors for debugging | |
) |