import gradio as gr from transformers import AutoProcessor, BarkModel import scipy.io.wavfile import torch import os from typing import Optional import numpy as np # Check for CUDA availability and set device DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Initialize model and processor globally with optimizations processor = AutoProcessor.from_pretrained("suno/bark") model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32) model.to(DEVICE) # Enable model optimizations if DEVICE == "cuda": torch.backends.cudnn.benchmark = True model.eval() # Set to evaluation mode # Cache for storing generated audio files CACHE_DIR = "audio_cache" os.makedirs(CACHE_DIR, exist_ok=True) def get_cache_path(text: str, voice_preset: str) -> str: """Generate a unique cache path for the given text and voice preset.""" import hashlib hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest() return os.path.join(CACHE_DIR, f"{hash_key}.wav") @torch.inference_mode() # More efficient than no_grad for inference def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2") -> Optional[str]: try: # Check cache first cache_path = get_cache_path(text, voice_preset) if os.path.exists(cache_path): return cache_path # Generate audio from text inputs = processor(text, voice_preset=voice_preset) # Move inputs to device inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} # Generate audio with optimized settings with torch.cuda.amp.autocast() if DEVICE == "cuda" else torch.no_grad(): audio_array = model.generate(**inputs, do_sample=True, guidance_scale=2.5, temperature=0.7) # Move to CPU and convert to numpy audio_array = audio_array.cpu().numpy().squeeze() # Normalize audio audio_array = np.clip(audio_array, -1, 1) # Get sample rate from model config sample_rate = model.generation_config.sample_rate # Save audio file to cache scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array) return cache_path except Exception as e: print(f"Error generating audio: {str(e)}") return None # Define available voice presets voice_presets = [ "v2/hi_speaker_1", "v2/hi_speaker_2", "v2/hi_speaker_3", "v2/hi_speaker_4", "v2/hi_speaker_5" ] # Create Gradio interface with optimized settings demo = gr.Interface( fn=text_to_speech, inputs=[ gr.Textbox(label="Enter text (Hindi or English)"), gr.Dropdown(choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice") ], outputs=gr.Audio(label="Generated Speech"), title="Bark Text-to-Speech", description="Convert text to speech using the Bark model. Supports Hindi and English text.", cache_examples=True, ) # Launch the app with optimized settings if __name__ == "__main__": demo.launch()