import gradio as gr from transformers import AutoProcessor, BarkModel import scipy.io.wavfile import torch import os import numpy as np import warnings warnings.filterwarnings('ignore') # Basic device setup DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {DEVICE}") # Model initialization with basic settings processor = AutoProcessor.from_pretrained( "suno/bark", trust_remote_code=True ) model = BarkModel.from_pretrained( "suno/bark", torch_dtype=torch.float32, # Using float32 for stability trust_remote_code=True ) # Basic model optimization model.to(DEVICE) model.eval() # Define cache directory in the allowed space CACHE_DIR = "audio_cache" os.makedirs(CACHE_DIR, exist_ok=True) def clean_text(text): """Clean and prepare text for processing.""" if not isinstance(text, str): return "" return text.strip() def get_cache_path(text: str, voice_preset: str) -> str: """Generate a unique cache path.""" import hashlib hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest() return os.path.join(CACHE_DIR, f"{hash_key}.wav") def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2"): """Convert text to speech using Bark model.""" try: # Clean and validate input text = clean_text(text) if not text: return None # Generate cache path cache_path = get_cache_path(text, voice_preset) # Process the text inputs = processor(text, voice_preset=voice_preset) # Move inputs to device inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} # Generate audio with torch.inference_mode(): audio_array = model.generate( **inputs, do_sample=True, temperature=0.7 ) # Process the audio audio_array = audio_array.cpu().numpy().squeeze() audio_array = np.clip(audio_array, -1, 1) # Save the audio sample_rate = model.generation_config.sample_rate scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array) return cache_path except Exception as e: print(f"Error in text_to_speech: {str(e)}") return None # Voice presets voice_presets = [ "v2/hi_speaker_1", "v2/hi_speaker_2", "v2/hi_speaker_3", "v2/hi_speaker_4", "v2/hi_speaker_5" ] # Create Gradio interface demo = gr.Interface( fn=text_to_speech, inputs=[ gr.Textbox( label="Enter text (Hindi or English)", placeholder="Type your text here...", lines=3 ), gr.Dropdown( choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice" ) ], outputs=gr.Audio(label="Generated Speech"), title="Bark Text-to-Speech", description="Convert text to speech using the Bark model. Supports Hindi and English text." ) # Launch the app demo.launch()