import os import sys import logging import gradio as gr import torch import scipy.io.wavfile import warnings from functools import lru_cache from typing import Optional # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Suppress warnings warnings.filterwarnings('ignore') def check_dependencies(): try: from transformers import AutoProcessor, BarkModel return True except ImportError as e: logger.error(f"Error importing required modules: {str(e)}") return False if not check_dependencies(): logger.error("Required dependencies not found. Please install them using:") logger.error("pip install -r requirements.txt") sys.exit(1) from transformers import AutoProcessor, BarkModel # Global variables for model and processor processor = None model = None def initialize_model(): global processor, model # Initialize processor and model only once if processor is None or model is None: logger.info("Initializing model and processor...") # Load processor processor = AutoProcessor.from_pretrained("suno/bark") # Load model with optimizations model = BarkModel.from_pretrained("suno/bark") # Move model to GPU if available device = "cuda" if torch.cuda.is_available() else "cpu" if device == "cuda": # Use half-precision floating point numbers model = model.half() model = model.to(device) # Enable model optimization model.eval() torch.set_grad_enabled(False) # Optional: Use torch.compile for PyTorch 2.0+ if hasattr(torch, 'compile'): try: model = torch.compile(model) logger.info("Model compiled successfully") except Exception as e: logger.warning(f"Could not compile model: {e}") logger.info(f"Model initialized on {device}") return processor, model # Cache the text preprocessing step @lru_cache(maxsize=128) def preprocess_text(text: str, voice_preset: str): processor, _ = initialize_model() return processor(text, voice_preset=voice_preset) def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2", history: Optional[list] = None): try: if not text.strip(): raise ValueError("Please enter some text") # Initialize model if not already initialized processor, model = initialize_model() # Get device device = next(model.parameters()).device # Preprocess text (cached) inputs = preprocess_text(text, voice_preset) # Move inputs to the same device as model inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()} # Generate audio with optimized settings with torch.inference_mode(): # Faster than no_grad() audio_array = model.generate( **inputs, do_sample=False, # Deterministic generation is faster num_beams=1, # No beam search for faster generation ) # Move to CPU and convert to numpy audio_array = audio_array.cpu().numpy().squeeze() # Get sample rate from model config sample_rate = model.generation_config.sample_rate # Create output directory if it doesn't exist os.makedirs("outputs", exist_ok=True) # Generate unique filename based on text hash output_path = os.path.join("outputs", f"audio_{hash(text)}_{hash(voice_preset)}.wav") # Save audio file scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array) return output_path except Exception as e: logger.error(f"Error in text_to_speech: {str(e)}") raise gr.Error(str(e)) # Define available voice presets voice_presets = [ "v2/hi_speaker_1", "v2/hi_speaker_2", "v2/hi_speaker_3", "v2/hi_speaker_4", "v2/hi_speaker_5" ] # Create Gradio interface with optimized settings with gr.Blocks(analytics_enabled=False) as demo: gr.Markdown("# Bark Text-to-Speech (Optimized)") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Enter text (Hindi or English)", placeholder="तुम बहुत अच्छे हो...", lines=3 ) voice_input = gr.Dropdown( choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice" ) submit_btn = gr.Button("Generate Speech") with gr.Column(): audio_output = gr.Audio(label="Generated Speech") # Add examples gr.Examples( examples=[ ["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"], ["You are very nice and I am also nice like you", "v2/hi_speaker_1"] ], inputs=[text_input, voice_input], outputs=audio_output, cache_examples=True # Cache example outputs ) # Connect components submit_btn.click( fn=text_to_speech, inputs=[text_input, voice_input], outputs=audio_output ) # Launch the app with optimized settings if __name__ == "__main__": # Initialize model at startup initialize_model() # Launch with optimized settings demo.launch( enable_queue=True, # Enable queue for better handling of multiple requests cache_examples=True, # Cache example outputs show_error=True, # Show errors for debugging )