Spaces:

ayush2607
/

Bark_tts_hindi

Running

File size: 5,894 Bytes

import os
import sys
import logging
import gradio as gr
import torch
import scipy.io.wavfile
import warnings
from functools import lru_cache
from typing import Optional

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Suppress warnings
warnings.filterwarnings('ignore')

def check_dependencies():
    try:
        from transformers import AutoProcessor, BarkModel
        return True
    except ImportError as e:
        logger.error(f"Error importing required modules: {str(e)}")
        return False

if not check_dependencies():
    logger.error("Required dependencies not found. Please install them using:")
    logger.error("pip install -r requirements.txt")
    sys.exit(1)

from transformers import AutoProcessor, BarkModel

# Global variables for model and processor
processor = None
model = None

def initialize_model():
    global processor, model
    
    # Initialize processor and model only once
    if processor is None or model is None:
        logger.info("Initializing model and processor...")
        
        # Load processor
        processor = AutoProcessor.from_pretrained("suno/bark")
        
        # Load model with optimizations
        model = BarkModel.from_pretrained("suno/bark")
        
        # Move model to GPU if available
        device = "cuda" if torch.cuda.is_available() else "cpu"
        if device == "cuda":
            # Use half-precision floating point numbers
            model = model.half()
        
        model = model.to(device)
        
        # Enable model optimization
        model.eval()
        torch.set_grad_enabled(False)
        
        # Optional: Use torch.compile for PyTorch 2.0+
        if hasattr(torch, 'compile'):
            try:
                model = torch.compile(model)
                logger.info("Model compiled successfully")
            except Exception as e:
                logger.warning(f"Could not compile model: {e}")
        
        logger.info(f"Model initialized on {device}")
    
    return processor, model

# Cache the text preprocessing step
@lru_cache(maxsize=128)
def preprocess_text(text: str, voice_preset: str):
    processor, _ = initialize_model()
    return processor(text, voice_preset=voice_preset)

def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2", history: Optional[list] = None):
    try:
        if not text.strip():
            raise ValueError("Please enter some text")

        # Initialize model if not already initialized
        processor, model = initialize_model()
        
        # Get device
        device = next(model.parameters()).device
        
        # Preprocess text (cached)
        inputs = preprocess_text(text, voice_preset)
        
        # Move inputs to the same device as model
        inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
        
        # Generate audio with optimized settings
        with torch.inference_mode():  # Faster than no_grad()
            audio_array = model.generate(
                **inputs,
                do_sample=False,  # Deterministic generation is faster
                num_beams=1,      # No beam search for faster generation
            )
        
        # Move to CPU and convert to numpy
        audio_array = audio_array.cpu().numpy().squeeze()
        
        # Get sample rate from model config
        sample_rate = model.generation_config.sample_rate
        
        # Create output directory if it doesn't exist
        os.makedirs("outputs", exist_ok=True)
        
        # Generate unique filename based on text hash
        output_path = os.path.join("outputs", f"audio_{hash(text)}_{hash(voice_preset)}.wav")
        
        # Save audio file
        scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)
        
        return output_path

    except Exception as e:
        logger.error(f"Error in text_to_speech: {str(e)}")
        raise gr.Error(str(e))

# Define available voice presets
voice_presets = [
    "v2/hi_speaker_1",
    "v2/hi_speaker_2",
    "v2/hi_speaker_3",
    "v2/hi_speaker_4",
    "v2/hi_speaker_5"
]

# Create Gradio interface with optimized settings
with gr.Blocks(analytics_enabled=False) as demo:
    gr.Markdown("# Bark Text-to-Speech (Optimized)")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Enter text (Hindi or English)",
                placeholder="तुम बहुत अच्छे हो...",
                lines=3
            )
            voice_input = gr.Dropdown(
                choices=voice_presets,
                value="v2/hi_speaker_2",
                label="Select Voice"
            )
            submit_btn = gr.Button("Generate Speech")
        
        with gr.Column():
            audio_output = gr.Audio(label="Generated Speech")
            
    # Add examples
    gr.Examples(
        examples=[
            ["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"],
            ["You are very nice and I am also nice like you", "v2/hi_speaker_1"]
        ],
        inputs=[text_input, voice_input],
        outputs=audio_output,
        cache_examples=True  # Cache example outputs
    )
    
    # Connect components
    submit_btn.click(
        fn=text_to_speech,
        inputs=[text_input, voice_input],
        outputs=audio_output
    )

# Launch the app with optimized settings
if __name__ == "__main__":
    # Initialize model at startup
    initialize_model()
    
    # Launch with optimized settings
    demo.launch(
        enable_queue=True,     # Enable queue for better handling of multiple requests
        cache_examples=True,   # Cache example outputs
        show_error=True,      # Show errors for debugging
    )