import gradio as gr
from transformers import AutoProcessor, BarkModel
import scipy.io.wavfile
import torch
import os
from typing import Optional
import numpy as np

# Check for CUDA availability and set device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize model and processor globally with optimizations
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32)
model.to(DEVICE)

# Enable model optimizations
if DEVICE == "cuda":
    torch.backends.cudnn.benchmark = True
model.eval()  # Set to evaluation mode

# Cache for storing generated audio files
CACHE_DIR = "audio_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

def get_cache_path(text: str, voice_preset: str) -> str:
    """Generate a unique cache path for the given text and voice preset."""
    import hashlib
    hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
    return os.path.join(CACHE_DIR, f"{hash_key}.wav")

@torch.inference_mode()  # More efficient than no_grad for inference
def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2") -> Optional[str]:
    try:
        # Check cache first
        cache_path = get_cache_path(text, voice_preset)
        if os.path.exists(cache_path):
            return cache_path

        # Generate audio from text
        inputs = processor(text, voice_preset=voice_preset)
        
        # Move inputs to device
        inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v 
                 for k, v in inputs.items()}
        
        # Generate audio with optimized settings
        with torch.cuda.amp.autocast() if DEVICE == "cuda" else torch.no_grad():
            audio_array = model.generate(**inputs, 
                                      do_sample=True,
                                      guidance_scale=2.5,
                                      temperature=0.7)
        
        # Move to CPU and convert to numpy
        audio_array = audio_array.cpu().numpy().squeeze()
        
        # Normalize audio
        audio_array = np.clip(audio_array, -1, 1)
        
        # Get sample rate from model config
        sample_rate = model.generation_config.sample_rate
        
        # Save audio file to cache
        scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
        
        return cache_path
    
    except Exception as e:
        print(f"Error generating audio: {str(e)}")
        return None

# Define available voice presets
voice_presets = [
    "v2/hi_speaker_1",
    "v2/hi_speaker_2",
    "v2/hi_speaker_3",
    "v2/hi_speaker_4",
    "v2/hi_speaker_5"
]

# Create Gradio interface with optimized settings
demo = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(label="Enter text (Hindi or English)"),
        gr.Dropdown(choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice")
    ],
    outputs=gr.Audio(label="Generated Speech"),
    title="Bark Text-to-Speech",
    description="Convert text to speech using the Bark model. Supports Hindi and English text.",
    cache_examples=True,
)

# Launch the app with optimized settings
if __name__ == "__main__":
    demo.launch()