import gradio as gr
from transformers import AutoProcessor, BarkModel
import scipy.io.wavfile
import torch
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Basic device setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# Model initialization with basic settings
processor = AutoProcessor.from_pretrained(
    "suno/bark",
    trust_remote_code=True
)

model = BarkModel.from_pretrained(
    "suno/bark",
    torch_dtype=torch.float32,  # Using float32 for stability
    trust_remote_code=True
)

# Basic model optimization
model.to(DEVICE)
model.eval()

# Define cache directory in the allowed space
CACHE_DIR = "audio_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

def clean_text(text):
    """Clean and prepare text for processing."""
    if not isinstance(text, str):
        return ""
    return text.strip()

def get_cache_path(text: str, voice_preset: str) -> str:
    """Generate a unique cache path."""
    import hashlib
    hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
    return os.path.join(CACHE_DIR, f"{hash_key}.wav")

def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2"):
    """Convert text to speech using Bark model."""
    try:
        # Clean and validate input
        text = clean_text(text)
        if not text:
            return None
            
        # Generate cache path
        cache_path = get_cache_path(text, voice_preset)
        
        # Process the text
        inputs = processor(text, voice_preset=voice_preset)
        
        # Move inputs to device
        inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v 
                 for k, v in inputs.items()}
        
        # Generate audio
        with torch.inference_mode():
            audio_array = model.generate(
                **inputs,
                do_sample=True,
                temperature=0.7
            )
        
        # Process the audio
        audio_array = audio_array.cpu().numpy().squeeze()
        audio_array = np.clip(audio_array, -1, 1)
        
        # Save the audio
        sample_rate = model.generation_config.sample_rate
        scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
        
        return cache_path
        
    except Exception as e:
        print(f"Error in text_to_speech: {str(e)}")
        return None

# Voice presets
voice_presets = [
    "v2/hi_speaker_1",
    "v2/hi_speaker_2",
    "v2/hi_speaker_3",
    "v2/hi_speaker_4",
    "v2/hi_speaker_5"
]

# Create Gradio interface
demo = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(
            label="Enter text (Hindi or English)",
            placeholder="Type your text here...",
            lines=3
        ),
        gr.Dropdown(
            choices=voice_presets,
            value="v2/hi_speaker_2",
            label="Select Voice"
        )
    ],
    outputs=gr.Audio(label="Generated Speech"),
    title="Bark Text-to-Speech",
    description="Convert text to speech using the Bark model. Supports Hindi and English text."
   
)

# Launch the app
demo.launch()