import gradio as gr
from transformers import AutomaticSpeechRecognitionPipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import torchaudio

# Model URLs
model_urls = [
    "kiranpantha/whisper-tiny-ne",
    "kiranpantha/whisper-base-ne",
    "kiranpantha/whisper-small-np",
    "kiranpantha/whisper-medium-nepali",
    "kiranpantha/whisper-large-v3-nepali",
    "kiranpantha/whisper-large-v3-turbo-nepali",
]

# Cache models and processors
model_cache = {}

def load_model(model_name):
    """Loads and caches the model and processor with proper device management."""
    if model_name not in model_cache:
        processor_name = model_name.replace("kiranpantha", "openai").replace(
            "-nepali", "").replace("-ne", "").replace("-np", "")
        
        # Load processor and model
        processor = AutoProcessor.from_pretrained(processor_name)
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(device)
        model_cache[model_name] = (processor, model, device)
    return model_cache[model_name]

def create_pipeline(model_name):
    """Creates an ASR pipeline with proper configuration."""
    processor, model, device = load_model(model_name)
    return AutomaticSpeechRecognitionPipeline(
        model=model,
        processor=processor,
        device=device,
        generate_kwargs={"task": "transcribe", "language": "nepali"}  # Verify language code
    )

def process_audio(model_url, audio_chunk):
    """Processes audio and returns transcription with error handling."""
    try:
        audio_array, sample_rate = audio_chunk
        
        # Convert stereo to mono
        if len(audio_array.shape) > 1:
            audio_array = audio_array.mean(axis=0)
            
        # Resample to 16kHz if needed
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            audio_array = resampler(torch.tensor(audio_array)).numpy()
        
        # Create pipeline and process
        asr_pipeline = create_pipeline(model_url)
        transcription = asr_pipeline(audio_array)["text"]
        return transcription
    
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# Nepali Speech Recognition with Whisper Models")
    
    model_dropdown = gr.Dropdown(
        choices=model_urls, 
        label="Select Model", 
        value=model_urls[0]
    )
    
    audio_input = gr.Audio(
        type="numpy",
        label="Input Audio",
        streaming=True
    )
    
    output_text = gr.Textbox(label="Transcription")
    
    audio_input.stream(
        fn=process_audio,
        inputs=[model_dropdown, audio_input],
        outputs=output_text,
    )

demo.launch()