import gradio as gr from transformers import AutomaticSpeechRecognitionPipeline, AutoProcessor, AutoModelForSpeechSeq2Seq import torch import torchaudio # Model URLs model_urls = [ "kiranpantha/whisper-tiny-ne", "kiranpantha/whisper-base-ne", "kiranpantha/whisper-small-np", "kiranpantha/whisper-medium-nepali", "kiranpantha/whisper-large-v3-nepali", "kiranpantha/whisper-large-v3-turbo-nepali", ] # Cache models and processors model_cache = {} def load_model(model_name): """Loads and caches the model and processor with proper device management.""" if model_name not in model_cache: processor_name = model_name.replace("kiranpantha", "openai").replace( "-nepali", "").replace("-ne", "").replace("-np", "") # Load processor and model processor = AutoProcessor.from_pretrained(processor_name) device = "cuda:0" if torch.cuda.is_available() else "cpu" model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(device) model_cache[model_name] = (processor, model, device) return model_cache[model_name] def create_pipeline(model_name): """Creates an ASR pipeline with proper configuration.""" processor, model, device = load_model(model_name) return AutomaticSpeechRecognitionPipeline( model=model, processor=processor, device=device, generate_kwargs={"task": "transcribe", "language": "nepali"} # Verify language code ) def process_audio(model_url, audio_chunk): """Processes audio and returns transcription with error handling.""" try: audio_array, sample_rate = audio_chunk # Convert stereo to mono if len(audio_array.shape) > 1: audio_array = audio_array.mean(axis=0) # Resample to 16kHz if needed if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) audio_array = resampler(torch.tensor(audio_array)).numpy() # Create pipeline and process asr_pipeline = create_pipeline(model_url) transcription = asr_pipeline(audio_array)["text"] return transcription except Exception as e: return f"Error: {str(e)}" # Gradio Interface with gr.Blocks() as demo: gr.Markdown("# Nepali Speech Recognition with Whisper Models") model_dropdown = gr.Dropdown( choices=model_urls, label="Select Model", value=model_urls[0] ) audio_input = gr.Audio( type="numpy", label="Input Audio", streaming=True ) output_text = gr.Textbox(label="Transcription") audio_input.stream( fn=process_audio, inputs=[model_dropdown, audio_input], outputs=output_text, ) demo.launch()