import gradio as gr from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import torch import torchaudio # Load pre-trained model and processor model_name = "facebook/wav2vec2-base-960h" processor = Wav2Vec2Processor.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) def speech_to_text(audio): try: if audio is None: return "No audio file provided." # Load audio file waveform, rate = torchaudio.load(audio) # Ensure the audio is mono if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Resample to 16000 Hz resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000) waveform = resampler(waveform) # Tokenize the waveform inputs = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000) # Perform inference with torch.no_grad(): logits = model(**inputs).logits # Decode the output predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription except Exception as e: return str(e) # Create Gradio interface iface = gr.Interface( fn=speech_to_text, inputs=gr.Audio(type="filepath", label="Input Audio"), outputs=gr.Textbox(label="Transcription"), live=True, title="Speech to Text", description="Speak into your microphone and get the transcribed text." ) # Launch the interface iface.launch()