Spaces:

harshananddev
/

speech-to-text

Sleeping

File size: 1,618 Bytes

4bc0171
4af88fe
4bc0171
 
b869493
4af88fe
4bc0171
4af88fe
4bc0171
 
 
b869493
fb2299d
 
 
b869493
fb2299d
b869493
 
 
 
 
 
 
 
 
 
4af88fe
b869493
 
 
 
 
 
 
4af88fe
b869493
 
 
 
4bc0171
 
 
 
fb2299d
 
4bc0171

import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio

# Load pre-trained model and processor
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

def speech_to_text(audio):
    try:
        if audio is None:
            return "No audio file provided."

        # Load audio file
        waveform, rate = torchaudio.load(audio)
        
        # Ensure the audio is mono
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Resample to 16000 Hz
        resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
        waveform = resampler(waveform)
        
        # Tokenize the waveform
        inputs = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000)
        
        # Perform inference
        with torch.no_grad():
            logits = model(**inputs).logits
        
        # Decode the output
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]
        
        return transcription
    except Exception as e:
        return str(e)

# Create Gradio interface
iface = gr.Interface(
    fn=speech_to_text,
    inputs=gr.Audio(type="filepath", label="Input Audio"),
    outputs=gr.Textbox(label="Transcription"),
    live=True,
    title="Speech to Text",
    description="Speak into your microphone and get the transcribed text."
)

# Launch the interface
iface.launch()