harshananddev commited on
Commit
4bc0171
·
verified ·
1 Parent(s): f646a77

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
3
+ import torch
4
+ import torchaudio
5
+
6
+ # Load pre-trained model and tokenizer
7
+ model_name = "facebook/wav2vec2-base-960h"
8
+ tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
9
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
10
+
11
+ def speech_to_text(audio):
12
+ # Load audio file
13
+ waveform, rate = torchaudio.load(audio.name)
14
+
15
+ # Ensure the audio is mono
16
+ if waveform.shape[0] > 1:
17
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
18
+
19
+ # Resample to 16000 Hz
20
+ resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
21
+ waveform = resampler(waveform)
22
+
23
+ # Tokenize the waveform
24
+ inputs = tokenizer(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000)
25
+
26
+ # Perform inference
27
+ with torch.no_grad():
28
+ logits = model(**inputs).logits
29
+
30
+ # Decode the output
31
+ predicted_ids = torch.argmax(logits, dim=-1)
32
+ transcription = tokenizer.batch_decode(predicted_ids)[0]
33
+
34
+ return transcription
35
+
36
+ # Create Gradio interface
37
+ iface = gr.Interface(
38
+ fn=speech_to_text,
39
+ inputs=gr.inputs.Audio(source="microphone", type="file"),
40
+ outputs="text",
41
+ live=True,
42
+ title="Speech to Text",
43
+ description="Speak into your microphone and get the transcribed text."
44
+ )
45
+
46
+ # Launch the interface
47
+ iface.launch()