harshananddev commited on
Commit
4af88fe
·
verified ·
1 Parent(s): b869493

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -8
app.py CHANGED
@@ -1,12 +1,11 @@
1
  import gradio as gr
2
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
3
  import torch
4
  import torchaudio
5
-
6
- # Install the necessary packages
7
  import subprocess
8
  import sys
9
 
 
10
  def install(package):
11
  subprocess.check_call([sys.executable, "-m", "pip", "install", package])
12
 
@@ -15,9 +14,9 @@ install("torch")
15
  install("torchaudio")
16
  install("gradio")
17
 
18
- # Load pre-trained model and tokenizer
19
  model_name = "facebook/wav2vec2-base-960h"
20
- tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
21
  model = Wav2Vec2ForCTC.from_pretrained(model_name)
22
 
23
  def speech_to_text(audio):
@@ -34,7 +33,7 @@ def speech_to_text(audio):
34
  waveform = resampler(waveform)
35
 
36
  # Tokenize the waveform
37
- inputs = tokenizer(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000)
38
 
39
  # Perform inference
40
  with torch.no_grad():
@@ -42,7 +41,7 @@ def speech_to_text(audio):
42
 
43
  # Decode the output
44
  predicted_ids = torch.argmax(logits, dim=-1)
45
- transcription = tokenizer.batch_decode(predicted_ids)[0]
46
 
47
  return transcription
48
  except Exception as e:
@@ -51,7 +50,7 @@ def speech_to_text(audio):
51
  # Create Gradio interface
52
  iface = gr.Interface(
53
  fn=speech_to_text,
54
- inputs=gr.inputs.Audio(source="microphone", type="file"),
55
  outputs="text",
56
  live=True,
57
  title="Speech to Text",
 
1
  import gradio as gr
2
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
3
  import torch
4
  import torchaudio
 
 
5
  import subprocess
6
  import sys
7
 
8
+ # Function to install required packages
9
  def install(package):
10
  subprocess.check_call([sys.executable, "-m", "pip", "install", package])
11
 
 
14
  install("torchaudio")
15
  install("gradio")
16
 
17
+ # Load pre-trained model and processor
18
  model_name = "facebook/wav2vec2-base-960h"
19
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
20
  model = Wav2Vec2ForCTC.from_pretrained(model_name)
21
 
22
  def speech_to_text(audio):
 
33
  waveform = resampler(waveform)
34
 
35
  # Tokenize the waveform
36
+ inputs = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000)
37
 
38
  # Perform inference
39
  with torch.no_grad():
 
41
 
42
  # Decode the output
43
  predicted_ids = torch.argmax(logits, dim=-1)
44
+ transcription = processor.batch_decode(predicted_ids)[0]
45
 
46
  return transcription
47
  except Exception as e:
 
50
  # Create Gradio interface
51
  iface = gr.Interface(
52
  fn=speech_to_text,
53
+ inputs=gr.Audio(source="microphone", type="file"),
54
  outputs="text",
55
  live=True,
56
  title="Speech to Text",