ylacombe commited on
Commit
875286c
·
1 Parent(s): 14e7f92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -8
app.py CHANGED
@@ -14,6 +14,10 @@ import os
14
  import gradio as gr
15
  import uuid
16
  import io
 
 
 
 
17
 
18
 
19
  set_seed(0)
@@ -35,6 +39,7 @@ speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys()
35
 
36
  SAMPLE_RATE = 24_000
37
 
 
38
 
39
  # import model
40
  if device == "cpu":
@@ -55,18 +60,26 @@ def generate_audio(text, voice_preset = None, lag = 0):
55
  inputs = processor(sentences, voice_preset=voice_preset).to(device)
56
  # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
57
 
58
- waveform = bark.generate(
59
- **inputs, coarse_temperature = 0.8, semantic_temperature = 0.5
60
  )
61
-
62
- return (SAMPLE_RATE, waveform.squeeze().cpu().numpy())
63
-
 
 
 
 
 
 
 
 
64
 
65
 
66
  # Gradio blocks demo
67
  with gr.Blocks() as demo_blocks:
68
  gr.Markdown("""<h1 align="center">🐶BARK with Vocos</h1>""")
69
- gr.HTML("""<h3 style="text-align:center;">📢Audio Streaming powered by Gradio 🦾! </h3>""")
70
  with gr.Group():
71
  with gr.Row():
72
  inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here")
@@ -81,7 +94,9 @@ with gr.Blocks() as demo_blocks:
81
  btn = gr.Button("Bark with Vocos TTS")
82
 
83
  with gr.Row():
84
- out_audio = gr.Audio(type="numpy", autoplay=True)
85
- btn.click(generate_audio, [inp_text, dd], out_audio)
 
 
86
 
87
  demo_blocks.queue().launch(debug=True)
 
14
  import gradio as gr
15
  import uuid
16
  import io
17
+ from vocos import Vocos
18
+
19
+ import os
20
+ os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp"
21
 
22
 
23
  set_seed(0)
 
39
 
40
  SAMPLE_RATE = 24_000
41
 
42
+ vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2").to(device)
43
 
44
  # import model
45
  if device == "cpu":
 
60
  inputs = processor(sentences, voice_preset=voice_preset).to(device)
61
  # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
62
 
63
+ fine_output = bark.generate(
64
+ **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
65
  )
66
+
67
+ print("Fine tokens generated")
68
+
69
+ with torch.no_grad():
70
+
71
+ encodec_waveform = bark.codec_decode(fine_output)
72
+
73
+ features = vocos.codes_to_features(fine_output.transpose(0,1))
74
+ vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
75
+
76
+ return (SAMPLE_RATE, encodec_waveform.cpu().squeeze().numpy()), (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy())
77
 
78
 
79
  # Gradio blocks demo
80
  with gr.Blocks() as demo_blocks:
81
  gr.Markdown("""<h1 align="center">🐶BARK with Vocos</h1>""")
82
+ gr.HTML("""<h3 style="text-align:center;">📢Vocos-enhanced TTS 🦾! </h3>""")
83
  with gr.Group():
84
  with gr.Row():
85
  inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here")
 
94
  btn = gr.Button("Bark with Vocos TTS")
95
 
96
  with gr.Row():
97
+ out_audio_encodec = gr.Audio(type="numpy", autoplay=False, label="original output", show_label=True)
98
+ out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="vocos enhanced output", show_label=True)
99
+
100
+ btn.click(generate_audio, [inp_text, dd], [out_audio_encodec, out_audio_vocos])
101
 
102
  demo_blocks.queue().launch(debug=True)