Spaces:

hf-audio
/

vocos-bark

Runtime error

App Files Files Community

ylacombe commited on Oct 13, 2023

Commit

875286c

1 Parent(s): 14e7f92

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -8

app.py CHANGED Viewed

@@ -14,6 +14,10 @@ import os
 import gradio as gr
 import uuid
 import io
 set_seed(0)
@@ -35,6 +39,7 @@ speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys()
 SAMPLE_RATE = 24_000
 # import model
 if device == "cpu":
@@ -55,18 +60,26 @@ def generate_audio(text, voice_preset = None, lag = 0):
     inputs = processor(sentences, voice_preset=voice_preset).to(device)
     # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
-    waveform = bark.generate(
-        **inputs, coarse_temperature = 0.8, semantic_temperature = 0.5
     )
-    return (SAMPLE_RATE, waveform.squeeze().cpu().numpy())
 # Gradio blocks demo
 with gr.Blocks() as demo_blocks:
     gr.Markdown("""<h1 align="center">🐶BARK with Vocos</h1>""")
-    gr.HTML("""<h3 style="text-align:center;">📢Audio Streaming powered by Gradio 🦾! </h3>""")
     with gr.Group():
       with gr.Row():
         inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here")
@@ -81,7 +94,9 @@ with gr.Blocks() as demo_blocks:
         btn = gr.Button("Bark with Vocos TTS")
     with gr.Row():
-        out_audio = gr.Audio(type="numpy", autoplay=True)
-        btn.click(generate_audio, [inp_text, dd], out_audio)
 demo_blocks.queue().launch(debug=True)

 import gradio as gr
 import uuid
 import io
+from vocos import Vocos
+import os
+os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp"
 set_seed(0)
 SAMPLE_RATE = 24_000
+vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2").to(device)
 # import model
 if device == "cpu":
     inputs = processor(sentences, voice_preset=voice_preset).to(device)
     # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+    fine_output = bark.generate(
+        **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
     )
+    print("Fine tokens generated")
+    with torch.no_grad():
+        encodec_waveform = bark.codec_decode(fine_output)
+        features = vocos.codes_to_features(fine_output.transpose(0,1))
+        vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
+    return (SAMPLE_RATE, encodec_waveform.cpu().squeeze().numpy()), (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy())
 # Gradio blocks demo
 with gr.Blocks() as demo_blocks:
     gr.Markdown("""<h1 align="center">🐶BARK with Vocos</h1>""")
+    gr.HTML("""<h3 style="text-align:center;">📢Vocos-enhanced TTS 🦾! </h3>""")
     with gr.Group():
       with gr.Row():
         inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here")
         btn = gr.Button("Bark with Vocos TTS")
     with gr.Row():
+        out_audio_encodec = gr.Audio(type="numpy", autoplay=False, label="original output", show_label=True)
+        out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="vocos enhanced output", show_label=True)
+        btn.click(generate_audio, [inp_text, dd], [out_audio_encodec, out_audio_vocos])
 demo_blocks.queue().launch(debug=True)