Spaces:
Runtime error
Runtime error
File size: 4,185 Bytes
1547a56 006edc3 1547a56 4e8e8e3 c331458 1547a56 006edc3 1547a56 c331458 1547a56 0c28d14 1547a56 ea38bcc c331458 ea38bcc c331458 27f7d09 c331458 a4f8653 c331458 1547a56 c331458 1547a56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import os
import torch
import gradio as gr
import numpy as np
import os.path as op
import pyarabic.araby as araby
from artst.tasks.artst import ArTSTTask
from transformers import SpeechT5HifiGan
from artst.models.artst import ArTSTTransformerModel
from fairseq.tasks.hubert_pretraining import LabelEncoder
from fairseq.data.audio.speech_to_text_dataset import get_features_or_waveform
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load('ckpts/clartts_tts.pt')
checkpoint['cfg']['task'].t5_task = 't2s'
checkpoint['cfg']['task'].bpe_tokenizer = "utils/arabic.model"
checkpoint['cfg']['task'].data = "utils/"
checkpoint['cfg']['model'].mask_prob = 0.5
checkpoint['cfg']['task'].mask_prob = 0.5
task = ArTSTTask.setup_task(checkpoint['cfg']['task'])
emb_path='embs/clartts.npy'
model = ArTSTTransformerModel.build_model(checkpoint['cfg']['model'], task)
model.load_state_dict(checkpoint['model'])
checkpoint['cfg']['task'].bpe_tokenizer = task.build_bpe(checkpoint['cfg']['model'])
tokenizer = checkpoint['cfg']['task'].bpe_tokenizer
processor = LabelEncoder(task.dicts['text'])
vocoder = SpeechT5HifiGan.from_pretrained('microsoft/speecht5_hifigan').to(device)
def get_embs(emb_path):
spkembs = get_features_or_waveform(emb_path)
spkembs = torch.from_numpy(spkembs).float().unsqueeze(0)
return spkembs
def process_text(text):
text = araby.strip_diacritics(text)
return processor(tokenizer.encode(text)).reshape(1, -1)
net_input = {}
def inference(text, spkr=emb_path):
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
net_input['src_tokens'] = process_text(text)
net_input['spkembs'] = get_embs(spkr)
outs, _, attn = task.generate_speech(
[model],
net_input,
)
with torch.no_grad():
gen_audio = vocoder(outs.to(device))
speech = (gen_audio.cpu().numpy() * 32767).astype(np.int16)
return (16000,speech)
text_box = gr.Textbox(max_lines=2, label="Arabic Text", rtl=True)
out = gr.Audio(label="Synthesized Audio", type="numpy")
title="ArTST: Arabic Speech Synthesis"
description="ArTST: Arabic text and speech transformer based on the T5 transformer. This space demonstarates the TTS checkpoint finetuned on \
the Classical Arabic Text-To-Speech (CLARTTS) dataset. The model is pre-trained on the MGB-2 dataset."
examples=["ูุฃู ูุฑุงู ุงูู
ุฃููู ูู ุงูุนุงุฏุฉ ูู
ุฌุงูุจุฉ ู
ุง ุตุงุฑ ู
ุชููุง ุนููู ุจุงูู
ูุงุถุนุฉ",\
"ูู
ู ูุทูู ุญูู
ุชู ุฃู ุฌุนู ููู ุนุจุงุฏุฉ ุญุงูุชูู",\
"ูู
ู ููู
ุนุฏู ุงูุฅูุณุงู ู
ุน ู
ู ูููู"]
article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2310.16621">ArTST paper</a> |
<a href="https://github.com/mbzuai-nlp/ArTST">GitHub</a> |
<a href="https://huggingface.co/MBZUAI/ArTST">Weights and Tokenizer</a></p>
<pre>
@inproceedings{toyin-etal-2023-artst,
title = "{A}r{TST}: {A}rabic Text and Speech Transformer",
author = "Toyin, Hawau and
Djanibekov, Amirbek and
Kulkarni, Ajinkya and
Aldarmaki, Hanan",
editor = "Sawaf, Hassan and
El-Beltagy, Samhaa and
Zaghouani, Wajdi and
Magdy, Walid and
Abdelali, Ahmed and
Tomeh, Nadi and
Abu Farha, Ibrahim and
Habash, Nizar and
Khalifa, Salam and
Keleg, Amr and
Haddad, Hatem and
Zitouni, Imed and
Mrini, Khalil and
Almatham, Rawan",
booktitle = "Proceedings of ArabicNLP 2023",
month = dec,
year = "2023",
address = "Singapore (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.arabicnlp-1.5",
pages = "41--51"
}
</pre>
<p>Speaker embeddings were generated from <a href="http://www.festvox.org/cmu_arctic/">CMU ARCTIC</a>.</p>
<p>ArTST is based on <a href="https://arxiv.org/abs/2110.07205">SpeechT5 architecture</a>.</p>
</div>
"""
demo = gr.Interface(inference, \
inputs=text_box, outputs=out, title=title, description=description, examples=examples, article=article)
if __name__ == "__main__":
demo.launch(share=True)
|