Spaces:
Lippppxy
/
Runtime error

File size: 10,529 Bytes
17f98b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a021ec
f1db410
 
 
 
 
17f98b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# coding=utf-8
import os
import re
import argparse
import utils
import commons
import json
import torch
import gradio as gr
from models import SynthesizerTrn
from text import text_to_sequence, _clean_text
from torch import no_grad, LongTensor
import gradio.processing_utils as gr_processing_utils
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
limitation = os.getenv("SYSTEM") == "spaces"  # limit text and audio length in huggingface spaces

hps_ms = utils.get_hparams_from_file(r'config/config.json')

audio_postprocess_ori = gr.Audio.postprocess

def audio_postprocess(self, y):
    data = audio_postprocess_ori(self, y)
    if data is None:
        return None
    return gr_processing_utils.encode_url_or_file_to_base64(data["name"])


gr.Audio.postprocess = audio_postprocess

def get_text(text, hps, is_symbol):
    text_norm, clean_text = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = LongTensor(text_norm)
    return text_norm, clean_text

def create_tts_fn(net_g_ms, speaker_id):
    def tts_fn(text, language, noise_scale, noise_scale_w, length_scale, is_symbol):
        text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
        if limitation:
            text_len = len(re.sub("\[([A-Z]{2})\]", "", text))
            max_len = 100
            if is_symbol:
                max_len *= 3
            if text_len > max_len:
                return "Error: Text is too long", None
        if not is_symbol:
            if language == 0:
                text = f"[ZH]{text}[ZH]"
            elif language == 1:
                text = f"[JA]{text}[JA]"
            else:
                text = f"{text}"
        stn_tst, clean_text = get_text(text, hps_ms, is_symbol)
        with no_grad():
            x_tst = stn_tst.unsqueeze(0).to(device)
            x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
            sid = LongTensor([speaker_id]).to(device)
            audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
                                   length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()

        return "Success", (22050, audio)
    return tts_fn

def create_to_symbol_fn(hps):
    def to_symbol_fn(is_symbol_input, input_text, temp_text, temp_lang):
        if temp_lang == 'Chinese':
            clean_text = f'[ZH]{input_text}[ZH]'
        elif temp_lang == "Japanese":
            clean_text = f'[JA]{input_text}[JA]'
        else:
            clean_text = input_text
        return (_clean_text(clean_text, hps.data.text_cleaners), input_text) if is_symbol_input else (temp_text, temp_text)

    return to_symbol_fn
def change_lang(language):
    if language == 0:
        return 0.6, 0.668, 1.2, "Chinese"
    elif language == 1:
        return 0.6, 0.668, 1, "Japanese"
    else:
        return 0.6, 0.668, 1, "Mix"

download_audio_js = """
() =>{{
    let root = document.querySelector("body > gradio-app");
    if (root.shadowRoot != null)
        root = root.shadowRoot;
    let audio = root.querySelector("#tts-audio-{audio_id}").querySelector("audio");
    let text = root.querySelector("#input-text-{audio_id}").querySelector("textarea");
    if (audio == undefined)
        return;
    text = text.value;
    if (text == undefined)
        text = Math.floor(Math.random()*100000000);
    audio = audio.src;
    let oA = document.createElement("a");
    oA.download = text.substr(0, 20)+'.wav';
    oA.href = audio;
    document.body.appendChild(oA);
    oA.click();
    oA.remove();
}}
"""

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', type=str, default='cpu')
    parser.add_argument('--api', action="store_true", default=False)
    parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
    args = parser.parse_args()
    device = torch.device(args.device)
    
    models = []
    with open("pretrained_models/info.json", "r", encoding="utf-8") as f:
        models_info = json.load(f)
    for i, info in models_info.items():
        if not info['enable']:
            continue
        sid = info['sid']
        name_en = info['name_en']
        name_zh = info['name_zh']
        title = info['title']
        cover = f"pretrained_models/{i}/{info['cover']}"
        example = info['example']
        language = info['language']
        net_g_ms = SynthesizerTrn(
            len(hps_ms.symbols),
            hps_ms.data.filter_length // 2 + 1,
            hps_ms.train.segment_size // hps_ms.data.hop_length,
            n_speakers=hps_ms.data.n_speakers if info['type'] == "multi" else 0,
            **hps_ms.model)
        utils.load_checkpoint(f'pretrained_models/{i}/{i}.pth', net_g_ms, None)
        _ = net_g_ms.eval().to(device)
        models.append((sid, name_en, name_zh, title, cover, example, language, net_g_ms, create_tts_fn(net_g_ms, sid), create_to_symbol_fn(hps_ms)))
    with gr.Blocks() as app:
        gr.Markdown(
            f'<img src="https://i.ibb.co/mJm8Qnn/a311e52b-4c25-4c0c-be11-ee2cac05c86c.jpg"'
            "# <center>π˜Όπ™„ π™π™šπ™­π™© 𝙏𝙀 π™Žπ™₯π™šπ™šπ™˜π™ π˜Όπ™£π™žπ™’π™š π˜Ύπ™π™–π™§π™–π™˜π™©π™šπ™§\n"
            "## <center>𝘣𝘺 𝘈𝘭π˜ͺ𝘱\n"
            "## <center>α΄˜ΚŸα΄‡α΄€κœ±α΄‡ α΄œκœ±α΄‡ Ιͺα΄› α΄‘Ιͺκœ±α΄‡ΚŸΚ, α΄€Ι΄α΄… ᴅᴏɴ'α΄› Κœα΄œΚ€α΄› ᴏʀ α΄κœ°κœ°α΄‡Ι΄α΄… α΄α΄›Κœα΄‡Κ€ α΄˜α΄‡α΄α΄˜ΚŸα΄‡ ᴏʀ Ι’Κ€α΄α΄œα΄˜κœ±\n"
            "[Visit Instagram]"
            "(https://www.instagram.com/liip.qdry/)"
        )

        with gr.Tabs():
            with gr.TabItem("EN"):
                for (sid, name_en, name_zh, title, cover, example, language, net_g_ms, tts_fn, to_symbol_fn) in models:
                    with gr.TabItem(name_en):
                        with gr.Row():
                            gr.Markdown(
                                '<div align="center">'
                                f'<a><strong>{title}</strong></a>'
                                f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else ""
                                '</div>'
                            )
                        with gr.Row():
                            with gr.Column():
                                input_text = gr.Textbox(label="Text (100 words limitation)" if limitation else "Text", lines=5, value=example, elem_id=f"input-text-en-{name_en.replace(' ','')}")
                                lang = gr.Dropdown(label="Language", choices=["Chinese", "Japanese", "Mix(wrap the Chinese text with [ZH][ZH], wrap the Japanese text with [JA][JA]οΌ‰"],
                                            type="index", value=language)
                                temp_lang = gr.Variable(value=language)
                                with gr.Accordion(label="Advanced Options", open=False):
                                    temp_text_var = gr.Variable()
                                    symbol_input = gr.Checkbox(value=False, label="Symbol input")
                                    symbol_list = gr.Dataset(label="Symbol list", components=[input_text],
                                                             samples=[[x] for x in hps_ms.symbols])
                                    symbol_list_json = gr.Json(value=hps_ms.symbols, visible=False)
                                btn = gr.Button(value="Generate", variant="primary")
                                with gr.Row():
                                    ns = gr.Slider(label="noise_scale", minimum=0.1, maximum=1.0, step=0.1, value=0.6, interactive=True)
                                    nsw = gr.Slider(label="noise_scale_w", minimum=0.1, maximum=1.0, step=0.1, value=0.668, interactive=True)
                                    ls = gr.Slider(label="length_scale", minimum=0.1, maximum=2.0, step=0.1, value=1.2 if language=="Chinese" else 1, interactive=True)
                            with gr.Column():
                                o1 = gr.Textbox(label="Output Message")
                                o2 = gr.Audio(label="Output Audio", elem_id=f"tts-audio-en-{name_en.replace(' ','')}")
                                download = gr.Button("Download Audio")
                            btn.click(tts_fn, inputs=[input_text, lang,  ns, nsw, ls, symbol_input], outputs=[o1, o2], api_name=f"tts-{name_en}")
                            download.click(None, [], [], _js=download_audio_js.format(audio_id=f"en-{name_en.replace(' ', '')}"))
                            lang.change(change_lang, inputs=[lang], outputs=[ns, nsw, ls, temp_lang])
                            symbol_input.change(
                                to_symbol_fn,
                                [symbol_input, input_text, temp_text_var, temp_lang],
                                [input_text, temp_text_var]
                            )
                            symbol_list.click(None, [symbol_list, symbol_list_json], [input_text],
                                              _js=f"""
                            (i,symbols) => {{
                                let root = document.querySelector("body > gradio-app");
                                if (root.shadowRoot != null)
                                    root = root.shadowRoot;
                                let text_input = root.querySelector("#input-text-en-{name_en.replace(' ', '')}").querySelector("textarea");
                                let startPos = text_input.selectionStart;
                                let endPos = text_input.selectionEnd;
                                let oldTxt = text_input.value;
                                let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
                                text_input.value = result;
                                let x = window.scrollX, y = window.scrollY;
                                text_input.focus();
                                text_input.selectionStart = startPos + symbols[i].length;
                                text_input.selectionEnd = startPos + symbols[i].length;
                                text_input.blur();
                                window.scrollTo(x, y);
                                return text_input.value;
                            }}""")
    app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)