File size: 4,842 Bytes
9e130e4
 
 
 
 
1a4473d
 
 
 
 
 
 
 
 
 
 
9e130e4
1a4473d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e130e4
 
1a4473d
9e130e4
1a4473d
 
 
 
9e130e4
 
 
 
 
 
 
 
 
 
 
 
 
 
1a4473d
 
 
 
 
 
 
 
 
9e130e4
1a4473d
 
 
 
 
 
 
9e130e4
 
1a4473d
 
9e130e4
 
1a4473d
 
9e130e4
 
1a4473d
 
9e130e4
 
1a4473d
 
9e130e4
1a4473d
 
 
 
 
 
 
9e130e4
 
 
 
 
1a4473d
9e130e4
1a4473d
 
 
 
 
 
 
 
 
 
 
9e130e4
 
 
 
1a4473d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
from gtts import gTTS
from pydub import AudioSegment
from io import BytesIO

def custom_tts(
    text1, accent1,
    text2, accent2,
    text3, accent3,
    text4, accent4,
    text5, accent5,
    text6, accent6,
    text7, accent7,
    text8, accent8,
    text9, accent9,
    text10, accent10
):
    # ๊ฐ ์•…์„ผํŠธ์— ๋”ฐ๋ฅธ ์–ธ์–ด ์ฝ”๋“œ์™€ tld ์„ค์ •
    accent_mapping = {
        "British": ("en", "co.uk"),
        "American": ("en", "com"),
        "Australian": ("en", "com.au")
    }
    
    # 10๊ฐœ์˜ ๋Œ€ํ™”๋ฌธ๊ณผ ์„ ํƒ๋œ ์•…์„ผํŠธ๋ฅผ ํŠœํ”Œ ๋ฆฌ์ŠคํŠธ๋กœ ๊ตฌ์„ฑํ•ฉ๋‹ˆ๋‹ค.
    dialogues = [
        (text1, accent1),
        (text2, accent2),
        (text3, accent3),
        (text4, accent4),
        (text5, accent5),
        (text6, accent6),
        (text7, accent7),
        (text8, accent8),
        (text9, accent9),
        (text10, accent10)
    ]
    
    combined_audio = AudioSegment.silent(duration=0)  # ์ดˆ๊ธฐ ๋นˆ ์˜ค๋””์˜ค
    
    # ๊ฐ ๋Œ€ํ™”๋ฌธ์— ๋Œ€ํ•ด ์Œ์„ฑ ์ƒ์„ฑ ํ›„ ๊ฒฐํ•ฉ
    for text, accent in dialogues:
        if text.strip():  # ํ…์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์ง€ ์•Š์€ ๊ฒฝ์šฐ์—๋งŒ ์ฒ˜๋ฆฌ
            lang, tld = accent_mapping.get(accent, ("en", "com"))
            tts = gTTS(text, lang=lang, tld=tld)
            audio_file = BytesIO()
            tts.write_to_fp(audio_file)
            audio_file.seek(0)
            tts_audio = AudioSegment.from_file(audio_file, format="mp3")
            # ๊ฐ ์Œ์„ฑ ์‚ฌ์ด์— 500ms์˜ ์นจ๋ฌต ์ถ”๊ฐ€
            combined_audio += tts_audio + AudioSegment.silent(duration=500)
    
    # ์ตœ์ข… ๊ฒฐํ•ฉ๋œ ์˜ค๋””์˜ค๋ฅผ mp3 ํŒŒ์ผ๋กœ ์ €์žฅ
    output_file = "combined_output.mp3"
    combined_audio.export(output_file, format="mp3")
    return output_file

with gr.Blocks() as demo:
    gr.Markdown("## Custom TTS: 10๊ฐœ์˜ ๋Œ€ํ™”๋ฌธ ์ž…๋ ฅ๋ž€์—์„œ ์•…์„ผํŠธ๋ฅผ ์„ ํƒํ•˜์—ฌ ์Œ์„ฑ ์ƒ์„ฑํ•˜๊ธฐ")
    
    with gr.Row():
        text1 = gr.Textbox(label="Dialogue 1", placeholder="Enter text for Dialogue 1")
        accent1 = gr.Dropdown(label="Accent for Dialogue 1", choices=["British", "American", "Australian"], value="British")
    
    with gr.Row():
        text2 = gr.Textbox(label="Dialogue 2", placeholder="Enter text for Dialogue 2")
        accent2 = gr.Dropdown(label="Accent for Dialogue 2", choices=["British", "American", "Australian"], value="British")
    
    with gr.Row():
        text3 = gr.Textbox(label="Dialogue 3", placeholder="Enter text for Dialogue 3")
        accent3 = gr.Dropdown(label="Accent for Dialogue 3", choices=["British", "American", "Australian"], value="British")
    
    with gr.Row():
        text4 = gr.Textbox(label="Dialogue 4", placeholder="Enter text for Dialogue 4")
        accent4 = gr.Dropdown(label="Accent for Dialogue 4", choices=["British", "American", "Australian"], value="British")
    
    with gr.Row():
        text5 = gr.Textbox(label="Dialogue 5", placeholder="Enter text for Dialogue 5")
        accent5 = gr.Dropdown(label="Accent for Dialogue 5", choices=["British", "American", "Australian"], value="British")
    
    with gr.Row():
        text6 = gr.Textbox(label="Dialogue 6", placeholder="Enter text for Dialogue 6")
        accent6 = gr.Dropdown(label="Accent for Dialogue 6", choices=["British", "American", "Australian"], value="British")
    
    with gr.Row():
        text7 = gr.Textbox(label="Dialogue 7", placeholder="Enter text for Dialogue 7")
        accent7 = gr.Dropdown(label="Accent for Dialogue 7", choices=["British", "American", "Australian"], value="British")
    
    with gr.Row():
        text8 = gr.Textbox(label="Dialogue 8", placeholder="Enter text for Dialogue 8")
        accent8 = gr.Dropdown(label="Accent for Dialogue 8", choices=["British", "American", "Australian"], value="British")
    
    with gr.Row():
        text9 = gr.Textbox(label="Dialogue 9", placeholder="Enter text for Dialogue 9")
        accent9 = gr.Dropdown(label="Accent for Dialogue 9", choices=["British", "American", "Australian"], value="British")
    
    with gr.Row():
        text10 = gr.Textbox(label="Dialogue 10", placeholder="Enter text for Dialogue 10")
        accent10 = gr.Dropdown(label="Accent for Dialogue 10", choices=["British", "American", "Australian"], value="British")
    
    output_audio = gr.Audio(label="Generated Speech", type="filepath")
    generate_button = gr.Button("Generate Speech")
    
    generate_button.click(
        custom_tts,
        inputs=[
            text1, accent1,
            text2, accent2,
            text3, accent3,
            text4, accent4,
            text5, accent5,
            text6, accent6,
            text7, accent7,
            text8, accent8,
            text9, accent9,
            text10, accent10
        ],
        outputs=output_audio
    )

if __name__ == "__main__":
    demo.launch()