Saurabh Kumar commited on
Commit
cfbcc10
·
verified ·
1 Parent(s): c8797a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -0
app.py CHANGED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import python_multipart
4
+ import os
5
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
+ from datasets import load_dataset, Audio
7
+ import numpy as np
8
+ from speechbrain.inference import EncoderClassifier
9
+
10
+ # Load models and processor
11
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
12
+ model = SpeechT5ForTextToSpeech.from_pretrained("Sana1207/Hindi_SpeechT5_finetuned")
13
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
14
+
15
+ # Load speaker encoder
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ speaker_model = EncoderClassifier.from_hparams(
18
+ source="speechbrain/spkrec-xvect-voxceleb",
19
+ run_opts={"device": device},
20
+ savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb")
21
+ )
22
+
23
+ # Load a sample from the dataset for speaker embedding
24
+ try:
25
+ dataset = load_dataset("mozilla-foundation/common_voice_17_0", "hi", split="validated", trust_remote_code=True)
26
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
27
+ sample = dataset[0]
28
+ speaker_embedding = create_speaker_embedding(sample['audio']['array'])
29
+ except Exception as e:
30
+ print(f"Error loading dataset: {e}")
31
+ # Use a random speaker embedding as fallback
32
+ speaker_embedding = torch.randn(1, 512)
33
+
34
+ def create_speaker_embedding(waveform):
35
+ with torch.no_grad():
36
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
37
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
38
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
39
+ return speaker_embeddings
40
+
41
+ def text_to_speech(text):
42
+ # Clean up text
43
+ replacements = [
44
+ ("अ", "a"),
45
+ ("आ", "aa"),
46
+ ("इ", "i"),
47
+ ("ई", "ee"),
48
+ ("उ", "u"),
49
+ ("ऋ", "ri"),
50
+ ("ए", "ae"),
51
+ ("ऐ", "ai"),
52
+ ("ऑ", "au"),
53
+ ("ओ", "o"),
54
+ ("औ", "au"),
55
+ ("क", "k"),
56
+ ("ख", "kh"),
57
+ ("ग", "g"),
58
+ ("घ", "gh"),
59
+ ("च", "ch"),
60
+ ("छ", "chh"),
61
+ ("ज", "j"),
62
+ ("झ", "jh"),
63
+ ("ञ", "gna"),
64
+ ("ट", "t"),
65
+ ("ठ", "th"),
66
+ ("ड", "d"),
67
+ ("ढ", "dh"),
68
+ ("ण", "nr"),
69
+ ("त", "t"),
70
+ ("थ", "th"),
71
+ ("द", "d"),
72
+ ("ध", "dh"),
73
+ ("न", "n"),
74
+ ("प", "p"),
75
+ ("फ", "ph"),
76
+ ("ब", "b"),
77
+ ("भ", "bh"),
78
+ ("म", "m"),
79
+ ("य", "ya"),
80
+ ("र", "r"),
81
+ ("ल", "l"),
82
+ ("व", "w"),
83
+ ("श", "sha"),
84
+ ("ष", "sh"),
85
+ ("स", "s"),
86
+ ("ह", "ha"),
87
+ ("़", "ng"),
88
+ ("्", ""),
89
+ ("ऽ", ""),
90
+ ("ा", "a"),
91
+ ("ि", "i"),
92
+ ("ी", "ee"),
93
+ ("ु", "u"),
94
+ ("ॅ", "n"),
95
+ ("े", "e"),
96
+ ("ै", "oi"),
97
+ ("ो", "o"),
98
+ ("ौ", "ou"),
99
+ ("ॅ", "n"),
100
+ ("ॉ", "r"),
101
+ ("ू", "uh"),
102
+ ("ृ", "ri"),
103
+ ("ं", "n"),
104
+ ("क़", "q"),
105
+ ("ज़", "z"),
106
+ ("ड़", "r"),
107
+ ("ढ़", "rh"),
108
+ ("फ़", "f"),
109
+ ("|", ".")
110
+ ]
111
+ for src, dst in replacements:
112
+ text = text.replace(src, dst)
113
+
114
+ inputs = processor(text=text, return_tensors="pt")
115
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
116
+ return (16000, speech.numpy())
117
+
118
+ iface = gr.Interface(
119
+ fn=text_to_speech,
120
+ inputs="text",
121
+ outputs="audio",
122
+ title="Hindi Text-to-Speech",
123
+ description="Enter Hindi text to convert it into an Audio"
124
+ )
125
+
126
+ iface.launch(share=True)