ayush2607 commited on
Commit
cddcc30
·
verified ·
1 Parent(s): 27d6995

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -94
app.py CHANGED
@@ -1,117 +1,61 @@
1
  import gradio as gr
2
- from transformers import AutoProcessor, BarkModel
3
- import scipy.io.wavfile
4
  import torch
5
- import os
 
6
  import numpy as np
7
- import warnings
8
- warnings.filterwarnings('ignore')
9
-
10
- # Basic device setup
11
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
- print(f"Using device: {DEVICE}")
13
-
14
- # Model initialization with basic settings
15
- processor = AutoProcessor.from_pretrained(
16
- "suno/bark",
17
- trust_remote_code=True
18
- )
19
-
20
- model = BarkModel.from_pretrained(
21
- "suno/bark",
22
- torch_dtype=torch.float32, # Using float32 for stability
23
- trust_remote_code=True
24
- )
25
-
26
- # Basic model optimization
27
- model.to(DEVICE)
28
- model.eval()
29
-
30
- # Define cache directory in the allowed space
31
- CACHE_DIR = "audio_cache"
32
- os.makedirs(CACHE_DIR, exist_ok=True)
33
-
34
- def clean_text(text):
35
- """Clean and prepare text for processing."""
36
- if not isinstance(text, str):
37
- return ""
38
- return text.strip()
39
 
40
- def get_cache_path(text: str, voice_preset: str) -> str:
41
- """Generate a unique cache path."""
42
- import hashlib
43
- hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
44
- return os.path.join(CACHE_DIR, f"{hash_key}.wav")
45
 
46
- def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2"):
47
- """Convert text to speech using Bark model."""
48
  try:
49
- # Clean and validate input
50
- text = clean_text(text)
51
- if not text:
52
- return None
53
-
54
- # Generate cache path
55
- cache_path = get_cache_path(text, voice_preset)
56
-
57
- # Process the text
58
- inputs = processor(text, voice_preset=voice_preset)
59
-
60
- # Move inputs to device
61
- inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
62
- for k, v in inputs.items()}
63
-
64
- # Generate audio
65
- with torch.inference_mode():
66
- audio_array = model.generate(
67
- **inputs,
68
- do_sample=True,
69
- temperature=0.7
70
- )
71
-
72
- # Process the audio
73
- audio_array = audio_array.cpu().numpy().squeeze()
74
- audio_array = np.clip(audio_array, -1, 1)
75
 
76
- # Save the audio
77
- sample_rate = model.generation_config.sample_rate
78
- scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
79
 
80
- return cache_path
 
 
81
 
 
82
  except Exception as e:
83
- print(f"Error in text_to_speech: {str(e)}")
84
- return None
85
 
86
- # Voice presets
87
- voice_presets = [
88
- "v2/hi_speaker_1",
89
- "v2/hi_speaker_2",
90
- "v2/hi_speaker_3",
91
- "v2/hi_speaker_4",
92
- "v2/hi_speaker_5"
93
- ]
94
 
95
  # Create Gradio interface
96
  demo = gr.Interface(
97
  fn=text_to_speech,
98
  inputs=[
99
  gr.Textbox(
100
- label="Enter text (Hindi or English)",
101
- placeholder="Type your text here...",
102
- lines=3
103
- ),
104
- gr.Dropdown(
105
- choices=voice_presets,
106
- value="v2/hi_speaker_2",
107
- label="Select Voice"
108
  )
109
  ],
110
  outputs=gr.Audio(label="Generated Speech"),
111
- title="Bark Text-to-Speech",
112
- description="Convert text to speech using the Bark model. Supports Hindi and English text."
113
-
 
 
 
114
  )
115
 
116
  # Launch the app
117
- demo.launch()
 
 
1
  import gradio as gr
 
 
2
  import torch
3
+ from transformers import AutoProcessor, AutoModel
4
+ import scipy.io.wavfile as wavfile
5
  import numpy as np
6
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Initialize model and processor
9
+ def load_model():
10
+ processor = AutoProcessor.from_pretrained("suno/bark-small")
11
+ model = AutoModel.from_pretrained("suno/bark-small")
12
+ return processor, model
13
 
14
+ # Text to speech function
15
+ def text_to_speech(text):
16
  try:
17
+ # Generate speech
18
+ inputs = processor(
19
+ text=[text],
20
+ return_tensors="pt",
21
+ )
22
+ speech_values = model.generate(**inputs, do_sample=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # Convert to numpy and normalize
25
+ audio_data = speech_values.cpu().numpy().squeeze()
26
+ sampling_rate = model.generation_config.sample_rate
27
 
28
+ # Create temporary file
29
+ temp_path = "temp_audio.wav"
30
+ wavfile.write(temp_path, sampling_rate, audio_data)
31
 
32
+ return temp_path
33
  except Exception as e:
34
+ return f"Error generating speech: {str(e)}"
 
35
 
36
+ # Load models globally
37
+ print("Loading models...")
38
+ processor, model = load_model()
39
+ print("Models loaded successfully!")
 
 
 
 
40
 
41
  # Create Gradio interface
42
  demo = gr.Interface(
43
  fn=text_to_speech,
44
  inputs=[
45
  gr.Textbox(
46
+ label="Enter text (Hindi supported)",
47
+ placeholder="इस योजना से संबंधित लाभों का विवरण प्राप्त कर सकते"
 
 
 
 
 
 
48
  )
49
  ],
50
  outputs=gr.Audio(label="Generated Speech"),
51
+ title="Hindi Text-to-Speech using Bark",
52
+ description="Generate natural-sounding speech from Hindi text using the Bark model.",
53
+ examples=[
54
+ ["इस योजना से संबंधित लाभों का विवरण प्राप्त कर सकते"],
55
+ ["नमस्ते, आप कैसे हैं?"],
56
+ ]
57
  )
58
 
59
  # Launch the app
60
+ if __name__ == "__main__":
61
+ demo.launch()