ayush2607 commited on
Commit
e2e5380
·
verified ·
1 Parent(s): 9ce6d3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -35
app.py CHANGED
@@ -1,31 +1,127 @@
 
 
 
1
  import gradio as gr
2
- from transformers import AutoProcessor, BarkModel
3
- import scipy.io.wavfile
4
  import torch
5
- import os
 
 
 
6
 
7
- # Initialize model and processor
8
- processor = AutoProcessor.from_pretrained("suno/bark")
9
- model = BarkModel.from_pretrained("suno/bark")
10
 
11
- def text_to_speech(text, voice_preset="v2/hi_speaker_2"):
12
- # Generate audio from text
13
- inputs = processor(text, voice_preset=voice_preset)
14
-
15
- # Generate audio
16
- audio_array = model.generate(**inputs)
17
- audio_array = audio_array.cpu().numpy().squeeze()
18
-
19
- # Get sample rate from model config
20
- sample_rate = model.generation_config.sample_rate
21
-
22
- # Create temporary file path
23
- output_path = "temp_audio.wav"
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- # Save audio file
26
- scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Define available voice presets
31
  voice_presets = [
@@ -36,19 +132,53 @@ voice_presets = [
36
  "v2/hi_speaker_5"
37
  ]
38
 
39
- # Create Gradio interface
40
- demo = gr.Interface(
41
- fn=text_to_speech,
42
- inputs=[
43
- gr.Textbox(label="Enter text (Hindi or English)"),
44
- gr.Dropdown(choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice")
45
- ],
46
- outputs=gr.Audio(label="Generated Speech"),
47
- title="Bark Text-to-Speech",
48
- description="Convert text to speech using the Bark model. Supports Hindi and English text.",
49
 
50
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- # Launch the app
53
  if __name__ == "__main__":
54
- demo.launch()
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
  import gradio as gr
 
 
5
  import torch
6
+ import scipy.io.wavfile
7
+ import warnings
8
+ from functools import lru_cache
9
+ from typing import Optional
10
 
11
+ # Configure logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
 
15
+ # Suppress warnings
16
+ warnings.filterwarnings('ignore')
17
+
18
+ def check_dependencies():
19
+ try:
20
+ from transformers import AutoProcessor, BarkModel
21
+ return True
22
+ except ImportError as e:
23
+ logger.error(f"Error importing required modules: {str(e)}")
24
+ return False
25
+
26
+ if not check_dependencies():
27
+ logger.error("Required dependencies not found. Please install them using:")
28
+ logger.error("pip install -r requirements.txt")
29
+ sys.exit(1)
30
+
31
+ from transformers import AutoProcessor, BarkModel
32
+
33
+ # Global variables for model and processor
34
+ processor = None
35
+ model = None
36
+
37
+ def initialize_model():
38
+ global processor, model
39
 
40
+ # Initialize processor and model only once
41
+ if processor is None or model is None:
42
+ logger.info("Initializing model and processor...")
43
+
44
+ # Load processor
45
+ processor = AutoProcessor.from_pretrained("suno/bark")
46
+
47
+ # Load model with optimizations
48
+ model = BarkModel.from_pretrained("suno/bark")
49
+
50
+ # Move model to GPU if available
51
+ device = "cuda" if torch.cuda.is_available() else "cpu"
52
+ if device == "cuda":
53
+ # Use half-precision floating point numbers
54
+ model = model.half()
55
+
56
+ model = model.to(device)
57
+
58
+ # Enable model optimization
59
+ model.eval()
60
+ torch.set_grad_enabled(False)
61
+
62
+ # Optional: Use torch.compile for PyTorch 2.0+
63
+ if hasattr(torch, 'compile'):
64
+ try:
65
+ model = torch.compile(model)
66
+ logger.info("Model compiled successfully")
67
+ except Exception as e:
68
+ logger.warning(f"Could not compile model: {e}")
69
+
70
+ logger.info(f"Model initialized on {device}")
71
 
72
+ return processor, model
73
+
74
+ # Cache the text preprocessing step
75
+ @lru_cache(maxsize=128)
76
+ def preprocess_text(text: str, voice_preset: str):
77
+ processor, _ = initialize_model()
78
+ return processor(text, voice_preset=voice_preset)
79
+
80
+ def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2", history: Optional[list] = None):
81
+ try:
82
+ if not text.strip():
83
+ raise ValueError("Please enter some text")
84
+
85
+ # Initialize model if not already initialized
86
+ processor, model = initialize_model()
87
+
88
+ # Get device
89
+ device = next(model.parameters()).device
90
+
91
+ # Preprocess text (cached)
92
+ inputs = preprocess_text(text, voice_preset)
93
+
94
+ # Move inputs to the same device as model
95
+ inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
96
+
97
+ # Generate audio with optimized settings
98
+ with torch.inference_mode(): # Faster than no_grad()
99
+ audio_array = model.generate(
100
+ **inputs,
101
+ do_sample=False, # Deterministic generation is faster
102
+ num_beams=1, # No beam search for faster generation
103
+ )
104
+
105
+ # Move to CPU and convert to numpy
106
+ audio_array = audio_array.cpu().numpy().squeeze()
107
+
108
+ # Get sample rate from model config
109
+ sample_rate = model.generation_config.sample_rate
110
+
111
+ # Create output directory if it doesn't exist
112
+ os.makedirs("outputs", exist_ok=True)
113
+
114
+ # Generate unique filename based on text hash
115
+ output_path = os.path.join("outputs", f"audio_{hash(text)}_{hash(voice_preset)}.wav")
116
+
117
+ # Save audio file
118
+ scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)
119
+
120
+ return output_path
121
+
122
+ except Exception as e:
123
+ logger.error(f"Error in text_to_speech: {str(e)}")
124
+ raise gr.Error(str(e))
125
 
126
  # Define available voice presets
127
  voice_presets = [
 
132
  "v2/hi_speaker_5"
133
  ]
134
 
135
+ # Create Gradio interface with optimized settings
136
+ with gr.Blocks(analytics_enabled=False) as demo:
137
+ gr.Markdown("# Bark Text-to-Speech (Optimized)")
 
 
 
 
 
 
 
138
 
139
+ with gr.Row():
140
+ with gr.Column():
141
+ text_input = gr.Textbox(
142
+ label="Enter text (Hindi or English)",
143
+ placeholder="तुम बहुत अच्छे हो...",
144
+ lines=3
145
+ )
146
+ voice_input = gr.Dropdown(
147
+ choices=voice_presets,
148
+ value="v2/hi_speaker_2",
149
+ label="Select Voice"
150
+ )
151
+ submit_btn = gr.Button("Generate Speech")
152
+
153
+ with gr.Column():
154
+ audio_output = gr.Audio(label="Generated Speech")
155
+
156
+ # Add examples
157
+ gr.Examples(
158
+ examples=[
159
+ ["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"],
160
+ ["You are very nice and I am also nice like you", "v2/hi_speaker_1"]
161
+ ],
162
+ inputs=[text_input, voice_input],
163
+ outputs=audio_output,
164
+ cache_examples=True # Cache example outputs
165
+ )
166
+
167
+ # Connect components
168
+ submit_btn.click(
169
+ fn=text_to_speech,
170
+ inputs=[text_input, voice_input],
171
+ outputs=audio_output
172
+ )
173
 
174
+ # Launch the app with optimized settings
175
  if __name__ == "__main__":
176
+ # Initialize model at startup
177
+ initialize_model()
178
+
179
+ # Launch with optimized settings
180
+ demo.launch(
181
+ enable_queue=True, # Enable queue for better handling of multiple requests
182
+ cache_examples=True, # Cache example outputs
183
+ show_error=True, # Show errors for debugging
184
+ )