ayush2607 commited on
Commit
d82b8b6
·
verified ·
1 Parent(s): 41f16a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -131
app.py CHANGED
@@ -1,104 +1,73 @@
1
- import os
2
- import sys
3
- import logging
4
  import gradio as gr
5
- import torch
6
  import scipy.io.wavfile
7
- import warnings
8
- from functools import lru_cache
9
  from typing import Optional
 
10
 
11
- # Configure logging
12
- logging.basicConfig(level=logging.INFO)
13
- logger = logging.getLogger(__name__)
14
-
15
- # Suppress warnings
16
- warnings.filterwarnings('ignore')
17
-
18
- def check_dependencies():
19
- try:
20
- from transformers import AutoProcessor, BarkModel
21
- return True
22
- except ImportError as e:
23
- logger.error(f"Error importing required modules: {str(e)}")
24
- return False
25
-
26
- if not check_dependencies():
27
- logger.error("Required dependencies not found. Please install them using:")
28
- logger.error("pip install -r requirements.txt")
29
- sys.exit(1)
30
 
31
- from transformers import AutoProcessor, BarkModel
 
 
 
32
 
33
- # Global variables for model and processor
34
- processor = None
35
- model = None
 
36
 
37
- def initialize_model():
38
- global processor, model
39
-
40
- if processor is None or model is None:
41
- logger.info("Initializing model and processor...")
42
-
43
- processor = AutoProcessor.from_pretrained("suno/bark")
44
- model = BarkModel.from_pretrained("suno/bark")
45
-
46
- device = "cuda" if torch.cuda.is_available() else "cpu"
47
- if device == "cuda":
48
- model = model.half()
49
-
50
- model = model.to(device)
51
- model.eval()
52
-
53
- torch.set_grad_enabled(False)
54
-
55
- if hasattr(torch, 'compile'):
56
- try:
57
- model = torch.compile(model)
58
- logger.info("Model compiled successfully")
59
- except Exception as e:
60
- logger.warning(f"Could not compile model: {e}")
61
-
62
- logger.info(f"Model initialized on {device}")
63
-
64
- return processor, model
65
 
66
- @lru_cache(maxsize=128)
67
- def preprocess_text(text: str, voice_preset: str):
68
- processor, _ = initialize_model()
69
- return processor(text, voice_preset=voice_preset)
 
70
 
71
- def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2"):
 
72
  try:
73
- if not text.strip():
74
- raise ValueError("Please enter some text")
 
 
75
 
76
- processor, model = initialize_model()
77
- device = next(model.parameters()).device
78
 
79
- inputs = preprocess_text(text, voice_preset)
80
- inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
 
81
 
82
- with torch.inference_mode():
83
- audio_array = model.generate(
84
- **inputs,
85
- do_sample=False,
86
- num_beams=1,
87
- )
88
 
 
89
  audio_array = audio_array.cpu().numpy().squeeze()
90
- sample_rate = model.generation_config.sample_rate
91
 
92
- os.makedirs("outputs", exist_ok=True)
93
- output_path = os.path.join("outputs", f"audio_{hash(text)}_{hash(voice_preset)}.wav")
94
 
95
- scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)
 
96
 
97
- return output_path
98
-
 
 
 
99
  except Exception as e:
100
- logger.error(f"Error in text_to_speech: {str(e)}")
101
- raise gr.Error(str(e))
102
 
103
  # Define available voice presets
104
  voice_presets = [
@@ -109,54 +78,20 @@ voice_presets = [
109
  "v2/hi_speaker_5"
110
  ]
111
 
112
- # Create Gradio interface
113
- with gr.Blocks() as demo:
114
- gr.Markdown("# Bark Text-to-Speech (Optimized)")
115
-
116
- with gr.Row():
117
- with gr.Column():
118
- text_input = gr.Textbox(
119
- label="Enter text (Hindi or English)",
120
- placeholder="तुम बहुत अच्छे हो...",
121
- lines=3
122
- )
123
- voice_input = gr.Dropdown(
124
- choices=voice_presets,
125
- value="v2/hi_speaker_2",
126
- label="Select Voice"
127
- )
128
- submit_btn = gr.Button("Generate Speech")
129
-
130
- with gr.Column():
131
- audio_output = gr.Audio(label="Generated Speech")
132
-
133
- # Fixed Examples implementation
134
- gr.Examples(
135
- examples=[
136
- ["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"],
137
- ["You are very nice and I am also nice like you", "v2/hi_speaker_1"]
138
- ],
139
- inputs=[text_input, voice_input],
140
- outputs=audio_output,
141
- fn=text_to_speech, # Add the function reference
142
- cache_examples=True
143
- )
144
-
145
- # Connect components
146
- submit_btn.click(
147
- fn=text_to_speech,
148
- inputs=[text_input, voice_input],
149
- outputs=audio_output
150
- )
151
 
152
- # Launch the app
153
  if __name__ == "__main__":
154
- # Initialize model at startup
155
- initialize_model()
156
-
157
- # Launch with optimized settings
158
- demo.launch(
159
- enable_queue=True,
160
- show_error=True,
161
- share=True # Enable sharing (optional)
162
- )
 
 
 
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, BarkModel
3
  import scipy.io.wavfile
4
+ import torch
5
+ import os
6
  from typing import Optional
7
+ import numpy as np
8
 
9
+ # Check for CUDA availability and set device
10
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # Initialize model and processor globally with optimizations
13
+ processor = AutoProcessor.from_pretrained("suno/bark")
14
+ model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32)
15
+ model.to(DEVICE)
16
 
17
+ # Enable model optimizations
18
+ if DEVICE == "cuda":
19
+ torch.backends.cudnn.benchmark = True
20
+ model.eval() # Set to evaluation mode
21
 
22
+ # Cache for storing generated audio files
23
+ CACHE_DIR = "audio_cache"
24
+ os.makedirs(CACHE_DIR, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ def get_cache_path(text: str, voice_preset: str) -> str:
27
+ """Generate a unique cache path for the given text and voice preset."""
28
+ import hashlib
29
+ hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
30
+ return os.path.join(CACHE_DIR, f"{hash_key}.wav")
31
 
32
+ @torch.inference_mode() # More efficient than no_grad for inference
33
+ def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2") -> Optional[str]:
34
  try:
35
+ # Check cache first
36
+ cache_path = get_cache_path(text, voice_preset)
37
+ if os.path.exists(cache_path):
38
+ return cache_path
39
 
40
+ # Generate audio from text
41
+ inputs = processor(text, voice_preset=voice_preset)
42
 
43
+ # Move inputs to device
44
+ inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
45
+ for k, v in inputs.items()}
46
 
47
+ # Generate audio with optimized settings
48
+ with torch.cuda.amp.autocast() if DEVICE == "cuda" else torch.no_grad():
49
+ audio_array = model.generate(**inputs,
50
+ do_sample=True,
51
+ guidance_scale=2.5,
52
+ temperature=0.7)
53
 
54
+ # Move to CPU and convert to numpy
55
  audio_array = audio_array.cpu().numpy().squeeze()
 
56
 
57
+ # Normalize audio
58
+ audio_array = np.clip(audio_array, -1, 1)
59
 
60
+ # Get sample rate from model config
61
+ sample_rate = model.generation_config.sample_rate
62
 
63
+ # Save audio file to cache
64
+ scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
65
+
66
+ return cache_path
67
+
68
  except Exception as e:
69
+ print(f"Error generating audio: {str(e)}")
70
+ return None
71
 
72
  # Define available voice presets
73
  voice_presets = [
 
78
  "v2/hi_speaker_5"
79
  ]
80
 
81
+ # Create Gradio interface with optimized settings
82
+ demo = gr.Interface(
83
+ fn=text_to_speech,
84
+ inputs=[
85
+ gr.Textbox(label="Enter text (Hindi or English)"),
86
+ gr.Dropdown(choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice")
87
+ ],
88
+ outputs=gr.Audio(label="Generated Speech"),
89
+ title="Bark Text-to-Speech",
90
+ description="Convert text to speech using the Bark model. Supports Hindi and English text.",
91
+ cache_examples=True,
92
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ # Launch the app with optimized settings
95
  if __name__ == "__main__":
96
+ demo.launch()
97
+