File size: 5,894 Bytes
e2e5380
 
 
28e403b
 
e2e5380
 
 
 
28e403b
e2e5380
 
 
28e403b
e2e5380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28e403b
e2e5380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28e403b
e2e5380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28e403b
 
 
 
 
 
 
 
 
 
e2e5380
 
 
9ce6d3a
e2e5380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28e403b
e2e5380
28e403b
e2e5380
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import sys
import logging
import gradio as gr
import torch
import scipy.io.wavfile
import warnings
from functools import lru_cache
from typing import Optional

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Suppress warnings
warnings.filterwarnings('ignore')

def check_dependencies():
    try:
        from transformers import AutoProcessor, BarkModel
        return True
    except ImportError as e:
        logger.error(f"Error importing required modules: {str(e)}")
        return False

if not check_dependencies():
    logger.error("Required dependencies not found. Please install them using:")
    logger.error("pip install -r requirements.txt")
    sys.exit(1)

from transformers import AutoProcessor, BarkModel

# Global variables for model and processor
processor = None
model = None

def initialize_model():
    global processor, model
    
    # Initialize processor and model only once
    if processor is None or model is None:
        logger.info("Initializing model and processor...")
        
        # Load processor
        processor = AutoProcessor.from_pretrained("suno/bark")
        
        # Load model with optimizations
        model = BarkModel.from_pretrained("suno/bark")
        
        # Move model to GPU if available
        device = "cuda" if torch.cuda.is_available() else "cpu"
        if device == "cuda":
            # Use half-precision floating point numbers
            model = model.half()
        
        model = model.to(device)
        
        # Enable model optimization
        model.eval()
        torch.set_grad_enabled(False)
        
        # Optional: Use torch.compile for PyTorch 2.0+
        if hasattr(torch, 'compile'):
            try:
                model = torch.compile(model)
                logger.info("Model compiled successfully")
            except Exception as e:
                logger.warning(f"Could not compile model: {e}")
        
        logger.info(f"Model initialized on {device}")
    
    return processor, model

# Cache the text preprocessing step
@lru_cache(maxsize=128)
def preprocess_text(text: str, voice_preset: str):
    processor, _ = initialize_model()
    return processor(text, voice_preset=voice_preset)

def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2", history: Optional[list] = None):
    try:
        if not text.strip():
            raise ValueError("Please enter some text")

        # Initialize model if not already initialized
        processor, model = initialize_model()
        
        # Get device
        device = next(model.parameters()).device
        
        # Preprocess text (cached)
        inputs = preprocess_text(text, voice_preset)
        
        # Move inputs to the same device as model
        inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
        
        # Generate audio with optimized settings
        with torch.inference_mode():  # Faster than no_grad()
            audio_array = model.generate(
                **inputs,
                do_sample=False,  # Deterministic generation is faster
                num_beams=1,      # No beam search for faster generation
            )
        
        # Move to CPU and convert to numpy
        audio_array = audio_array.cpu().numpy().squeeze()
        
        # Get sample rate from model config
        sample_rate = model.generation_config.sample_rate
        
        # Create output directory if it doesn't exist
        os.makedirs("outputs", exist_ok=True)
        
        # Generate unique filename based on text hash
        output_path = os.path.join("outputs", f"audio_{hash(text)}_{hash(voice_preset)}.wav")
        
        # Save audio file
        scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)
        
        return output_path

    except Exception as e:
        logger.error(f"Error in text_to_speech: {str(e)}")
        raise gr.Error(str(e))

# Define available voice presets
voice_presets = [
    "v2/hi_speaker_1",
    "v2/hi_speaker_2",
    "v2/hi_speaker_3",
    "v2/hi_speaker_4",
    "v2/hi_speaker_5"
]

# Create Gradio interface with optimized settings
with gr.Blocks(analytics_enabled=False) as demo:
    gr.Markdown("# Bark Text-to-Speech (Optimized)")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Enter text (Hindi or English)",
                placeholder="तुम बहुत अच्छे हो...",
                lines=3
            )
            voice_input = gr.Dropdown(
                choices=voice_presets,
                value="v2/hi_speaker_2",
                label="Select Voice"
            )
            submit_btn = gr.Button("Generate Speech")
        
        with gr.Column():
            audio_output = gr.Audio(label="Generated Speech")
            
    # Add examples
    gr.Examples(
        examples=[
            ["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"],
            ["You are very nice and I am also nice like you", "v2/hi_speaker_1"]
        ],
        inputs=[text_input, voice_input],
        outputs=audio_output,
        cache_examples=True  # Cache example outputs
    )
    
    # Connect components
    submit_btn.click(
        fn=text_to_speech,
        inputs=[text_input, voice_input],
        outputs=audio_output
    )

# Launch the app with optimized settings
if __name__ == "__main__":
    # Initialize model at startup
    initialize_model()
    
    # Launch with optimized settings
    demo.launch(
        enable_queue=True,     # Enable queue for better handling of multiple requests
        cache_examples=True,   # Cache example outputs
        show_error=True,      # Show errors for debugging
    )