Spaces:
Running
Running
File size: 5,894 Bytes
e2e5380 28e403b e2e5380 28e403b e2e5380 28e403b e2e5380 28e403b e2e5380 28e403b e2e5380 28e403b e2e5380 9ce6d3a e2e5380 28e403b e2e5380 28e403b e2e5380 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import os
import sys
import logging
import gradio as gr
import torch
import scipy.io.wavfile
import warnings
from functools import lru_cache
from typing import Optional
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Suppress warnings
warnings.filterwarnings('ignore')
def check_dependencies():
try:
from transformers import AutoProcessor, BarkModel
return True
except ImportError as e:
logger.error(f"Error importing required modules: {str(e)}")
return False
if not check_dependencies():
logger.error("Required dependencies not found. Please install them using:")
logger.error("pip install -r requirements.txt")
sys.exit(1)
from transformers import AutoProcessor, BarkModel
# Global variables for model and processor
processor = None
model = None
def initialize_model():
global processor, model
# Initialize processor and model only once
if processor is None or model is None:
logger.info("Initializing model and processor...")
# Load processor
processor = AutoProcessor.from_pretrained("suno/bark")
# Load model with optimizations
model = BarkModel.from_pretrained("suno/bark")
# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
# Use half-precision floating point numbers
model = model.half()
model = model.to(device)
# Enable model optimization
model.eval()
torch.set_grad_enabled(False)
# Optional: Use torch.compile for PyTorch 2.0+
if hasattr(torch, 'compile'):
try:
model = torch.compile(model)
logger.info("Model compiled successfully")
except Exception as e:
logger.warning(f"Could not compile model: {e}")
logger.info(f"Model initialized on {device}")
return processor, model
# Cache the text preprocessing step
@lru_cache(maxsize=128)
def preprocess_text(text: str, voice_preset: str):
processor, _ = initialize_model()
return processor(text, voice_preset=voice_preset)
def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2", history: Optional[list] = None):
try:
if not text.strip():
raise ValueError("Please enter some text")
# Initialize model if not already initialized
processor, model = initialize_model()
# Get device
device = next(model.parameters()).device
# Preprocess text (cached)
inputs = preprocess_text(text, voice_preset)
# Move inputs to the same device as model
inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
# Generate audio with optimized settings
with torch.inference_mode(): # Faster than no_grad()
audio_array = model.generate(
**inputs,
do_sample=False, # Deterministic generation is faster
num_beams=1, # No beam search for faster generation
)
# Move to CPU and convert to numpy
audio_array = audio_array.cpu().numpy().squeeze()
# Get sample rate from model config
sample_rate = model.generation_config.sample_rate
# Create output directory if it doesn't exist
os.makedirs("outputs", exist_ok=True)
# Generate unique filename based on text hash
output_path = os.path.join("outputs", f"audio_{hash(text)}_{hash(voice_preset)}.wav")
# Save audio file
scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)
return output_path
except Exception as e:
logger.error(f"Error in text_to_speech: {str(e)}")
raise gr.Error(str(e))
# Define available voice presets
voice_presets = [
"v2/hi_speaker_1",
"v2/hi_speaker_2",
"v2/hi_speaker_3",
"v2/hi_speaker_4",
"v2/hi_speaker_5"
]
# Create Gradio interface with optimized settings
with gr.Blocks(analytics_enabled=False) as demo:
gr.Markdown("# Bark Text-to-Speech (Optimized)")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter text (Hindi or English)",
placeholder="तुम बहुत अच्छे हो...",
lines=3
)
voice_input = gr.Dropdown(
choices=voice_presets,
value="v2/hi_speaker_2",
label="Select Voice"
)
submit_btn = gr.Button("Generate Speech")
with gr.Column():
audio_output = gr.Audio(label="Generated Speech")
# Add examples
gr.Examples(
examples=[
["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"],
["You are very nice and I am also nice like you", "v2/hi_speaker_1"]
],
inputs=[text_input, voice_input],
outputs=audio_output,
cache_examples=True # Cache example outputs
)
# Connect components
submit_btn.click(
fn=text_to_speech,
inputs=[text_input, voice_input],
outputs=audio_output
)
# Launch the app with optimized settings
if __name__ == "__main__":
# Initialize model at startup
initialize_model()
# Launch with optimized settings
demo.launch(
enable_queue=True, # Enable queue for better handling of multiple requests
cache_examples=True, # Cache example outputs
show_error=True, # Show errors for debugging
) |