Bark_tts_hindi / app.py
ayush2607's picture
Update app.py
e2e5380 verified
raw
history blame
5.89 kB
import os
import sys
import logging
import gradio as gr
import torch
import scipy.io.wavfile
import warnings
from functools import lru_cache
from typing import Optional
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Suppress warnings
warnings.filterwarnings('ignore')
def check_dependencies():
try:
from transformers import AutoProcessor, BarkModel
return True
except ImportError as e:
logger.error(f"Error importing required modules: {str(e)}")
return False
if not check_dependencies():
logger.error("Required dependencies not found. Please install them using:")
logger.error("pip install -r requirements.txt")
sys.exit(1)
from transformers import AutoProcessor, BarkModel
# Global variables for model and processor
processor = None
model = None
def initialize_model():
global processor, model
# Initialize processor and model only once
if processor is None or model is None:
logger.info("Initializing model and processor...")
# Load processor
processor = AutoProcessor.from_pretrained("suno/bark")
# Load model with optimizations
model = BarkModel.from_pretrained("suno/bark")
# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
# Use half-precision floating point numbers
model = model.half()
model = model.to(device)
# Enable model optimization
model.eval()
torch.set_grad_enabled(False)
# Optional: Use torch.compile for PyTorch 2.0+
if hasattr(torch, 'compile'):
try:
model = torch.compile(model)
logger.info("Model compiled successfully")
except Exception as e:
logger.warning(f"Could not compile model: {e}")
logger.info(f"Model initialized on {device}")
return processor, model
# Cache the text preprocessing step
@lru_cache(maxsize=128)
def preprocess_text(text: str, voice_preset: str):
processor, _ = initialize_model()
return processor(text, voice_preset=voice_preset)
def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2", history: Optional[list] = None):
try:
if not text.strip():
raise ValueError("Please enter some text")
# Initialize model if not already initialized
processor, model = initialize_model()
# Get device
device = next(model.parameters()).device
# Preprocess text (cached)
inputs = preprocess_text(text, voice_preset)
# Move inputs to the same device as model
inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
# Generate audio with optimized settings
with torch.inference_mode(): # Faster than no_grad()
audio_array = model.generate(
**inputs,
do_sample=False, # Deterministic generation is faster
num_beams=1, # No beam search for faster generation
)
# Move to CPU and convert to numpy
audio_array = audio_array.cpu().numpy().squeeze()
# Get sample rate from model config
sample_rate = model.generation_config.sample_rate
# Create output directory if it doesn't exist
os.makedirs("outputs", exist_ok=True)
# Generate unique filename based on text hash
output_path = os.path.join("outputs", f"audio_{hash(text)}_{hash(voice_preset)}.wav")
# Save audio file
scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)
return output_path
except Exception as e:
logger.error(f"Error in text_to_speech: {str(e)}")
raise gr.Error(str(e))
# Define available voice presets
voice_presets = [
"v2/hi_speaker_1",
"v2/hi_speaker_2",
"v2/hi_speaker_3",
"v2/hi_speaker_4",
"v2/hi_speaker_5"
]
# Create Gradio interface with optimized settings
with gr.Blocks(analytics_enabled=False) as demo:
gr.Markdown("# Bark Text-to-Speech (Optimized)")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Enter text (Hindi or English)",
placeholder="तुम बहुत अच्छे हो...",
lines=3
)
voice_input = gr.Dropdown(
choices=voice_presets,
value="v2/hi_speaker_2",
label="Select Voice"
)
submit_btn = gr.Button("Generate Speech")
with gr.Column():
audio_output = gr.Audio(label="Generated Speech")
# Add examples
gr.Examples(
examples=[
["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"],
["You are very nice and I am also nice like you", "v2/hi_speaker_1"]
],
inputs=[text_input, voice_input],
outputs=audio_output,
cache_examples=True # Cache example outputs
)
# Connect components
submit_btn.click(
fn=text_to_speech,
inputs=[text_input, voice_input],
outputs=audio_output
)
# Launch the app with optimized settings
if __name__ == "__main__":
# Initialize model at startup
initialize_model()
# Launch with optimized settings
demo.launch(
enable_queue=True, # Enable queue for better handling of multiple requests
cache_examples=True, # Cache example outputs
show_error=True, # Show errors for debugging
)