Spaces:

ayush2607
/

Bark_tts_hindi

Running

App Files Files Community

Bark_tts_hindi / app.py

ayush2607

Update app.py

e2e5380 verified 4 months ago

raw

history blame

5.89 kB

	import os
	import sys
	import logging
	import gradio as gr
	import torch
	import scipy.io.wavfile
	import warnings
	from functools import lru_cache
	from typing import Optional

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Suppress warnings
	warnings.filterwarnings('ignore')

	def check_dependencies():
	try:
	from transformers import AutoProcessor, BarkModel
	return True
	except ImportError as e:
	logger.error(f"Error importing required modules: {str(e)}")
	return False

	if not check_dependencies():
	logger.error("Required dependencies not found. Please install them using:")
	logger.error("pip install -r requirements.txt")
	sys.exit(1)

	from transformers import AutoProcessor, BarkModel

	# Global variables for model and processor
	processor = None
	model = None

	def initialize_model():
	global processor, model

	# Initialize processor and model only once
	if processor is None or model is None:
	logger.info("Initializing model and processor...")

	# Load processor
	processor = AutoProcessor.from_pretrained("suno/bark")

	# Load model with optimizations
	model = BarkModel.from_pretrained("suno/bark")

	# Move model to GPU if available
	device = "cuda" if torch.cuda.is_available() else "cpu"
	if device == "cuda":
	# Use half-precision floating point numbers
	model = model.half()

	model = model.to(device)

	# Enable model optimization
	model.eval()
	torch.set_grad_enabled(False)

	# Optional: Use torch.compile for PyTorch 2.0+
	if hasattr(torch, 'compile'):
	try:
	model = torch.compile(model)
	logger.info("Model compiled successfully")
	except Exception as e:
	logger.warning(f"Could not compile model: {e}")

	logger.info(f"Model initialized on {device}")

	return processor, model

	# Cache the text preprocessing step
	@lru_cache(maxsize=128)
	def preprocess_text(text: str, voice_preset: str):
	processor, _ = initialize_model()
	return processor(text, voice_preset=voice_preset)

	def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2", history: Optional[list] = None):
	try:
	if not text.strip():
	raise ValueError("Please enter some text")

	# Initialize model if not already initialized
	processor, model = initialize_model()

	# Get device
	device = next(model.parameters()).device

	# Preprocess text (cached)
	inputs = preprocess_text(text, voice_preset)

	# Move inputs to the same device as model
	inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}

	# Generate audio with optimized settings
	with torch.inference_mode(): # Faster than no_grad()
	audio_array = model.generate(
	**inputs,
	do_sample=False, # Deterministic generation is faster
	num_beams=1, # No beam search for faster generation
	)

	# Move to CPU and convert to numpy
	audio_array = audio_array.cpu().numpy().squeeze()

	# Get sample rate from model config
	sample_rate = model.generation_config.sample_rate

	# Create output directory if it doesn't exist
	os.makedirs("outputs", exist_ok=True)

	# Generate unique filename based on text hash
	output_path = os.path.join("outputs", f"audio_{hash(text)}_{hash(voice_preset)}.wav")

	# Save audio file
	scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)

	return output_path

	except Exception as e:
	logger.error(f"Error in text_to_speech: {str(e)}")
	raise gr.Error(str(e))

	# Define available voice presets
	voice_presets = [
	"v2/hi_speaker_1",
	"v2/hi_speaker_2",
	"v2/hi_speaker_3",
	"v2/hi_speaker_4",
	"v2/hi_speaker_5"
	]

	# Create Gradio interface with optimized settings
	with gr.Blocks(analytics_enabled=False) as demo:
	gr.Markdown("# Bark Text-to-Speech (Optimized)")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Enter text (Hindi or English)",
	placeholder="तुम बहुत अच्छे हो...",
	lines=3
	)
	voice_input = gr.Dropdown(
	choices=voice_presets,
	value="v2/hi_speaker_2",
	label="Select Voice"
	)
	submit_btn = gr.Button("Generate Speech")

	with gr.Column():
	audio_output = gr.Audio(label="Generated Speech")

	# Add examples
	gr.Examples(
	examples=[
	["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"],
	["You are very nice and I am also nice like you", "v2/hi_speaker_1"]
	],
	inputs=[text_input, voice_input],
	outputs=audio_output,
	cache_examples=True # Cache example outputs
	)

	# Connect components
	submit_btn.click(
	fn=text_to_speech,
	inputs=[text_input, voice_input],
	outputs=audio_output
	)

	# Launch the app with optimized settings
	if __name__ == "__main__":
	# Initialize model at startup
	initialize_model()

	# Launch with optimized settings
	demo.launch(
	enable_queue=True, # Enable queue for better handling of multiple requests
	cache_examples=True, # Cache example outputs
	show_error=True, # Show errors for debugging
	)