Spaces:

ayush2607
/

Bark_tts_hindi

Running

App Files Files Community

Bark_tts_hindi / app.py

ayush2607

Update app.py

d82b8b6 verified 4 months ago

raw

history blame

3.27 kB

	import gradio as gr
	from transformers import AutoProcessor, BarkModel
	import scipy.io.wavfile
	import torch
	import os
	from typing import Optional
	import numpy as np

	# Check for CUDA availability and set device
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# Initialize model and processor globally with optimizations
	processor = AutoProcessor.from_pretrained("suno/bark")
	model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32)
	model.to(DEVICE)

	# Enable model optimizations
	if DEVICE == "cuda":
	torch.backends.cudnn.benchmark = True
	model.eval() # Set to evaluation mode

	# Cache for storing generated audio files
	CACHE_DIR = "audio_cache"
	os.makedirs(CACHE_DIR, exist_ok=True)

	def get_cache_path(text: str, voice_preset: str) -> str:
	"""Generate a unique cache path for the given text and voice preset."""
	import hashlib
	hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
	return os.path.join(CACHE_DIR, f"{hash_key}.wav")

	@torch.inference_mode() # More efficient than no_grad for inference
	def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2") -> Optional[str]:
	try:
	# Check cache first
	cache_path = get_cache_path(text, voice_preset)
	if os.path.exists(cache_path):
	return cache_path

	# Generate audio from text
	inputs = processor(text, voice_preset=voice_preset)

	# Move inputs to device
	inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
	for k, v in inputs.items()}

	# Generate audio with optimized settings
	with torch.cuda.amp.autocast() if DEVICE == "cuda" else torch.no_grad():
	audio_array = model.generate(**inputs,
	do_sample=True,
	guidance_scale=2.5,
	temperature=0.7)

	# Move to CPU and convert to numpy
	audio_array = audio_array.cpu().numpy().squeeze()

	# Normalize audio
	audio_array = np.clip(audio_array, -1, 1)

	# Get sample rate from model config
	sample_rate = model.generation_config.sample_rate

	# Save audio file to cache
	scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)

	return cache_path

	except Exception as e:
	print(f"Error generating audio: {str(e)}")
	return None

	# Define available voice presets
	voice_presets = [
	"v2/hi_speaker_1",
	"v2/hi_speaker_2",
	"v2/hi_speaker_3",
	"v2/hi_speaker_4",
	"v2/hi_speaker_5"
	]

	# Create Gradio interface with optimized settings
	demo = gr.Interface(
	fn=text_to_speech,
	inputs=[
	gr.Textbox(label="Enter text (Hindi or English)"),
	gr.Dropdown(choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice")
	],
	outputs=gr.Audio(label="Generated Speech"),
	title="Bark Text-to-Speech",
	description="Convert text to speech using the Bark model. Supports Hindi and English text.",
	cache_examples=True,
	)

	# Launch the app with optimized settings
	if __name__ == "__main__":
	demo.launch()