Spaces:

ayush2607
/

Bark_tts_hindi

Running

App Files Files Community

Bark_tts_hindi / app.py

ayush2607

Update app.py

27d6995 verified 4 months ago

raw

history blame

3.15 kB

	import gradio as gr
	from transformers import AutoProcessor, BarkModel
	import scipy.io.wavfile
	import torch
	import os
	import numpy as np
	import warnings
	warnings.filterwarnings('ignore')

	# Basic device setup
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {DEVICE}")

	# Model initialization with basic settings
	processor = AutoProcessor.from_pretrained(
	"suno/bark",
	trust_remote_code=True
	)

	model = BarkModel.from_pretrained(
	"suno/bark",
	torch_dtype=torch.float32, # Using float32 for stability
	trust_remote_code=True
	)

	# Basic model optimization
	model.to(DEVICE)
	model.eval()

	# Define cache directory in the allowed space
	CACHE_DIR = "audio_cache"
	os.makedirs(CACHE_DIR, exist_ok=True)

	def clean_text(text):
	"""Clean and prepare text for processing."""
	if not isinstance(text, str):
	return ""
	return text.strip()

	def get_cache_path(text: str, voice_preset: str) -> str:
	"""Generate a unique cache path."""
	import hashlib
	hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
	return os.path.join(CACHE_DIR, f"{hash_key}.wav")

	def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2"):
	"""Convert text to speech using Bark model."""
	try:
	# Clean and validate input
	text = clean_text(text)
	if not text:
	return None

	# Generate cache path
	cache_path = get_cache_path(text, voice_preset)

	# Process the text
	inputs = processor(text, voice_preset=voice_preset)

	# Move inputs to device
	inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
	for k, v in inputs.items()}

	# Generate audio
	with torch.inference_mode():
	audio_array = model.generate(
	**inputs,
	do_sample=True,
	temperature=0.7
	)

	# Process the audio
	audio_array = audio_array.cpu().numpy().squeeze()
	audio_array = np.clip(audio_array, -1, 1)

	# Save the audio
	sample_rate = model.generation_config.sample_rate
	scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)

	return cache_path

	except Exception as e:
	print(f"Error in text_to_speech: {str(e)}")
	return None

	# Voice presets
	voice_presets = [
	"v2/hi_speaker_1",
	"v2/hi_speaker_2",
	"v2/hi_speaker_3",
	"v2/hi_speaker_4",
	"v2/hi_speaker_5"
	]

	# Create Gradio interface
	demo = gr.Interface(
	fn=text_to_speech,
	inputs=[
	gr.Textbox(
	label="Enter text (Hindi or English)",
	placeholder="Type your text here...",
	lines=3
	),
	gr.Dropdown(
	choices=voice_presets,
	value="v2/hi_speaker_2",
	label="Select Voice"
	)
	],
	outputs=gr.Audio(label="Generated Speech"),
	title="Bark Text-to-Speech",
	description="Convert text to speech using the Bark model. Supports Hindi and English text."

	)

	# Launch the app
	demo.launch()