Bark_tts_hindi / app.py
ayush2607's picture
Update app.py
27d6995 verified
raw
history blame
3.15 kB
import gradio as gr
from transformers import AutoProcessor, BarkModel
import scipy.io.wavfile
import torch
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# Basic device setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
# Model initialization with basic settings
processor = AutoProcessor.from_pretrained(
"suno/bark",
trust_remote_code=True
)
model = BarkModel.from_pretrained(
"suno/bark",
torch_dtype=torch.float32, # Using float32 for stability
trust_remote_code=True
)
# Basic model optimization
model.to(DEVICE)
model.eval()
# Define cache directory in the allowed space
CACHE_DIR = "audio_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
def clean_text(text):
"""Clean and prepare text for processing."""
if not isinstance(text, str):
return ""
return text.strip()
def get_cache_path(text: str, voice_preset: str) -> str:
"""Generate a unique cache path."""
import hashlib
hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
return os.path.join(CACHE_DIR, f"{hash_key}.wav")
def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2"):
"""Convert text to speech using Bark model."""
try:
# Clean and validate input
text = clean_text(text)
if not text:
return None
# Generate cache path
cache_path = get_cache_path(text, voice_preset)
# Process the text
inputs = processor(text, voice_preset=voice_preset)
# Move inputs to device
inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
for k, v in inputs.items()}
# Generate audio
with torch.inference_mode():
audio_array = model.generate(
**inputs,
do_sample=True,
temperature=0.7
)
# Process the audio
audio_array = audio_array.cpu().numpy().squeeze()
audio_array = np.clip(audio_array, -1, 1)
# Save the audio
sample_rate = model.generation_config.sample_rate
scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
return cache_path
except Exception as e:
print(f"Error in text_to_speech: {str(e)}")
return None
# Voice presets
voice_presets = [
"v2/hi_speaker_1",
"v2/hi_speaker_2",
"v2/hi_speaker_3",
"v2/hi_speaker_4",
"v2/hi_speaker_5"
]
# Create Gradio interface
demo = gr.Interface(
fn=text_to_speech,
inputs=[
gr.Textbox(
label="Enter text (Hindi or English)",
placeholder="Type your text here...",
lines=3
),
gr.Dropdown(
choices=voice_presets,
value="v2/hi_speaker_2",
label="Select Voice"
)
],
outputs=gr.Audio(label="Generated Speech"),
title="Bark Text-to-Speech",
description="Convert text to speech using the Bark model. Supports Hindi and English text."
)
# Launch the app
demo.launch()