Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoProcessor, BarkModel | |
import scipy.io.wavfile | |
import torch | |
import os | |
import numpy as np | |
import warnings | |
warnings.filterwarnings('ignore') | |
# Basic device setup | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Using device: {DEVICE}") | |
# Model initialization with basic settings | |
processor = AutoProcessor.from_pretrained( | |
"suno/bark", | |
trust_remote_code=True | |
) | |
model = BarkModel.from_pretrained( | |
"suno/bark", | |
torch_dtype=torch.float32, # Using float32 for stability | |
trust_remote_code=True | |
) | |
# Basic model optimization | |
model.to(DEVICE) | |
model.eval() | |
# Define cache directory in the allowed space | |
CACHE_DIR = "audio_cache" | |
os.makedirs(CACHE_DIR, exist_ok=True) | |
def clean_text(text): | |
"""Clean and prepare text for processing.""" | |
if not isinstance(text, str): | |
return "" | |
return text.strip() | |
def get_cache_path(text: str, voice_preset: str) -> str: | |
"""Generate a unique cache path.""" | |
import hashlib | |
hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest() | |
return os.path.join(CACHE_DIR, f"{hash_key}.wav") | |
def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2"): | |
"""Convert text to speech using Bark model.""" | |
try: | |
# Clean and validate input | |
text = clean_text(text) | |
if not text: | |
return None | |
# Generate cache path | |
cache_path = get_cache_path(text, voice_preset) | |
# Process the text | |
inputs = processor(text, voice_preset=voice_preset) | |
# Move inputs to device | |
inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v | |
for k, v in inputs.items()} | |
# Generate audio | |
with torch.inference_mode(): | |
audio_array = model.generate( | |
**inputs, | |
do_sample=True, | |
temperature=0.7 | |
) | |
# Process the audio | |
audio_array = audio_array.cpu().numpy().squeeze() | |
audio_array = np.clip(audio_array, -1, 1) | |
# Save the audio | |
sample_rate = model.generation_config.sample_rate | |
scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array) | |
return cache_path | |
except Exception as e: | |
print(f"Error in text_to_speech: {str(e)}") | |
return None | |
# Voice presets | |
voice_presets = [ | |
"v2/hi_speaker_1", | |
"v2/hi_speaker_2", | |
"v2/hi_speaker_3", | |
"v2/hi_speaker_4", | |
"v2/hi_speaker_5" | |
] | |
# Create Gradio interface | |
demo = gr.Interface( | |
fn=text_to_speech, | |
inputs=[ | |
gr.Textbox( | |
label="Enter text (Hindi or English)", | |
placeholder="Type your text here...", | |
lines=3 | |
), | |
gr.Dropdown( | |
choices=voice_presets, | |
value="v2/hi_speaker_2", | |
label="Select Voice" | |
) | |
], | |
outputs=gr.Audio(label="Generated Speech"), | |
title="Bark Text-to-Speech", | |
description="Convert text to speech using the Bark model. Supports Hindi and English text." | |
) | |
# Launch the app | |
demo.launch() |