Spaces:

ayush2607
/

Bark_tts_hindi

Running

File size: 3,267 Bytes

28e403b
d82b8b6
e2e5380
d82b8b6
 
e2e5380
d82b8b6
28e403b
d82b8b6
 
e2e5380
d82b8b6
 
 
 
e2e5380
d82b8b6
 
 
 
e2e5380
d82b8b6
 
 
e2e5380
d82b8b6
 
 
 
 
e2e5380
d82b8b6
 
e2e5380
d82b8b6
 
 
 
e2e5380
d82b8b6
 
e2e5380
d82b8b6
 
 
e2e5380
d82b8b6
 
 
 
 
 
e2e5380
d82b8b6
e2e5380
 
d82b8b6
 
e2e5380
d82b8b6
 
e2e5380
d82b8b6
 
 
 
 
e2e5380
d82b8b6
 
28e403b
 
 
 
 
 
 
 
 
 
d82b8b6
 
 
 
 
 
 
 
 
 
 
 
28e403b
d82b8b6
28e403b
d82b8b6

import gradio as gr
from transformers import AutoProcessor, BarkModel
import scipy.io.wavfile
import torch
import os
from typing import Optional
import numpy as np

# Check for CUDA availability and set device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize model and processor globally with optimizations
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32)
model.to(DEVICE)

# Enable model optimizations
if DEVICE == "cuda":
    torch.backends.cudnn.benchmark = True
model.eval()  # Set to evaluation mode

# Cache for storing generated audio files
CACHE_DIR = "audio_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

def get_cache_path(text: str, voice_preset: str) -> str:
    """Generate a unique cache path for the given text and voice preset."""
    import hashlib
    hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
    return os.path.join(CACHE_DIR, f"{hash_key}.wav")

@torch.inference_mode()  # More efficient than no_grad for inference
def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2") -> Optional[str]:
    try:
        # Check cache first
        cache_path = get_cache_path(text, voice_preset)
        if os.path.exists(cache_path):
            return cache_path

        # Generate audio from text
        inputs = processor(text, voice_preset=voice_preset)
        
        # Move inputs to device
        inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v 
                 for k, v in inputs.items()}
        
        # Generate audio with optimized settings
        with torch.cuda.amp.autocast() if DEVICE == "cuda" else torch.no_grad():
            audio_array = model.generate(**inputs, 
                                      do_sample=True,
                                      guidance_scale=2.5,
                                      temperature=0.7)
        
        # Move to CPU and convert to numpy
        audio_array = audio_array.cpu().numpy().squeeze()
        
        # Normalize audio
        audio_array = np.clip(audio_array, -1, 1)
        
        # Get sample rate from model config
        sample_rate = model.generation_config.sample_rate
        
        # Save audio file to cache
        scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
        
        return cache_path
    
    except Exception as e:
        print(f"Error generating audio: {str(e)}")
        return None

# Define available voice presets
voice_presets = [
    "v2/hi_speaker_1",
    "v2/hi_speaker_2",
    "v2/hi_speaker_3",
    "v2/hi_speaker_4",
    "v2/hi_speaker_5"
]

# Create Gradio interface with optimized settings
demo = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(label="Enter text (Hindi or English)"),
        gr.Dropdown(choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice")
    ],
    outputs=gr.Audio(label="Generated Speech"),
    title="Bark Text-to-Speech",
    description="Convert text to speech using the Bark model. Supports Hindi and English text.",
    cache_examples=True,
)

# Launch the app with optimized settings
if __name__ == "__main__":
    demo.launch()