import gradio as gr from transformers import AutoProcessor, BarkModel import scipy.io.wavfile import torch import os # Initialize model and processor processor = AutoProcessor.from_pretrained("suno/bark") model = BarkModel.from_pretrained("suno/bark") def text_to_speech(text, voice_preset="v2/hi_speaker_2"): # Generate audio from text inputs = processor(text, voice_preset=voice_preset) # Generate audio audio_array = model.generate(**inputs) audio_array = audio_array.cpu().numpy().squeeze() # Get sample rate from model config sample_rate = model.generation_config.sample_rate # Create temporary file path output_path = "temp_audio.wav" # Save audio file scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array) return output_path # Define available voice presets voice_presets = [ "v2/hi_speaker_1", "v2/hi_speaker_2", "v2/hi_speaker_3", "v2/hi_speaker_4", "v2/hi_speaker_5" ] # Create Gradio interface demo = gr.Interface( fn=text_to_speech, inputs=[ gr.Textbox(label="Enter text (Hindi or English)", placeholder="तुम बहुत अच्छे हो..."), gr.Dropdown(choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice") ], outputs=gr.Audio(label="Generated Speech"), title="Bark Text-to-Speech", description="Convert text to speech using the Bark model. Supports Hindi and English text.", examples=[ ["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"], ["You are very nice and I am also nice like you", "v2/hi_speaker_1"] ] ) # Launch the app if __name__ == "__main__": demo.launch()