Spaces:

ayush2607
/

Bark_tts_hindi

Running

App Files Files Community

ayush2607 commited on Oct 24, 2024

Commit

cddcc30

verified ·

1 Parent(s): 27d6995

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -94

app.py CHANGED Viewed

@@ -1,117 +1,61 @@
 import gradio as gr
-from transformers import AutoProcessor, BarkModel
-import scipy.io.wavfile
 import torch
-import os
 import numpy as np
-import warnings
-warnings.filterwarnings('ignore')
-# Basic device setup
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {DEVICE}")
-# Model initialization with basic settings
-processor = AutoProcessor.from_pretrained(
-    "suno/bark",
-    trust_remote_code=True
-)
-model = BarkModel.from_pretrained(
-    "suno/bark",
-    torch_dtype=torch.float32,  # Using float32 for stability
-    trust_remote_code=True
-)
-# Basic model optimization
-model.to(DEVICE)
-model.eval()
-# Define cache directory in the allowed space
-CACHE_DIR = "audio_cache"
-os.makedirs(CACHE_DIR, exist_ok=True)
-def clean_text(text):
-    """Clean and prepare text for processing."""
-    if not isinstance(text, str):
-        return ""
-    return text.strip()
-def get_cache_path(text: str, voice_preset: str) -> str:
-    """Generate a unique cache path."""
-    import hashlib
-    hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
-    return os.path.join(CACHE_DIR, f"{hash_key}.wav")
-def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2"):
-    """Convert text to speech using Bark model."""
     try:
-        # Clean and validate input
-        text = clean_text(text)
-        if not text:
-            return None
-        # Generate cache path
-        cache_path = get_cache_path(text, voice_preset)
-        # Process the text
-        inputs = processor(text, voice_preset=voice_preset)
-        # Move inputs to device
-        inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
-                 for k, v in inputs.items()}
-        # Generate audio
-        with torch.inference_mode():
-            audio_array = model.generate(
-                **inputs,
-                do_sample=True,
-                temperature=0.7
-            )
-        # Process the audio
-        audio_array = audio_array.cpu().numpy().squeeze()
-        audio_array = np.clip(audio_array, -1, 1)
-        # Save the audio
-        sample_rate = model.generation_config.sample_rate
-        scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
-        return cache_path
     except Exception as e:
-        print(f"Error in text_to_speech: {str(e)}")
-        return None
-# Voice presets
-voice_presets = [
-    "v2/hi_speaker_1",
-    "v2/hi_speaker_2",
-    "v2/hi_speaker_3",
-    "v2/hi_speaker_4",
-    "v2/hi_speaker_5"
-]
 # Create Gradio interface
 demo = gr.Interface(
     fn=text_to_speech,
     inputs=[
         gr.Textbox(
-            label="Enter text (Hindi or English)",
-            placeholder="Type your text here...",
-            lines=3
-        ),
-        gr.Dropdown(
-            choices=voice_presets,
-            value="v2/hi_speaker_2",
-            label="Select Voice"
         )
     ],
     outputs=gr.Audio(label="Generated Speech"),
-    title="Bark Text-to-Speech",
-    description="Convert text to speech using the Bark model. Supports Hindi and English text."
 )
 # Launch the app
-demo.launch()

 import gradio as gr
 import torch
+from transformers import AutoProcessor, AutoModel
+import scipy.io.wavfile as wavfile
 import numpy as np
+import os
+# Initialize model and processor
+def load_model():
+    processor = AutoProcessor.from_pretrained("suno/bark-small")
+    model = AutoModel.from_pretrained("suno/bark-small")
+    return processor, model
+# Text to speech function
+def text_to_speech(text):
     try:
+        # Generate speech
+        inputs = processor(
+            text=[text],
+            return_tensors="pt",
+        )
+        speech_values = model.generate(**inputs, do_sample=True)
+        # Convert to numpy and normalize
+        audio_data = speech_values.cpu().numpy().squeeze()
+        sampling_rate = model.generation_config.sample_rate
+        # Create temporary file
+        temp_path = "temp_audio.wav"
+        wavfile.write(temp_path, sampling_rate, audio_data)
+        return temp_path
     except Exception as e:
+        return f"Error generating speech: {str(e)}"
+# Load models globally
+print("Loading models...")
+processor, model = load_model()
+print("Models loaded successfully!")
 # Create Gradio interface
 demo = gr.Interface(
     fn=text_to_speech,
     inputs=[
         gr.Textbox(
+            label="Enter text (Hindi supported)",
+            placeholder="इस योजना से संबंधित लाभों का विवरण प्राप्त कर सकते"
         )
     ],
     outputs=gr.Audio(label="Generated Speech"),
+    title="Hindi Text-to-Speech using Bark",
+    description="Generate natural-sounding speech from Hindi text using the Bark model.",
+    examples=[
+        ["इस योजना से संबंधित लाभों का विवरण प्राप्त कर सकते"],
+        ["नमस्ते, आप कैसे हैं?"],
+    ]
 )
 # Launch the app
+if __name__ == "__main__":
+    demo.launch()