Spaces:

ayush2607
/

Bark_tts_hindi

Running

App Files Files Community

ayush2607 commited on Oct 24, 2024

Commit

d82b8b6

verified ·

1 Parent(s): 41f16a2

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -131

app.py CHANGED Viewed

@@ -1,104 +1,73 @@
-import os
-import sys
-import logging
 import gradio as gr
-import torch
 import scipy.io.wavfile
-import warnings
-from functools import lru_cache
 from typing import Optional
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Suppress warnings
-warnings.filterwarnings('ignore')
-def check_dependencies():
-    try:
-        from transformers import AutoProcessor, BarkModel
-        return True
-    except ImportError as e:
-        logger.error(f"Error importing required modules: {str(e)}")
-        return False
-if not check_dependencies():
-    logger.error("Required dependencies not found. Please install them using:")
-    logger.error("pip install -r requirements.txt")
-    sys.exit(1)
-from transformers import AutoProcessor, BarkModel
-# Global variables for model and processor
-processor = None
-model = None
-def initialize_model():
-    global processor, model
-    if processor is None or model is None:
-        logger.info("Initializing model and processor...")
-        processor = AutoProcessor.from_pretrained("suno/bark")
-        model = BarkModel.from_pretrained("suno/bark")
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        if device == "cuda":
-            model = model.half()
-        model = model.to(device)
-        model.eval()
-        torch.set_grad_enabled(False)
-        if hasattr(torch, 'compile'):
-            try:
-                model = torch.compile(model)
-                logger.info("Model compiled successfully")
-            except Exception as e:
-                logger.warning(f"Could not compile model: {e}")
-        logger.info(f"Model initialized on {device}")
-    return processor, model
-@lru_cache(maxsize=128)
-def preprocess_text(text: str, voice_preset: str):
-    processor, _ = initialize_model()
-    return processor(text, voice_preset=voice_preset)
-def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2"):
     try:
-        if not text.strip():
-            raise ValueError("Please enter some text")
-        processor, model = initialize_model()
-        device = next(model.parameters()).device
-        inputs = preprocess_text(text, voice_preset)
-        inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
-        with torch.inference_mode():
-            audio_array = model.generate(
-                **inputs,
-                do_sample=False,
-                num_beams=1,
-            )
         audio_array = audio_array.cpu().numpy().squeeze()
-        sample_rate = model.generation_config.sample_rate
-        os.makedirs("outputs", exist_ok=True)
-        output_path = os.path.join("outputs", f"audio_{hash(text)}_{hash(voice_preset)}.wav")
-        scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)
-        return output_path
     except Exception as e:
-        logger.error(f"Error in text_to_speech: {str(e)}")
-        raise gr.Error(str(e))
 # Define available voice presets
 voice_presets = [
@@ -109,54 +78,20 @@ voice_presets = [
     "v2/hi_speaker_5"
 ]
-# Create Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# Bark Text-to-Speech (Optimized)")
-    with gr.Row():
-        with gr.Column():
-            text_input = gr.Textbox(
-                label="Enter text (Hindi or English)",
-                placeholder="तुम बहुत अच्छे हो...",
-                lines=3
-            )
-            voice_input = gr.Dropdown(
-                choices=voice_presets,
-                value="v2/hi_speaker_2",
-                label="Select Voice"
-            )
-            submit_btn = gr.Button("Generate Speech")
-        with gr.Column():
-            audio_output = gr.Audio(label="Generated Speech")
-    # Fixed Examples implementation
-    gr.Examples(
-        examples=[
-            ["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"],
-            ["You are very nice and I am also nice like you", "v2/hi_speaker_1"]
-        ],
-        inputs=[text_input, voice_input],
-        outputs=audio_output,
-        fn=text_to_speech,  # Add the function reference
-        cache_examples=True
-    )
-    # Connect components
-    submit_btn.click(
-        fn=text_to_speech,
-        inputs=[text_input, voice_input],
-        outputs=audio_output
-    )
-# Launch the app
 if __name__ == "__main__":
-    # Initialize model at startup
-    initialize_model()
-    # Launch with optimized settings
-    demo.launch(
-        enable_queue=True,
-        show_error=True,
-        share=True  # Enable sharing (optional)
-    )

 import gradio as gr
+from transformers import AutoProcessor, BarkModel
 import scipy.io.wavfile
+import torch
+import os
 from typing import Optional
+import numpy as np
+# Check for CUDA availability and set device
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Initialize model and processor globally with optimizations
+processor = AutoProcessor.from_pretrained("suno/bark")
+model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32)
+model.to(DEVICE)
+# Enable model optimizations
+if DEVICE == "cuda":
+    torch.backends.cudnn.benchmark = True
+model.eval()  # Set to evaluation mode
+# Cache for storing generated audio files
+CACHE_DIR = "audio_cache"
+os.makedirs(CACHE_DIR, exist_ok=True)
+def get_cache_path(text: str, voice_preset: str) -> str:
+    """Generate a unique cache path for the given text and voice preset."""
+    import hashlib
+    hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
+    return os.path.join(CACHE_DIR, f"{hash_key}.wav")
+@torch.inference_mode()  # More efficient than no_grad for inference
+def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2") -> Optional[str]:
     try:
+        # Check cache first
+        cache_path = get_cache_path(text, voice_preset)
+        if os.path.exists(cache_path):
+            return cache_path
+        # Generate audio from text
+        inputs = processor(text, voice_preset=voice_preset)
+        # Move inputs to device
+        inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
+                 for k, v in inputs.items()}
+        # Generate audio with optimized settings
+        with torch.cuda.amp.autocast() if DEVICE == "cuda" else torch.no_grad():
+            audio_array = model.generate(**inputs,
+                                      do_sample=True,
+                                      guidance_scale=2.5,
+                                      temperature=0.7)
+        # Move to CPU and convert to numpy
         audio_array = audio_array.cpu().numpy().squeeze()
+        # Normalize audio
+        audio_array = np.clip(audio_array, -1, 1)
+        # Get sample rate from model config
+        sample_rate = model.generation_config.sample_rate
+        # Save audio file to cache
+        scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
+        return cache_path
     except Exception as e:
+        print(f"Error generating audio: {str(e)}")
+        return None
 # Define available voice presets
 voice_presets = [
     "v2/hi_speaker_5"
 ]
+# Create Gradio interface with optimized settings
+demo = gr.Interface(
+    fn=text_to_speech,
+    inputs=[
+        gr.Textbox(label="Enter text (Hindi or English)"),
+        gr.Dropdown(choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice")
+    ],
+    outputs=gr.Audio(label="Generated Speech"),
+    title="Bark Text-to-Speech",
+    description="Convert text to speech using the Bark model. Supports Hindi and English text.",
+    cache_examples=True,
+)
+# Launch the app with optimized settings
 if __name__ == "__main__":
+    demo.launch()