Spaces:

ayush2607
/

Bark_tts_hindi

Running

App Files Files Community

ayush2607 commited on Oct 24, 2024

Commit

e2e5380

verified ·

1 Parent(s): 9ce6d3a

Update app.py

Browse files

Files changed (1) hide show

app.py +165 -35

app.py CHANGED Viewed

@@ -1,31 +1,127 @@
 import gradio as gr
-from transformers import AutoProcessor, BarkModel
-import scipy.io.wavfile
 import torch
-import os
-# Initialize model and processor
-processor = AutoProcessor.from_pretrained("suno/bark")
-model = BarkModel.from_pretrained("suno/bark")
-def text_to_speech(text, voice_preset="v2/hi_speaker_2"):
-    # Generate audio from text
-    inputs = processor(text, voice_preset=voice_preset)
-    # Generate audio
-    audio_array = model.generate(**inputs)
-    audio_array = audio_array.cpu().numpy().squeeze()
-    # Get sample rate from model config
-    sample_rate = model.generation_config.sample_rate
-    # Create temporary file path
-    output_path = "temp_audio.wav"
-    # Save audio file
-    scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)
-    return output_path
 # Define available voice presets
 voice_presets = [
@@ -36,19 +132,53 @@ voice_presets = [
     "v2/hi_speaker_5"
 ]
-# Create Gradio interface
-demo = gr.Interface(
-    fn=text_to_speech,
-    inputs=[
-        gr.Textbox(label="Enter text (Hindi or English)"),
-        gr.Dropdown(choices=voice_presets, value="v2/hi_speaker_2", label="Select Voice")
-    ],
-    outputs=gr.Audio(label="Generated Speech"),
-    title="Bark Text-to-Speech",
-    description="Convert text to speech using the Bark model. Supports Hindi and English text.",
-)
-# Launch the app
 if __name__ == "__main__":
-    demo.launch()

+import os
+import sys
+import logging
 import gradio as gr
 import torch
+import scipy.io.wavfile
+import warnings
+from functools import lru_cache
+from typing import Optional
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Suppress warnings
+warnings.filterwarnings('ignore')
+def check_dependencies():
+    try:
+        from transformers import AutoProcessor, BarkModel
+        return True
+    except ImportError as e:
+        logger.error(f"Error importing required modules: {str(e)}")
+        return False
+if not check_dependencies():
+    logger.error("Required dependencies not found. Please install them using:")
+    logger.error("pip install -r requirements.txt")
+    sys.exit(1)
+from transformers import AutoProcessor, BarkModel
+# Global variables for model and processor
+processor = None
+model = None
+def initialize_model():
+    global processor, model
+    # Initialize processor and model only once
+    if processor is None or model is None:
+        logger.info("Initializing model and processor...")
+        # Load processor
+        processor = AutoProcessor.from_pretrained("suno/bark")
+        # Load model with optimizations
+        model = BarkModel.from_pretrained("suno/bark")
+        # Move model to GPU if available
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if device == "cuda":
+            # Use half-precision floating point numbers
+            model = model.half()
+        model = model.to(device)
+        # Enable model optimization
+        model.eval()
+        torch.set_grad_enabled(False)
+        # Optional: Use torch.compile for PyTorch 2.0+
+        if hasattr(torch, 'compile'):
+            try:
+                model = torch.compile(model)
+                logger.info("Model compiled successfully")
+            except Exception as e:
+                logger.warning(f"Could not compile model: {e}")
+        logger.info(f"Model initialized on {device}")
+    return processor, model
+# Cache the text preprocessing step
+@lru_cache(maxsize=128)
+def preprocess_text(text: str, voice_preset: str):
+    processor, _ = initialize_model()
+    return processor(text, voice_preset=voice_preset)
+def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2", history: Optional[list] = None):
+    try:
+        if not text.strip():
+            raise ValueError("Please enter some text")
+        # Initialize model if not already initialized
+        processor, model = initialize_model()
+        # Get device
+        device = next(model.parameters()).device
+        # Preprocess text (cached)
+        inputs = preprocess_text(text, voice_preset)
+        # Move inputs to the same device as model
+        inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
+        # Generate audio with optimized settings
+        with torch.inference_mode():  # Faster than no_grad()
+            audio_array = model.generate(
+                **inputs,
+                do_sample=False,  # Deterministic generation is faster
+                num_beams=1,      # No beam search for faster generation
+            )
+        # Move to CPU and convert to numpy
+        audio_array = audio_array.cpu().numpy().squeeze()
+        # Get sample rate from model config
+        sample_rate = model.generation_config.sample_rate
+        # Create output directory if it doesn't exist
+        os.makedirs("outputs", exist_ok=True)
+        # Generate unique filename based on text hash
+        output_path = os.path.join("outputs", f"audio_{hash(text)}_{hash(voice_preset)}.wav")
+        # Save audio file
+        scipy.io.wavfile.write(output_path, rate=sample_rate, data=audio_array)
+        return output_path
+    except Exception as e:
+        logger.error(f"Error in text_to_speech: {str(e)}")
+        raise gr.Error(str(e))
 # Define available voice presets
 voice_presets = [
     "v2/hi_speaker_5"
 ]
+# Create Gradio interface with optimized settings
+with gr.Blocks(analytics_enabled=False) as demo:
+    gr.Markdown("# Bark Text-to-Speech (Optimized)")
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Enter text (Hindi or English)",
+                placeholder="तुम बहुत अच्छे हो...",
+                lines=3
+            )
+            voice_input = gr.Dropdown(
+                choices=voice_presets,
+                value="v2/hi_speaker_2",
+                label="Select Voice"
+            )
+            submit_btn = gr.Button("Generate Speech")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated Speech")
+    # Add examples
+    gr.Examples(
+        examples=[
+            ["तुम बहुत अच्छे हो और मैं भी तुम्हारी तरह अच्छा हूँ", "v2/hi_speaker_2"],
+            ["You are very nice and I am also nice like you", "v2/hi_speaker_1"]
+        ],
+        inputs=[text_input, voice_input],
+        outputs=audio_output,
+        cache_examples=True  # Cache example outputs
+    )
+    # Connect components
+    submit_btn.click(
+        fn=text_to_speech,
+        inputs=[text_input, voice_input],
+        outputs=audio_output
+    )
+# Launch the app with optimized settings
 if __name__ == "__main__":
+    # Initialize model at startup
+    initialize_model()
+    # Launch with optimized settings
+    demo.launch(
+        enable_queue=True,     # Enable queue for better handling of multiple requests
+        cache_examples=True,   # Cache example outputs
+        show_error=True,      # Show errors for debugging
+    )