Spaces:

ayush2607
/

Bark_tts_hindi

Running

App Files Files Community

ayush2607 commited on Oct 24, 2024

Commit

27d6995

verified ·

1 Parent(s): a3d5303

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -95

app.py CHANGED Viewed

@@ -3,70 +3,39 @@ from transformers import AutoProcessor, BarkModel
 import scipy.io.wavfile
 import torch
 import os
-from typing import Optional
 import numpy as np
-from concurrent.futures import ThreadPoolExecutor
 import warnings
 warnings.filterwarnings('ignore')
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {DEVICE}")
-# Initialize model and processor with HF-optimized settings
 processor = AutoProcessor.from_pretrained(
     "suno/bark",
-    use_fast=True,
     trust_remote_code=True
 )
 model = BarkModel.from_pretrained(
     "suno/bark",
-    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
-    low_cpu_mem_usage=True,
     trust_remote_code=True
 )
-# Optimize model based on device
-if DEVICE == "cuda":
-    model = model.half()
-    torch.backends.cudnn.benchmark = True
-    torch.backends.cudnn.enabled = True
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-else:
-    model = torch.quantization.quantize_dynamic(
-        model, {torch.nn.Linear}, dtype=torch.qint8
-    )
 model.to(DEVICE)
 model.eval()
-# Cache in HF Space-friendly location
-CACHE_DIR = "/tmp/audio_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
-MAX_TEXT_LENGTH = 200
-def chunk_text(text: str) -> list[str]:
-    """Split text into smaller chunks at sentence boundaries."""
-    if len(text) <= MAX_TEXT_LENGTH:
-        return [text]
-    sentences = text.replace('।', '.').split('.')
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) <= MAX_TEXT_LENGTH:
-            current_chunk += sentence + "."
-        else:
-            if current_chunk:
-                chunks.append(current_chunk.strip())
-            current_chunk = sentence + "."
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
 def get_cache_path(text: str, voice_preset: str) -> str:
     """Generate a unique cache path."""
@@ -74,65 +43,42 @@ def get_cache_path(text: str, voice_preset: str) -> str:
     hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
     return os.path.join(CACHE_DIR, f"{hash_key}.wav")
-def process_chunk(chunk: str, voice_preset: str) -> np.ndarray:
-    """Process a single text chunk."""
     try:
-        inputs = processor(chunk, voice_preset=voice_preset)
         inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
                  for k, v in inputs.items()}
-        with torch.inference_mode(), torch.cuda.amp.autocast() if DEVICE == "cuda" else torch.no_grad():
             audio_array = model.generate(
                 **inputs,
                 do_sample=True,
-                guidance_scale=2.0,
-                temperature=0.7,
             )
-        return audio_array.cpu().numpy().squeeze()
-    except Exception as e:
-        print(f"Error processing chunk: {str(e)}")
-        return np.zeros(0)
-@torch.inference_mode()
-def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2") -> Optional[str]:
-    try:
-        if not text.strip():
-            return None
-        # Clear old cache files
-        for file in os.listdir(CACHE_DIR):
-            if file.endswith('.wav'):
-                try:
-                    os.remove(os.path.join(CACHE_DIR, file))
-                except:
-                    pass
-        cache_path = get_cache_path(text, voice_preset)
-        # Process text
-        chunks = chunk_text(text)
-        # Process chunks based on length
-        if len(chunks) > 1:
-            with ThreadPoolExecutor(max_workers=2) as executor:
-                audio_chunks = list(executor.map(
-                    lambda x: process_chunk(x, voice_preset),
-                    chunks
-                ))
-            audio_array = np.concatenate([chunk for chunk in audio_chunks if chunk.size > 0])
-        else:
-            audio_array = process_chunk(chunks[0], voice_preset)
-        if audio_array.size == 0:
-            return None
-        # Normalize and save
         audio_array = np.clip(audio_array, -1, 1)
         sample_rate = model.generation_config.sample_rate
         scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
         return cache_path
     except Exception as e:
         print(f"Error in text_to_speech: {str(e)}")
         return None
@@ -153,7 +99,7 @@ demo = gr.Interface(
         gr.Textbox(
             label="Enter text (Hindi or English)",
             placeholder="Type your text here...",
-            lines=4
         ),
         gr.Dropdown(
             choices=voice_presets,
@@ -162,14 +108,10 @@ demo = gr.Interface(
         )
     ],
     outputs=gr.Audio(label="Generated Speech"),
-    title="🎧 Bark Text-to-Speech",
-    description="""Convert text to speech using the Bark model.
-    \n- Supports both Hindi and English text
-    \n- Multiple voice options available
-    \n- For best results, keep text length moderate""",
-    cache_examples=True,
 )
-# Launch for HF Spaces
 demo.launch()

 import scipy.io.wavfile
 import torch
 import os
 import numpy as np
 import warnings
 warnings.filterwarnings('ignore')
+# Basic device setup
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {DEVICE}")
+# Model initialization with basic settings
 processor = AutoProcessor.from_pretrained(
     "suno/bark",
     trust_remote_code=True
 )
 model = BarkModel.from_pretrained(
     "suno/bark",
+    torch_dtype=torch.float32,  # Using float32 for stability
     trust_remote_code=True
 )
+# Basic model optimization
 model.to(DEVICE)
 model.eval()
+# Define cache directory in the allowed space
+CACHE_DIR = "audio_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
+def clean_text(text):
+    """Clean and prepare text for processing."""
+    if not isinstance(text, str):
+        return ""
+    return text.strip()
 def get_cache_path(text: str, voice_preset: str) -> str:
     """Generate a unique cache path."""
     hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
     return os.path.join(CACHE_DIR, f"{hash_key}.wav")
+def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2"):
+    """Convert text to speech using Bark model."""
     try:
+        # Clean and validate input
+        text = clean_text(text)
+        if not text:
+            return None
+        # Generate cache path
+        cache_path = get_cache_path(text, voice_preset)
+        # Process the text
+        inputs = processor(text, voice_preset=voice_preset)
+        # Move inputs to device
         inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
                  for k, v in inputs.items()}
+        # Generate audio
+        with torch.inference_mode():
             audio_array = model.generate(
                 **inputs,
                 do_sample=True,
+                temperature=0.7
             )
+        # Process the audio
+        audio_array = audio_array.cpu().numpy().squeeze()
         audio_array = np.clip(audio_array, -1, 1)
+        # Save the audio
         sample_rate = model.generation_config.sample_rate
         scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
         return cache_path
     except Exception as e:
         print(f"Error in text_to_speech: {str(e)}")
         return None
         gr.Textbox(
             label="Enter text (Hindi or English)",
             placeholder="Type your text here...",
+            lines=3
         ),
         gr.Dropdown(
             choices=voice_presets,
         )
     ],
     outputs=gr.Audio(label="Generated Speech"),
+    title="Bark Text-to-Speech",
+    description="Convert text to speech using the Bark model. Supports Hindi and English text."
 )
+# Launch the app
 demo.launch()