ayush2607 commited on
Commit
27d6995
·
verified ·
1 Parent(s): a3d5303

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -95
app.py CHANGED
@@ -3,70 +3,39 @@ from transformers import AutoProcessor, BarkModel
3
  import scipy.io.wavfile
4
  import torch
5
  import os
6
- from typing import Optional
7
  import numpy as np
8
- from concurrent.futures import ThreadPoolExecutor
9
  import warnings
10
  warnings.filterwarnings('ignore')
11
 
 
12
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
  print(f"Using device: {DEVICE}")
14
 
15
- # Initialize model and processor with HF-optimized settings
16
  processor = AutoProcessor.from_pretrained(
17
  "suno/bark",
18
- use_fast=True,
19
  trust_remote_code=True
20
  )
21
 
22
  model = BarkModel.from_pretrained(
23
  "suno/bark",
24
- torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
25
- low_cpu_mem_usage=True,
26
  trust_remote_code=True
27
  )
28
 
29
- # Optimize model based on device
30
- if DEVICE == "cuda":
31
- model = model.half()
32
- torch.backends.cudnn.benchmark = True
33
- torch.backends.cudnn.enabled = True
34
- torch.backends.cuda.matmul.allow_tf32 = True
35
- torch.backends.cudnn.allow_tf32 = True
36
- else:
37
- model = torch.quantization.quantize_dynamic(
38
- model, {torch.nn.Linear}, dtype=torch.qint8
39
- )
40
-
41
  model.to(DEVICE)
42
  model.eval()
43
 
44
- # Cache in HF Space-friendly location
45
- CACHE_DIR = "/tmp/audio_cache"
46
  os.makedirs(CACHE_DIR, exist_ok=True)
47
- MAX_TEXT_LENGTH = 200
48
 
49
- def chunk_text(text: str) -> list[str]:
50
- """Split text into smaller chunks at sentence boundaries."""
51
- if len(text) <= MAX_TEXT_LENGTH:
52
- return [text]
53
-
54
- sentences = text.replace('।', '.').split('.')
55
- chunks = []
56
- current_chunk = ""
57
-
58
- for sentence in sentences:
59
- if len(current_chunk) + len(sentence) <= MAX_TEXT_LENGTH:
60
- current_chunk += sentence + "."
61
- else:
62
- if current_chunk:
63
- chunks.append(current_chunk.strip())
64
- current_chunk = sentence + "."
65
-
66
- if current_chunk:
67
- chunks.append(current_chunk.strip())
68
-
69
- return chunks
70
 
71
  def get_cache_path(text: str, voice_preset: str) -> str:
72
  """Generate a unique cache path."""
@@ -74,65 +43,42 @@ def get_cache_path(text: str, voice_preset: str) -> str:
74
  hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
75
  return os.path.join(CACHE_DIR, f"{hash_key}.wav")
76
 
77
- def process_chunk(chunk: str, voice_preset: str) -> np.ndarray:
78
- """Process a single text chunk."""
79
  try:
80
- inputs = processor(chunk, voice_preset=voice_preset)
 
 
 
 
 
 
 
 
 
 
 
81
  inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
82
  for k, v in inputs.items()}
83
 
84
- with torch.inference_mode(), torch.cuda.amp.autocast() if DEVICE == "cuda" else torch.no_grad():
 
85
  audio_array = model.generate(
86
  **inputs,
87
  do_sample=True,
88
- guidance_scale=2.0,
89
- temperature=0.7,
90
  )
91
 
92
- return audio_array.cpu().numpy().squeeze()
93
- except Exception as e:
94
- print(f"Error processing chunk: {str(e)}")
95
- return np.zeros(0)
96
-
97
- @torch.inference_mode()
98
- def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2") -> Optional[str]:
99
- try:
100
- if not text.strip():
101
- return None
102
-
103
- # Clear old cache files
104
- for file in os.listdir(CACHE_DIR):
105
- if file.endswith('.wav'):
106
- try:
107
- os.remove(os.path.join(CACHE_DIR, file))
108
- except:
109
- pass
110
-
111
- cache_path = get_cache_path(text, voice_preset)
112
-
113
- # Process text
114
- chunks = chunk_text(text)
115
-
116
- # Process chunks based on length
117
- if len(chunks) > 1:
118
- with ThreadPoolExecutor(max_workers=2) as executor:
119
- audio_chunks = list(executor.map(
120
- lambda x: process_chunk(x, voice_preset),
121
- chunks
122
- ))
123
- audio_array = np.concatenate([chunk for chunk in audio_chunks if chunk.size > 0])
124
- else:
125
- audio_array = process_chunk(chunks[0], voice_preset)
126
-
127
- if audio_array.size == 0:
128
- return None
129
-
130
- # Normalize and save
131
  audio_array = np.clip(audio_array, -1, 1)
 
 
132
  sample_rate = model.generation_config.sample_rate
133
  scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
134
 
135
  return cache_path
 
136
  except Exception as e:
137
  print(f"Error in text_to_speech: {str(e)}")
138
  return None
@@ -153,7 +99,7 @@ demo = gr.Interface(
153
  gr.Textbox(
154
  label="Enter text (Hindi or English)",
155
  placeholder="Type your text here...",
156
- lines=4
157
  ),
158
  gr.Dropdown(
159
  choices=voice_presets,
@@ -162,14 +108,10 @@ demo = gr.Interface(
162
  )
163
  ],
164
  outputs=gr.Audio(label="Generated Speech"),
165
- title="🎧 Bark Text-to-Speech",
166
- description="""Convert text to speech using the Bark model.
167
- \n- Supports both Hindi and English text
168
- \n- Multiple voice options available
169
- \n- For best results, keep text length moderate""",
170
 
171
- cache_examples=True,
172
  )
173
 
174
- # Launch for HF Spaces
175
  demo.launch()
 
3
  import scipy.io.wavfile
4
  import torch
5
  import os
 
6
  import numpy as np
 
7
  import warnings
8
  warnings.filterwarnings('ignore')
9
 
10
+ # Basic device setup
11
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
  print(f"Using device: {DEVICE}")
13
 
14
+ # Model initialization with basic settings
15
  processor = AutoProcessor.from_pretrained(
16
  "suno/bark",
 
17
  trust_remote_code=True
18
  )
19
 
20
  model = BarkModel.from_pretrained(
21
  "suno/bark",
22
+ torch_dtype=torch.float32, # Using float32 for stability
 
23
  trust_remote_code=True
24
  )
25
 
26
+ # Basic model optimization
 
 
 
 
 
 
 
 
 
 
 
27
  model.to(DEVICE)
28
  model.eval()
29
 
30
+ # Define cache directory in the allowed space
31
+ CACHE_DIR = "audio_cache"
32
  os.makedirs(CACHE_DIR, exist_ok=True)
 
33
 
34
+ def clean_text(text):
35
+ """Clean and prepare text for processing."""
36
+ if not isinstance(text, str):
37
+ return ""
38
+ return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  def get_cache_path(text: str, voice_preset: str) -> str:
41
  """Generate a unique cache path."""
 
43
  hash_key = hashlib.md5(f"{text}_{voice_preset}".encode()).hexdigest()
44
  return os.path.join(CACHE_DIR, f"{hash_key}.wav")
45
 
46
+ def text_to_speech(text: str, voice_preset: str = "v2/hi_speaker_2"):
47
+ """Convert text to speech using Bark model."""
48
  try:
49
+ # Clean and validate input
50
+ text = clean_text(text)
51
+ if not text:
52
+ return None
53
+
54
+ # Generate cache path
55
+ cache_path = get_cache_path(text, voice_preset)
56
+
57
+ # Process the text
58
+ inputs = processor(text, voice_preset=voice_preset)
59
+
60
+ # Move inputs to device
61
  inputs = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
62
  for k, v in inputs.items()}
63
 
64
+ # Generate audio
65
+ with torch.inference_mode():
66
  audio_array = model.generate(
67
  **inputs,
68
  do_sample=True,
69
+ temperature=0.7
 
70
  )
71
 
72
+ # Process the audio
73
+ audio_array = audio_array.cpu().numpy().squeeze()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  audio_array = np.clip(audio_array, -1, 1)
75
+
76
+ # Save the audio
77
  sample_rate = model.generation_config.sample_rate
78
  scipy.io.wavfile.write(cache_path, rate=sample_rate, data=audio_array)
79
 
80
  return cache_path
81
+
82
  except Exception as e:
83
  print(f"Error in text_to_speech: {str(e)}")
84
  return None
 
99
  gr.Textbox(
100
  label="Enter text (Hindi or English)",
101
  placeholder="Type your text here...",
102
+ lines=3
103
  ),
104
  gr.Dropdown(
105
  choices=voice_presets,
 
108
  )
109
  ],
110
  outputs=gr.Audio(label="Generated Speech"),
111
+ title="Bark Text-to-Speech",
112
+ description="Convert text to speech using the Bark model. Supports Hindi and English text."
 
 
 
113
 
 
114
  )
115
 
116
+ # Launch the app
117
  demo.launch()