ylacombe commited on
Commit
8281081
·
1 Parent(s): 875286c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +208 -101
app.py CHANGED
@@ -1,102 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
- from threading import Thread
3
-
4
- from transformers import AutoProcessor
5
- from transformers import set_seed
6
-
7
- from vocos_bark import BarkModel
8
- from scipy.io.wavfile import write
9
- from pydub import AudioSegment
10
-
11
- import numpy as np
12
-
13
- import os
14
- import gradio as gr
15
- import uuid
16
- import io
17
- from vocos import Vocos
18
-
19
- import os
20
- os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp"
21
-
22
-
23
- set_seed(0)
24
-
25
- def _grab_best_device(use_gpu=True):
26
- if torch.cuda.device_count() > 0 and use_gpu:
27
- device = "cuda"
28
- else:
29
- device = "cpu"
30
- return device
31
-
32
- device = _grab_best_device()
33
-
34
- HUB_PATH = "suno/bark"
35
-
36
- processor = AutoProcessor.from_pretrained(HUB_PATH)
37
-
38
- speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key])
39
-
40
- SAMPLE_RATE = 24_000
41
-
42
- vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2").to(device)
43
-
44
- # import model
45
- if device == "cpu":
46
- bark = BarkModel.from_pretrained(HUB_PATH)
47
- else:
48
- bark = BarkModel.from_pretrained(HUB_PATH).to(device)
49
- bark = bark.to_bettertransformer()
50
-
51
-
52
- # streaming inference
53
- def generate_audio(text, voice_preset = None, lag = 0):
54
- if voice_preset not in speaker_embeddings:
55
- voice_preset = None
56
-
57
- sentences = [
58
- text,
59
- ]
60
- inputs = processor(sentences, voice_preset=voice_preset).to(device)
61
- # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
62
-
63
- fine_output = bark.generate(
64
- **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
65
- )
66
-
67
- print("Fine tokens generated")
68
-
69
- with torch.no_grad():
70
-
71
- encodec_waveform = bark.codec_decode(fine_output)
72
-
73
- features = vocos.codes_to_features(fine_output.transpose(0,1))
74
- vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
75
-
76
- return (SAMPLE_RATE, encodec_waveform.cpu().squeeze().numpy()), (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy())
77
-
78
-
79
- # Gradio blocks demo
80
- with gr.Blocks() as demo_blocks:
81
- gr.Markdown("""<h1 align="center">🐶BARK with Vocos</h1>""")
82
- gr.HTML("""<h3 style="text-align:center;">📢Vocos-enhanced TTS 🦾! </h3>""")
83
- with gr.Group():
84
- with gr.Row():
85
- inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here")
86
- dd = gr.Dropdown(
87
- speaker_embeddings,
88
- value=None,
89
- label="Available voice presets",
90
- info="Defaults to no speaker embeddings!"
91
- )
92
-
93
- with gr.Row():
94
- btn = gr.Button("Bark with Vocos TTS")
95
-
96
- with gr.Row():
97
- out_audio_encodec = gr.Audio(type="numpy", autoplay=False, label="original output", show_label=True)
98
- out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="vocos enhanced output", show_label=True)
99
-
100
- btn.click(generate_audio, [inp_text, dd], [out_audio_encodec, out_audio_vocos])
101
-
102
- demo_blocks.queue().launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Optional, Tuple, Union
2
+
3
+ from transformers.models.bark import BarkSemanticModel, BarkCoarseModel, BarkFineModel, BarkPreTrainedModel
4
+ from transformers.models.bark.generation_configuration_bark import (
5
+ BarkCoarseGenerationConfig,
6
+ BarkFineGenerationConfig,
7
+ BarkSemanticGenerationConfig,
8
+ )
9
+ from transformers import BarkConfig, AutoModel
10
+ from transformers.modeling_utils import get_parameter_device
11
+ from transformers.utils import (
12
+ is_accelerate_available,
13
+ )
14
+
15
  import torch
16
+
17
+ class BarkModel(BarkPreTrainedModel):
18
+ config_class = BarkConfig
19
+
20
+ def __init__(self, config):
21
+ super().__init__(config)
22
+
23
+ self.semantic = BarkSemanticModel(config.semantic_config)
24
+ self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
25
+ self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
26
+
27
+ self.codec_model = AutoModel.from_config(config.codec_config)
28
+
29
+ self.config = config
30
+
31
+ @property
32
+ def device(self) -> torch.device:
33
+ """
34
+ `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
35
+ device).
36
+ """
37
+ # for bark_model, device must be verified on its sub-models
38
+ # if has _hf_hook, has been offloaded so the device has to be found in the hook
39
+ if not hasattr(self.semantic, "_hf_hook"):
40
+ return get_parameter_device(self)
41
+ for module in self.semantic.modules():
42
+ if (
43
+ hasattr(module, "_hf_hook")
44
+ and hasattr(module._hf_hook, "execution_device")
45
+ and module._hf_hook.execution_device is not None
46
+ ):
47
+ return torch.device(module._hf_hook.execution_device)
48
+
49
+ def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
50
+ r"""
51
+ Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
52
+ method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
53
+ the next sub-model runs.
54
+
55
+ Args:
56
+ gpu_id (`int`, *optional*, defaults to 0):
57
+ GPU id on which the sub-models will be loaded and offloaded.
58
+ """
59
+ if is_accelerate_available():
60
+ from accelerate import cpu_offload_with_hook
61
+ else:
62
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")
63
+
64
+ device = torch.device(f"cuda:{gpu_id}")
65
+
66
+ if self.device.type != "cpu":
67
+ self.to("cpu")
68
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
69
+
70
+ # this layer is used outside the first foward pass of semantic so need to be loaded before semantic
71
+ self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
72
+
73
+ hook = None
74
+ for cpu_offloaded_model in [
75
+ self.semantic,
76
+ self.coarse_acoustics,
77
+ self.fine_acoustics,
78
+ ]:
79
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
80
+
81
+ self.fine_acoustics_hook = hook
82
+
83
+ _, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook)
84
+
85
+ # We'll offload the last model manually.
86
+ self.codec_model_hook = hook
87
+
88
+ def codec_decode(self, fine_output):
89
+ """Turn quantized audio codes into audio array using encodec."""
90
+
91
+ fine_output = fine_output.transpose(0, 1)
92
+ emb = self.codec_model.quantizer.decode(fine_output)
93
+ out = self.codec_model.decoder(emb)
94
+ audio_arr = out.squeeze(1) # squeeze the codebook dimension
95
+
96
+ return audio_arr
97
+
98
+ @torch.no_grad()
99
+ def generate(
100
+ self,
101
+ input_ids: Optional[torch.Tensor] = None,
102
+ history_prompt: Optional[Dict[str, torch.Tensor]] = None,
103
+ **kwargs,
104
+ ) -> torch.LongTensor:
105
+ """
106
+ Generates audio from an input prompt and an additional optional `Bark` speaker prompt.
107
+
108
+ Args:
109
+ input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
110
+ Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
111
+ longest generation among the batch.
112
+ history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
113
+ Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
114
+ kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
115
+
116
+ - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
117
+ - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
118
+ semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
119
+
120
+ This means you can, for example, specify a generation strategy for all sub-models except one.
121
+ Returns:
122
+ torch.LongTensor: Output generated audio.
123
+
124
+ Example:
125
+
126
+ ```python
127
+ >>> from transformers import AutoProcessor, BarkModel
128
+
129
+ >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
130
+ >>> model = BarkModel.from_pretrained("suno/bark-small")
131
+
132
+ >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
133
+ >>> voice_preset = "v2/en_speaker_6"
134
+
135
+ >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)
136
+
137
+ >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
138
+ >>> audio_array = audio_array.cpu().numpy().squeeze()
139
+ ```
140
+ """
141
+ # TODO (joao):workaround until nested generation config is compatible with PreTrained Model
142
+ # todo: dict
143
+ semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config)
144
+ coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config)
145
+ fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config)
146
+
147
+ kwargs_semantic = {
148
+ # if "attention_mask" is set, it should not be passed to CoarseModel and FineModel
149
+ "attention_mask": kwargs.pop("attention_mask", None)
150
+ }
151
+ kwargs_coarse = {}
152
+ kwargs_fine = {}
153
+ for key, value in kwargs.items():
154
+ if key.startswith("semantic_"):
155
+ key = key[len("semantic_") :]
156
+ kwargs_semantic[key] = value
157
+ elif key.startswith("coarse_"):
158
+ key = key[len("coarse_") :]
159
+ kwargs_coarse[key] = value
160
+ elif key.startswith("fine_"):
161
+ key = key[len("fine_") :]
162
+ kwargs_fine[key] = value
163
+ else:
164
+ # If the key is already in a specific config, then it's been set with a
165
+ # submodules specific value and we don't override
166
+ if key not in kwargs_semantic:
167
+ kwargs_semantic[key] = value
168
+ if key not in kwargs_coarse:
169
+ kwargs_coarse[key] = value
170
+ if key not in kwargs_fine:
171
+ kwargs_fine[key] = value
172
+
173
+ # 1. Generate from the semantic model
174
+ semantic_output = self.semantic.generate(
175
+ input_ids,
176
+ history_prompt=history_prompt,
177
+ semantic_generation_config=semantic_generation_config,
178
+ **kwargs_semantic,
179
+ )
180
+
181
+ # 2. Generate from the coarse model
182
+ coarse_output = self.coarse_acoustics.generate(
183
+ semantic_output,
184
+ history_prompt=history_prompt,
185
+ semantic_generation_config=semantic_generation_config,
186
+ coarse_generation_config=coarse_generation_config,
187
+ codebook_size=self.generation_config.codebook_size,
188
+ **kwargs_coarse,
189
+ )
190
+
191
+ # 3. "generate" from the fine model
192
+ output = self.fine_acoustics.generate(
193
+ coarse_output,
194
+ history_prompt=history_prompt,
195
+ semantic_generation_config=semantic_generation_config,
196
+ coarse_generation_config=coarse_generation_config,
197
+ fine_generation_config=fine_generation_config,
198
+ codebook_size=self.generation_config.codebook_size,
199
+ **kwargs_fine,
200
+ )
201
+
202
+ if getattr(self, "fine_acoustics_hook", None) is not None:
203
+ # Manually offload fine_acoustics to CPU
204
+ # and load codec_model to GPU
205
+ # since bark doesn't use codec_model forward pass
206
+ self.fine_acoustics_hook.offload()
207
+ self.codec_model = self.codec_model.to(self.device)
208
+
209
+ return output