yasserrmd commited on
Commit
c31b600
·
verified ·
1 Parent(s): 74836a0

Update generate_audio.py

Browse files
Files changed (1) hide show
  1. generate_audio.py +37 -34
generate_audio.py CHANGED
@@ -31,15 +31,16 @@ class TTSGenerator:
31
 
32
  # Set device
33
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
34
-
 
35
  # Load Parler model and tokenizer for Speaker 1
36
- self.parler_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(self.device)
37
- self.parler_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
38
  self.speaker1_description = """
39
- Laura's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
40
  """
41
  self.speaker2_description = """
42
- Gary's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
43
  """
44
 
45
  # Load Bark model and processor for Speaker 2
@@ -76,20 +77,21 @@ class TTSGenerator:
76
  # generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
77
  # audio_arr = generation.cpu().numpy().squeeze()
78
  # return audio_arr, self.parler_model.config.sampling_rate
79
- input_ids = self.parler_tokenizer(self.speaker1_description, return_tensors="pt", padding=True).input_ids.to(self.device)
80
- attention_mask_input = self.parler_tokenizer(self.speaker1_description, return_tensors="pt", padding=True).attention_mask.to(self.device)
81
-
82
- prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
83
- attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
84
-
85
- # Pass all required arguments to generate() for reliable behavior
86
- generation = self.parler_model.generate(
87
- input_ids=input_ids,
88
- attention_mask=attention_mask_input, # Set attention mask for input IDs
89
- prompt_input_ids=prompt_input_ids,
90
- prompt_attention_mask=attention_mask_prompt # Set prompt attention mask
91
- )
92
- audio_arr = generation.cpu().numpy().squeeze()
 
93
  return audio_arr, self.parler_model.config.sampling_rate
94
 
95
  #@spaces.GPU(duration=30)
@@ -104,21 +106,21 @@ class TTSGenerator:
104
  np.array: Audio array.
105
  int: Sampling rate.
106
  """
107
-
108
- input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt", padding=True).input_ids.to(self.device)
109
- attention_mask_input = self.parler_tokenizer(self.speaker1_description, return_tensors="pt", padding=True).attention_mask.to(self.device)
110
-
111
- prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
112
- attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
113
-
114
- # Pass all required arguments to generate() for reliable behavior
115
- generation = self.parler_model.generate(
116
- input_ids=input_ids,
117
- attention_mask=attention_mask_input, # Set attention mask for input IDs
118
- prompt_input_ids=prompt_input_ids,
119
- prompt_attention_mask=attention_mask_prompt # Set prompt attention mask
120
- )
121
- audio_arr = generation.cpu().numpy().squeeze()
122
 
123
  # inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
124
  # speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
@@ -175,6 +177,7 @@ class TTSGenerator:
175
  final_audio = audio_segment
176
  else:
177
  final_audio += audio_segment
 
178
 
179
  # Export final audio to MP3
180
  final_audio.export(self.output_audio_path, format="mp3", bitrate="192k", parameters=["-q:a", "0"])
 
31
 
32
  # Set device
33
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
34
+
35
+
36
  # Load Parler model and tokenizer for Speaker 1
37
+ self.parler_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(self.device)
38
+ self.parler_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")
39
  self.speaker1_description = """
40
+ Laura's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise and very clear audio.
41
  """
42
  self.speaker2_description = """
43
+ Gary's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise and very clear audio.
44
  """
45
 
46
  # Load Bark model and processor for Speaker 2
 
77
  # generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
78
  # audio_arr = generation.cpu().numpy().squeeze()
79
  # return audio_arr, self.parler_model.config.sampling_rate
80
+ with torch.no_grad():
81
+ input_ids = self.parler_tokenizer(self.speaker1_description, return_tensors="pt", padding=True).input_ids.to(self.device)
82
+ attention_mask_input = self.parler_tokenizer(self.speaker1_description, return_tensors="pt", padding=True).attention_mask.to(self.device)
83
+
84
+ prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
85
+ attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
86
+
87
+ # Pass all required arguments to generate() for reliable behavior
88
+ generation = self.parler_model.generate(
89
+ input_ids=input_ids,
90
+ attention_mask=attention_mask_input, # Set attention mask for input IDs
91
+ prompt_input_ids=prompt_input_ids,
92
+ prompt_attention_mask=attention_mask_prompt # Set prompt attention mask
93
+ )
94
+ audio_arr = generation.cpu().numpy().squeeze()
95
  return audio_arr, self.parler_model.config.sampling_rate
96
 
97
  #@spaces.GPU(duration=30)
 
106
  np.array: Audio array.
107
  int: Sampling rate.
108
  """
109
+ with torch.no_grad():
110
+ input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt", padding=True).input_ids.to(self.device)
111
+ attention_mask_input = self.parler_tokenizer(self.speaker1_description, return_tensors="pt", padding=True).attention_mask.to(self.device)
112
+
113
+ prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
114
+ attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
115
+
116
+ # Pass all required arguments to generate() for reliable behavior
117
+ generation = self.parler_model.generate(
118
+ input_ids=input_ids,
119
+ attention_mask=attention_mask_input, # Set attention mask for input IDs
120
+ prompt_input_ids=prompt_input_ids,
121
+ prompt_attention_mask=attention_mask_prompt # Set prompt attention mask
122
+ )
123
+ audio_arr = generation.cpu().numpy().squeeze()
124
 
125
  # inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
126
  # speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
 
177
  final_audio = audio_segment
178
  else:
179
  final_audio += audio_segment
180
+ torch.cuda.empty_cache()
181
 
182
  # Export final audio to MP3
183
  final_audio.export(self.output_audio_path, format="mp3", bitrate="192k", parameters=["-q:a", "0"])