Respair
/

RiFornet_Vocoder

vocoder

audio

speech

tts

Model card Files Files and versions Community

Respair commited on 10 days ago

Commit

521d929

verified ·

1 Parent(s): 8248296

Update RingFormer/meldataset.py

Browse files

Files changed (1) hide show

RingFormer/meldataset.py +33 -16

RingFormer/meldataset.py CHANGED Viewed

@@ -17,11 +17,24 @@ import soundfile as sf
 def normalize_audio(wav):
     return wav / torch.max(torch.abs(torch.from_numpy(wav)))  # Correct peak normalization
-def load_wav(full_path):
-    data, sampling_rate = librosa.load(full_path, sr=None)
     return data, sampling_rate
 def dynamic_range_compression(x, C=1, clip_val=1e-5):
     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
@@ -54,12 +67,16 @@ hann_window = {}
 def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
-    y = torch.clamp(y, min=-1, max=1)
-    if torch.min(y) < -1.:
-        print('min value is ', torch.min(y))
-    if torch.max(y) > 1.:
-        print('max value is ', torch.max(y))
     global mel_basis, hann_window
     if fmax not in mel_basis:
@@ -87,12 +104,10 @@ to_mel = torchaudio.transforms.MelSpectrogram(
    sample_rate=44_100, n_mels=128, n_fft=2048, win_length=2048, hop_length=512)
-# to_mel = torchaudio.transforms.MelSpectrogram(
-#    sample_rate=24000, n_mels=80, n_fft=2048, win_length=1200, hop_length=300, center='center')
 mean, std = -4, 4
 def preproces(wave,to_mel=to_mel, device='cpu'):
     to_mel = to_mel.to(device)
@@ -142,17 +157,19 @@ class MelDataset(torch.utils.data.Dataset):
         filename = self.audio_files[index]
         if self._cache_ref_count == 0:
             audio, sampling_rate = load_wav(filename)
             self.cached_wav = audio
             if sampling_rate != self.sampling_rate:
-                raise ValueError("{} SR doesn't match target {} SR".format(
-                    sampling_rate, self.sampling_rate))
             self._cache_ref_count = self.n_cache_reuse
         else:
             audio = self.cached_wav
             self._cache_ref_count -= 1
         audio = torch.FloatTensor(audio)
         audio = audio.unsqueeze(0)

 def normalize_audio(wav):
     return wav / torch.max(torch.abs(torch.from_numpy(wav)))  # Correct peak normalization
+def load_wav_librosa(full_path):
+    data, sampling_rate = librosa.load(full_path, sr=44100)
     return data, sampling_rate
+def load_wav_scipy(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def load_wav(full_path):
+    try:
+        return load_wav_scipy(full_path)
+    except:
+        # print('using librosa...')
+        return load_wav_librosa(full_path)
 def dynamic_range_compression(x, C=1, clip_val=1e-5):
     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    # y = torch.clamp(y, -1, 1)
+    # if torch.min(y) < -1.:
+    #     # y = torch.clamp(y, min = -1)
+    #     # print('min value is ', torch.min(y))
+    # if torch.max(y) > 1.:
+        # y = torch.clamp(y, max = -1)
+        # print('max value is ', torch.max(y))
     global mel_basis, hann_window
     if fmax not in mel_basis:
    sample_rate=44_100, n_mels=128, n_fft=2048, win_length=2048, hop_length=512)
+# to_mel = torchaudio.transforms.MelSpectrogram(n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
 mean, std = -4, 4
+5
 def preproces(wave,to_mel=to_mel, device='cpu'):
     to_mel = to_mel.to(device)
         filename = self.audio_files[index]
         if self._cache_ref_count == 0:
             audio, sampling_rate = load_wav(filename)
+            audio = audio / MAX_WAV_VALUE
+            if not self.fine_tuning:
+                audio = normalize(audio) * 0.95
             self.cached_wav = audio
             if sampling_rate != self.sampling_rate:
+                audio = librosa.resample(audio, orig_sr= sampling_rate, target_sr= self.sampling_rate)
+#                 raise ValueError("{} SR doesn't match target {} SR, {}".format(
+#                     sampling_rate, self.sampling_rate, filename))
             self._cache_ref_count = self.n_cache_reuse
         else:
             audio = self.cached_wav
             self._cache_ref_count -= 1
         audio = torch.FloatTensor(audio)
         audio = audio.unsqueeze(0)