File size: 5,049 Bytes
5a9b731
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import torch
from TTS.api import TTS
#Andy edited: import losses
import audio_diffusion_attacks_forhf.src.losses
from audiotools import AudioSignal
import numpy as np
import torchaudio
import random
import string
import os

class XTTS_Eval:
    
    def __init__(self, input_sample_rate, text="The quick brown fox jumps over the lazy dog."):
        self.model = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
        self.model=self.model.to(device='cuda')
        self.text=text
        self.input_sample_rate=input_sample_rate
        self.mel_loss = losses.MelSpectrogramLoss(n_mels=[5, 10, 20, 40, 80, 160, 320],
                                                 window_lengths=[32, 64, 128, 256, 512, 1024, 2048],
                                                 mel_fmin=[0, 0, 0, 0, 0, 0, 0],
                                                 pow=1.0,
                                                 clamp_eps=1.0e-5,
                                                 mag_weight=0.0)
        
    def eval(self, original_audio, protected_audio):
        
        original_audio=original_audio[0]
        protected_audio=protected_audio[0]
        
        unprotected_gen=self.generate_audio(original_audio).to(device='cuda')
        protected_gen=self.generate_audio(protected_audio).to(device='cuda')
        
        match_len=min(original_audio.shape[1], unprotected_gen.shape[1])
        if original_audio.shape[1]<unprotected_gen.shape[1]:
            s_unprotected_gen=unprotected_gen[:, :match_len]
            s_protected_gen=unprotected_gen[:, :match_len]
            s_original_audio=original_audio
            s_protected_audio=protected_audio
        else:
            s_unprotected_gen=unprotected_gen
            s_protected_gen=unprotected_gen
            s_original_audio=original_audio[:, :match_len]
            s_protected_audio=protected_audio[:, :match_len]
            
        match_len=min(protected_gen.shape[1], unprotected_gen.shape[1])
        protected_gen=protected_gen[:,:match_len]
        unprotected_gen=unprotected_gen[:,:match_len]
            
        
        eval_dict={}
        # Difference between original and unprotected gen
        eval_dict["original_unprotectedgen_l1"]=torch.mean(torch.abs(s_original_audio-s_unprotected_gen))
        eval_dict["original_unprotectedgen_mel"]=self.mel_loss(AudioSignal(s_original_audio, self.input_sample_rate), AudioSignal(s_unprotected_gen, self.input_sample_rate))
        # Difference between original and protected gen
        eval_dict["original_protectedgen_l1"]=torch.mean(torch.abs(s_original_audio-s_protected_gen))
        eval_dict["original_protectedgen_mel"]=self.mel_loss(AudioSignal(s_original_audio, self.input_sample_rate), AudioSignal(s_protected_gen, self.input_sample_rate))
        # Difference between protected and protected gen
        eval_dict["protected_protectedgen_l1"]=torch.mean(torch.abs(s_protected_audio-s_protected_gen))
        eval_dict["protected_protectedgen_mel"]=self.mel_loss(AudioSignal(s_protected_audio, self.input_sample_rate), AudioSignal(s_protected_gen, self.input_sample_rate))
        # Difference between unprotected gen and protected gen
        eval_dict["protectedgen_unprotectedgen_l1"]=torch.mean(torch.abs(protected_gen-unprotected_gen))
        eval_dict["protectedgen_unprotectedgen_mel"]=self.mel_loss(AudioSignal(protected_gen, self.input_sample_rate), AudioSignal(unprotected_gen, self.input_sample_rate))
        return eval_dict, unprotected_gen, protected_gen
        
    def generate_audio(self, audio):
        random_str=''.join(random.choices(string.ascii_uppercase + string.digits, k=50))
        torchaudio.save(f"test_audio/{random_str}.wav", torch.reshape(audio.detach().cpu(), (2, audio.shape[1])), self.input_sample_rate, format="wav")
        torch.manual_seed(0)

        wav = self.model.tts(text=self.text, 
                      speaker_wav=f"test_audio/{random_str}.wav",
                      language="en")
        os.remove(f"test_audio/{random_str}.wav")
        wav=torch.from_numpy(np.array(wav))
        stereo_wave=torch.zeros((2, wav.shape[0]))
        stereo_wave[:,:]=wav
        
        transform = torchaudio.transforms.Resample(24000, self.input_sample_rate)
        stereo_wave=transform(stereo_wave)
        return stereo_wave

# # Init TTS
# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
#
# # Run TTS
# # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
# # Text to speech list of amplitude values as output
# # wav = tts.tts(text="Hello world!", speaker_wav=, language="en")
# # Text to speech to a file
# tts.tts_to_file(text="Hello world!", 
#                 speaker_wav="/media/willie/1caf5422-4135-4f2c-9619-c44041b51146/audio_data/DS_10283_3443/VCTK-Corpus-0.92/wav48_silence_trimmed/p227/p227_023_mic1.flac", 
#                 language="en", 
#                 file_path="/home/willie/eclipse-workspace/audio_diffusion_attacks/src/test_audio/speech/output.wav")