fffiloni commited on
Commit
84a8081
·
1 Parent(s): 1062de0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -0
app.py CHANGED
@@ -60,4 +60,77 @@ def wav_bytes_from_spectrogram_image(image: Image.Image) -> T.Tuple[io.BytesIO,
60
 
61
  return wav_bytes
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  gr.Interface(fn=wav_bytes_from_spectrogram_image, inputs=[gr.Image()], outputs=[gr.Audio()]).launch()
 
60
 
61
  return wav_bytes
62
 
63
+ def spectrogram_from_image(
64
+ image: Image.Image, max_volume: float = 50, power_for_image: float = 0.25
65
+ ) -> np.ndarray:
66
+ """
67
+ Compute a spectrogram magnitude array from a spectrogram image.
68
+ TODO(hayk): Add image_from_spectrogram and call this out as the reverse.
69
+ """
70
+ # Convert to a numpy array of floats
71
+ data = np.array(image).astype(np.float32)
72
+
73
+ # Flip Y take a single channel
74
+ data = data[::-1, :, 0]
75
+
76
+ # Invert
77
+ data = 255 - data
78
+
79
+ # Rescale to max volume
80
+ data = data * max_volume / 255
81
+
82
+ # Reverse the power curve
83
+ data = np.power(data, 1 / power_for_image)
84
+
85
+ return data
86
+
87
+ def waveform_from_spectrogram(
88
+ Sxx: np.ndarray,
89
+ n_fft: int,
90
+ hop_length: int,
91
+ win_length: int,
92
+ num_samples: int,
93
+ sample_rate: int,
94
+ mel_scale: bool = True,
95
+ n_mels: int = 512,
96
+ max_mel_iters: int = 200,
97
+ num_griffin_lim_iters: int = 32,
98
+ device: str = "cuda:0",
99
+ ) -> np.ndarray:
100
+ """
101
+ Reconstruct a waveform from a spectrogram.
102
+ This is an approximate inverse of spectrogram_from_waveform, using the Griffin-Lim algorithm
103
+ to approximate the phase.
104
+ """
105
+ Sxx_torch = torch.from_numpy(Sxx).to(device)
106
+
107
+ # TODO(hayk): Make this a class that caches the two things
108
+
109
+ if mel_scale:
110
+ mel_inv_scaler = torchaudio.transforms.InverseMelScale(
111
+ n_mels=n_mels,
112
+ sample_rate=sample_rate,
113
+ f_min=0,
114
+ f_max=10000,
115
+ n_stft=n_fft // 2 + 1,
116
+ norm=None,
117
+ mel_scale="htk",
118
+ max_iter=max_mel_iters,
119
+ ).to(device)
120
+
121
+ Sxx_torch = mel_inv_scaler(Sxx_torch)
122
+
123
+ griffin_lim = torchaudio.transforms.GriffinLim(
124
+ n_fft=n_fft,
125
+ win_length=win_length,
126
+ hop_length=hop_length,
127
+ power=1.0,
128
+ n_iter=num_griffin_lim_iters,
129
+ ).to(device)
130
+
131
+ waveform = griffin_lim(Sxx_torch).cpu().numpy()
132
+
133
+ return waveform
134
+
135
+
136
  gr.Interface(fn=wav_bytes_from_spectrogram_image, inputs=[gr.Image()], outputs=[gr.Audio()]).launch()