Spaces:
Running
on
T4
Running
on
T4
Update app.py
Browse files
app.py
CHANGED
@@ -60,4 +60,77 @@ def wav_bytes_from_spectrogram_image(image: Image.Image) -> T.Tuple[io.BytesIO,
|
|
60 |
|
61 |
return wav_bytes
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
gr.Interface(fn=wav_bytes_from_spectrogram_image, inputs=[gr.Image()], outputs=[gr.Audio()]).launch()
|
|
|
60 |
|
61 |
return wav_bytes
|
62 |
|
63 |
+
def spectrogram_from_image(
|
64 |
+
image: Image.Image, max_volume: float = 50, power_for_image: float = 0.25
|
65 |
+
) -> np.ndarray:
|
66 |
+
"""
|
67 |
+
Compute a spectrogram magnitude array from a spectrogram image.
|
68 |
+
TODO(hayk): Add image_from_spectrogram and call this out as the reverse.
|
69 |
+
"""
|
70 |
+
# Convert to a numpy array of floats
|
71 |
+
data = np.array(image).astype(np.float32)
|
72 |
+
|
73 |
+
# Flip Y take a single channel
|
74 |
+
data = data[::-1, :, 0]
|
75 |
+
|
76 |
+
# Invert
|
77 |
+
data = 255 - data
|
78 |
+
|
79 |
+
# Rescale to max volume
|
80 |
+
data = data * max_volume / 255
|
81 |
+
|
82 |
+
# Reverse the power curve
|
83 |
+
data = np.power(data, 1 / power_for_image)
|
84 |
+
|
85 |
+
return data
|
86 |
+
|
87 |
+
def waveform_from_spectrogram(
|
88 |
+
Sxx: np.ndarray,
|
89 |
+
n_fft: int,
|
90 |
+
hop_length: int,
|
91 |
+
win_length: int,
|
92 |
+
num_samples: int,
|
93 |
+
sample_rate: int,
|
94 |
+
mel_scale: bool = True,
|
95 |
+
n_mels: int = 512,
|
96 |
+
max_mel_iters: int = 200,
|
97 |
+
num_griffin_lim_iters: int = 32,
|
98 |
+
device: str = "cuda:0",
|
99 |
+
) -> np.ndarray:
|
100 |
+
"""
|
101 |
+
Reconstruct a waveform from a spectrogram.
|
102 |
+
This is an approximate inverse of spectrogram_from_waveform, using the Griffin-Lim algorithm
|
103 |
+
to approximate the phase.
|
104 |
+
"""
|
105 |
+
Sxx_torch = torch.from_numpy(Sxx).to(device)
|
106 |
+
|
107 |
+
# TODO(hayk): Make this a class that caches the two things
|
108 |
+
|
109 |
+
if mel_scale:
|
110 |
+
mel_inv_scaler = torchaudio.transforms.InverseMelScale(
|
111 |
+
n_mels=n_mels,
|
112 |
+
sample_rate=sample_rate,
|
113 |
+
f_min=0,
|
114 |
+
f_max=10000,
|
115 |
+
n_stft=n_fft // 2 + 1,
|
116 |
+
norm=None,
|
117 |
+
mel_scale="htk",
|
118 |
+
max_iter=max_mel_iters,
|
119 |
+
).to(device)
|
120 |
+
|
121 |
+
Sxx_torch = mel_inv_scaler(Sxx_torch)
|
122 |
+
|
123 |
+
griffin_lim = torchaudio.transforms.GriffinLim(
|
124 |
+
n_fft=n_fft,
|
125 |
+
win_length=win_length,
|
126 |
+
hop_length=hop_length,
|
127 |
+
power=1.0,
|
128 |
+
n_iter=num_griffin_lim_iters,
|
129 |
+
).to(device)
|
130 |
+
|
131 |
+
waveform = griffin_lim(Sxx_torch).cpu().numpy()
|
132 |
+
|
133 |
+
return waveform
|
134 |
+
|
135 |
+
|
136 |
gr.Interface(fn=wav_bytes_from_spectrogram_image, inputs=[gr.Image()], outputs=[gr.Audio()]).launch()
|