Speech_Enhancement_Demo

Runtime error

App Files Files Community

DurreSudoku commited on Nov 12, 2024

Commit

68d7781

verified ·

1 Parent(s): d3ebde8

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

README.md +2 -8
__pycache__/custom_scaler.cpython-312.pyc +0 -0
__pycache__/functions.cpython-312.pyc +0 -0
__pycache__/unet.cpython-312.pyc +0 -0
app.py +42 -0
custom_scaler.py +35 -0
examples/VoiceBank+DEMAND_test_sample_male_1.wav +0 -0
examples/durim_test_sample_1.wav +0 -0
examples/durim_test_sample_2.wav +0 -0
functions.py +83 -0
model.pth +3 -0
requirements.txt +3 -0
test.py +18 -0
unet.py +75 -0

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Speech Enhancement Demo
-emoji: 🌍
-colorFrom: blue
-colorTo: yellow
 sdk: gradio
 sdk_version: 5.5.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Speech_Enhancement_Demo
+app_file: app.py
 sdk: gradio
 sdk_version: 5.5.0
 ---

__pycache__/custom_scaler.cpython-312.pyc ADDED Viewed

Binary file (2.67 kB). View file

__pycache__/functions.cpython-312.pyc ADDED Viewed

Binary file (4.85 kB). View file

__pycache__/unet.cpython-312.pyc ADDED Viewed

Binary file (5.6 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import gradio as gr
+import torch
+from functions import *
+from unet import UNet
+from custom_scaler import min_max_scaler
+model = UNet()
+model_state_dict = torch.load("huggingface/model.pth", map_location="cpu")
+model.load_state_dict(model_state_dict["model_state_dict"])
+scaler = min_max_scaler()
+scaler.fit()
+with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown(
+    """
+    # Speech enhancement demonstration
+    Hello!
+    This is a demo for a speech enhancement model trained to reduce background noice to ensure inteligibility of a single speaker.
+    Feel free to upload your own audio file or try one of our example files to see how it works!
+    """
+        )
+    with gr.Row():
+        with gr.Column():
+            audio_path = gr.Audio(sources="upload", type="filepath", label="Upload your song here", format="wav")
+        with gr.Column():
+            enhanced_audio = gr.Audio(sources=None, label="Enhanced audio will be found here", format="wav")
+    with gr.Row():
+        files = gr.FileExplorer(label="Example files", file_count="single", root_dir="huggingface/examples", interactive=True)
+        files.change(fn=return_input, inputs=files, outputs=audio_path)
+        files.change(fn=return_input, inputs=None, outputs=enhanced_audio)
+    with gr.Row():
+        submit_audio = gr.Button(value="Submit audio for enhancement")
+        submit_audio.click(fn=lambda x: predict(x, model, scaler), inputs=audio_path, outputs=enhanced_audio, trigger_mode="once")
+demo.launch(share=True)

custom_scaler.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+class min_max_scaler():
+    def __init__(self, upper_bound=1, lower_bound=0):
+        self.upper = upper_bound
+        self.lower = lower_bound
+        self.minimum = torch.ones(1) * torch.inf
+        self.maximum = - torch.ones(1) *torch.inf
+    def fit(self, set_maximum=0.0, set_minimum=-100.0):
+        """Find min and max of given subset OR set min and max manually.
+           Since dB-spectrograms are on the scale [-100, 0] by default, default values are set to those values.
+        Args:
+            set_maximum (float, optional): set maximum value manually. Defaults to 0.0.
+            set_minimum (float, optional): set minimum value manually. Defaults to -100.0.
+        Returns:
+            None: None
+        """
+        if set_minimum is not None and set_maximum is not None:
+            self.minimum = set_minimum
+            self.maximum = set_maximum
+        return None
+    def transform(self, spectrogram):
+        if self.minimum == torch.inf:
+            raise ValueError("Cannot transform before scaler is fitted with min-max-values")
+        return (self.upper - self.lower) * (spectrogram - self.minimum) / (self.maximum - self.minimum) + self.lower
+    def inverse_transform(self, spectrogram):
+        if self.minimum == torch.inf:
+            raise ValueError("Cannot inverse transform before scaler is fitted with min-max-values")
+        return (spectrogram - self.lower) * (self.maximum - self.minimum) / (self.upper - self.lower) + self.minimum

examples/VoiceBank+DEMAND_test_sample_male_1.wav ADDED Viewed

Binary file (112 kB). View file

examples/durim_test_sample_1.wav ADDED Viewed

Binary file (300 kB). View file

examples/durim_test_sample_2.wav ADDED Viewed

Binary file (347 kB). View file

functions.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torchaudio
+from torch import hamming_window, log10, no_grad, exp
+def return_input(user_input):
+    if user_input is None:
+        return None
+    return user_input
+def load_audio(audio_path):
+    audio_tensor, sr = torchaudio.load(audio_path)
+    audio_tensor = torchaudio.functional.resample(audio_tensor, sr, 16000)
+    audio_tensor.type()
+    return audio_tensor
+def load_audio_numpy(audio_path):
+    audio_tensor, sr = torchaudio.load(audio_path)
+    audio_tensor = torchaudio.functional.resample(audio_tensor, sr, 16000)
+    audio_array = audio_tensor.numpy()
+    return (16000, audio_array.ravel())
+def audio_to_spectrogram(audio):
+    transform_fn = torchaudio.transforms.Spectrogram(n_fft=512, hop_length=512//4, power=None, window_fn=hamming_window)
+    spectrogram = transform_fn(audio)
+    return spectrogram
+def extract_magnitude_and_phase(spectrogram):
+    magnitude, phase = spectrogram.abs(), spectrogram.angle()
+    return magnitude, phase
+def amplitude_to_db(magnitude_spec):
+    max_amplitude = magnitude_spec.max()
+    db_spectrogram = torchaudio.functional.amplitude_to_DB(magnitude_spec, 20, 10e-10, log10(max_amplitude), 100.0)
+    return db_spectrogram, max_amplitude
+def min_max_scaling(spectrogram, scaler):
+    # Min-Max scaling (soundness of the math is questionable due to the use of each spectrograms' max value during decibel-scaling)
+    spectrogram = scaler.transform(spectrogram)
+    return spectrogram
+def inverse_min_max(spectrogram, scaler):
+    spectrogram = scaler.inverse_transform(spectrogram)
+    return spectrogram
+def db_to_amplitude(db_spectrogram, max_amplitude):
+    return max_amplitude * 10**(db_spectrogram/20)
+def reconstruct_complex_spectrogram(magnitude, phase):
+    return magnitude * exp(1j*phase)
+def inverse_fft(spectrogram):
+    inverse_fn = torchaudio.transforms.InverseSpectrogram(n_fft=512, hop_length=512//4, window_fn=hamming_window)
+    return inverse_fn(spectrogram)
+def transform_audio(audio, scaler):
+    spectrogram = audio_to_spectrogram(audio)
+    magnitude, phase = extract_magnitude_and_phase(spectrogram)
+    db_spectrogram, max_amplitude = amplitude_to_db(magnitude)
+    db_spectrogram = min_max_scaling(db_spectrogram, scaler)
+    return db_spectrogram.unsqueeze(0), phase, max_amplitude
+def spectrogram_to_audio(db_spectrogram, scaler, phase, max_amplitude):
+    db_spectrogram = db_spectrogram.squeeze(0)
+    db_spectrogram = inverse_min_max(db_spectrogram, scaler)
+    spectrogram = db_to_amplitude(db_spectrogram, max_amplitude)
+    complex_spec = reconstruct_complex_spectrogram(spectrogram, phase)
+    audio = inverse_fft(complex_spec)
+    return audio
+def save_audio(audio):
+    torchaudio.save(r"enhanced_audio.wav", audio, 16000)
+    return r"enhanced_audio.wav"
+def predict(user_input, model, scaler):
+    audio = load_audio(user_input)
+    spectrogram, phase, max_amplitude = transform_audio(audio, scaler)
+    with no_grad():
+        enhanced_spectrogram = model.forward(spectrogram)
+    enhanced_audio = spectrogram_to_audio(enhanced_spectrogram, scaler, phase, max_amplitude)
+    enhanced_audio_path = save_audio(enhanced_audio)
+    return enhanced_audio_path

model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b7f1b84b4e0520ab3d07e3dad89f1dee2f8e74a845a1c6c5ced5e482227e4b1
+size 137041542

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch==2.3.0
+torchaudio==2.3.0
+gradio==5.5.0

test.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torchaudio
+import torch
+import torch.nn as nn
+import numpy as np
+from functions import *
+from unet import UNet
+"""
+model = UNet()
+model_state_dict = torch.load("huggingface/model.pth", map_location="cpu")
+model.load_state_dict(model_state_dict["model_state_dict"])
+print("# of trainable parameters =", sum(p.numel() for p in model.parameters() if p.requires_grad))
+audio = load_audio("huggingface/p232_001.wav")
+enhanced = predict("huggingface/p232_001.wav", model)
+print(enhanced.shape)"""
+string = "C:/Users/durim/Documents/KTH/Master_År2/DT2119-SSR/project_feature_extraction.ipynb"
+print("/".join(string.split("/")[:-1]))

unet.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+import torch.nn as nn
+class EncodingBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(EncodingBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.activation = nn.ReLU(inplace=True)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.activation(x)
+        x = self.conv2(x)
+        x = self.activation(x)
+        skip_connection = x
+        x = self.pool(x)
+        return x, skip_connection
+class DecodingBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(DecodingBlock, self).__init__()
+        self.conv_transpose = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=5, stride=2, padding=2)
+        self.conv1 = nn.Conv2d(out_channels * 2, out_channels, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
+        self.activation = nn.ReLU(inplace=True)
+    def forward(self, x, skip_connection):
+        x = self.conv_transpose(x)
+        pd = (0, skip_connection.size(-1) - x.size(-1), 0, skip_connection.size(-2) - x.size(-2))
+        x = nn.functional.pad(x, pd, mode='constant', value=0)
+        x = torch.cat((x, skip_connection), dim=1)
+        x = self.conv1(x)
+        x = self.activation(x)
+        x = self.conv2(x)
+        x = self.activation(x)
+        return x
+class UNet(nn.Module):
+    def __init__(self, init_features=32, bottleneck_size=512):
+        super(UNet, self).__init__()
+        self.encoding_block1 = EncodingBlock(1, init_features)
+        self.encoding_block2 = EncodingBlock(init_features, init_features*2)
+        self.encoding_block3 = EncodingBlock(init_features*2, init_features*4)
+        self.encoding_block4 = EncodingBlock(init_features*4, init_features*8)
+        self.bottleneck_conv1 = nn.Conv2d(init_features*8, bottleneck_size, kernel_size=3, padding=1)
+        self.bottleneck_conv2 = nn.Conv2d(bottleneck_size, bottleneck_size, kernel_size=3, padding=1)
+        self.decoding_block4 = DecodingBlock(bottleneck_size, init_features*8)
+        self.decoding_block3 = DecodingBlock(init_features*8, init_features*4)
+        self.decoding_block2 = DecodingBlock(init_features*4, init_features*2)
+        self.decoding_block1 = DecodingBlock(init_features*2, init_features)
+        self.final_conv = nn.Conv2d(init_features, 1, kernel_size=1)
+    def forward(self, x):
+        x, skip1 = self.encoding_block1(x)
+        x, skip2 = self.encoding_block2(x)
+        x, skip3 = self.encoding_block3(x)
+        x, skip4 = self.encoding_block4(x)
+        x = self.bottleneck_conv1(x)
+        x = nn.ReLU(inplace=True)(x)
+        x = self.bottleneck_conv2(x)
+        x = nn.ReLU(inplace=True)(x)
+        x = self.decoding_block4(x, skip4)
+        x = self.decoding_block3(x, skip3)
+        x = self.decoding_block2(x, skip2)
+        x = self.decoding_block1(x, skip1)
+        x = self.final_conv(x)
+        return x