Spaces:

BorisovMaksim
/

denoising

Runtime error

App Files Files Community

BorisovMaksim commited on May 2, 2023

Commit

bd0a813

1 Parent(s): 3f204d4

fixes

Browse files

Files changed (11) hide show

.gitignore +1 -0
EDA.ipynb +0 -0
README.md +4 -4
app.py +9 -7
datasets.py +39 -0
denoisers/SpectralGating.py +1 -1
denoisers/__pycache__/SpectralGating.cpython-38.pyc +0 -0
denoisers/demucs.py +33 -24
evaluation.py +2 -2
metrics.py +5 -4
train.py +121 -0

.gitignore CHANGED Viewed

@@ -2,3 +2,4 @@
 .ipynb_checkpoints/**
 nohup.out
 __pycache__/**

 .ipynb_checkpoints/**
 nohup.out
 __pycache__/**
+cache_wav/

EDA.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -2,9 +2,9 @@
 # Testing
-|                 |    valentini_PESQ    | valentini_STOI |
-|:---------------:|:--------------------:|:--------------:|
-| ideal denoising |        1.9709        |     0.9211     |
-|    baseline     |        1.5693        |     0.9504     |

 # Testing
+|                 | valentini_PESQ | valentini_STOI |
+|:---------------:|:--------------:|:--------------:|
+| ideal denoising |     1.9709     |     0.9211     |
+|    baseline     |     1.7433     |     0.8844     |

app.py CHANGED Viewed

@@ -9,35 +9,37 @@ import logging
 import gradio as gr
 from denoisers.SpectralGating import SpectralGating
 model = SpectralGating()
 def denoising_transform(audio):
-    src_path = "cache_wav/source/{}.wav".format(str(uuid.uuid4()))
-    tgt_path = "cache_wav/target/{}.wav".format(str(uuid.uuid4()))
-    # os.rename(audio.name, src_path)
     (ffmpeg.input(audio)
             .output(src_path, acodec='pcm_s16le', ac=1, ar=22050)
             .run()
     )
-    model.predict(src_path, tgt_path)
     return tgt_path
 inputs = gr.inputs.Audio(label="Source Audio", source="microphone", type='filepath')
 outputs = gr.outputs.Audio(label="Target Audio", type='filepath')
-title = "Chinese-to-English Direct Speech-to-Speech Translation (BETA)"
 #"""
 gr.Interface(
     denoising_transform, inputs, outputs, title=title,
-    allow_flagging='never',
 ).launch(
     server_name='localhost',
     server_port=7871,
     #ssl_keyfile='example.key',
     #ssl_certfile="example.crt",
 )

 import gradio as gr
 from denoisers.SpectralGating import SpectralGating
 model = SpectralGating()
 def denoising_transform(audio):
+    src_path = "cache_wav/original/{}.wav".format(str(uuid.uuid4()))
+    tgt_path = "cache_wav/denoised/{}.wav".format(str(uuid.uuid4()))
     (ffmpeg.input(audio)
             .output(src_path, acodec='pcm_s16le', ac=1, ar=22050)
             .run()
     )
+    model.predict(audio, tgt_path)
     return tgt_path
+    # model.predict(src_path, tgt_path)
+    # return tgt_path
 inputs = gr.inputs.Audio(label="Source Audio", source="microphone", type='filepath')
 outputs = gr.outputs.Audio(label="Target Audio", type='filepath')
+title = "Denoising"
 #"""
 gr.Interface(
     denoising_transform, inputs, outputs, title=title,
+    allow_flagging='never'
 ).launch(
     server_name='localhost',
     server_port=7871,
+    share=True
     #ssl_keyfile='example.key',
     #ssl_certfile="example.crt",
 )

datasets.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+from torch.utils.data import Dataset
+from pathlib import Path
+from utils import load_wav
+class Valentini(Dataset):
+    def __init__(self, dataset_path='/media/public/datasets/denoising/DS_10283_2791/', transform=None,
+                 valid=False):
+        clean_path = Path(dataset_path) / 'clean_trainset_56spk_wav'
+        noisy_path = Path(dataset_path) / 'noisy_trainset_56spk_wav'
+        clean_wavs = list(clean_path.glob("*"))
+        noisy_wavs = list(noisy_path.glob("*"))
+        valid_threshold = int(len(clean_wavs) * 0.2)
+        if valid:
+            self.clean_wavs = clean_wavs[:valid_threshold]
+            self.noisy_wavs = noisy_wavs[:valid_threshold]
+        else:
+            self.clean_wavs = clean_wavs[valid_threshold:]
+            self.noisy_wavs = noisy_wavs[valid_threshold:]
+        assert len(self.clean_wavs) == len(self.noisy_wavs)
+        self.transform = transform
+    def __len__(self):
+        return len(self.clean_wavs)
+    def __getitem__(self, idx):
+        noisy_wav = load_wav(self.noisy_wavs[idx])
+        clean_wav = load_wav(self.clean_wavs[idx])
+        if self.transform:
+            random_seed = torch.randint(100, (1,))[0]
+            torch.manual_seed(random_seed)
+            noisy_wav = self.transform(noisy_wav)
+            torch.manual_seed(random_seed)
+            clean_wav = self.transform(clean_wav)
+        return noisy_wav, clean_wav

denoisers/SpectralGating.py CHANGED Viewed

@@ -16,7 +16,7 @@ class SpectralGating(torch.nn.Module):
         data, rate = torchaudio.load(wav_path)
         reduced_noise = torch.Tensor(nr.reduce_noise(y=data, sr=rate))
         torchaudio.save(out_path, reduced_noise, rate)
-        return reduced_noise

         data, rate = torchaudio.load(wav_path)
         reduced_noise = torch.Tensor(nr.reduce_noise(y=data, sr=rate))
         torchaudio.save(out_path, reduced_noise, rate)
+        return out_path

denoisers/__pycache__/SpectralGating.cpython-38.pyc ADDED Viewed

Binary file (1.08 kB). View file

denoisers/demucs.py CHANGED Viewed

@@ -1,36 +1,34 @@
 import torch
 class Encoder(torch.nn.Module):
-    def __init__(self, in_channels, out_channels,
-                 kernel_size_1=8, stride_1=4,
-                 kernel_size_2=1, stride_2=1):
         super(Encoder, self).__init__()
         self.conv1 = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels,
-                                     kernel_size=kernel_size_1, stride=stride_1)
         self.relu1 = torch.nn.ReLU()
         self.conv2 = torch.nn.Conv1d(in_channels=out_channels, out_channels=2 * out_channels,
-                                     kernel_size=kernel_size_2, stride=stride_2)
-        self.glu = torch.nn.GLU()
     def forward(self, x):
         x = self.relu1(self.conv1(x))
         x = self.glu(self.conv2(x))
         return x
 class Decoder(torch.nn.Module):
-    def __init__(self, in_channels, out_channels,
-                 kernel_size_1=3, stride_1=1,
-                 kernel_size_2=8, stride_2=4):
         super(Decoder, self).__init__()
         self.conv1 = torch.nn.Conv1d(in_channels=in_channels, out_channels=2 * in_channels,
-                                     kernel_size=kernel_size_1, stride=stride_1)
-        self.glu = torch.nn.GLU()
         self.conv2 = torch.nn.ConvTranspose1d(in_channels=in_channels, out_channels=out_channels,
-                                              kernel_size=kernel_size_2, stride=stride_2)
         self.relu = torch.nn.ReLU()
     def forward(self, x):
@@ -40,28 +38,39 @@ class Decoder(torch.nn.Module):
 class Demucs(torch.nn.Module):
-    def __init__(self):
         super(Demucs, self).__init__()
-        self.encoder1 = Encoder(in_channels=1, out_channels=64)
-        self.encoder2 = Encoder(in_channels=64, out_channels=128)
-        self.encoder3 = Encoder(in_channels=128, out_channels=256)
-        self.lstm = torch.nn.LSTM(input_size=256, hidden_size=256, num_layers=2)
-        self.decoder1 = Decoder(in_channels=256, out_channels=128)
-        self.decoder2 = Decoder(in_channels=128, out_channels=64)
-        self.decoder3 = Decoder(in_channels=64, out_channels=1)
     def forward(self, x):
         out1 = self.encoder1(x)
         out2 = self.encoder2(out1)
         out3 = self.encoder3(out2)
-        x = self.lstm(out3)
         x = self.decoder1(x + out3)
         x = self.decoder2(x + out2)
-        x = self.decoder3(x + out1)
         return x

 import torch
+from torch.nn.functional import pad
 class Encoder(torch.nn.Module):
+    def __init__(self, in_channels, out_channels):
         super(Encoder, self).__init__()
         self.conv1 = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels,
+                                     kernel_size=8, stride=2)
         self.relu1 = torch.nn.ReLU()
         self.conv2 = torch.nn.Conv1d(in_channels=out_channels, out_channels=2 * out_channels,
+                                     kernel_size=1, stride=1)
+        self.glu = torch.nn.GLU(dim=-2)
     def forward(self, x):
         x = self.relu1(self.conv1(x))
+        if x.shape[-1] % 2 == 1:
+            x = pad(x, (0, 1))
         x = self.glu(self.conv2(x))
         return x
 class Decoder(torch.nn.Module):
+    def __init__(self, in_channels, out_channels):
         super(Decoder, self).__init__()
         self.conv1 = torch.nn.Conv1d(in_channels=in_channels, out_channels=2 * in_channels,
+                                     kernel_size=1, stride=1)
+        self.glu = torch.nn.GLU(dim=-2)
         self.conv2 = torch.nn.ConvTranspose1d(in_channels=in_channels, out_channels=out_channels,
+                                              kernel_size=8, stride=2)
         self.relu = torch.nn.ReLU()
     def forward(self, x):
 class Demucs(torch.nn.Module):
+    def __init__(self, H):
         super(Demucs, self).__init__()
+        self.encoder1 = Encoder(in_channels=1, out_channels=H)
+        self.encoder2 = Encoder(in_channels=H, out_channels=2*H)
+        self.encoder3 = Encoder(in_channels=2*H, out_channels=4*H)
+        self.lstm = torch.nn.LSTM(
+                                  input_size=4*H,
+                                  hidden_size=4*H, num_layers=2, batch_first=True)
+        self.decoder1 = Decoder(in_channels=4*H, out_channels=2*H)
+        self.decoder2 = Decoder(in_channels=2*H, out_channels=H)
+        self.decoder3 = Decoder(in_channels=H, out_channels=1)
     def forward(self, x):
         out1 = self.encoder1(x)
         out2 = self.encoder2(out1)
         out3 = self.encoder3(out2)
+        x, _ = self.lstm(out3.permute(0, 2, 1))
+        x = x.permute(0, 2, 1)
         x = self.decoder1(x + out3)
+        x = x[:, :, :out2.shape[-1]]
         x = self.decoder2(x + out2)
+        x = x[:, :, :-1]
+        out1 = out1[:, :, :-1]
+        if x.shape[-1] > out1.shape[-1]:
+            x = x[:, :, :out1.shape[-1]]
+        elif x.shape[-1] < out1.shape[-1]:
+            out1 = out1[:, :, :x.shape[-1]]
+        x = self.decoder3(x + out1)
         return x

evaluation.py CHANGED Viewed

@@ -28,10 +28,10 @@ def evaluate_on_dataset(model_name, dataset_path, dataset_type):
         noisy_wav = load_wav(noisy_path)
         if model_name is None:
-            scores = metrics.calculate(noisy_wav, clean_wav)
         else:
             denoised_wav = model(noisy_wav)
-            scores = metrics.calculate(noisy_wav, denoised_wav)
         mean_scores['PESQ'] += scores['PESQ']
         mean_scores['STOI'] += scores['STOI']

         noisy_wav = load_wav(noisy_path)
         if model_name is None:
+            scores = metrics.calculate(denoised=noisy_wav, clean=clean_wav)
         else:
             denoised_wav = model(noisy_wav)
+            scores = metrics.calculate(denoised=denoised_wav, clean=clean_wav)
         mean_scores['PESQ'] += scores['PESQ']
         mean_scores['STOI'] += scores['STOI']

metrics.py CHANGED Viewed

@@ -2,16 +2,17 @@ from torchmetrics.audio.pesq import PerceptualEvaluationSpeechQuality
 from torchmetrics.audio.stoi import ShortTimeObjectiveIntelligibility
 import torch
 import torchaudio
 class Metrics:
     def __init__(self, rate=16000):
         self.nb_pesq = PerceptualEvaluationSpeechQuality(rate, 'wb')
         self.stoi = ShortTimeObjectiveIntelligibility(rate, False)
-    def calculate(self, preds, target):
-        return {'PESQ': self.nb_pesq(preds, target),
-                'STOI': self.stoi(preds, target)}

 from torchmetrics.audio.stoi import ShortTimeObjectiveIntelligibility
 import torch
 import torchaudio
+from torchmetrics import SignalNoiseRatio
 class Metrics:
     def __init__(self, rate=16000):
         self.nb_pesq = PerceptualEvaluationSpeechQuality(rate, 'wb')
         self.stoi = ShortTimeObjectiveIntelligibility(rate, False)
+        self.snr = SignalNoiseRatio()
+    def calculate(self, denoised, clean):
+        return {'PESQ': self.nb_pesq(denoised, clean),
+                'STOI': self.stoi(denoised, clean)}

train.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+from torch.utils.tensorboard import SummaryWriter
+import torch
+from torch.nn import Sequential
+from torch.utils.data import DataLoader
+from datasets import Valentini
+from datetime import datetime
+from torchvision.transforms import RandomCrop
+from utils import load_wav
+from denoisers.demucs import Demucs
+from pathlib import Path
+os.environ['CUDA_VISIBLE_DEVICES'] = "1"
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model = Demucs(H=64).to(device)
+DATASET_PATH = Path('/media/public/datasets/denoising/DS_10283_2791/')
+VALID_WAVS = {'hard': 'p257_171.wav',
+              'medium': 'p232_071.wav',
+              'easy': 'p232_284.wav'}
+MAX_SECONDS = 3.2
+SAMPLE_RATE = 16000
+transform = Sequential(RandomCrop((1, int(MAX_SECONDS * SAMPLE_RATE)), pad_if_needed=True))
+training_loader = DataLoader(Valentini(valid=False, transform=transform), batch_size=12, shuffle=True)
+validation_loader = DataLoader(Valentini(valid=True, transform=transform), batch_size=12, shuffle=True)
+optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+loss_fn = torch.nn.MSELoss()
+def train_one_epoch(epoch_index, tb_writer):
+    running_loss = 0.
+    last_loss = 0.
+    for i, data in enumerate(training_loader):
+        inputs, labels = data
+        inputs, labels = inputs.to(device), labels.to(device)
+        optimizer.zero_grad()
+        outputs = model(inputs)
+        loss = loss_fn(outputs, labels)
+        loss.backward()
+        optimizer.step()
+        running_loss += loss.item()
+        if i % 1000 == 999:
+            last_loss = running_loss / 100  # loss per batch
+            print('  batch {} loss: {}'.format(i + 1, last_loss))
+            tb_x = epoch_index * len(training_loader) + i + 1
+            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
+            running_loss = 0.
+    return last_loss
+def train():
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    writer = SummaryWriter('runs/denoising_trainer_{}'.format(timestamp))
+    epoch_number = 0
+    EPOCHS = 5
+    best_vloss = 1_000_000.
+    for tag, wav_path in VALID_WAVS.items():
+        wav = load_wav(DATASET_PATH / 'noisy_testset_wav' / wav_path)
+        writer.add_audio(tag=tag, snd_tensor=wav, sample_rate=SAMPLE_RATE)
+    writer.flush()
+    for epoch in range(EPOCHS):
+        print('EPOCH {}:'.format(epoch_number + 1))
+        # Make sure gradient tracking is on, and do a pass over the data
+        model.train(True)
+        avg_loss = train_one_epoch(epoch_number, writer)
+        # We don't need gradients on to do reporting
+        model.train(False)
+        running_vloss = 0.0
+        with torch.no_grad():
+            for i, vdata in enumerate(validation_loader):
+                vinputs, vlabels = vdata
+                vinputs, vlabels = vinputs.to(device), vlabels.to(device)
+                voutputs = model(vinputs)
+                vloss = loss_fn(voutputs, vlabels)
+                running_vloss += vloss
+            avg_vloss = running_vloss / (i + 1)
+            print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
+            writer.add_scalars('Training vs. Validation Loss',
+                               {'Training': avg_loss, 'Validation': avg_vloss},
+                               epoch_number + 1)
+            for tag, wav_path in VALID_WAVS.items():
+                wav = load_wav(DATASET_PATH / 'noisy_testset_wav' / wav_path)
+                wav = torch.reshape(wav, (1, 1, -1)).to(device)
+                prediction = model(wav)
+                writer.add_audio(tag=f"Model predicted {tag} on epoch {epoch}",
+                                 snd_tensor=prediction,
+                                 sample_rate=SAMPLE_RATE)
+            writer.flush()
+            if avg_vloss < best_vloss:
+                best_vloss = avg_vloss
+                model_path = 'checkpoints/model_{}_{}'.format(timestamp, epoch_number)
+                torch.save(model.state_dict(), model_path)
+            epoch_number += 1
+if __name__ == '__main__':
+    train()