Docker_v

Sleeping

+import math
+import os
+import librosa
+import numpy as np
+import onnxruntime as ort
+from numpy.fft import rfft
+from numpy.lib.stride_tricks import as_strided
+class PLCMOSEstimator():
+    def __init__(self, model_version=1):
+        """
+        Initialize a PLC-MOS model of a given version. There are currently three models available, v0 (intrusive)
+        and v1 (both non-intrusive and intrusive available). The default is to use the v1 models.
+        """
+        self.model_version = model_version
+        model_paths = [
+            # v0 model:
+            [("models/plcmos_v0.onnx", 999999999999), (None, 0)],
+            # v1 models:
+            [("models/plcmos_v1_intrusive.onnx", 768),
+             ("models/plcmos_v1_nonintrusive.onnx", 999999999999)],
+        ]
+        self.sessions = []
+        self.max_lens = []
+        options = ort.SessionOptions()
+        options.intra_op_num_threads = 8
+        options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        for path, max_len in model_paths[model_version]:
+            if not path is None:
+                file_dir = os.path.dirname(os.path.realpath(__file__))
+                self.sessions.append(ort.InferenceSession(
+                    os.path.join(file_dir, path), options))
+                self.max_lens.append(max_len)
+            else:
+                self.sessions.append(None)
+                self.max_lens.append(0)
+    def logpow_dns(self, sig, floor=-30.):
+        """
+        Compute log power of complex spectrum.
+        Floor any -`np.inf` value to (nonzero minimum + `floor`) dB.
+        If all values are 0s, floor all values to -80 dB.
+        """
+        log10e = np.log10(np.e)
+        pspec = sig.real ** 2 + sig.imag ** 2
+        zeros = pspec == 0
+        logp = np.empty_like(pspec)
+        if np.any(~zeros):
+            logp[~zeros] = np.log(pspec[~zeros])
+            logp[zeros] = np.log(pspec[~zeros].min()) + floor / 10 / log10e
+        else:
+            logp.fill(-80 / 10 / log10e)
+        return logp
+    def hop2hsize(self, wind, hop):
+        """
+        Convert hop fraction to integer size if necessary.
+        """
+        if hop >= 1:
+            assert type(hop) == int, "Hop size must be integer!"
+            return hop
+        else:
+            assert 0 < hop < 1, "Hop fraction has to be in range (0,1)!"
+            return int(len(wind) * hop)
+    def stana(self, sig, sr, wind, hop, synth=False, center=False):
+        """
+        Short term analysis by windowing
+        """
+        ssize = len(sig)
+        fsize = len(wind)
+        hsize = self.hop2hsize(wind, hop)
+        if synth:
+            sstart = hsize - fsize  # int(-fsize * (1-hfrac))
+        elif center:
+            sstart = -int(len(wind) / 2)  # odd window centered at exactly n=0
+        else:
+            sstart = 0
+        send = ssize
+        nframe = math.ceil((send - sstart) / hsize)
+        # Calculate zero-padding sizes
+        zpleft = -sstart
+        zpright = (nframe - 1) * hsize + fsize - zpleft - ssize
+        if zpleft > 0 or zpright > 0:
+            sigpad = np.zeros(ssize + zpleft + zpright, dtype=sig.dtype)
+            sigpad[zpleft:len(sigpad) - zpright] = sig
+        else:
+            sigpad = sig
+        return as_strided(sigpad, shape=(nframe, fsize),
+                          strides=(sig.itemsize * hsize, sig.itemsize)) * wind
+    def stft(self, sig, sr, wind, hop, nfft):
+        """
+        Compute STFT: window + rfft
+        """
+        frames = self.stana(sig, sr, wind, hop, synth=True)
+        return rfft(frames, n=nfft)
+    def stft_transform(self, audio, dft_size=512, hop_fraction=0.5, sr=16000):
+        """
+        Compute STFT parameters, then compute STFT
+        """
+        window = np.hamming(dft_size + 1)
+        window = window[:-1]
+        amp = np.abs(self.stft(audio, sr, window, hop_fraction, dft_size))
+        feat = self.logpow_dns(amp, floor=-120.)
+        return feat / 20.
+    def run(self, audio_degraded, audio_clean=None, combined=False):
+        """
+        Run the PLCMOS model and return the MOS for the given audio. If a clean audio file is passed and the
+        selected model version has an intrusive version, that version will be used, otherwise, the nonintrusive
+        model will be used. If combined is set to true (default), the mean of intrusive and nonintrusive models
+        results will be returned, when both are available
+        For intrusive models, the clean reference should be the unprocessed audio file the degraded audio is
+        based on. It is not required to be aligned with the degraded audio.
+        Audio data should be 16kHz, mono, [-1, 1] range.
+        """
+        audio_features_degraded = np.float32(self.stft_transform(audio_degraded))[
+            np.newaxis, np.newaxis, ...]
+        assert len(
+            audio_features_degraded) <= self.max_lens[0], "Maximum input length exceeded"
+        if audio_clean is None:
+            combined = False
+        mos = 0
+        session = self.sessions[0]
+        assert not session is None, "Intrusive model not available for this model version."
+        audio_features_clean = np.float32(self.stft_transform(audio_clean))[
+            np.newaxis, np.newaxis, ...]
+        assert len(
+            audio_features_clean) <= self.max_lens[0], "Maximum input length exceeded"
+        onnx_inputs = {"degraded_audio": audio_features_degraded,
+                       "clean_audio": audio_features_clean}
+        mos = float(session.run(None, onnx_inputs)[0])
+        session = self.sessions[1]
+        assert not session is None, "Nonintrusive model not available for this model version."
+        onnx_inputs = {"degraded_audio": audio_features_degraded}
+        mos_2 = float(session.run(None, onnx_inputs)[0])
+        mos = [mos, mos_2]
+        return mos

utils/__init__.py ADDED Viewed

File without changes

utils/stft.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch
+import torch.nn as nn
+class STFTMag(nn.Module):
+    def __init__(self,
+                 nfft=1024,
+                 hop=256):
+        super().__init__()
+        self.nfft = nfft
+        self.hop = hop
+        self.register_buffer('window', torch.hann_window(nfft), False)
+    # x: [B,T] or [T]
+    @torch.no_grad()
+    def forward(self, x):
+        stft = torch.stft(x.cpu(),
+                          self.nfft,
+                          self.hop,
+                          window=self.window,
+                          )  # return_complex=False)  #[B, F, TT,2]
+        mag = torch.norm(stft, p=2, dim=-1)  # [B, F, TT]
+        return mag

utils/tblogger.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from os import path
+import librosa as rosa
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.utilities import rank_zero_only
+from utils.stft import STFTMag
+matplotlib.use('Agg')
+class TensorBoardLoggerExpanded(TensorBoardLogger):
+    def __init__(self, sr=16000):
+        super().__init__(save_dir='lightning_logs', default_hp_metric=False, name='')
+        self.sr = sr
+        self.stftmag = STFTMag()
+    def fig2np(self, fig):
+        data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+        data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+        return data
+    def plot_spectrogram_to_numpy(self, y, y_low, y_recon, step):
+        name_list = ['y', 'y_low', 'y_recon']
+        fig = plt.figure(figsize=(9, 15))
+        fig.suptitle(f'Epoch_{step}')
+        for i, yy in enumerate([y, y_low, y_recon]):
+            if yy.dim() == 1:
+                yy = self.stftmag(yy)
+            ax = plt.subplot(3, 1, i + 1)
+            ax.set_title(name_list[i])
+            plt.imshow(rosa.amplitude_to_db(yy.numpy(),
+                                            ref=np.max, top_db=80.),
+                       # vmin = -20,
+                       vmax=0.,
+                       aspect='auto',
+                       origin='lower',
+                       interpolation='none')
+            plt.colorbar()
+            plt.xlabel('Frames')
+            plt.ylabel('Channels')
+            plt.tight_layout()
+        fig.canvas.draw()
+        data = self.fig2np(fig)
+        plt.close()
+        return data
+    @rank_zero_only
+    def log_spectrogram(self, y, y_low, y_recon, epoch):
+        y, y_low, y_recon = y.detach().cpu(), y_low.detach().cpu(), y_recon.detach().cpu()
+        spec_img = self.plot_spectrogram_to_numpy(y, y_low, y_recon, epoch)
+        self.experiment.add_image(path.join(self.save_dir, 'result'),
+                                  spec_img,
+                                  epoch,
+                                  dataformats='HWC')
+        self.experiment.flush()
+        return
+    @rank_zero_only
+    def log_audio(self, y, y_low, y_recon, epoch):
+        y, y_low, y_recon = y.detach().cpu(), y_low.detach().cpu(), y_recon.detach().cpu(),
+        name_list = ['y', 'y_low', 'y_recon']
+        for n, yy in zip(name_list, [y, y_low, y_recon]):
+            self.experiment.add_audio(n, yy, epoch, self.sr)
+        self.experiment.flush()
+        return

utils/utils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import librosa
+import librosa.display
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
+from config import CONFIG
+def mkdir_p(mypath):
+    """Creates a directory. equivalent to using mkdir -p on the command line"""
+    from errno import EEXIST
+    from os import makedirs, path
+    try:
+        makedirs(mypath)
+    except OSError as exc:  # Python >2.5
+        if exc.errno == EEXIST and path.isdir(mypath):
+            pass
+        else:
+            raise
+def visualize(target, input, recon, path):
+    sr = CONFIG.DATA.sr
+    window_size = 1024
+    window = np.hanning(window_size)
+    stft_hr = librosa.core.spectrum.stft(target, n_fft=window_size, hop_length=512, window=window)
+    stft_hr = 2 * np.abs(stft_hr) / np.sum(window)
+    stft_lr = librosa.core.spectrum.stft(input, n_fft=window_size, hop_length=512, window=window)
+    stft_lr = 2 * np.abs(stft_lr) / np.sum(window)
+    stft_recon = librosa.core.spectrum.stft(recon, n_fft=window_size, hop_length=512, window=window)
+    stft_recon = 2 * np.abs(stft_recon) / np.sum(window)
+    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharey=True, sharex=True, figsize=(16, 10))
+    ax1.title.set_text('Target signal')
+    ax2.title.set_text('Lossy signal')
+    ax3.title.set_text('Reconstructed signal')
+    canvas = FigureCanvas(fig)
+    p = librosa.display.specshow(librosa.amplitude_to_db(stft_hr), ax=ax1, y_axis='linear', x_axis='time', sr=sr)
+    p = librosa.display.specshow(librosa.amplitude_to_db(stft_lr), ax=ax2, y_axis='linear', x_axis='time', sr=sr)
+    p = librosa.display.specshow(librosa.amplitude_to_db(stft_recon), ax=ax3, y_axis='linear', x_axis='time', sr=sr)
+    mkdir_p(path)
+    fig.savefig(os.path.join(path, 'spec.png'))
+def get_power(x, nfft):
+    S = librosa.stft(x, n_fft=nfft)
+    S = np.log(np.abs(S) ** 2 + 1e-8)
+    return S
+def LSD(x_hr, x_pr):
+    S1 = get_power(x_hr, nfft=2048)
+    S2 = get_power(x_pr, nfft=2048)
+    lsd = np.mean(np.sqrt(np.mean((S1 - S2) ** 2 + 1e-8, axis=-1)), axis=0)
+    S1 = S1[-(len(S1) - 1) // 2:, :]
+    S2 = S2[-(len(S2) - 1) // 2:, :]
+    lsd_high = np.mean(np.sqrt(np.mean((S1 - S2) ** 2 + 1e-8, axis=-1)), axis=0)
+    return lsd, lsd_high