from tensorflow import keras | |
from tensorflow.keras import Model, layers | |
from tensorflow.keras.layers import Dense, Dropout, Conv2D | |
from tensorflow.keras.layers import LSTM, TimeDistributed, Bidirectional | |
from tensorflow.keras.constraints import max_norm | |
import librosa | |
import scipy | |
import numpy as np | |
import os | |
from ... import Metric | |
# prevent TF warnings | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' | |
class MOSNet(Metric): | |
def __init__(self, window, hop=None): | |
super(MOSNet, self).__init__(name='MOSNet', window=window, hop=hop) | |
# constants | |
self.fixed_rate = 16000 | |
self.mono = True | |
self.absolute = True | |
self.FFT_SIZE = 512 | |
self.SGRAM_DIM = self.FFT_SIZE // 2 + 1 | |
self.HOP_LENGTH = 256 | |
self.WIN_LENGTH = 512 | |
_input = keras.Input(shape=(None, 257)) | |
re_input = layers.Reshape((-1, 257, 1), input_shape=(-1, 257))(_input) | |
# CNN | |
conv1 = (Conv2D(16, (3, 3), strides=(1, 1), activation='relu', | |
padding='same'))(re_input) | |
conv1 = (Conv2D(16, (3, 3), strides=(1, 1), activation='relu', | |
padding='same'))(conv1) | |
conv1 = (Conv2D(16, (3, 3), strides=(1, 3), activation='relu', | |
padding='same'))(conv1) | |
conv2 = (Conv2D(32, (3, 3), strides=(1, 1), activation='relu', | |
padding='same'))(conv1) | |
conv2 = (Conv2D(32, (3, 3), strides=(1, 1), activation='relu', | |
padding='same'))(conv2) | |
conv2 = (Conv2D(32, (3, 3), strides=(1, 3), activation='relu', | |
padding='same'))(conv2) | |
conv3 = (Conv2D(64, (3, 3), strides=(1, 1), activation='relu', | |
padding='same'))(conv2) | |
conv3 = (Conv2D(64, (3, 3), strides=(1, 1), activation='relu', | |
padding='same'))(conv3) | |
conv3 = (Conv2D(64, (3, 3), strides=(1, 3), activation='relu', | |
padding='same'))(conv3) | |
conv4 = (Conv2D(128, (3, 3), strides=(1, 1), activation='relu', | |
padding='same'))(conv3) | |
conv4 = (Conv2D(128, (3, 3), strides=(1, 1), activation='relu', | |
padding='same'))(conv4) | |
conv4 = (Conv2D(128, (3, 3), strides=(1, 3), activation='relu', | |
padding='same'))(conv4) | |
re_shape = layers.Reshape((-1, 4*128), input_shape=(-1, 4, 128))(conv4) | |
# BLSTM | |
blstm1 = Bidirectional( | |
LSTM(128, return_sequences=True, dropout=0.3, | |
recurrent_dropout=0.3, | |
recurrent_constraint=max_norm(0.00001)), | |
merge_mode='concat')(re_shape) | |
# DNN | |
flatten = TimeDistributed(layers.Flatten())(blstm1) | |
dense1 = TimeDistributed(Dense(128, activation='relu'))(flatten) | |
dense1 = Dropout(0.3)(dense1) | |
frame_score = TimeDistributed(Dense(1), name='frame')(dense1) | |
import warnings | |
average_score = layers.GlobalAveragePooling1D(name='avg')(frame_score) | |
self.model = Model(outputs=[average_score, frame_score], inputs=_input) | |
# weights are in the directory of this file | |
pre_trained_dir = os.path.dirname(__file__) | |
# load pre-trained weights. CNN_BLSTM is reported as best | |
self.model.load_weights(os.path.join(pre_trained_dir, 'cnn_blstm.h5')) | |
def test_window(self, audios, rate): | |
# stft. D: (1+n_fft//2, T) | |
linear = librosa.stft(y=np.asfortranarray(audios[0]), | |
n_fft=self.FFT_SIZE, | |
hop_length=self.HOP_LENGTH, | |
win_length=self.WIN_LENGTH, | |
window=scipy.signal.hamming, | |
) | |
# magnitude spectrogram | |
mag = np.abs(linear) # (1+n_fft/2, T) | |
# shape in (T, 1+n_fft/2) | |
mag = np.transpose(mag.astype(np.float32)) | |
# now call the actual MOSnet | |
return {'mosnet': | |
self.model.predict(mag[None, ...], verbose=0, batch_size=1)[0]} | |