alibabasglab's picture
Upload 73 files
936f6fa verified
raw
history blame
4.01 kB
from tensorflow import keras
from tensorflow.keras import Model, layers
from tensorflow.keras.layers import Dense, Dropout, Conv2D
from tensorflow.keras.layers import LSTM, TimeDistributed, Bidirectional
from tensorflow.keras.constraints import max_norm
import librosa
import scipy
import numpy as np
import os
from ... import Metric
# prevent TF warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
class MOSNet(Metric):
def __init__(self, window, hop=None):
super(MOSNet, self).__init__(name='MOSNet', window=window, hop=hop)
# constants
self.fixed_rate = 16000
self.mono = True
self.absolute = True
self.FFT_SIZE = 512
self.SGRAM_DIM = self.FFT_SIZE // 2 + 1
self.HOP_LENGTH = 256
self.WIN_LENGTH = 512
_input = keras.Input(shape=(None, 257))
re_input = layers.Reshape((-1, 257, 1), input_shape=(-1, 257))(_input)
# CNN
conv1 = (Conv2D(16, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(re_input)
conv1 = (Conv2D(16, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv1)
conv1 = (Conv2D(16, (3, 3), strides=(1, 3), activation='relu',
padding='same'))(conv1)
conv2 = (Conv2D(32, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv1)
conv2 = (Conv2D(32, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv2)
conv2 = (Conv2D(32, (3, 3), strides=(1, 3), activation='relu',
padding='same'))(conv2)
conv3 = (Conv2D(64, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv2)
conv3 = (Conv2D(64, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv3)
conv3 = (Conv2D(64, (3, 3), strides=(1, 3), activation='relu',
padding='same'))(conv3)
conv4 = (Conv2D(128, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv3)
conv4 = (Conv2D(128, (3, 3), strides=(1, 1), activation='relu',
padding='same'))(conv4)
conv4 = (Conv2D(128, (3, 3), strides=(1, 3), activation='relu',
padding='same'))(conv4)
re_shape = layers.Reshape((-1, 4*128), input_shape=(-1, 4, 128))(conv4)
# BLSTM
blstm1 = Bidirectional(
LSTM(128, return_sequences=True, dropout=0.3,
recurrent_dropout=0.3,
recurrent_constraint=max_norm(0.00001)),
merge_mode='concat')(re_shape)
# DNN
flatten = TimeDistributed(layers.Flatten())(blstm1)
dense1 = TimeDistributed(Dense(128, activation='relu'))(flatten)
dense1 = Dropout(0.3)(dense1)
frame_score = TimeDistributed(Dense(1), name='frame')(dense1)
import warnings
average_score = layers.GlobalAveragePooling1D(name='avg')(frame_score)
self.model = Model(outputs=[average_score, frame_score], inputs=_input)
# weights are in the directory of this file
pre_trained_dir = os.path.dirname(__file__)
# load pre-trained weights. CNN_BLSTM is reported as best
self.model.load_weights(os.path.join(pre_trained_dir, 'cnn_blstm.h5'))
def test_window(self, audios, rate):
# stft. D: (1+n_fft//2, T)
linear = librosa.stft(y=np.asfortranarray(audios[0]),
n_fft=self.FFT_SIZE,
hop_length=self.HOP_LENGTH,
win_length=self.WIN_LENGTH,
window=scipy.signal.hamming,
)
# magnitude spectrogram
mag = np.abs(linear) # (1+n_fft/2, T)
# shape in (T, 1+n_fft/2)
mag = np.transpose(mag.astype(np.float32))
# now call the actual MOSnet
return {'mosnet':
self.model.predict(mag[None, ...], verbose=0, batch_size=1)[0]}