Openai-Codex-REAL / simple-generative-model-regressor.py
scp4950's picture
Create simple-generative-model-regressor.py
59e33cf
import sys
import time
import numpy as np
from keras.activations import relu
from scipy.io.wavfile import read, write
from keras.models import Model, Sequential
from keras.layers import Convolution2D, AtrousConvolution2D, Flatten, Dense, \
Input, Lambda, merge
def wavenetBlock(n_atrous_filters, atrous_filter_size, atrous_rate,
n_conv_filters, conv_filter_size):
def f(input_):
residual = input_
tanh_out = AtrousConvolution1D(n_atrous_filters, atrous_filter_size,
atrous_rate=atrous_rate,
border_mode='same',
activation='tanh')(input_)
sigmoid_out = AtrousConvolution1D(n_atrous_filters, atrous_filter_size,
atrous_rate=atrous_rate,
border_mode='same',
activation='sigmoid')(input_)
merged = merge([tanh_out, sigmoid_out], mode='mul')
skip_out = Convolution1D(1, 1, activation='relu', border_mode='same')(merged)
out = merge([skip_out, residual], mode='sum')
return out, skip_out
return f
def get_basic_generative_model(input_size):
input = Input(shape=(1, input_size, 1))
l1a, l1b = wavenetBlock(10, 5, 2, 1, 3)(input)
l2a, l2b = wavenetBlock(1, 2, 4, 1, 3)(l1a)
l3a, l3b = wavenetBlock(1, 2, 8, 1, 3)(l2a)
l4a, l4b = wavenetBlock(1, 2, 16, 1, 3)(l3a)
l5a, l5b = wavenetBlock(1, 2, 32, 1, 3)(l4a)
l6 = merge([l1b, l2b, l3b, l4b, l5b], mode='sum')
l7 = Lambda(relu)(l6)
l8 = Convolution2D(1, 1, 1, activation='relu')(l7)
l9 = Convolution2D(1, 1, 1)(l8)
l10 = Flatten()(l9)
l11 = Dense(1, activation='tanh')(l10)
model = Model(input=input, output=l11)
model.compile(loss='mse', optimizer='rmsprop', metrics=['accuracy'])
model.summary()
return model
def get_audio(filename):
sr, audio = read(filename)
audio = audio.astype(float)
audio = audio - audio.min()
audio = audio / (audio.max() - audio.min())
audio = (audio - 0.5) * 2
return sr, audio
def frame_generator(sr, audio, frame_size, frame_shift):
audio_len = len(audio)
while 1:
for i in range(0, audio_len - frame_size - 1, frame_shift):
frame = audio[i:i+frame_size]
if len(frame) < frame_size:
break
if i + frame_size >= audio_len:
break
temp = audio[i + frame_size]
yield frame.reshape(1, 1, frame_size, 1), \
temp.reshape(1, 1)
if __name__ == '__main__':
n_epochs = 20
frame_size = 2048
frame_shift = 512
sr_training, training_audio = get_audio('train.wav')
training_audio = training_audio[:sr_training*240]
sr_valid, valid_audio = get_audio('validate.wav')
valid_audio = valid_audio[:sr_valid*30]
assert sr_training == sr_valid, "Training, validation samplerate mismatch"
n_training_examples = int((len(training_audio)-frame_size-1) / float(
frame_shift))
n_validation_examples = int((len(valid_audio)-frame_size-1) / float(
frame_shift))
model = get_basic_generative_model(frame_size)
print 'Total training examples:', n_training_examples
print 'Total validation examples:', n_validation_examples
model.fit_generator(frame_generator(sr_training, training_audio,
frame_size, frame_shift),
samples_per_epoch=n_training_examples,
nb_epoch=n_epochs,
validation_data=frame_generator(sr_valid, valid_audio,
frame_size, frame_shift
),
nb_val_samples=n_validation_examples,
verbose=1)
print 'Saving model...'
str_timestamp = str(int(time.time()))
model.save('models/model_'+str_timestamp+'_'+str(n_epochs)+'.h5')
print 'Generating audio...'
new_audio = np.zeros((sr_training * 3))
curr_sample_idx = 0
audio_context = valid_audio[:frame_size]
while curr_sample_idx < new_audio.shape[0]:
predicted_val = model.predict(audio_context.reshape(1, 1, frame_size,
1))
ampl_val_16 = predicted_val * 2**15
new_audio[curr_sample_idx] = ampl_val_16
audio_context[-1] = ampl_val_16
audio_context[:-1] = audio_context[1:]
pc_str = str(round(100*curr_sample_idx/float(new_audio.shape[0]), 2))
sys.stdout.write('Percent complete: ' + pc_str + '\r')
sys.stdout.flush()
curr_sample_idx += 1
outfilepath = 'output/reg_generated_'+str_timestamp+'.wav'
print 'Writing generated audio to:', outfilepath
write(outfilepath, sr_training, new_audio.astype(np.int16))
print '\nDone!'