Spaces:
Runtime error
Runtime error
upgraded using tf2 upgrade notebook
Browse files
audio_style_transfer/models/timedomain.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
"""NIPS2017 "Time Domain Neural Audio Style Transfer" code repository
|
2 |
Parag K. Mital
|
3 |
"""
|
4 |
-
import tensorflow
|
5 |
-
tf.disable_v2_behavior()
|
6 |
import librosa
|
7 |
import numpy as np
|
8 |
from scipy.signal import hann
|
@@ -75,17 +74,17 @@ def instance_norm(x, epsilon=1e-5):
|
|
75 |
epsilon : float, optional
|
76 |
Description
|
77 |
"""
|
78 |
-
with tf.variable_scope('instance_norm'):
|
79 |
-
mean, var = tf.nn.moments(x, [1, 2],
|
80 |
-
scale = tf.get_variable(
|
81 |
name='scale',
|
82 |
shape=[x.get_shape()[-1]],
|
83 |
-
initializer=tf.truncated_normal_initializer(mean=1.0, stddev=0.02))
|
84 |
-
offset = tf.get_variable(
|
85 |
name='offset',
|
86 |
shape=[x.get_shape()[-1]],
|
87 |
-
initializer=tf.constant_initializer(0.0))
|
88 |
-
out = scale * tf.div(x - mean, tf.sqrt(var + epsilon)) + offset
|
89 |
return out
|
90 |
|
91 |
|
@@ -97,23 +96,23 @@ def compute_inputs(x, freqs, n_fft, n_frames, input_features, norm=False):
|
|
97 |
return x
|
98 |
freqs_tf = tf.constant(freqs, name="freqs", dtype='float32')
|
99 |
inputs = {}
|
100 |
-
with tf.variable_scope('real'):
|
101 |
inputs['real'] = norm_fn(tf.reshape(
|
102 |
tf.matmul(x, tf.cos(freqs_tf)), [1, 1, n_frames, n_fft // 2]))
|
103 |
-
with tf.variable_scope('imag'):
|
104 |
inputs['imag'] = norm_fn(tf.reshape(
|
105 |
tf.matmul(x, tf.sin(freqs_tf)), [1, 1, n_frames, n_fft // 2]))
|
106 |
-
with tf.variable_scope('mags'):
|
107 |
inputs['mags'] = norm_fn(tf.reshape(
|
108 |
tf.sqrt(
|
109 |
tf.maximum(1e-15, inputs['real'] * inputs['real'] + inputs[
|
110 |
'imag'] * inputs['imag'])), [1, 1, n_frames, n_fft // 2]))
|
111 |
-
with tf.variable_scope('phase'):
|
112 |
inputs['phase'] = norm_fn(tf.atan2(inputs['imag'], inputs['real']))
|
113 |
-
with tf.variable_scope('unwrapped'):
|
114 |
-
inputs['unwrapped'] = tf.py_func(
|
115 |
unwrap, [inputs['phase']], tf.float32)
|
116 |
-
with tf.variable_scope('unwrapped_difference'):
|
117 |
inputs['unwrapped_difference'] = (tf.slice(
|
118 |
inputs['unwrapped'],
|
119 |
[0, 0, 0, 1], [-1, -1, -1, n_fft // 2 - 1]) -
|
@@ -147,9 +146,10 @@ def compute_features(content,
|
|
147 |
kernels = []
|
148 |
content_features = []
|
149 |
style_features = []
|
150 |
-
config_proto = tf.ConfigProto
|
151 |
-
|
152 |
-
|
|
|
153 |
p = np.reshape(
|
154 |
np.linspace(0.0, n_samples - 1, n_samples), [n_samples, 1])
|
155 |
k = np.reshape(
|
@@ -157,7 +157,7 @@ def compute_features(content,
|
|
157 |
[1, n_fft // 2])
|
158 |
freqs = np.dot(p, k)
|
159 |
inputs, net = compute_inputs(x, freqs, n_fft, n_frames, input_features, norm)
|
160 |
-
sess.run(tf.initialize_all_variables())
|
161 |
content_feature = net.eval(feed_dict={x: content_tf})
|
162 |
content_features.append(content_feature)
|
163 |
style_feature = inputs['mags'].eval(feed_dict={x: style_tf})
|
@@ -177,8 +177,8 @@ def compute_features(content,
|
|
177 |
kernel_tf = tf.constant(
|
178 |
kernel, name="kernel{}".format(layer_i), dtype='float32')
|
179 |
conv = tf.nn.conv2d(
|
180 |
-
net,
|
181 |
-
kernel_tf,
|
182 |
strides=[1, stride, stride, 1],
|
183 |
padding="VALID",
|
184 |
name="conv{}".format(layer_i))
|
@@ -215,7 +215,7 @@ def compute_stylization(kernels,
|
|
215 |
inputs, net = compute_inputs(x, freqs, n_fft, n_frames, input_features, norm)
|
216 |
content_loss = alpha * 2 * tf.nn.l2_loss(net - content_features[0])
|
217 |
feats = tf.reshape(inputs['mags'], (-1, n_fft // 2))
|
218 |
-
gram = tf.matmul(tf.transpose(feats), feats) / (n_frames)
|
219 |
style_loss = 2 * tf.nn.l2_loss(gram - style_gram[0])
|
220 |
for layer_i in range(n_layers):
|
221 |
kernel_tf = tf.constant(
|
@@ -223,8 +223,8 @@ def compute_stylization(kernels,
|
|
223 |
name="kernel{}".format(layer_i),
|
224 |
dtype='float32')
|
225 |
conv = tf.nn.conv2d(
|
226 |
-
net,
|
227 |
-
kernel_tf,
|
228 |
strides=[1, stride, stride, 1],
|
229 |
padding="VALID",
|
230 |
name="conv{}".format(layer_i))
|
@@ -233,7 +233,7 @@ def compute_stylization(kernels,
|
|
233 |
alpha * 2 * tf.nn.l2_loss(net - content_features[layer_i + 1])
|
234 |
_, height, width, number = map(lambda i: i.value, net.get_shape())
|
235 |
feats = tf.reshape(net, (-1, number))
|
236 |
-
gram = tf.matmul(tf.transpose(feats), feats) / (n_frames)
|
237 |
style_loss = style_loss + 2 * tf.nn.l2_loss(gram - style_gram[
|
238 |
layer_i + 1])
|
239 |
loss = content_loss + style_loss
|
@@ -241,17 +241,17 @@ def compute_stylization(kernels,
|
|
241 |
opt = tf.contrib.opt.ScipyOptimizerInterface(
|
242 |
loss, method='L-BFGS-B', options={'maxiter': iterations})
|
243 |
# Optimization
|
244 |
-
with tf.Session() as sess:
|
245 |
-
sess.run(tf.initialize_all_variables())
|
246 |
print('Started optimization.')
|
247 |
opt.minimize(sess)
|
248 |
result = x.eval()
|
249 |
else:
|
250 |
-
opt = tf.train.AdamOptimizer(
|
251 |
learning_rate=learning_rate).minimize(loss)
|
252 |
# Optimization
|
253 |
-
with tf.Session() as sess:
|
254 |
-
sess.run(tf.initialize_all_variables())
|
255 |
print('Started optimization.')
|
256 |
for i in range(iterations):
|
257 |
s, c, l, _ = sess.run([style_loss, content_loss, loss, opt])
|
|
|
1 |
"""NIPS2017 "Time Domain Neural Audio Style Transfer" code repository
|
2 |
Parag K. Mital
|
3 |
"""
|
4 |
+
import tensorflow as tf
|
|
|
5 |
import librosa
|
6 |
import numpy as np
|
7 |
from scipy.signal import hann
|
|
|
74 |
epsilon : float, optional
|
75 |
Description
|
76 |
"""
|
77 |
+
with tf.compat.v1.variable_scope('instance_norm'):
|
78 |
+
mean, var = tf.nn.moments(x=x, axes=[1, 2], keepdims=True)
|
79 |
+
scale = tf.compat.v1.get_variable(
|
80 |
name='scale',
|
81 |
shape=[x.get_shape()[-1]],
|
82 |
+
initializer=tf.compat.v1.truncated_normal_initializer(mean=1.0, stddev=0.02))
|
83 |
+
offset = tf.compat.v1.get_variable(
|
84 |
name='offset',
|
85 |
shape=[x.get_shape()[-1]],
|
86 |
+
initializer=tf.compat.v1.constant_initializer(0.0))
|
87 |
+
out = scale * tf.compat.v1.div(x - mean, tf.sqrt(var + epsilon)) + offset
|
88 |
return out
|
89 |
|
90 |
|
|
|
96 |
return x
|
97 |
freqs_tf = tf.constant(freqs, name="freqs", dtype='float32')
|
98 |
inputs = {}
|
99 |
+
with tf.compat.v1.variable_scope('real'):
|
100 |
inputs['real'] = norm_fn(tf.reshape(
|
101 |
tf.matmul(x, tf.cos(freqs_tf)), [1, 1, n_frames, n_fft // 2]))
|
102 |
+
with tf.compat.v1.variable_scope('imag'):
|
103 |
inputs['imag'] = norm_fn(tf.reshape(
|
104 |
tf.matmul(x, tf.sin(freqs_tf)), [1, 1, n_frames, n_fft // 2]))
|
105 |
+
with tf.compat.v1.variable_scope('mags'):
|
106 |
inputs['mags'] = norm_fn(tf.reshape(
|
107 |
tf.sqrt(
|
108 |
tf.maximum(1e-15, inputs['real'] * inputs['real'] + inputs[
|
109 |
'imag'] * inputs['imag'])), [1, 1, n_frames, n_fft // 2]))
|
110 |
+
with tf.compat.v1.variable_scope('phase'):
|
111 |
inputs['phase'] = norm_fn(tf.atan2(inputs['imag'], inputs['real']))
|
112 |
+
with tf.compat.v1.variable_scope('unwrapped'):
|
113 |
+
inputs['unwrapped'] = tf.compat.v1.py_func(
|
114 |
unwrap, [inputs['phase']], tf.float32)
|
115 |
+
with tf.compat.v1.variable_scope('unwrapped_difference'):
|
116 |
inputs['unwrapped_difference'] = (tf.slice(
|
117 |
inputs['unwrapped'],
|
118 |
[0, 0, 0, 1], [-1, -1, -1, n_fft // 2 - 1]) -
|
|
|
146 |
kernels = []
|
147 |
content_features = []
|
148 |
style_features = []
|
149 |
+
config_proto = tf.compat.v1.ConfigProto()
|
150 |
+
config_proto.gpu_options.allow_growth = True
|
151 |
+
with g.as_default(), g.device('/cpu:0'), tf.compat.v1.Session(config=config_proto) as sess:
|
152 |
+
x = tf.compat.v1.placeholder('float32', [n_frames, n_samples], name="x")
|
153 |
p = np.reshape(
|
154 |
np.linspace(0.0, n_samples - 1, n_samples), [n_samples, 1])
|
155 |
k = np.reshape(
|
|
|
157 |
[1, n_fft // 2])
|
158 |
freqs = np.dot(p, k)
|
159 |
inputs, net = compute_inputs(x, freqs, n_fft, n_frames, input_features, norm)
|
160 |
+
sess.run(tf.compat.v1.initialize_all_variables())
|
161 |
content_feature = net.eval(feed_dict={x: content_tf})
|
162 |
content_features.append(content_feature)
|
163 |
style_feature = inputs['mags'].eval(feed_dict={x: style_tf})
|
|
|
177 |
kernel_tf = tf.constant(
|
178 |
kernel, name="kernel{}".format(layer_i), dtype='float32')
|
179 |
conv = tf.nn.conv2d(
|
180 |
+
input=net,
|
181 |
+
filters=kernel_tf,
|
182 |
strides=[1, stride, stride, 1],
|
183 |
padding="VALID",
|
184 |
name="conv{}".format(layer_i))
|
|
|
215 |
inputs, net = compute_inputs(x, freqs, n_fft, n_frames, input_features, norm)
|
216 |
content_loss = alpha * 2 * tf.nn.l2_loss(net - content_features[0])
|
217 |
feats = tf.reshape(inputs['mags'], (-1, n_fft // 2))
|
218 |
+
gram = tf.matmul(tf.transpose(a=feats), feats) / (n_frames)
|
219 |
style_loss = 2 * tf.nn.l2_loss(gram - style_gram[0])
|
220 |
for layer_i in range(n_layers):
|
221 |
kernel_tf = tf.constant(
|
|
|
223 |
name="kernel{}".format(layer_i),
|
224 |
dtype='float32')
|
225 |
conv = tf.nn.conv2d(
|
226 |
+
input=net,
|
227 |
+
filters=kernel_tf,
|
228 |
strides=[1, stride, stride, 1],
|
229 |
padding="VALID",
|
230 |
name="conv{}".format(layer_i))
|
|
|
233 |
alpha * 2 * tf.nn.l2_loss(net - content_features[layer_i + 1])
|
234 |
_, height, width, number = map(lambda i: i.value, net.get_shape())
|
235 |
feats = tf.reshape(net, (-1, number))
|
236 |
+
gram = tf.matmul(tf.transpose(a=feats), feats) / (n_frames)
|
237 |
style_loss = style_loss + 2 * tf.nn.l2_loss(gram - style_gram[
|
238 |
layer_i + 1])
|
239 |
loss = content_loss + style_loss
|
|
|
241 |
opt = tf.contrib.opt.ScipyOptimizerInterface(
|
242 |
loss, method='L-BFGS-B', options={'maxiter': iterations})
|
243 |
# Optimization
|
244 |
+
with tf.compat.v1.Session() as sess:
|
245 |
+
sess.run(tf.compat.v1.initialize_all_variables())
|
246 |
print('Started optimization.')
|
247 |
opt.minimize(sess)
|
248 |
result = x.eval()
|
249 |
else:
|
250 |
+
opt = tf.compat.v1.train.AdamOptimizer(
|
251 |
learning_rate=learning_rate).minimize(loss)
|
252 |
# Optimization
|
253 |
+
with tf.compat.v1.Session() as sess:
|
254 |
+
sess.run(tf.compat.v1.initialize_all_variables())
|
255 |
print('Started optimization.')
|
256 |
for i in range(iterations):
|
257 |
s, c, l, _ = sess.run([style_loss, content_loss, loss, opt])
|