ydin0771 commited on
Commit
b2b46fa
β€’
1 Parent(s): ba880ef

Upload ops.py

Browse files
Files changed (1) hide show
  1. ops.py +1067 -0
ops.py ADDED
@@ -0,0 +1,1067 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ import math
3
+ import tensorflow as tf
4
+
5
+ from mi_gru_cell import MiGRUCell
6
+ from mi_lstm_cell import MiLSTMCell
7
+ from config import config
8
+
9
+ eps = 1e-20
10
+ inf = 1e30
11
+
12
+ ####################################### variables ########################################
13
+
14
+ '''
15
+ Initializes a weight matrix variable given a shape and a name.
16
+ Uses random_normal initialization if 1d, otherwise uses xavier.
17
+ '''
18
+ def getWeight(shape, name = ""):
19
+ with tf.variable_scope("weights"):
20
+ initializer = tf.contrib.layers.xavier_initializer()
21
+ # if len(shape) == 1: # good?
22
+ # initializer = tf.random_normal_initializer()
23
+ W = tf.get_variable("weight" + name, shape = shape, initializer = initializer)
24
+ return W
25
+
26
+ '''
27
+ Initializes a weight matrix variable given a shape and a name. Uses xavier
28
+ '''
29
+ def getKernel(shape, name = ""):
30
+ with tf.variable_scope("kernels"):
31
+ initializer = tf.contrib.layers.xavier_initializer()
32
+ W = tf.get_variable("kernel" + name, shape = shape, initializer = initializer)
33
+ return W
34
+
35
+ '''
36
+ Initializes a bias variable given a shape and a name.
37
+ '''
38
+ def getBias(shape, name = ""):
39
+ with tf.variable_scope("biases"):
40
+ initializer = tf.zeros_initializer()
41
+ b = tf.get_variable("bias" + name, shape = shape, initializer = initializer)
42
+ return b
43
+
44
+ ######################################### basics #########################################
45
+
46
+ '''
47
+ Multiplies input inp of any depth by a 2d weight matrix.
48
+ '''
49
+ # switch with conv 1?
50
+ def multiply(inp, W):
51
+ inDim = tf.shape(W)[0]
52
+ outDim = tf.shape(W)[1]
53
+ newDims = tf.concat([tf.shape(inp)[:-1], tf.fill((1,), outDim)], axis = 0)
54
+
55
+ inp = tf.reshape(inp, (-1, inDim))
56
+ output = tf.matmul(inp, W)
57
+ output = tf.reshape(output, newDims)
58
+
59
+ return output
60
+
61
+ '''
62
+ Concatenates x and y. Support broadcasting.
63
+ Optionally concatenate multiplication of x * y
64
+ '''
65
+ def concat(x, y, dim, mul = False, extendY = False):
66
+ if extendY:
67
+ y = tf.expand_dims(y, axis = -2)
68
+ # broadcasting to have the same shape
69
+ y = tf.zeros_like(x) + y
70
+
71
+ if mul:
72
+ out = tf.concat([x, y, x * y], axis = -1)
73
+ dim *= 3
74
+ else:
75
+ out = tf.concat([x, y], axis = -1)
76
+ dim *= 2
77
+
78
+ return out, dim
79
+
80
+ '''
81
+ Adds L2 regularization for weight and kernel variables.
82
+ '''
83
+ # add l2 in the tf way
84
+ def L2RegularizationOp(l2 = None):
85
+ if l2 is None:
86
+ l2 = config.l2
87
+ l2Loss = 0
88
+ names = ["weight", "kernel"]
89
+ for var in tf.trainable_variables():
90
+ if any((name in var.name.lower()) for name in names):
91
+ l2Loss += tf.nn.l2_loss(var)
92
+ return l2 * l2Loss
93
+
94
+ ######################################### attention #########################################
95
+
96
+ '''
97
+ Transform vectors to scalar logits.
98
+
99
+ Args:
100
+ interactions: input vectors
101
+ [batchSize, N, dim]
102
+
103
+ dim: dimension of input vectors
104
+
105
+ sumMod: LIN for linear transformation to scalars.
106
+ SUM to sum up vectors entries to get scalar logit.
107
+
108
+ dropout: dropout value over inputs (for linear case)
109
+
110
+ Return matching scalar for each interaction.
111
+ [batchSize, N]
112
+ '''
113
+ sumMod = ["LIN", "SUM"]
114
+ def inter2logits(interactions, dim, sumMod = "LIN", dropout = 1.0, name = "", reuse = None):
115
+ with tf.variable_scope("inter2logits" + name, reuse = reuse):
116
+ if sumMod == "SUM":
117
+ logits = tf.reduce_sum(interactions, axis = -1)
118
+ else: # "LIN"
119
+ logits = linear(interactions, dim, 1, dropout = dropout, name = "logits")
120
+ return logits
121
+
122
+ '''
123
+ Transforms vectors to probability distribution.
124
+ Calls inter2logits and then softmax over these.
125
+
126
+ Args:
127
+ interactions: input vectors
128
+ [batchSize, N, dim]
129
+
130
+ dim: dimension of input vectors
131
+
132
+ sumMod: LIN for linear transformation to scalars.
133
+ SUM to sum up vectors entries to get scalar logit.
134
+
135
+ dropout: dropout value over inputs (for linear case)
136
+
137
+ Return attention distribution over interactions.
138
+ [batchSize, N]
139
+ '''
140
+ def inter2att(interactions, dim, dropout = 1.0, name = "", reuse = None):
141
+ with tf.variable_scope("inter2att" + name, reuse = reuse):
142
+ logits = inter2logits(interactions, dim, dropout = dropout)
143
+ attention = tf.nn.softmax(logits)
144
+ return attention
145
+
146
+ '''
147
+ Sums up features using attention distribution to get a weighted average over them.
148
+ '''
149
+ def att2Smry(attention, features):
150
+ return tf.reduce_sum(tf.expand_dims(attention, axis = -1) * features, axis = -2)
151
+
152
+ ####################################### activations ########################################
153
+
154
+ '''
155
+ Performs a variant of ReLU based on config.relu
156
+ PRM for PReLU
157
+ ELU for ELU
158
+ LKY for Leaky ReLU
159
+ otherwise, standard ReLU
160
+ '''
161
+ def relu(inp):
162
+ if config.relu == "PRM":
163
+ with tf.variable_scope(None, default_name = "prelu"):
164
+ alpha = tf.get_variable("alpha", shape = inp.get_shape()[-1],
165
+ initializer = tf.constant_initializer(0.25))
166
+ pos = tf.nn.relu(inp)
167
+ neg = - (alpha * tf.nn.relu(-inp))
168
+ output = pos + neg
169
+ elif config.relu == "ELU":
170
+ output = tf.nn.elu(inp)
171
+ # elif config.relu == "SELU":
172
+ # output = tf.nn.selu(inp)
173
+ elif config.relu == "LKY":
174
+ # output = tf.nn.leaky_relu(inp, config.reluAlpha)
175
+ output = tf.maximum(inp, config.reluAlpha * inp)
176
+ elif config.relu == "STD": # STD
177
+ output = tf.nn.relu(inp)
178
+
179
+ return output
180
+
181
+ activations = {
182
+ "NON": tf.identity, # lambda inp: inp
183
+ "TANH": tf.tanh,
184
+ "SIGMOID": tf.sigmoid,
185
+ "RELU": relu,
186
+ "ELU": tf.nn.elu
187
+ }
188
+
189
+ # Sample from Gumbel(0, 1)
190
+ def sampleGumbel(shape):
191
+ U = tf.random_uniform(shape, minval = 0, maxval = 1)
192
+ return -tf.log(-tf.log(U + eps) + eps)
193
+
194
+ # Draw a clevr_sample from the Gumbel-Softmax distribution
195
+ def gumbelSoftmaxSample(logits, temperature):
196
+ y = logits + sampleGumbel(tf.shape(logits))
197
+ return tf.nn.softmax(y / temperature)
198
+
199
+ def gumbelSoftmax(logits, temperature, train): # hard = False
200
+ # Sample from the Gumbel-Softmax distribution and optionally discretize.
201
+ # Args:
202
+ # logits: [batch_size, n_class] unnormalized log-probs
203
+ # temperature: non-negative scalar
204
+ # hard: if True, take argmax, but differentiate w.r.t. soft clevr_sample y
205
+ # Returns:
206
+ # [batch_size, n_class] clevr_sample from the Gumbel-Softmax distribution.
207
+ # If hard=True, then the returned clevr_sample will be one-hot, otherwise it will
208
+ # be a probabilitiy distribution that sums to 1 across classes
209
+
210
+ y = gumbelSoftmaxSample(logits, temperature)
211
+
212
+ # k = tf.shape(logits)[-1]
213
+ # yHard = tf.cast(tf.one_hot(tf.argmax(y,1),k), y.dtype)
214
+ yHard = tf.cast(tf.equal(y, tf.reduce_max(y, 1, keep_dims = True)), y.dtype)
215
+ yNew = tf.stop_gradient(yHard - y) + y
216
+
217
+ if config.gumbelSoftmaxBoth:
218
+ return y
219
+ if config.gumbelArgmaxBoth:
220
+ return yNew
221
+ ret = tf.cond(train, lambda: y, lambda: yNew)
222
+
223
+ return ret
224
+
225
+ def softmaxDiscrete(logits, temperature, train):
226
+ if config.gumbelSoftmax:
227
+ return gumbelSoftmax(logits, temperature = temperature, train = train)
228
+ else:
229
+ return tf.nn.softmax(logits)
230
+
231
+ def parametricDropout(name, train):
232
+ var = tf.get_variable("varDp" + name, shape = (), initializer = tf.constant_initializer(2),
233
+ dtype = tf.float32)
234
+ dropout = tf.cond(train, lambda: tf.sigmoid(var), lambda: 1.0)
235
+ return dropout
236
+
237
+ ###################################### sequence helpers ######################################
238
+
239
+ '''
240
+ Casts exponential mask over a sequence with sequence length.
241
+ Used to prepare logits before softmax.
242
+ '''
243
+ def expMask(seq, seqLength):
244
+ maxLength = tf.shape(seq)[-1]
245
+ mask = (1 - tf.cast(tf.sequence_mask(seqLength, maxLength), tf.float32)) * (-inf)
246
+ masked = seq + mask
247
+ return masked
248
+
249
+ '''
250
+ Computes seq2seq loss between logits and target sequences, with given lengths.
251
+ '''
252
+ def seq2SeqLoss(logits, targets, lengths):
253
+ mask = tf.sequence_mask(lengths, maxlen = tf.shape(targets)[1])
254
+ loss = tf.contrib.seq2seq.sequence_loss(logits, targets, tf.to_float(mask))
255
+ return loss
256
+
257
+ '''
258
+ Computes seq2seq loss between logits and target sequences, with given lengths.
259
+ acc1: accuracy per symbol
260
+ acc2: accuracy per sequence
261
+ '''
262
+ def seq2seqAcc(preds, targets, lengths):
263
+ mask = tf.sequence_mask(lengths, maxlen = tf.shape(targets)[1])
264
+ corrects = tf.logical_and(tf.equal(preds, targets), mask)
265
+ numCorrects = tf.reduce_sum(tf.to_int32(corrects), axis = 1)
266
+
267
+ acc1 = tf.to_float(numCorrects) / (tf.to_float(lengths) + eps) # add small eps instead?
268
+ acc1 = tf.reduce_mean(acc1)
269
+
270
+ acc2 = tf.to_float(tf.equal(numCorrects, lengths))
271
+ acc2 = tf.reduce_mean(acc2)
272
+
273
+ return acc1, acc2
274
+
275
+ ########################################### linear ###########################################
276
+
277
+ '''
278
+ linear transformation.
279
+
280
+ Args:
281
+ inp: input to transform
282
+ inDim: input dimension
283
+ outDim: output dimension
284
+ dropout: dropout over input
285
+ batchNorm: if not None, applies batch normalization to inputs
286
+ addBias: True to add bias
287
+ bias: initial bias value
288
+ act: if not None, activation to use after linear transformation
289
+ actLayer: if True and act is not None, applies another linear transformation on top of previous
290
+ actDropout: dropout to apply in the optional second linear transformation
291
+ retVars: if True, return parameters (weight and bias)
292
+
293
+ Returns linear transformation result.
294
+ '''
295
+ # batchNorm = {"decay": float, "train": Tensor}
296
+ # actLayer: if activation is not non, stack another linear layer
297
+ # maybe change naming scheme such that if name = "" than use it as default_name (-->unique?)
298
+ def linear(inp, inDim, outDim, dropout = 1.0,
299
+ batchNorm = None, addBias = True, bias = 0.0,
300
+ act = "NON", actLayer = True, actDropout = 1.0,
301
+ retVars = False, name = "", reuse = None):
302
+
303
+ with tf.variable_scope("linearLayer" + name, reuse = reuse):
304
+ W = getWeight((inDim, outDim) if outDim > 1 else (inDim, ))
305
+ b = getBias((outDim, ) if outDim > 1 else ()) + bias
306
+
307
+ if batchNorm is not None:
308
+ inp = tf.contrib.layers.batch_norm(inp, decay = batchNorm["decay"],
309
+ center = True, scale = True, is_training = batchNorm["train"], updates_collections = None)
310
+ # tf.layers.batch_normalization, axis -1 ?
311
+
312
+ inp = tf.nn.dropout(inp, dropout)
313
+
314
+ if outDim > 1:
315
+ output = multiply(inp, W)
316
+ else:
317
+ output = tf.reduce_sum(inp * W, axis = -1)
318
+
319
+ if addBias:
320
+ output += b
321
+
322
+ output = activations[act](output)
323
+
324
+ # good?
325
+ if act != "NON" and actLayer:
326
+ output = linear(output, outDim, outDim, dropout = actDropout, batchNorm = batchNorm,
327
+ addBias = addBias, act = "NON", actLayer = False,
328
+ name = name + "_2", reuse = reuse)
329
+
330
+ if retVars:
331
+ return (output, (W, b))
332
+
333
+ return output
334
+
335
+ '''
336
+ Computes Multi-layer feed-forward network.
337
+
338
+ Args:
339
+ features: input features
340
+ dims: list with dimensions of network.
341
+ First dimension is of the inputs, final is of the outputs.
342
+ batchNorm: if not None, applies batchNorm
343
+ dropout: dropout value to apply for each layer
344
+ act: activation to apply between layers.
345
+ NON, TANH, SIGMOID, RELU, ELU
346
+ '''
347
+ # no activation after last layer
348
+ # batchNorm = {"decay": float, "train": Tensor}
349
+ def FCLayer(features, dims, batchNorm = None, dropout = 1.0, act = "RELU"):
350
+ layersNum = len(dims) - 1
351
+
352
+ for i in range(layersNum):
353
+ features = linear(features, dims[i], dims[i+1], name = "fc_%d" % i,
354
+ batchNorm = batchNorm, dropout = dropout)
355
+ # not the last layer
356
+ if i < layersNum - 1:
357
+ features = activations[act](features)
358
+
359
+ return features
360
+
361
+ ###################################### cnns ######################################
362
+
363
+ '''
364
+ Computes convolution.
365
+
366
+ Args:
367
+ inp: input features
368
+ inDim: input dimension
369
+ outDim: output dimension
370
+ batchNorm: if not None, applies batchNorm on inputs
371
+ dropout: dropout value to apply on inputs
372
+ addBias: True to add bias
373
+ kernelSize: kernel size
374
+ stride: stride size
375
+ act: activation to apply on outputs
376
+ NON, TANH, SIGMOID, RELU, ELU
377
+ '''
378
+ # batchNorm = {"decay": float, "train": Tensor, "center": bool, "scale": bool}
379
+ # collections.namedtuple("batchNorm", ("decay", "train"))
380
+ def cnn(inp, inDim, outDim, batchNorm = None, dropout = 1.0, addBias = True,
381
+ kernelSize = None, stride = 1, act = "NON", name = "", reuse = None):
382
+
383
+ with tf.variable_scope("cnnLayer" + name, reuse = reuse):
384
+
385
+ if kernelSize is None:
386
+ kernelSize = config.stemKernelSize
387
+ kernelH = kernelW = kernelSize
388
+
389
+ kernel = getKernel((kernelH, kernelW, inDim, outDim))
390
+ b = getBias((outDim, ))
391
+
392
+ if batchNorm is not None:
393
+ inp = tf.contrib.layers.batch_norm(inp, decay = batchNorm["decay"], center = batchNorm["center"],
394
+ scale = batchNorm["scale"], is_training = batchNorm["train"], updates_collections = None)
395
+
396
+ inp = tf.nn.dropout(inp, dropout)
397
+
398
+ output = tf.nn.conv2d(inp, filter = kernel, strides = [1, stride, stride, 1], padding = "SAME")
399
+
400
+ if addBias:
401
+ output += b
402
+
403
+ output = activations[act](output)
404
+
405
+ return output
406
+
407
+ '''
408
+ Computes Multi-layer convolutional network.
409
+
410
+ Args:
411
+ features: input features
412
+ dims: list with dimensions of network.
413
+ First dimension is of the inputs. Final is of the outputs.
414
+ batchNorm: if not None, applies batchNorm
415
+ dropout: dropout value to apply for each layer
416
+ kernelSizes: list of kernel sizes for each layer. Default to config.stemKernelSize
417
+ strides: list of strides for each layer. Default to 1.
418
+ act: activation to apply between layers.
419
+ NON, TANH, SIGMOID, RELU, ELU
420
+ '''
421
+ # batchNorm = {"decay": float, "train": Tensor, "center": bool, "scale": bool}
422
+ # activation after last layer
423
+ def CNNLayer(features, dims, batchNorm = None, dropout = 1.0,
424
+ kernelSizes = None, strides = None, act = "RELU"):
425
+
426
+ layersNum = len(dims) - 1
427
+
428
+ if kernelSizes is None:
429
+ kernelSizes = [config.stemKernelSize for i in range(layersNum)]
430
+
431
+ if strides is None:
432
+ strides = [1 for i in range(layersNum)]
433
+
434
+ for i in range(layersNum):
435
+ features = cnn(features, dims[i], dims[i+1], name = "cnn_%d" % i, batchNorm = batchNorm,
436
+ dropout = dropout, kernelSize = kernelSizes[i], stride = strides[i], act = act)
437
+
438
+ return features
439
+
440
+ ######################################## location ########################################
441
+
442
+ '''
443
+ Computes linear positional encoding for h x w grid.
444
+ If outDim positive, casts positions to that dimension.
445
+ '''
446
+ # ignores dim
447
+ # h,w can be tensor scalars
448
+ def locationL(h, w, dim, outDim = -1, addBias = True):
449
+ dim = 2
450
+ grid = tf.stack(tf.meshgrid(tf.linspace(-config.locationBias, config.locationBias, w),
451
+ tf.linspace(-config.locationBias, config.locationBias, h)), axis = -1)
452
+
453
+ if outDim > 0:
454
+ grid = linear(grid, dim, outDim, addBias = addBias, name = "locationL")
455
+ dim = outDim
456
+
457
+ return grid, dim
458
+
459
+ '''
460
+ Computes sin/cos positional encoding for h x w x (4*dim).
461
+ If outDim positive, casts positions to that dimension.
462
+ Based on positional encoding presented in "Attention is all you need"
463
+ '''
464
+ # dim % 4 = 0
465
+ # h,w can be tensor scalars
466
+ def locationPE(h, w, dim, outDim = -1, addBias = True):
467
+ x = tf.expand_dims(tf.to_float(tf.linspace(-config.locationBias, config.locationBias, w)), axis = -1)
468
+ y = tf.expand_dims(tf.to_float(tf.linspace(-config.locationBias, config.locationBias, h)), axis = -1)
469
+ i = tf.expand_dims(tf.to_float(tf.range(dim)), axis = 0)
470
+
471
+ peSinX = tf.sin(x / (tf.pow(10000.0, i / dim)))
472
+ peCosX = tf.cos(x / (tf.pow(10000.0, i / dim)))
473
+ peSinY = tf.sin(y / (tf.pow(10000.0, i / dim)))
474
+ peCosY = tf.cos(y / (tf.pow(10000.0, i / dim)))
475
+
476
+ peSinX = tf.tile(tf.expand_dims(peSinX, axis = 0), [h, 1, 1])
477
+ peCosX = tf.tile(tf.expand_dims(peCosX, axis = 0), [h, 1, 1])
478
+ peSinY = tf.tile(tf.expand_dims(peSinY, axis = 1), [1, w, 1])
479
+ peCosY = tf.tile(tf.expand_dims(peCosY, axis = 1), [1, w, 1])
480
+
481
+ grid = tf.concat([peSinX, peCosX, peSinY, peCosY], axis = -1)
482
+ dim *= 4
483
+
484
+ if outDim > 0:
485
+ grid = linear(grid, dim, outDim, addBias = addBias, name = "locationPE")
486
+ dim = outDim
487
+
488
+ return grid, dim
489
+
490
+ locations = {
491
+ "L": locationL,
492
+ "PE": locationPE
493
+ }
494
+
495
+ '''
496
+ Adds positional encoding to features. May ease spatial reasoning.
497
+ (although not used in the default model).
498
+
499
+ Args:
500
+ features: features to add position encoding to.
501
+ [batchSize, h, w, c]
502
+
503
+ inDim: number of features' channels
504
+ lDim: dimension for positional encodings
505
+ outDim: if positive, cast enhanced features (with positions) to that dimension
506
+ h: features' height
507
+ w: features' width
508
+ locType: L for linear encoding, PE for cos/sin based positional encoding
509
+ mod: way to add positional encoding: concatenation (CNCT), addition (ADD),
510
+ multiplication (MUL), linear transformation (LIN).
511
+ '''
512
+ mods = ["CNCT", "ADD", "LIN", "MUL"]
513
+ # if outDim = -1, then will be set based on inDim, lDim
514
+ def addLocation(features, inDim, lDim, outDim = -1, h = None, w = None,
515
+ locType = "L", mod = "CNCT", name = "", reuse = None): # h,w not needed
516
+
517
+ with tf.variable_scope("addLocation" + name, reuse = reuse):
518
+ batchSize = tf.shape(features)[0]
519
+ if h is None:
520
+ h = tf.shape(features)[1]
521
+ if w is None:
522
+ w = tf.shape(features)[2]
523
+ dim = inDim
524
+
525
+ if mod == "LIN":
526
+ if outDim < 0:
527
+ outDim = dim
528
+
529
+ grid, _ = locations[locType](h, w, lDim, outDim = outDim, addBias = False)
530
+ features = linear(features, dim, outDim, name = "LIN")
531
+ features += grid
532
+ return features, outDim
533
+
534
+ if mod == "CNCT":
535
+ grid, lDim = locations[locType](h, w, lDim)
536
+ # grid = tf.zeros_like(features) + grid
537
+ grid = tf.tile(tf.expand_dims(grid, axis = 0), [batchSize, 1, 1, 1])
538
+ features = tf.concat([features, grid], axis = -1)
539
+ dim += lDim
540
+
541
+ elif mod == "ADD":
542
+ grid, _ = locations[locType](h, w, lDim, outDim = dim)
543
+ features += grid
544
+
545
+ elif mod == "MUL": # MUL
546
+ grid, _ = locations[locType](h, w, lDim, outDim = dim)
547
+
548
+ if outDim < 0:
549
+ outDim = dim
550
+
551
+ grid = tf.tile(tf.expand_dims(grid, axis = 0), [batchSize, 1, 1, 1])
552
+ features = tf.concat([features, grid, features * grid], axis = -1)
553
+ dim *= 3
554
+
555
+ if outDim > 0:
556
+ features = linear(features, dim, outDim)
557
+ dim = outDim
558
+
559
+ return features, dim
560
+
561
+ # config.locationAwareEnd
562
+ # H, W, _ = config.imageDims
563
+ # projDim = config.stemProjDim
564
+ # k = config.stemProjPooling
565
+ # projDim on inDim or on out
566
+ # inDim = tf.shape(features)[3]
567
+
568
+ '''
569
+ Linearize 2d image to linear vector.
570
+
571
+ Args:
572
+ features: batch of 2d images.
573
+ [batchSize, h, w, inDim]
574
+
575
+ h: image height
576
+
577
+ w: image width
578
+
579
+ inDim: number of channels
580
+
581
+ projDim: if not None, project image to that dimension before linearization
582
+
583
+ outDim: if not None, project image to that dimension after linearization
584
+
585
+ loc: if not None, add positional encoding:
586
+ locType: L for linear encoding, PE for cos/sin based positional encoding
587
+ mod: way to add positional encoding: concatenation (CNCT), addition (ADD),
588
+ multiplication (MUL), linear transformation (LIN).
589
+ pooling: number to pool image with before linearization.
590
+
591
+ Returns linearized image:
592
+ [batchSize, outDim] (or [batchSize, (h / pooling) * (w /pooling) * projDim] if outDim not supported)
593
+ '''
594
+ # loc = {"locType": str, "mod": str}
595
+ def linearizeFeatures(features, h, w, inDim, projDim = None, outDim = None,
596
+ loc = None, pooling = None):
597
+
598
+ if pooling is None:
599
+ pooling = config.imageLinPool
600
+
601
+ if loc is not None:
602
+ features = addLocation(features, inDim, lDim = inDim, outDim = inDim,
603
+ locType = loc["locType"], mod = loc["mod"])
604
+
605
+ if projDim is not None:
606
+ features = linear(features, dim, projDim)
607
+ features = relu(features)
608
+ dim = projDim
609
+
610
+ if pooling > 1:
611
+ poolingDims = [1, pooling, pooling, 1]
612
+ features = tf.nn.max_pool(features, ksize = poolingDims, strides = poolingDims,
613
+ padding = "SAME")
614
+ h /= pooling
615
+ w /= pooling
616
+
617
+ dim = h * w * dim
618
+ features = tf.reshape(features, (-1, dim))
619
+
620
+ if outDim is not None:
621
+ features = linear(features, dim, outDim)
622
+ dim = outDim
623
+
624
+ return features, dim
625
+
626
+ ################################### multiplication ###################################
627
+ # specific dim / proj for x / y
628
+ '''
629
+ "Enhanced" hadamard product between x and y:
630
+ 1. Supports optional projection of x, and y prior to multiplication.
631
+ 2. Computes simple multiplication, or a parametrized one, using diagonal of complete matrix (bi-linear)
632
+ 3. Optionally concatenate x or y or their projection to the multiplication result.
633
+
634
+ Support broadcasting
635
+
636
+ Args:
637
+ x: left-hand side argument
638
+ [batchSize, dim]
639
+
640
+ y: right-hand side argument
641
+ [batchSize, dim]
642
+
643
+ dim: input dimension of x and y
644
+
645
+ dropout: dropout value to apply on x and y
646
+
647
+ proj: if not None, project x and y:
648
+ dim: projection dimension
649
+ shared: use same projection for x and y
650
+ dropout: dropout to apply to x and y if projected
651
+
652
+ interMod: multiplication type:
653
+ "MUL": x * y
654
+ "DIAG": x * W * y for a learned diagonal parameter W
655
+ "BL": x' W y for a learned matrix W
656
+
657
+ concat: if not None, concatenate x or y or their projection.
658
+
659
+ mulBias: optional bias to stabilize multiplication (x * bias) (y * bias)
660
+
661
+ Returns the multiplication result
662
+ [batchSize, outDim] when outDim depends on the use of proj and cocnat arguments.
663
+ '''
664
+ # proj = {"dim": int, "shared": bool, "dropout": float} # "act": str, "actDropout": float
665
+ ## interMod = ["direct", "scalarW", "bilinear"] # "additive"
666
+ # interMod = ["MUL", "DIAG", "BL", "ADD"]
667
+ # concat = {"x": bool, "y": bool, "proj": bool}
668
+ def mul(x, y, dim, dropout = 1.0, proj = None, interMod = "MUL", concat = None, mulBias = None,
669
+ extendY = True, name = "", reuse = None):
670
+
671
+ with tf.variable_scope("mul" + name, reuse = reuse):
672
+ origVals = {"x": x, "y": y, "dim": dim}
673
+
674
+ x = tf.nn.dropout(x, dropout)
675
+ y = tf.nn.dropout(y, dropout)
676
+ # projection
677
+ if proj is not None:
678
+ x = tf.nn.dropout(x, proj.get("dropout", 1.0))
679
+ y = tf.nn.dropout(y, proj.get("dropout", 1.0))
680
+
681
+ if proj["shared"]:
682
+ xName, xReuse = "proj", None
683
+ yName, yReuse = "proj", True
684
+ else:
685
+ xName, xReuse = "projX", None
686
+ yName, yReuse = "projY", None
687
+
688
+ x = linear(x, dim, proj["dim"], name = xName, reuse = xReuse)
689
+ y = linear(y, dim, proj["dim"], name = yName, reuse = yReuse)
690
+ dim = proj["dim"]
691
+ projVals = {"x": x, "y": y, "dim": dim}
692
+ proj["x"], proj["y"] = x, y
693
+
694
+ if extendY:
695
+ y = tf.expand_dims(y, axis = -2)
696
+ # broadcasting to have the same shape
697
+ y = tf.zeros_like(x) + y
698
+
699
+ # multiplication
700
+ if interMod == "MUL":
701
+ if mulBias is None:
702
+ mulBias = config.mulBias
703
+ output = (x + mulBias) * (y + mulBias)
704
+ elif interMod == "DIAG":
705
+ W = getWeight((dim, )) # change initialization?
706
+ b = getBias((dim, ))
707
+ activations = x * W * y + b
708
+ elif interMod == "BL":
709
+ W = getWeight((dim, dim))
710
+ b = getBias((dim, ))
711
+ output = multiply(x, W) * y + b
712
+ else: # "ADD"
713
+ output = tf.tanh(x + y)
714
+ # concatenation
715
+ if concat is not None:
716
+ concatVals = projVals if concat.get("proj", False) else origVals
717
+ if concat.get("x", False):
718
+ output = tf.concat([output, concatVals["x"]], axis = -1)
719
+ dim += concatVals["dim"]
720
+
721
+ if concat.get("y", False):
722
+ output = ops.concat(output, concatVals["y"], extendY = extendY)
723
+ dim += concatVals["dim"]
724
+
725
+ return output, dim
726
+
727
+ ######################################## rnns ########################################
728
+
729
+ '''
730
+ Creates an RNN cell.
731
+
732
+ Args:
733
+ hdim: the hidden dimension of the RNN cell.
734
+
735
+ reuse: whether the cell should reuse parameters or create new ones.
736
+
737
+ cellType: the cell type
738
+ RNN, GRU, LSTM, MiGRU, MiLSTM, ProjLSTM
739
+
740
+ act: the cell activation
741
+ NON, TANH, SIGMOID, RELU, ELU
742
+
743
+ projDim: if ProjLSTM, the dimension for the states projection
744
+
745
+ Returns the cell.
746
+ '''
747
+ # tf.nn.rnn_cell.MultiRNNCell([cell(hDim, reuse = reuse) for _ in config.encNumLayers])
748
+ # note that config.enc params not general
749
+ def createCell(hDim, reuse, cellType = None, act = None, projDim = None):
750
+ if cellType is None:
751
+ cellType = config.encType
752
+
753
+ activation = activations.get(act, None)
754
+
755
+ if cellType == "ProjLSTM":
756
+ cell = tf.nn.rnn_cell.LSTMCell
757
+ if projDim is None:
758
+ projDim = config.cellDim
759
+ cell = cell(hDim, num_proj = projDim, reuse = reuse, activation = activation)
760
+ return cell
761
+
762
+ cells = {
763
+ "RNN": tf.nn.rnn_cell.BasicRNNCell,
764
+ "GRU": tf.nn.rnn_cell.GRUCell,
765
+ "LSTM": tf.nn.rnn_cell.BasicLSTMCell,
766
+ "MiGRU": MiGRUCell,
767
+ "MiLSTM": MiLSTMCell
768
+ }
769
+
770
+ cell = cells[cellType](hDim, reuse = reuse, activation = activation)
771
+
772
+ return cell
773
+
774
+ '''
775
+ Runs an forward RNN layer.
776
+
777
+ Args:
778
+ inSeq: the input sequence to run the RNN over.
779
+ [batchSize, sequenceLength, inDim]
780
+
781
+ seqL: the sequence matching lengths.
782
+ [batchSize, 1]
783
+
784
+ hDim: hidden dimension of the RNN.
785
+
786
+ cellType: the cell type
787
+ RNN, GRU, LSTM, MiGRU, MiLSTM, ProjLSTM
788
+
789
+ dropout: value for dropout over input sequence
790
+
791
+ varDp: if not None, state and input variational dropouts to apply.
792
+ dimension of input has to be supported (inputSize).
793
+
794
+ Returns the outputs sequence and final RNN state.
795
+ '''
796
+ # varDp = {"stateDp": float, "inputDp": float, "inputSize": int}
797
+ # proj = {"output": bool, "state": bool, "dim": int, "dropout": float, "act": str}
798
+ def fwRNNLayer(inSeq, seqL, hDim, cellType = None, dropout = 1.0, varDp = None,
799
+ name = "", reuse = None): # proj = None
800
+
801
+ with tf.variable_scope("rnnLayer" + name, reuse = reuse):
802
+ batchSize = tf.shape(inSeq)[0]
803
+
804
+ cell = createCell(hDim, reuse, cellType) # passing reuse isn't mandatory
805
+
806
+ if varDp is not None:
807
+ cell = tf.contrib.rnn.DropoutWrapper(cell,
808
+ state_keep_prob = varDp["stateDp"],
809
+ input_keep_prob = varDp["inputDp"],
810
+ variational_recurrent = True, input_size = varDp["inputSize"], dtype = tf.float32)
811
+ else:
812
+ inSeq = tf.nn.dropout(inSeq, dropout)
813
+
814
+ initialState = cell.zero_state(batchSize, tf.float32)
815
+
816
+ outSeq, lastState = tf.nn.dynamic_rnn(cell, inSeq,
817
+ sequence_length = seqL,
818
+ initial_state = initialState,
819
+ swap_memory = True)
820
+
821
+ if isinstance(lastState, tf.nn.rnn_cell.LSTMStateTuple):
822
+ lastState = lastState.h
823
+
824
+ # if proj is not None:
825
+ # if proj["output"]:
826
+ # outSeq = linear(outSeq, cell.output_size, proj["dim"], act = proj["act"],
827
+ # dropout = proj["dropout"], name = "projOutput")
828
+
829
+ # if proj["state"]:
830
+ # lastState = linear(lastState, cell.state_size, proj["dim"], act = proj["act"],
831
+ # dropout = proj["dropout"], name = "projState")
832
+
833
+ return outSeq, lastState
834
+
835
+ '''
836
+ Runs an bidirectional RNN layer.
837
+
838
+ Args:
839
+ inSeq: the input sequence to run the RNN over.
840
+ [batchSize, sequenceLength, inDim]
841
+
842
+ seqL: the sequence matching lengths.
843
+ [batchSize, 1]
844
+
845
+ hDim: hidden dimension of the RNN.
846
+
847
+ cellType: the cell type
848
+ RNN, GRU, LSTM, MiGRU, MiLSTM
849
+
850
+ dropout: value for dropout over input sequence
851
+
852
+ varDp: if not None, state and input variational dropouts to apply.
853
+ dimension of input has to be supported (inputSize).
854
+
855
+ Returns the outputs sequence and final RNN state.
856
+ '''
857
+ # varDp = {"stateDp": float, "inputDp": float, "inputSize": int}
858
+ # proj = {"output": bool, "state": bool, "dim": int, "dropout": float, "act": str}
859
+ def biRNNLayer(inSeq, seqL, hDim, cellType = None, dropout = 1.0, varDp = None,
860
+ name = "", reuse = None): # proj = None,
861
+
862
+ with tf.variable_scope("birnnLayer" + name, reuse = reuse):
863
+ batchSize = tf.shape(inSeq)[0]
864
+
865
+ with tf.variable_scope("fw"):
866
+ cellFw = createCell(hDim, reuse, cellType)
867
+ with tf.variable_scope("bw"):
868
+ cellBw = createCell(hDim, reuse, cellType)
869
+
870
+ if varDp is not None:
871
+ cellFw = tf.contrib.rnn.DropoutWrapper(cellFw,
872
+ state_keep_prob = varDp["stateDp"],
873
+ input_keep_prob = varDp["inputDp"],
874
+ variational_recurrent = True, input_size = varDp["inputSize"], dtype = tf.float32)
875
+
876
+ cellBw = tf.contrib.rnn.DropoutWrapper(cellBw,
877
+ state_keep_prob = varDp["stateDp"],
878
+ input_keep_prob = varDp["inputDp"],
879
+ variational_recurrent = True, input_size = varDp["inputSize"], dtype = tf.float32)
880
+ else:
881
+ inSeq = tf.nn.dropout(inSeq, dropout)
882
+
883
+ initialStateFw = cellFw.zero_state(batchSize, tf.float32)
884
+ initialStateBw = cellBw.zero_state(batchSize, tf.float32)
885
+
886
+ (outSeqFw, outSeqBw), (lastStateFw, lastStateBw) = tf.nn.bidirectional_dynamic_rnn(
887
+ cellFw, cellBw, inSeq,
888
+ sequence_length = seqL,
889
+ initial_state_fw = initialStateFw,
890
+ initial_state_bw = initialStateBw,
891
+ swap_memory = True)
892
+
893
+ if isinstance(lastStateFw, tf.nn.rnn_cell.LSTMStateTuple):
894
+ lastStateFw = lastStateFw.h # take c?
895
+ lastStateBw = lastStateBw.h
896
+
897
+ outSeq = tf.concat([outSeqFw, outSeqBw], axis = -1)
898
+ lastState = tf.concat([lastStateFw, lastStateBw], axis = -1)
899
+
900
+ # if proj is not None:
901
+ # if proj["output"]:
902
+ # outSeq = linear(outSeq, cellFw.output_size + cellFw.output_size,
903
+ # proj["dim"], act = proj["act"], dropout = proj["dropout"],
904
+ # name = "projOutput")
905
+
906
+ # if proj["state"]:
907
+ # lastState = linear(lastState, cellFw.state_size + cellFw.state_size,
908
+ # proj["dim"], act = proj["act"], dropout = proj["dropout"],
909
+ # name = "projState")
910
+
911
+ return outSeq, lastState
912
+
913
+ # int(hDim / 2) for biRNN?
914
+ '''
915
+ Runs an RNN layer by calling biRNN or fwRNN.
916
+
917
+ Args:
918
+ inSeq: the input sequence to run the RNN over.
919
+ [batchSize, sequenceLength, inDim]
920
+
921
+ seqL: the sequence matching lengths.
922
+ [batchSize, 1]
923
+
924
+ hDim: hidden dimension of the RNN.
925
+
926
+ bi: true to run bidirectional rnn.
927
+
928
+ cellType: the cell type
929
+ RNN, GRU, LSTM, MiGRU, MiLSTM
930
+
931
+ dropout: value for dropout over input sequence
932
+
933
+ varDp: if not None, state and input variational dropouts to apply.
934
+ dimension of input has to be supported (inputSize).
935
+
936
+ Returns the outputs sequence and final RNN state.
937
+ '''
938
+ # proj = {"output": bool, "state": bool, "dim": int, "dropout": float, "act": str}
939
+ # varDp = {"stateDp": float, "inputDp": float, "inputSize": int}
940
+ def RNNLayer(inSeq, seqL, hDim, bi = None, cellType = None, dropout = 1.0, varDp = None,
941
+ name = "", reuse = None): # proj = None
942
+
943
+ with tf.variable_scope("rnnLayer" + name, reuse = reuse):
944
+ if bi is None:
945
+ bi = config.encBi
946
+
947
+ rnn = biRNNLayer if bi else fwRNNLayer
948
+
949
+ if bi:
950
+ hDim = int(hDim / 2)
951
+
952
+ return rnn(inSeq, seqL, hDim, cellType = cellType, dropout = dropout, varDp = varDp) # , proj = proj
953
+
954
+ # tf counterpart?
955
+ # hDim = config.moduleDim
956
+ def multigridRNNLayer(featrues, h, w, dim, name = "", reuse = None):
957
+ with tf.variable_scope("multigridRNNLayer" + name, reuse = reuse):
958
+ featrues = linear(featrues, dim, dim / 2, name = "i")
959
+
960
+ output0 = gridRNNLayer(featrues, h, w, dim, right = True, down = True, name = "rd")
961
+ output1 = gridRNNLayer(featrues, h, w, dim, right = True, down = False, name = "r")
962
+ output2 = gridRNNLayer(featrues, h, w, dim, right = False, down = True, name = "d")
963
+ output3 = gridRNNLayer(featrues, h, w, dim, right = False, down = False, name = "NON")
964
+
965
+ output = tf.concat([output0, output1, output2, output3], axis = -1)
966
+ output = linear(output, 2 * dim, dim, name = "o")
967
+
968
+ return outputs
969
+
970
+ # h,w should be constants
971
+ def gridRNNLayer(features, h, w, dim, right, down, name = "", reuse = None):
972
+ with tf.variable_scope("gridRNNLayer" + name):
973
+ batchSize = tf.shape(features)[0]
974
+
975
+ cell = createCell(dim, reuse = reuse, cellType = config.stemGridRnnMod,
976
+ act = config.stemGridAct)
977
+
978
+ initialState = cell.zero_state(batchSize, tf.float32)
979
+
980
+ inputs = [tf.unstack(row, w, axis = 1) for row in tf.unstack(features, h, axis = 1)]
981
+ states = [[None for _ in range(w)] for _ in range(h)]
982
+
983
+ iAxis = range(h) if down else (range(h)[::-1])
984
+ jAxis = range(w) if right else (range(w)[::-1])
985
+
986
+ iPrev = -1 if down else 1
987
+ jPrev = -1 if right else 1
988
+
989
+ prevState = lambda i,j: states[i][j] if (i >= 0 and i < h and j >= 0 and j < w) else initialState
990
+
991
+ for i in iAxis:
992
+ for j in jAxis:
993
+ prevs = tf.concat((prevState(i + iPrev, j), prevState(i, j + jPrev)), axis = -1)
994
+ curr = inputs[i][j]
995
+ _, states[i][j] = cell(prevs, curr)
996
+
997
+ outputs = [tf.stack(row, axis = 1) for row in states]
998
+ outputs = tf.stack(outputs, axis = 1)
999
+
1000
+ return outputs
1001
+
1002
+ # tf seq2seq?
1003
+ # def projRNNLayer(inSeq, seqL, hDim, labels, labelsNum, labelsDim, labelsEmb, name = "", reuse = None):
1004
+ # with tf.variable_scope("projRNNLayer" + name):
1005
+ # batchSize = tf.shape(features)[0]
1006
+
1007
+ # cell = createCell(hDim, reuse = reuse)
1008
+
1009
+ # projCell = ProjWrapper(cell, labelsNum, labelsDim, labelsEmb, # config.wrdEmbDim
1010
+ # feedPrev = True, dropout = 1.0, config,
1011
+ # temperature = 1.0, clevr_sample = False, reuse)
1012
+
1013
+ # initialState = projCell.zero_state(batchSize, tf.float32)
1014
+
1015
+ # if config.soft:
1016
+ # inSeq = inSeq
1017
+
1018
+ # # outputs, _ = tf.nn.static_rnn(projCell, inputs,
1019
+ # # sequence_length = seqL,
1020
+ # # initial_state = initialState)
1021
+
1022
+ # inSeq = tf.unstack(inSeq, axis = 1)
1023
+ # state = initialState
1024
+ # logitsList = []
1025
+ # chosenList = []
1026
+
1027
+ # for inp in inSeq:
1028
+ # (logits, chosen), state = projCell(inp, state)
1029
+ # logitsList.append(logits)
1030
+ # chosenList.append(chosen)
1031
+ # projCell.reuse = True
1032
+
1033
+ # logitsOut = tf.stack(logitsList, axis = 1)
1034
+ # chosenOut = tf.stack(chosenList, axis = 1)
1035
+ # outputs = (logitsOut, chosenOut)
1036
+ # else:
1037
+ # labels = tf.to_float(labels)
1038
+ # labels = tf.concat([tf.zeros((batchSize, 1)), labels], axis = 1)[:, :-1] # ,newaxis
1039
+ # inSeq = tf.concat([inSeq, tf.expand_dims(labels, axis = -1)], axis = -1)
1040
+
1041
+ # outputs, _ = tf.nn.dynamic_rnn(projCell, inSeq,
1042
+ # sequence_length = seqL,
1043
+ # initial_state = initialState,
1044
+ # swap_memory = True)
1045
+
1046
+ # return outputs #, labelsEmb
1047
+
1048
+ ############################### variational dropout ###############################
1049
+
1050
+ '''
1051
+ Generates a variational dropout mask for a given shape and a dropout
1052
+ probability value.
1053
+ '''
1054
+ def generateVarDpMask(shape, keepProb):
1055
+ randomTensor = tf.to_float(keepProb)
1056
+ randomTensor += tf.random_uniform(shape, minval = 0, maxval = 1)
1057
+ binaryTensor = tf.floor(randomTensor)
1058
+ mask = tf.to_float(binaryTensor)
1059
+ return mask
1060
+
1061
+ '''
1062
+ Applies the a variational dropout over an input, given dropout mask
1063
+ and a dropout probability value.
1064
+ '''
1065
+ def applyVarDpMask(inp, mask, keepProb):
1066
+ ret = (tf.div(inp, tf.to_float(keepProb))) * mask
1067
+ return ret