Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /adversarial_text /layers.py

NCTCMumbai

Upload 2571 files

0b8359d almost 2 years ago

raw

history blame contribute delete

13.4 kB

	# Copyright 2017 Google Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Layers for VatxtModel."""
	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	# Dependency imports

	from six.moves import xrange
	import tensorflow as tf
	K = tf.keras


	def cl_logits_subgraph(layer_sizes, input_size, num_classes, keep_prob=1.):
	"""Construct multiple ReLU layers with dropout and a linear layer."""
	subgraph = K.models.Sequential(name='cl_logits')
	for i, layer_size in enumerate(layer_sizes):
	if i == 0:
	subgraph.add(
	K.layers.Dense(layer_size, activation='relu', input_dim=input_size))
	else:
	subgraph.add(K.layers.Dense(layer_size, activation='relu'))

	if keep_prob < 1.:
	subgraph.add(K.layers.Dropout(1. - keep_prob))
	subgraph.add(K.layers.Dense(1 if num_classes == 2 else num_classes))
	return subgraph


	class Embedding(K.layers.Layer):
	"""Embedding layer with frequency-based normalization and dropout."""

	def __init__(self,
	vocab_size,
	embedding_dim,
	normalize=False,
	vocab_freqs=None,
	keep_prob=1.,
	**kwargs):
	self.vocab_size = vocab_size
	self.embedding_dim = embedding_dim
	self.normalized = normalize
	self.keep_prob = keep_prob

	if normalize:
	assert vocab_freqs is not None
	self.vocab_freqs = tf.constant(
	vocab_freqs, dtype=tf.float32, shape=(vocab_size, 1))

	super(Embedding, self).__init__(**kwargs)

	def build(self, input_shape):
	with tf.device('/cpu:0'):
	self.var = self.add_weight(
	shape=(self.vocab_size, self.embedding_dim),
	initializer=tf.random_uniform_initializer(-1., 1.),
	name='embedding',
	dtype=tf.float32)

	if self.normalized:
	self.var = self._normalize(self.var)

	super(Embedding, self).build(input_shape)

	def call(self, x):
	embedded = tf.nn.embedding_lookup(self.var, x)
	if self.keep_prob < 1.:
	shape = embedded.get_shape().as_list()

	# Use same dropout masks at each timestep with specifying noise_shape.
	# This slightly improves performance.
	# Please see https://arxiv.org/abs/1512.05287 for the theoretical
	# explanation.
	embedded = tf.nn.dropout(
	embedded, self.keep_prob, noise_shape=(shape[0], 1, shape[2]))
	return embedded

	def _normalize(self, emb):
	weights = self.vocab_freqs / tf.reduce_sum(self.vocab_freqs)
	mean = tf.reduce_sum(weights * emb, 0, keep_dims=True)
	var = tf.reduce_sum(weights * tf.pow(emb - mean, 2.), 0, keep_dims=True)
	stddev = tf.sqrt(1e-6 + var)
	return (emb - mean) / stddev


	class LSTM(object):
	"""LSTM layer using dynamic_rnn.

	Exposes variables in `trainable_weights` property.
	"""

	def __init__(self, cell_size, num_layers=1, keep_prob=1., name='LSTM'):
	self.cell_size = cell_size
	self.num_layers = num_layers
	self.keep_prob = keep_prob
	self.reuse = None
	self.trainable_weights = None
	self.name = name

	def __call__(self, x, initial_state, seq_length):
	with tf.variable_scope(self.name, reuse=self.reuse) as vs:
	cell = tf.contrib.rnn.MultiRNNCell([
	tf.contrib.rnn.BasicLSTMCell(
	self.cell_size,
	forget_bias=0.0,
	reuse=tf.get_variable_scope().reuse)
	for _ in xrange(self.num_layers)
	])

	# shape(x) = (batch_size, num_timesteps, embedding_dim)

	lstm_out, next_state = tf.nn.dynamic_rnn(
	cell, x, initial_state=initial_state, sequence_length=seq_length)

	# shape(lstm_out) = (batch_size, timesteps, cell_size)

	if self.keep_prob < 1.:
	lstm_out = tf.nn.dropout(lstm_out, self.keep_prob)

	if self.reuse is None:
	self.trainable_weights = vs.global_variables()

	self.reuse = True

	return lstm_out, next_state


	class SoftmaxLoss(K.layers.Layer):
	"""Softmax xentropy loss with candidate sampling."""

	def __init__(self,
	vocab_size,
	num_candidate_samples=-1,
	vocab_freqs=None,
	**kwargs):
	self.vocab_size = vocab_size
	self.num_candidate_samples = num_candidate_samples
	self.vocab_freqs = vocab_freqs
	super(SoftmaxLoss, self).__init__(**kwargs)
	self.multiclass_dense_layer = K.layers.Dense(self.vocab_size)

	def build(self, input_shape):
	input_shape = input_shape[0].as_list()
	with tf.device('/cpu:0'):
	self.lin_w = self.add_weight(
	shape=(input_shape[-1], self.vocab_size),
	name='lm_lin_w',
	initializer=K.initializers.glorot_uniform())
	self.lin_b = self.add_weight(
	shape=(self.vocab_size,),
	name='lm_lin_b',
	initializer=K.initializers.glorot_uniform())
	self.multiclass_dense_layer.build(input_shape)

	super(SoftmaxLoss, self).build(input_shape)

	def call(self, inputs):
	x, labels, weights = inputs
	if self.num_candidate_samples > -1:
	assert self.vocab_freqs is not None
	labels_reshaped = tf.reshape(labels, [-1])
	labels_reshaped = tf.expand_dims(labels_reshaped, -1)
	sampled = tf.nn.fixed_unigram_candidate_sampler(
	true_classes=labels_reshaped,
	num_true=1,
	num_sampled=self.num_candidate_samples,
	unique=True,
	range_max=self.vocab_size,
	unigrams=self.vocab_freqs)
	inputs_reshaped = tf.reshape(x, [-1, int(x.get_shape()[2])])

	lm_loss = tf.nn.sampled_softmax_loss(
	weights=tf.transpose(self.lin_w),
	biases=self.lin_b,
	labels=labels_reshaped,
	inputs=inputs_reshaped,
	num_sampled=self.num_candidate_samples,
	num_classes=self.vocab_size,
	sampled_values=sampled)
	lm_loss = tf.reshape(
	lm_loss,
	[int(x.get_shape()[0]), int(x.get_shape()[1])])
	else:
	logits = self.multiclass_dense_layer(x)
	lm_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
	logits=logits, labels=labels)

	lm_loss = tf.identity(
	tf.reduce_sum(lm_loss * weights) / _num_labels(weights),
	name='lm_xentropy_loss')
	return lm_loss


	def classification_loss(logits, labels, weights):
	"""Computes cross entropy loss between logits and labels.

	Args:
	logits: 2-D [timesteps*batch_size, m] float tensor, where m=1 if
	num_classes=2, otherwise m=num_classes.
	labels: 1-D [timesteps*batch_size] integer tensor.
	weights: 1-D [timesteps*batch_size] float tensor.

	Returns:
	Loss scalar of type float.
	"""
	inner_dim = logits.get_shape().as_list()[-1]
	with tf.name_scope('classifier_loss'):
	# Logistic loss
	if inner_dim == 1:
	loss = tf.nn.sigmoid_cross_entropy_with_logits(
	logits=tf.squeeze(logits, -1), labels=tf.cast(labels, tf.float32))
	# Softmax loss
	else:
	loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
	logits=logits, labels=labels)

	num_lab = _num_labels(weights)
	tf.summary.scalar('num_labels', num_lab)
	return tf.identity(
	tf.reduce_sum(weights * loss) / num_lab, name='classification_xentropy')


	def accuracy(logits, targets, weights):
	"""Computes prediction accuracy.

	Args:
	logits: 2-D classifier logits [timesteps*batch_size, num_classes]
	targets: 1-D [timesteps*batch_size] integer tensor.
	weights: 1-D [timesteps*batch_size] float tensor.

	Returns:
	Accuracy: float scalar.
	"""
	with tf.name_scope('accuracy'):
	eq = tf.cast(tf.equal(predictions(logits), targets), tf.float32)
	return tf.identity(
	tf.reduce_sum(weights * eq) / _num_labels(weights), name='accuracy')


	def predictions(logits):
	"""Class prediction from logits."""
	inner_dim = logits.get_shape().as_list()[-1]
	with tf.name_scope('predictions'):
	# For binary classification
	if inner_dim == 1:
	pred = tf.cast(tf.greater(tf.squeeze(logits, -1), 0.), tf.int64)
	# For multi-class classification
	else:
	pred = tf.argmax(logits, 2)
	return pred


	def _num_labels(weights):
	"""Number of 1's in weights. Returns 1. if 0."""
	num_labels = tf.reduce_sum(weights)
	num_labels = tf.where(tf.equal(num_labels, 0.), 1., num_labels)
	return num_labels


	def optimize(loss,
	global_step,
	max_grad_norm,
	lr,
	lr_decay,
	sync_replicas=False,
	replicas_to_aggregate=1,
	task_id=0):
	"""Builds optimization graph.

	* Creates an optimizer, and optionally wraps with SyncReplicasOptimizer
	* Computes, clips, and applies gradients
	* Maintains moving averages for all trainable variables
	* Summarizes variables and gradients

	Args:
	loss: scalar loss to minimize.
	global_step: integer scalar Variable.
	max_grad_norm: float scalar. Grads will be clipped to this value.
	lr: float scalar, learning rate.
	lr_decay: float scalar, learning rate decay rate.
	sync_replicas: bool, whether to use SyncReplicasOptimizer.
	replicas_to_aggregate: int, number of replicas to aggregate when using
	SyncReplicasOptimizer.
	task_id: int, id of the current task; used to ensure proper initialization
	of SyncReplicasOptimizer.

	Returns:
	train_op
	"""
	with tf.name_scope('optimization'):
	# Compute gradients.
	tvars = tf.trainable_variables()
	grads = tf.gradients(
	loss,
	tvars,
	aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)

	# Clip non-embedding grads
	non_embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars)
	if 'embedding' not in v.op.name]
	embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars)
	if 'embedding' in v.op.name]

	ne_grads, ne_vars = zip(*non_embedding_grads_and_vars)
	ne_grads, _ = tf.clip_by_global_norm(ne_grads, max_grad_norm)
	non_embedding_grads_and_vars = zip(ne_grads, ne_vars)

	grads_and_vars = embedding_grads_and_vars + list(non_embedding_grads_and_vars)

	# Summarize
	_summarize_vars_and_grads(grads_and_vars)

	# Decaying learning rate
	lr = tf.train.exponential_decay(
	lr, global_step, 1, lr_decay, staircase=True)
	tf.summary.scalar('learning_rate', lr)
	opt = tf.train.AdamOptimizer(lr)

	# Track the moving averages of all trainable variables.
	variable_averages = tf.train.ExponentialMovingAverage(0.999, global_step)

	# Apply gradients
	if sync_replicas:
	opt = tf.train.SyncReplicasOptimizer(
	opt,
	replicas_to_aggregate,
	variable_averages=variable_averages,
	variables_to_average=tvars,
	total_num_replicas=replicas_to_aggregate)
	apply_gradient_op = opt.apply_gradients(
	grads_and_vars, global_step=global_step)
	with tf.control_dependencies([apply_gradient_op]):
	train_op = tf.no_op(name='train_op')

	# Initialization ops
	tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
	opt.get_chief_queue_runner())
	if task_id == 0: # Chief task
	local_init_op = opt.chief_init_op
	tf.add_to_collection('chief_init_op', opt.get_init_tokens_op())
	else:
	local_init_op = opt.local_step_init_op
	tf.add_to_collection('local_init_op', local_init_op)
	tf.add_to_collection('ready_for_local_init_op',
	opt.ready_for_local_init_op)
	else:
	# Non-sync optimizer
	apply_gradient_op = opt.apply_gradients(grads_and_vars, global_step)
	with tf.control_dependencies([apply_gradient_op]):
	train_op = variable_averages.apply(tvars)

	return train_op


	def _summarize_vars_and_grads(grads_and_vars):
	tf.logging.info('Trainable variables:')
	tf.logging.info('-' * 60)
	for grad, var in grads_and_vars:
	tf.logging.info(var)

	def tag(name, v=var):
	return v.op.name + '_' + name

	# Variable summary
	mean = tf.reduce_mean(var)
	tf.summary.scalar(tag('mean'), mean)
	with tf.name_scope(tag('stddev')):
	stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
	tf.summary.scalar(tag('stddev'), stddev)
	tf.summary.scalar(tag('max'), tf.reduce_max(var))
	tf.summary.scalar(tag('min'), tf.reduce_min(var))
	tf.summary.histogram(tag('histogram'), var)

	# Gradient summary
	if grad is not None:
	if isinstance(grad, tf.IndexedSlices):
	grad_values = grad.values
	else:
	grad_values = grad

	tf.summary.histogram(tag('gradient'), grad_values)
	tf.summary.scalar(tag('gradient_norm'), tf.global_norm([grad_values]))
	else:
	tf.logging.info('Var %s has no gradient', var.op.name)