Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /official /nlp /modeling /networks /encoder_scaffold.py

NCTCMumbai

Upload 2571 files

0b8359d almost 2 years ago

raw

history blame contribute delete

11.4 kB

	# Lint as: python3
	# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Transformer-based text encoder network."""
	# pylint: disable=g-classes-have-attributes
	from __future__ import absolute_import
	from __future__ import division
	# from __future__ import google_type_annotations
	from __future__ import print_function

	import inspect

	import gin
	import tensorflow as tf

	from official.nlp.modeling import layers


	@tf.keras.utils.register_keras_serializable(package='Text')
	@gin.configurable
	class EncoderScaffold(tf.keras.Model):
	"""Bi-directional Transformer-based encoder network scaffold.

	This network allows users to flexibly implement an encoder similar to the one
	described in "BERT: Pre-training of Deep Bidirectional Transformers for
	Language Understanding" (https://arxiv.org/abs/1810.04805).

	In this network, users can choose to provide a custom embedding subnetwork
	(which will replace the standard embedding logic) and/or a custom hidden layer
	class (which will replace the Transformer instantiation in the encoder). For
	each of these custom injection points, users can pass either a class or a
	class instance. If a class is passed, that class will be instantiated using
	the 'embedding_cfg' or 'hidden_cfg' argument, respectively; if an instance
	is passed, that instance will be invoked. (In the case of hidden_cls, the
	instance will be invoked 'num_hidden_instances' times.

	If the hidden_cls is not overridden, a default transformer layer will be
	instantiated.

	Arguments:
	pooled_output_dim: The dimension of pooled output.
	pooler_layer_initializer: The initializer for the classification
	layer.
	embedding_cls: The class or instance to use to embed the input data. This
	class or instance defines the inputs to this encoder and outputs
	(1) embeddings tensor with shape [batch_size, seq_length, hidden_size] and
	(2) attention masking with tensor [batch_size, seq_length, seq_length].
	If embedding_cls is not set, a default embedding network
	(from the original BERT paper) will be created.
	embedding_cfg: A dict of kwargs to pass to the embedding_cls, if it needs to
	be instantiated. If embedding_cls is not set, a config dict must be
	passed to 'embedding_cfg' with the following values:
	"vocab_size": The size of the token vocabulary.
	"type_vocab_size": The size of the type vocabulary.
	"hidden_size": The hidden size for this encoder.
	"max_seq_length": The maximum sequence length for this encoder.
	"seq_length": The sequence length for this encoder.
	"initializer": The initializer for the embedding portion of this encoder.
	"dropout_rate": The dropout rate to apply before the encoding layers.
	embedding_data: A reference to the embedding weights that will be used to
	train the masked language model, if necessary. This is optional, and only
	needed if (1) you are overriding embedding_cls and (2) are doing standard
	pretraining.
	num_hidden_instances: The number of times to instantiate and/or invoke the
	hidden_cls.
	hidden_cls: The class or instance to encode the input data. If hidden_cls is
	not set, a KerasBERT transformer layer will be used as the encoder class.
	hidden_cfg: A dict of kwargs to pass to the hidden_cls, if it needs to be
	instantiated. If hidden_cls is not set, a config dict must be passed to
	'hidden_cfg' with the following values:
	"num_attention_heads": The number of attention heads. The hidden size
	must be divisible by num_attention_heads.
	"intermediate_size": The intermediate size of the transformer.
	"intermediate_activation": The activation to apply in the transfomer.
	"dropout_rate": The overall dropout rate for the transformer layers.
	"attention_dropout_rate": The dropout rate for the attention layers.
	"kernel_initializer": The initializer for the transformer layers.
	return_all_layer_outputs: Whether to output sequence embedding outputs of
	all encoder transformer layers.
	"""

	def __init__(
	self,
	pooled_output_dim,
	pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
	stddev=0.02),
	embedding_cls=None,
	embedding_cfg=None,
	embedding_data=None,
	num_hidden_instances=1,
	hidden_cls=layers.Transformer,
	hidden_cfg=None,
	return_all_layer_outputs=False,
	**kwargs):
	self._self_setattr_tracking = False
	self._hidden_cls = hidden_cls
	self._hidden_cfg = hidden_cfg
	self._num_hidden_instances = num_hidden_instances
	self._pooled_output_dim = pooled_output_dim
	self._pooler_layer_initializer = pooler_layer_initializer
	self._embedding_cls = embedding_cls
	self._embedding_cfg = embedding_cfg
	self._embedding_data = embedding_data
	self._return_all_layer_outputs = return_all_layer_outputs
	self._kwargs = kwargs

	if embedding_cls:
	if inspect.isclass(embedding_cls):
	self._embedding_network = embedding_cls(
	**embedding_cfg) if embedding_cfg else embedding_cls()
	else:
	self._embedding_network = embedding_cls
	inputs = self._embedding_network.inputs
	embeddings, attention_mask = self._embedding_network(inputs)
	else:
	self._embedding_network = None
	word_ids = tf.keras.layers.Input(
	shape=(embedding_cfg['seq_length'],),
	dtype=tf.int32,
	name='input_word_ids')
	mask = tf.keras.layers.Input(
	shape=(embedding_cfg['seq_length'],),
	dtype=tf.int32,
	name='input_mask')
	type_ids = tf.keras.layers.Input(
	shape=(embedding_cfg['seq_length'],),
	dtype=tf.int32,
	name='input_type_ids')
	inputs = [word_ids, mask, type_ids]

	self._embedding_layer = layers.OnDeviceEmbedding(
	vocab_size=embedding_cfg['vocab_size'],
	embedding_width=embedding_cfg['hidden_size'],
	initializer=embedding_cfg['initializer'],
	name='word_embeddings')

	word_embeddings = self._embedding_layer(word_ids)

	# Always uses dynamic slicing for simplicity.
	self._position_embedding_layer = layers.PositionEmbedding(
	initializer=embedding_cfg['initializer'],
	use_dynamic_slicing=True,
	max_sequence_length=embedding_cfg['max_seq_length'],
	name='position_embedding')
	position_embeddings = self._position_embedding_layer(word_embeddings)

	type_embeddings = (
	layers.OnDeviceEmbedding(
	vocab_size=embedding_cfg['type_vocab_size'],
	embedding_width=embedding_cfg['hidden_size'],
	initializer=embedding_cfg['initializer'],
	use_one_hot=True,
	name='type_embeddings')(type_ids))

	embeddings = tf.keras.layers.Add()(
	[word_embeddings, position_embeddings, type_embeddings])
	embeddings = (
	tf.keras.layers.LayerNormalization(
	name='embeddings/layer_norm',
	axis=-1,
	epsilon=1e-12,
	dtype=tf.float32)(embeddings))
	embeddings = (
	tf.keras.layers.Dropout(
	rate=embedding_cfg['dropout_rate'])(embeddings))

	attention_mask = layers.SelfAttentionMask()([embeddings, mask])

	data = embeddings

	layer_output_data = []
	self._hidden_layers = []
	for _ in range(num_hidden_instances):
	if inspect.isclass(hidden_cls):
	layer = hidden_cls(**hidden_cfg) if hidden_cfg else hidden_cls()
	else:
	layer = hidden_cls
	data = layer([data, attention_mask])
	layer_output_data.append(data)
	self._hidden_layers.append(layer)

	first_token_tensor = (
	tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
	layer_output_data[-1]))
	self._pooler_layer = tf.keras.layers.Dense(
	units=pooled_output_dim,
	activation='tanh',
	kernel_initializer=pooler_layer_initializer,
	name='cls_transform')
	cls_output = self._pooler_layer(first_token_tensor)

	if return_all_layer_outputs:
	outputs = [layer_output_data, cls_output]
	else:
	outputs = [layer_output_data[-1], cls_output]

	super(EncoderScaffold, self).__init__(
	inputs=inputs, outputs=outputs, **kwargs)

	def get_config(self):
	config_dict = {
	'num_hidden_instances':
	self._num_hidden_instances,
	'pooled_output_dim':
	self._pooled_output_dim,
	'pooler_layer_initializer':
	self._pooler_layer_initializer,
	'embedding_cls':
	self._embedding_network,
	'embedding_cfg':
	self._embedding_cfg,
	'hidden_cfg':
	self._hidden_cfg,
	'return_all_layer_outputs':
	self._return_all_layer_outputs,
	}
	if inspect.isclass(self._hidden_cls):
	config_dict['hidden_cls_string'] = tf.keras.utils.get_registered_name(
	self._hidden_cls)
	else:
	config_dict['hidden_cls'] = self._hidden_cls

	config_dict.update(self._kwargs)
	return config_dict

	@classmethod
	def from_config(cls, config, custom_objects=None):
	if 'hidden_cls_string' in config:
	config['hidden_cls'] = tf.keras.utils.get_registered_object(
	config['hidden_cls_string'], custom_objects=custom_objects)
	del config['hidden_cls_string']
	return cls(**config)

	def get_embedding_table(self):
	if self._embedding_network is None:
	# In this case, we don't have a custom embedding network and can return
	# the standard embedding data.
	return self._embedding_layer.embeddings

	if self._embedding_data is None:
	raise RuntimeError(('The EncoderScaffold %s does not have a reference '
	'to the embedding data. This is required when you '
	'pass a custom embedding network to the scaffold. '
	'It is also possible that you are trying to get '
	'embedding data from an embedding scaffold with a '
	'custom embedding network where the scaffold has '
	'been serialized and deserialized. Unfortunately, '
	'accessing custom embedding references after '
	'serialization is not yet supported.') % self.name)
	else:
	return self._embedding_data

	@property
	def hidden_layers(self):
	"""List of hidden layers in the encoder."""
	return self._hidden_layers

	@property
	def pooler_layer(self):
	"""The pooler dense layer after the transformer layers."""
	return self._pooler_layer