# coding=utf-8 # Copyright 2018 The Google AI Team Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Utility functions for RACE dataset.""" from __future__ import absolute_import from __future__ import division # from __future__ import google_type_annotations from __future__ import print_function import collections import json import os from albert import classifier_utils from albert import fine_tuning_utils from albert import modeling from albert import optimization from albert import tokenization import tensorflow.compat.v1 as tf from tensorflow.contrib import tpu as contrib_tpu class InputExample(object): """A single training/test example for the RACE dataset.""" def __init__(self, example_id, context_sentence, start_ending, endings, label=None): self.example_id = example_id self.context_sentence = context_sentence self.start_ending = start_ending self.endings = endings self.label = label def __str__(self): return self.__repr__() def __repr__(self): l = [ "id: {}".format(self.example_id), "context_sentence: {}".format(self.context_sentence), "start_ending: {}".format(self.start_ending), "ending_0: {}".format(self.endings[0]), "ending_1: {}".format(self.endings[1]), "ending_2: {}".format(self.endings[2]), "ending_3: {}".format(self.endings[3]), ] if self.label is not None: l.append("label: {}".format(self.label)) return ", ".join(l) class RaceProcessor(object): """Processor for the RACE data set.""" def __init__(self, use_spm, do_lower_case, high_only, middle_only): super(RaceProcessor, self).__init__() self.use_spm = use_spm self.do_lower_case = do_lower_case self.high_only = high_only self.middle_only = middle_only def get_train_examples(self, data_dir): """Gets a collection of `InputExample`s for the train set.""" return self.read_examples( os.path.join(data_dir, "RACE", "train")) def get_dev_examples(self, data_dir): """Gets a collection of `InputExample`s for the dev set.""" return self.read_examples( os.path.join(data_dir, "RACE", "dev")) def get_test_examples(self, data_dir): """Gets a collection of `InputExample`s for prediction.""" return self.read_examples( os.path.join(data_dir, "RACE", "test")) def get_labels(self): """Gets the list of labels for this data set.""" return ["A", "B", "C", "D"] def process_text(self, text): if self.use_spm: return tokenization.preprocess_text(text, lower=self.do_lower_case) else: return tokenization.convert_to_unicode(text) def read_examples(self, data_dir): """Read examples from RACE json files.""" examples = [] for level in ["middle", "high"]: if level == "middle" and self.high_only: continue if level == "high" and self.middle_only: continue cur_dir = os.path.join(data_dir, level) cur_path = os.path.join(cur_dir, "all.txt") with tf.gfile.Open(cur_path) as f: for line in f: cur_data = json.loads(line.strip()) answers = cur_data["answers"] options = cur_data["options"] questions = cur_data["questions"] context = self.process_text(cur_data["article"]) for i in range(len(answers)): label = ord(answers[i]) - ord("A") qa_list = [] question = self.process_text(questions[i]) for j in range(4): option = self.process_text(options[i][j]) if "_" in question: qa_cat = question.replace("_", option) else: qa_cat = " ".join([question, option]) qa_list.append(qa_cat) examples.append( InputExample( example_id=cur_data["id"], context_sentence=context, start_ending=None, endings=[qa_list[0], qa_list[1], qa_list[2], qa_list[3]], label=label ) ) return examples def convert_single_example(example_index, example, label_size, max_seq_length, tokenizer, max_qa_length): """Loads a data file into a list of `InputBatch`s.""" # RACE is a multiple choice task. To perform this task using AlBERT, # we will use the formatting proposed in "Improving Language # Understanding by Generative Pre-Training" and suggested by # @jacobdevlin-google in this issue # https://github.com/google-research/bert/issues/38. # # Each choice will correspond to a sample on which we run the # inference. For a given RACE example, we will create the 4 # following inputs: # - [CLS] context [SEP] choice_1 [SEP] # - [CLS] context [SEP] choice_2 [SEP] # - [CLS] context [SEP] choice_3 [SEP] # - [CLS] context [SEP] choice_4 [SEP] # The model will output a single value for each input. To get the # final decision of the model, we will run a softmax over these 4 # outputs. if isinstance(example, classifier_utils.PaddingInputExample): return classifier_utils.InputFeatures( example_id=0, input_ids=[[0] * max_seq_length] * label_size, input_mask=[[0] * max_seq_length] * label_size, segment_ids=[[0] * max_seq_length] * label_size, label_id=0, is_real_example=False) else: context_tokens = tokenizer.tokenize(example.context_sentence) if example.start_ending is not None: start_ending_tokens = tokenizer.tokenize(example.start_ending) all_input_tokens = [] all_input_ids = [] all_input_mask = [] all_segment_ids = [] for ending in example.endings: # We create a copy of the context tokens in order to be # able to shrink it according to ending_tokens context_tokens_choice = context_tokens[:] if example.start_ending is not None: ending_tokens = start_ending_tokens + tokenizer.tokenize(ending) else: ending_tokens = tokenizer.tokenize(ending) # Modifies `context_tokens_choice` and `ending_tokens` in # place so that the total length is less than the # specified length. Account for [CLS], [SEP], [SEP] with # "- 3" ending_tokens = ending_tokens[- max_qa_length:] if len(context_tokens_choice) + len(ending_tokens) > max_seq_length - 3: context_tokens_choice = context_tokens_choice[: ( max_seq_length - 3 - len(ending_tokens))] tokens = ["[CLS]"] + context_tokens_choice + ( ["[SEP]"] + ending_tokens + ["[SEP]"]) segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * ( len(ending_tokens) + 1) input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. padding = [0] * (max_seq_length - len(input_ids)) input_ids += padding input_mask += padding segment_ids += padding assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length all_input_tokens.append(tokens) all_input_ids.append(input_ids) all_input_mask.append(input_mask) all_segment_ids.append(segment_ids) label = example.label if example_index < 5: tf.logging.info("*** Example ***") tf.logging.info("id: {}".format(example.example_id)) for choice_idx, (tokens, input_ids, input_mask, segment_ids) in \ enumerate(zip(all_input_tokens, all_input_ids, all_input_mask, all_segment_ids)): tf.logging.info("choice: {}".format(choice_idx)) tf.logging.info("tokens: {}".format(" ".join(tokens))) tf.logging.info( "input_ids: {}".format(" ".join(map(str, input_ids)))) tf.logging.info( "input_mask: {}".format(" ".join(map(str, input_mask)))) tf.logging.info( "segment_ids: {}".format(" ".join(map(str, segment_ids)))) tf.logging.info("label: {}".format(label)) return classifier_utils.InputFeatures( example_id=example.example_id, input_ids=all_input_ids, input_mask=all_input_mask, segment_ids=all_segment_ids, label_id=label ) def file_based_convert_examples_to_features( examples, label_list, max_seq_length, tokenizer, output_file, max_qa_length): """Convert a set of `InputExample`s to a TFRecord file.""" writer = tf.python_io.TFRecordWriter(output_file) for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) feature = convert_single_example(ex_index, example, len(label_list), max_seq_length, tokenizer, max_qa_length) def create_int_feature(values): f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) return f features = collections.OrderedDict() features["input_ids"] = create_int_feature(sum(feature.input_ids, [])) features["input_mask"] = create_int_feature(sum(feature.input_mask, [])) features["segment_ids"] = create_int_feature(sum(feature.segment_ids, [])) features["label_ids"] = create_int_feature([feature.label_id]) features["is_real_example"] = create_int_feature( [int(feature.is_real_example)]) tf_example = tf.train.Example(features=tf.train.Features(feature=features)) writer.write(tf_example.SerializeToString()) writer.close() def create_model(albert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, max_seq_length, dropout_prob, hub_module): """Creates a classification model.""" bsz_per_core = tf.shape(input_ids)[0] input_ids = tf.reshape(input_ids, [bsz_per_core * num_labels, max_seq_length]) input_mask = tf.reshape(input_mask, [bsz_per_core * num_labels, max_seq_length]) token_type_ids = tf.reshape(segment_ids, [bsz_per_core * num_labels, max_seq_length]) (output_layer, _) = fine_tuning_utils.create_albert( albert_config=albert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, use_einsum=True, hub_module=hub_module) hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [1, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [1], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout( output_layer, keep_prob=1 - dropout_prob) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [bsz_per_core, num_labels]) probabilities = tf.nn.softmax(logits, axis=-1) predictions = tf.argmax(probabilities, axis=-1, output_type=tf.int32) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot( labels, depth=tf.cast(num_labels, dtype=tf.int32), dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, probabilities, logits, predictions) def model_fn_builder(albert_config, num_labels, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings, max_seq_length, dropout_prob, hub_module): """Returns `model_fn` closure for TPUEstimator.""" def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] is_real_example = None if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = (mode == tf.estimator.ModeKeys.TRAIN) (total_loss, per_example_loss, probabilities, logits, predictions) = \ create_model(albert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings, max_seq_length, dropout_prob, hub_module) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy( labels=label_ids, predictions=predictions, weights=is_real_example) loss = tf.metrics.mean( values=per_example_loss, weights=is_real_example) return { "eval_accuracy": accuracy, "eval_loss": loss, } eval_metrics = (metric_fn, [per_example_loss, label_ids, logits, is_real_example]) output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = contrib_tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities, "predictions": predictions}, scaffold_fn=scaffold_fn) return output_spec return model_fn