# coding=utf-8 # Copyright 2020 The Google Research Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Config controlling hyperparameters for fine-tuning ELECTRA.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import tensorflow as tf class FinetuningConfig(object): """Fine-tuning hyperparameters.""" def __init__(self, model_name, data_dir, **kwargs): # general self.model_name = model_name self.debug = False # debug mode for quickly running things self.log_examples = False # print out some train examples for debugging self.num_trials = 1 # how many train+eval runs to perform self.do_train = True # train a model self.do_eval = True # evaluate the model self.keep_all_models = True # if False, only keep the last trial's ckpt # model self.model_size = "base" # one of "small", "base", or "large" self.task_names = ["chunk"] # which tasks to learn # override the default transformer hparams for the provided model size; see # modeling.BertConfig for the possible hparams and util.training_utils for # the defaults self.model_hparam_overrides = ( kwargs["model_hparam_overrides"] if "model_hparam_overrides" in kwargs else {}) self.embedding_size = None # bert hidden size by default self.vocab_size = 64000 # number of tokens in the vocabulary self.do_lower_case = True # training self.learning_rate = 1e-4 self.weight_decay_rate = 0.01 self.layerwise_lr_decay = 0.8 # if > 0, the learning rate for a layer is # lr * lr_decay^(depth - max_depth) i.e., # shallower layers have lower learning rates self.num_train_epochs = 3.0 # passes over the dataset during training self.warmup_proportion = 0.1 # how much of training to warm up the LR for self.save_checkpoints_steps = 1000000 self.iterations_per_loop = 1000 self.use_tfrecords_if_existing = True # don't make tfrecords and write them # to disc if existing ones are found # writing model outputs to disc self.write_test_outputs = False # whether to write test set outputs, # currently supported for GLUE + SQuAD 2.0 self.n_writes_test = 5 # write test set predictions for the first n trials # sizing self.max_seq_length = 128 self.train_batch_size = 32 self.eval_batch_size = 32 self.predict_batch_size = 32 self.double_unordered = True # for tasks like paraphrase where sentence # order doesn't matter, train the model on # on both sentence orderings for each example # for qa tasks self.max_query_length = 64 # max tokens in q as opposed to context self.doc_stride = 128 # stride when splitting doc into multiple examples self.n_best_size = 20 # number of predictions per example to save self.max_answer_length = 30 # filter out answers longer than this length self.answerable_classifier = True # answerable classifier for SQuAD 2.0 self.answerable_uses_start_logits = True # more advanced answerable # classifier using predicted start self.answerable_weight = 0.5 # weight for answerability loss self.joint_prediction = True # jointly predict the start and end positions # of the answer span self.beam_size = 20 # beam size when doing joint predictions self.qa_na_threshold = -2.75 # threshold for "no answer" when writing SQuAD # 2.0 test outputs # TPU settings self.use_tpu = False self.num_tpu_cores = 1 self.tpu_job_name = None self.tpu_name = None # cloud TPU to use for training self.tpu_zone = None # GCE zone where the Cloud TPU is located in self.gcp_project = None # project name for the Cloud TPU-enabled project # default locations of data files self.data_dir = data_dir pretrained_model_dir = os.path.join(data_dir, "models", model_name) self.raw_data_dir = os.path.join(data_dir, "finetuning_data", "{:}").format self.vocab_file = os.path.join(pretrained_model_dir, "vocab.txt") if not tf.io.gfile.exists(self.vocab_file): self.vocab_file = os.path.join(self.data_dir, "vocab.txt") task_names_str = ",".join( kwargs["task_names"] if "task_names" in kwargs else self.task_names) self.init_checkpoint = None if self.debug else pretrained_model_dir self.model_dir = os.path.join(pretrained_model_dir, "finetuning_models", task_names_str + "_model") results_dir = os.path.join(pretrained_model_dir, "results") self.results_txt = os.path.join(results_dir, task_names_str + "_results.txt") self.results_pkl = os.path.join(results_dir, task_names_str + "_results.pkl") qa_topdir = os.path.join(results_dir, task_names_str + "_qa") self.qa_eval_file = os.path.join(qa_topdir, "{:}_eval.json").format self.qa_preds_file = os.path.join(qa_topdir, "{:}_preds.json").format self.qa_na_file = os.path.join(qa_topdir, "{:}_null_odds.json").format self.preprocessed_data_dir = os.path.join( pretrained_model_dir, "finetuning_tfrecords", task_names_str + "_tfrecords" + ("-debug" if self.debug else "")) self.test_predictions = os.path.join( pretrained_model_dir, "test_predictions", "{:}_{:}_{:}_predictions.pkl").format # update defaults with passed-in hyperparameters self.update(kwargs) # default hyperparameters for single-task models if len(self.task_names) == 1: task_name = self.task_names[0] if task_name == "rte" or task_name == "sts": self.num_train_epochs = 10.0 elif "squad" in task_name or "qa" in task_name: self.max_seq_length = 512 self.num_train_epochs = 2.0 self.write_distill_outputs = False self.write_test_outputs = False elif task_name == "chunk": self.max_seq_length = 256 else: self.num_train_epochs = 3.0 # default hyperparameters for different model sizes if self.model_size == "large": self.learning_rate = 5e-5 self.layerwise_lr_decay = 0.9 elif self.model_size == "small": self.embedding_size = 128 # debug-mode settings if self.debug: self.save_checkpoints_steps = 1000000 self.use_tfrecords_if_existing = False self.num_trials = 1 self.iterations_per_loop = 1 self.train_batch_size = 32 self.num_train_epochs = 3.0 self.log_examples = True # passed-in-arguments override (for example) debug-mode defaults self.update(kwargs) def update(self, kwargs): for k, v in kwargs.items(): if k not in self.__dict__: raise ValueError("Unknown hparam " + k) self.__dict__[k] = v