Spaces:
Running
Running
import argparse | |
import logging | |
import os | |
import random | |
import socket | |
import numpy as np | |
import torch | |
logger = logging.getLogger() | |
def add_tokenizer_params(parser: argparse.ArgumentParser): | |
parser.add_argument( | |
"--do_lower_case", | |
action="store_true", | |
help="Whether to lower case the input text. True for uncased models, False for cased models.", | |
) | |
def add_encoder_params(parser: argparse.ArgumentParser): | |
""" | |
Common parameters to initialize an encoder-based model | |
""" | |
parser.add_argument( | |
"--pretrained_model_cfg", | |
default=None, | |
type=str, | |
help="config name for model initialization", | |
) | |
parser.add_argument( | |
"--encoder_model_type", | |
default=None, | |
type=str, | |
help="model type. One of [hf_bert, pytext_bert, fairseq_roberta]", | |
) | |
parser.add_argument( | |
"--pretrained_file", | |
type=str, | |
help="Some encoders need to be initialized from a file", | |
) | |
parser.add_argument( | |
"--model_file", | |
default=None, | |
type=str, | |
help="Saved bi-encoder checkpoint file to initialize the model", | |
) | |
parser.add_argument( | |
"--projection_dim", | |
default=0, | |
type=int, | |
help="Extra linear layer on top of standard bert/roberta encoder", | |
) | |
parser.add_argument( | |
"--sequence_length", | |
type=int, | |
default=512, | |
help="Max length of the encoder input sequence", | |
) | |
parser.add_argument( | |
"--do_fill_lower_case", | |
action="store_true", | |
help="Make all fills lower case. e.g. for cased models such as roberta" | |
) | |
parser.add_argument( | |
"--desegment_valid_fill", | |
action="store_true", | |
help="Desegment model fill output for validation" | |
) | |
def add_training_params(parser: argparse.ArgumentParser): | |
""" | |
Common parameters for training | |
""" | |
add_cuda_params(parser) | |
parser.add_argument( | |
"--train_file", default=None, type=str, help="File pattern for the train set" | |
) | |
parser.add_argument("--dev_file", default=None, type=str, help="") | |
parser.add_argument( | |
"--batch_size", default=2, type=int, help="Amount of questions per batch" | |
) | |
parser.add_argument( | |
"--dev_batch_size", | |
type=int, | |
default=4, | |
help="amount of questions per batch for dev set validation", | |
) | |
parser.add_argument( | |
"--seed", | |
type=int, | |
default=0, | |
help="random seed for initialization and dataset shuffling", | |
) | |
parser.add_argument( | |
"--adam_eps", default=1e-8, type=float, help="Epsilon for Adam optimizer." | |
) | |
parser.add_argument( | |
"--adam_betas", | |
default="(0.9, 0.999)", | |
type=str, | |
help="Betas for Adam optimizer.", | |
) | |
parser.add_argument( | |
"--max_grad_norm", default=1.0, type=float, help="Max gradient norm." | |
) | |
parser.add_argument("--log_batch_step", default=100, type=int, help="") | |
parser.add_argument("--train_rolling_loss_step", default=100, type=int, help="") | |
parser.add_argument("--weight_decay", default=0.0, type=float, help="") | |
parser.add_argument( | |
"--learning_rate", | |
default=1e-5, | |
type=float, | |
help="The initial learning rate for Adam.", | |
) | |
parser.add_argument( | |
"--warmup_steps", default=100, type=int, help="Linear warmup over warmup_steps." | |
) | |
parser.add_argument("--dropout", default=0.1, type=float, help="") | |
parser.add_argument( | |
"--gradient_accumulation_steps", | |
type=int, | |
default=1, | |
help="Number of updates steps to accumulate before performing a backward/update pass.", | |
) | |
parser.add_argument( | |
"--num_train_epochs", | |
default=3.0, | |
type=float, | |
help="Total number of training epochs to perform.", | |
) | |
def add_cuda_params(parser: argparse.ArgumentParser): | |
parser.add_argument( | |
"--no_cuda", action="store_true", help="Whether not to use CUDA when available" | |
) | |
parser.add_argument( | |
"--local_rank", | |
type=int, | |
default=-1, | |
help="local_rank for distributed training on gpus", | |
) | |
parser.add_argument( | |
"--fp16", | |
action="store_true", | |
help="Whether to use 16-bit float precision instead of 32-bit", | |
) | |
parser.add_argument( | |
"--fp16_opt_level", | |
type=str, | |
default="O1", | |
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." | |
"See details at https://nvidia.github.io/apex/amp.html", | |
) | |
def add_reader_preprocessing_params(parser: argparse.ArgumentParser): | |
parser.add_argument( | |
"--gold_passages_src", | |
type=str, | |
help="File with the original dataset passages (json format). Required for train set", | |
) | |
parser.add_argument( | |
"--gold_passages_src_dev", | |
type=str, | |
help="File with the original dataset passages (json format). Required for dev set", | |
) | |
parser.add_argument( | |
"--num_workers", | |
type=int, | |
default=16, | |
help="number of parallel processes to binarize reader data", | |
) | |
def get_encoder_checkpoint_params_names(): | |
return [ | |
"do_lower_case", | |
"pretrained_model_cfg", | |
"encoder_model_type", | |
"pretrained_file", | |
"projection_dim", | |
"sequence_length", | |
] | |
def get_encoder_params_state(args): | |
""" | |
Selects the param values to be saved in a checkpoint, so that a trained model faile can be used for downstream | |
tasks without the need to specify these parameter again | |
:return: Dict of params to memorize in a checkpoint | |
""" | |
params_to_save = get_encoder_checkpoint_params_names() | |
r = {} | |
for param in params_to_save: | |
r[param] = getattr(args, param) | |
return r | |
def set_encoder_params_from_state(state, args): | |
if not state: | |
return | |
params_to_save = get_encoder_checkpoint_params_names() | |
override_params = [ | |
(param, state[param]) | |
for param in params_to_save | |
if param in state and state[param] | |
] | |
for param, value in override_params: | |
if hasattr(args, param): | |
logger.warning( | |
"Overriding args parameter value from checkpoint state. Param = %s, value = %s", | |
param, | |
value, | |
) | |
setattr(args, param, value) | |
return args | |
def set_seed(args): | |
seed = args.seed | |
random.seed(seed) | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
if args.n_gpu > 0: | |
torch.cuda.manual_seed_all(seed) | |
def setup_args_gpu(args): | |
""" | |
Setup arguments CUDA, GPU & distributed training | |
""" | |
if args.local_rank == -1 or args.no_cuda: # single-node multi-gpu (or cpu) mode | |
device = torch.device( | |
"cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu" | |
) | |
args.n_gpu = torch.cuda.device_count() | |
else: # distributed mode | |
torch.cuda.set_device(args.local_rank) | |
device = torch.device("cuda", args.local_rank) | |
torch.distributed.init_process_group(backend="nccl") | |
args.n_gpu = 1 | |
args.device = device | |
ws = os.environ.get("WORLD_SIZE") | |
args.distributed_world_size = int(ws) if ws else 1 | |
logger.info( | |
"Initialized host %s as d.rank %d on device=%s, n_gpu=%d, world size=%d", | |
socket.gethostname(), | |
args.local_rank, | |
device, | |
args.n_gpu, | |
args.distributed_world_size, | |
) | |
logger.info("16-bits training: %s ", args.fp16) | |
def print_args(args): | |
logger.info(" **************** CONFIGURATION **************** ") | |
for key, val in sorted(vars(args).items()): | |
keystr = "{}".format(key) + (" " * (30 - len(key))) | |
logger.info("%s --> %s", keystr, val) | |
logger.info(" **************** CONFIGURATION **************** ") |