VISOR-GPT / train /finetune /run_regression.py
szukevin's picture
upload
7900c16
raw
history blame
7.2 kB
"""
This script provides an example to wrap TencentPretrain for regression.
"""
import sys
import os
import random
import argparse
import torch
import torch.nn as nn
tencentpretrain_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(tencentpretrain_dir)
from finetune.run_classifier import *
from scipy.stats import spearmanr
class Regression(nn.Module):
def __init__(self, args):
super(Regression, self).__init__()
self.embedding = Embedding(args)
for embedding_name in args.embedding:
tmp_emb = str2embedding[embedding_name](args, len(args.tokenizer.vocab))
self.embedding.update(tmp_emb, embedding_name)
self.encoder = str2encoder[args.encoder](args)
self.pooling_type = args.pooling
self.output_layer_1 = nn.Linear(args.hidden_size, args.hidden_size)
self.output_layer_2 = nn.Linear(args.hidden_size, 1)
def forward(self, src, tgt, seg, soft_tgt=None):
"""
Args:
src: [batch_size x seq_length]
tgt: [batch_size]
seg: [batch_size x seq_length]
"""
# Embedding.
emb = self.embedding(src, seg)
# Encoder.
output = self.encoder(emb, seg)
# Target.
output = pooling(output, seg, self.pooling_type)
output = torch.tanh(self.output_layer_1(output))
logits = self.output_layer_2(output)
if tgt is not None:
loss = nn.MSELoss()(logits.view(-1), tgt.view(-1))
return loss, logits
else:
return None, logits
def read_dataset(args, path):
dataset, columns = [], {}
with open(path, mode="r", encoding="utf-8") as f:
for line_id, line in enumerate(f):
if line_id == 0:
for i, column_name in enumerate(line.rstrip("\r\n").split("\t")):
columns[column_name] = i
continue
line = line.rstrip("\r\n").split("\t")
tgt = float(line[columns["label"]])
if "text_b" not in columns:
text_a = line[columns["text_a"]]
src = args.tokenizer.convert_tokens_to_ids([CLS_TOKEN] + args.tokenizer.tokenize(text_a) + [SEP_TOKEN])
seg = [1] * len(src)
else:
text_a, text_b = line[columns["text_a"]], line[columns["text_b"]]
src_a = args.tokenizer.convert_tokens_to_ids([CLS_TOKEN] + args.tokenizer.tokenize(text_a) + [SEP_TOKEN])
src_b = args.tokenizer.convert_tokens_to_ids(args.tokenizer.tokenize(text_b) + [SEP_TOKEN])
src = src_a + src_b
seg = [1] * len(src_a) + [2] * len(src_b)
if len(src) > args.seq_length:
src = src[: args.seq_length]
seg = seg[: args.seq_length]
PAD_ID = args.tokenizer.convert_tokens_to_ids([PAD_TOKEN])[0]
while len(src) < args.seq_length:
src.append(PAD_ID)
seg.append(0)
dataset.append((src, tgt, seg))
return dataset
def evaluate(args, dataset):
src = torch.LongTensor([sample[0] for sample in dataset])
tgt = torch.FloatTensor([sample[1] for sample in dataset])
seg = torch.LongTensor([sample[2] for sample in dataset])
pred_list = []
gold_list = []
batch_size = args.batch_size
args.model.eval()
for i, (src_batch, tgt_batch, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)):
src_batch = src_batch.to(args.device)
tgt_batch = tgt_batch.to(args.device)
seg_batch = seg_batch.to(args.device)
with torch.no_grad():
_, pred = args.model(src_batch, tgt_batch, seg_batch)
gold = tgt_batch
pred_list += pred.tolist()
gold_list += gold.tolist()
spearman_corr, _ = spearmanr(gold_list, pred_list)
args.logger.info("Spearman corr: {:.4f}".format(spearman_corr))
return spearman_corr
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
finetune_opts(parser)
tokenizer_opts(parser)
adv_opts(parser)
args = parser.parse_args()
# Load the hyperparameters from the config file.
args = load_hyperparam(args)
# Build tokenizer.
args.tokenizer = str2tokenizer[args.tokenizer](args)
set_seed(args.seed)
model = Regression(args)
# Load or initialize parameters.
load_or_initialize_parameters(args, model)
# Get logger.
args.logger = init_logger(args)
args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(args.device)
# Training phase.
trainset = read_dataset(args, args.train_path)
instances_num = len(trainset)
batch_size = args.batch_size
args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1
args.logger.info("Batch size: {}".format(batch_size))
args.logger.info("The number of training instances: {}".format(instances_num))
optimizer, scheduler = build_optimizer(args, model)
if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp
if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
args.model = model
if args.use_adv:
args.adv_method = str2adv[args.adv_type](model)
total_loss, result, best_result = 0.0, 0.0, 0.0
args.logger.info("Start training.")
for epoch in range(1, args.epochs_num + 1):
random.shuffle(trainset)
src = torch.LongTensor([example[0] for example in trainset])
tgt = torch.FloatTensor([example[1] for example in trainset])
seg = torch.LongTensor([example[2] for example in trainset])
model.train()
for i, (src_batch, tgt_batch, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg, None)):
loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch, None)
total_loss += loss.item()
if (i + 1) % args.report_steps == 0:
args.logger.info("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".format(epoch, i + 1, total_loss / args.report_steps))
total_loss = 0.0
result = evaluate(args, read_dataset(args, args.dev_path))
if result > best_result:
best_result = result
save_model(model, args.output_model_path)
# Evaluation phase.
if args.test_path is not None:
args.logger.info("Test set evaluation.")
if torch.cuda.device_count() > 1:
args.model.module.load_state_dict(torch.load(args.output_model_path))
else:
args.model.load_state_dict(torch.load(args.output_model_path))
evaluate(args, read_dataset(args, args.test_path))
if __name__ == "__main__":
main()