Vaino Hatanpaa
add training and evaluation scripts
ceedef8
import os
import sys
from datasets import load_dataset, load_from_disk, concatenate_datasets
from transformers import PreTrainedTokenizerFast
import transformers
from transformers import (
AutoConfig,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
default_data_collator,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
from transformers import GPT2Model
from transformers import GPT2TokenizerFast
import transformers
import torch
import numpy as np
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('test', type=int)
parser.add_argument('length', type=int)
#parser.add_argument('--input_file', type=int)
args = parser.parse_args()
def compute_metrics(eval_pred):
logits,labels = eval_pred
import pickle
with open("logits_{}.pickle".format("xed"),"wb") as handle:
pickle.dump(logits, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("labels_{}.pickle".format("xed"),"wb") as handle:
pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)
#Continue in a jupyter notebook from here
return
class MultilabelTrainer(Trainer):
def compute_loss(self,model,inputs,return_outputs=False):
labels = inputs.pop("labels")
outputs = model(**inputs)
logits = outputs.logits
loss_fct = torch.nn.BCEWithLogitsLoss()
loss = loss_fct(logits.view(-1,self.model.config.num_labels),
labels.float().view(-1,self.model.config.num_labels))
return (loss,outputs) if return_outputs else loss
def main():
ds_names = ["yle", "online_review","xed","ylilauta"]
#ds_sizes = [1000, 3000, 10000, 32000, 9999999]
print("test:",args.test)
ds_name = ds_names[args.test]
#ds_size = int(args.test.slit()[1])
ds_size = args.length
print(ds_name, ds_size)
metric = compute_metrics
#print("cuda_avail:",torch.cuda.is_available())
#checkpoint_loc = "/media/volume/output/checkpoint-275000"
#output_dir = "/media/volume/fi_nlp/output/finetune"
#checkpoint_loc = r"H:\Data_temp\checkpoints\good_large\checkpoint-67400"
output_dir = "/data/loc/"+ds_name
#Most of the parameters not used but lets just pass this to make the Trainer happy...
training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
learning_rate=5e-6,
adam_beta1=0.95,
adam_beta2=0.985,
adam_epsilon=1e-8,
weight_decay=0.001,
lr_scheduler_type="linear",
gradient_accumulation_steps=4,
max_steps=10000,
num_train_epochs=20000,
save_total_limit=2,
dataloader_num_workers=5,
save_steps=100000,
warmup_steps=500,
do_eval=True,
eval_steps=500,
evaluation_strategy="steps",
logging_strategy="steps",
logging_steps=50,
fp16_opt_level="O2",
half_precision_backend="amp",
log_on_each_node=False,
disable_tqdm=True
)
print(training_args)
dataset = load_from_disk(r"/data_loc/"+ds_name)["test"]
#dataset = load_from_disk(r"C:\Users\vin\Documents\Projects\dippa\tests\ylilauta\tokenized_set").train_test_split(test_size=0.1)
trainer_class = MultilabelTrainer
#print("num_labels",num_labels)
model = AutoModelForSequenceClassification.from_pretrained("/fine_tuning_checkpoint/"+ds_name)
tokenizer = AutoTokenizer.from_pretrained("/fine_tuning_checkpoint/"+ds_name)
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
print("init trainer")
trainer = trainer_class(
model=model,
args=training_args,
train_dataset=dataset,
eval_dataset=dataset,
tokenizer=tokenizer,
compute_metrics=metric,
data_collator=default_data_collator
)
#checkpoint = None
#checkpoint = get_last_checkpoint(output_dir)
#checkpoint = None
#train_result = trainer.train()
#trainer.save_state()
metrics = trainer.evaluate()
print(metrics)
#trainer.save_model() # Saves the tokenizer too for easy upload
if __name__ == "__main__":
main()