|
|
|
|
|
import os |
|
import sys |
|
from datasets import load_dataset, load_from_disk, concatenate_datasets |
|
from transformers import PreTrainedTokenizerFast |
|
import transformers |
|
from transformers import ( |
|
AutoConfig, |
|
AutoModelForCausalLM, |
|
Trainer, |
|
TrainingArguments, |
|
default_data_collator, |
|
) |
|
from transformers.trainer_utils import get_last_checkpoint |
|
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel |
|
|
|
from transformers import GPT2Model |
|
from transformers import GPT2TokenizerFast |
|
import transformers |
|
import torch |
|
import numpy as np |
|
import argparse |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('test', type=int) |
|
parser.add_argument('length', type=int) |
|
|
|
args = parser.parse_args() |
|
|
|
def compute_metrics(eval_pred): |
|
logits,labels = eval_pred |
|
import pickle |
|
with open("logits_{}.pickle".format("xed"),"wb") as handle: |
|
pickle.dump(logits, handle, protocol=pickle.HIGHEST_PROTOCOL) |
|
with open("labels_{}.pickle".format("xed"),"wb") as handle: |
|
pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL) |
|
|
|
|
|
return |
|
|
|
|
|
|
|
class MultilabelTrainer(Trainer): |
|
def compute_loss(self,model,inputs,return_outputs=False): |
|
labels = inputs.pop("labels") |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
loss_fct = torch.nn.BCEWithLogitsLoss() |
|
loss = loss_fct(logits.view(-1,self.model.config.num_labels), |
|
labels.float().view(-1,self.model.config.num_labels)) |
|
return (loss,outputs) if return_outputs else loss |
|
|
|
def main(): |
|
ds_names = ["yle", "online_review","xed","ylilauta"] |
|
|
|
print("test:",args.test) |
|
ds_name = ds_names[args.test] |
|
|
|
ds_size = args.length |
|
print(ds_name, ds_size) |
|
|
|
metric = compute_metrics |
|
|
|
|
|
|
|
|
|
|
|
output_dir = "/data/loc/"+ds_name |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir=output_dir, |
|
per_device_train_batch_size=4, |
|
per_device_eval_batch_size=4, |
|
learning_rate=5e-6, |
|
adam_beta1=0.95, |
|
adam_beta2=0.985, |
|
adam_epsilon=1e-8, |
|
weight_decay=0.001, |
|
lr_scheduler_type="linear", |
|
gradient_accumulation_steps=4, |
|
max_steps=10000, |
|
num_train_epochs=20000, |
|
save_total_limit=2, |
|
dataloader_num_workers=5, |
|
save_steps=100000, |
|
warmup_steps=500, |
|
do_eval=True, |
|
eval_steps=500, |
|
evaluation_strategy="steps", |
|
logging_strategy="steps", |
|
logging_steps=50, |
|
fp16_opt_level="O2", |
|
half_precision_backend="amp", |
|
log_on_each_node=False, |
|
disable_tqdm=True |
|
) |
|
|
|
print(training_args) |
|
|
|
dataset = load_from_disk(r"/data_loc/"+ds_name)["test"] |
|
|
|
|
|
trainer_class = MultilabelTrainer |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained("/fine_tuning_checkpoint/"+ds_name) |
|
tokenizer = AutoTokenizer.from_pretrained("/fine_tuning_checkpoint/"+ds_name) |
|
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) |
|
|
|
print("init trainer") |
|
trainer = trainer_class( |
|
model=model, |
|
args=training_args, |
|
train_dataset=dataset, |
|
eval_dataset=dataset, |
|
tokenizer=tokenizer, |
|
compute_metrics=metric, |
|
data_collator=default_data_collator |
|
) |
|
|
|
|
|
|
|
|
|
|
|
metrics = trainer.evaluate() |
|
print(metrics) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|