Spaces:
Runtime error
Runtime error
import time | |
import evaluate | |
import numpy as np | |
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq | |
from transformers import TrainingArguments, Trainer | |
from utils import ( | |
get_dataset, | |
get_tok_and_model, | |
get_open_prompt_data, | |
get_dict_dataset, | |
get_advance_dataset,) | |
base_model = "distilgpt2" | |
tokenizer, model = get_tok_and_model(f"./models/{base_model}") | |
tokenizer.pad_token = tokenizer.eos_token | |
rouge = evaluate.load("rouge") | |
# train_data, test_data = get_open_prompt_data("./data") | |
# train_dataset, test_dataset = get_dataset(train_data, test_data) | |
dict_data = get_dict_dataset("./data") | |
dataset = get_advance_dataset(dict_data) | |
dataset = dataset.train_test_split(test_size=0.2) | |
def preprocess_function(examples): | |
x_inputs = [x for x in examples["x"]] | |
y_inputs = examples["y"] | |
model_inputs = tokenizer(x_inputs, max_length=128, truncation=True) | |
labels = tokenizer(text_target=y_inputs, max_length=128, truncation=True) | |
model_inputs["labels"] = model_inputs["input_ids"] | |
return model_inputs | |
def compute_metrics(eval_pred): | |
predictions, labels = eval_pred | |
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) | |
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | |
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) | |
result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) | |
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions] | |
result["gen_len"] = np.mean(prediction_lens) | |
return {k: round(v, 4) for k, v in result.items()} | |
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) | |
print("tokenize data...") | |
t1 = time.time() | |
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["x", "y"]) | |
t2 = time.time() | |
print(f"data tokenize done. process time : {t2 - t1}") | |
training_args = TrainingArguments( | |
output_dir=f"./output/{base_model}_openprpmpt", | |
evaluation_strategy="steps", | |
eval_steps=20000, | |
learning_rate=2e-5, | |
lr_scheduler_type="constant", | |
report_to="tensorboard", | |
per_device_train_batch_size=64, | |
per_device_eval_batch_size=32, | |
adam_beta1=0.9, | |
adam_beta2=0.98, | |
save_total_limit=1, | |
num_train_epochs=100, | |
fp16=True, | |
push_to_hub=False, | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset["train"], | |
eval_dataset=tokenized_dataset["test"], | |
tokenizer=tokenizer, | |
data_collator=data_collator, | |
) | |
trainer.train() | |
import math | |
eval_results = trainer.evaluate() | |
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") | |