|
|
|
|
|
|
|
import torch |
|
|
|
if torch.cuda.is_available(): |
|
print('gpu is available') |
|
else: |
|
raise Exception('gpu is NOT available') |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
device |
|
|
|
|
|
from datasets import load_dataset, DatasetDict |
|
from transformers import AutoTokenizer |
|
from transformers import AutoModelForSequenceClassification |
|
from transformers import TrainingArguments |
|
from transformers import Trainer |
|
from sklearn.metrics import accuracy_score, f1_score |
|
import numpy as np |
|
import pandas as pd |
|
import torch |
|
import random |
|
|
|
|
|
from transformers.trainer_utils import set_seed |
|
|
|
|
|
set_seed(42) |
|
|
|
|
|
|
|
|
|
|
|
from pprint import pprint |
|
from datasets import load_dataset |
|
|
|
|
|
|
|
train_dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese", split="train") |
|
valid_dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese", split="validation") |
|
|
|
pprint(train_dataset) |
|
pprint(valid_dataset) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
|
|
def preprocess_text(batch): |
|
encoded_batch = tokenizer(batch['text'], max_length=512) |
|
encoded_batch['labels'] = batch['label'] |
|
return encoded_batch |
|
|
|
|
|
|
|
encoded_train_dataset = train_dataset.map( |
|
preprocess_text, |
|
remove_columns=train_dataset.column_names, |
|
) |
|
encoded_valid_dataset = valid_dataset.map( |
|
preprocess_text, |
|
remove_columns=valid_dataset.column_names, |
|
) |
|
|
|
|
|
|
|
from transformers import DataCollatorWithPadding |
|
|
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
|
|
|
|
|
|
|
|
|
|
|
OPTIMIZER_NAME = "adafactor" |
|
|
|
|
|
def optuna_hp_space(trial): |
|
return { |
|
"lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["constant", "linear", "cosine"]), |
|
"learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True), |
|
|
|
"gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4, 8, 16]), |
|
"weight_decay": trial.suggest_float("weight_decay", 1e-6, 1e-1, log=True), |
|
} |
|
|
|
|
|
|
|
from transformers import AutoModelForSequenceClassification |
|
|
|
def model_init(trial): |
|
class_label = train_dataset.features["label"] |
|
label2id = {label: id for id, label in enumerate(class_label.names)} |
|
id2label = {id: label for id, label in enumerate(class_label.names)} |
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
model_name, |
|
num_labels=class_label.num_classes, |
|
label2id=label2id, |
|
id2label=id2label, |
|
) |
|
return model |
|
|
|
|
|
|
|
from transformers import TrainingArguments |
|
|
|
training_args = TrainingArguments( |
|
optim=OPTIMIZER_NAME, |
|
output_dir="output_multilingual", |
|
per_device_train_batch_size=16, |
|
|
|
|
|
|
|
|
|
warmup_ratio=0.1, |
|
num_train_epochs=3, |
|
save_strategy="epoch", |
|
logging_strategy="epoch", |
|
evaluation_strategy="epoch", |
|
load_best_model_at_end=True, |
|
metric_for_best_model="accuracy", |
|
fp16=True, |
|
) |
|
|
|
|
|
|
|
def compute_metrics(pred): |
|
labels = pred.label_ids |
|
preds = pred.predictions.argmax(-1) |
|
f1 = f1_score(labels, preds, average="weighted") |
|
acc = accuracy_score(labels, preds) |
|
return {"accuracy": acc, "f1": f1} |
|
|
|
|
|
|
|
|
|
|
|
from transformers import Trainer |
|
|
|
trainer = Trainer( |
|
model=None, |
|
train_dataset=encoded_train_dataset, |
|
eval_dataset=encoded_valid_dataset, |
|
data_collator=data_collator, |
|
args=training_args, |
|
compute_metrics=compute_metrics, |
|
model_init=model_init, |
|
) |
|
|
|
|
|
def compute_objective(metrics): |
|
return metrics["eval_f1"] |
|
|
|
|
|
|
|
best_trial = trainer.hyperparameter_search( |
|
direction="maximize", |
|
backend="optuna", |
|
hp_space=optuna_hp_space, |
|
n_trials=50, |
|
compute_objective=compute_objective, |
|
) |
|
|
|
|
|
|
|
print('optimizer:',OPTIMIZER_NAME) |
|
print('best param:',best_trial) |
|
|
|
|
|
|
|
|
|
|
|
from transformers import AutoModelForSequenceClassification |
|
|
|
class_label = train_dataset.features["label"] |
|
label2id = {label: id for id, label in enumerate(class_label.names)} |
|
id2label = {id: label for id, label in enumerate(class_label.names)} |
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
model_name, |
|
num_labels=class_label.num_classes, |
|
label2id=label2id, |
|
id2label=id2label, |
|
) |
|
print(type(model).__name__) |
|
|
|
|
|
|
|
from transformers import TrainingArguments |
|
|
|
|
|
best_lr_type = best_trial.hyperparameters['lr_scheduler_type'] |
|
best_lr = best_trial.hyperparameters['learning_rate'] |
|
best_grad_acc_steps = best_trial.hyperparameters['gradient_accumulation_steps'] |
|
best_weight_decay = best_trial.hyperparameters['weight_decay'] |
|
|
|
save_dir = f'bert-finetuned-multilingual-sentiments-{OPTIMIZER_NAME}' |
|
|
|
training_args = TrainingArguments( |
|
output_dir=save_dir, |
|
optim=OPTIMIZER_NAME, |
|
per_device_train_batch_size=16, |
|
per_device_eval_batch_size=16, |
|
gradient_accumulation_steps=best_grad_acc_steps, |
|
learning_rate=best_lr, |
|
lr_scheduler_type=best_lr_type, |
|
weight_decay=best_weight_decay, |
|
warmup_ratio=0.1, |
|
num_train_epochs=100, |
|
save_strategy="epoch", |
|
logging_strategy="epoch", |
|
evaluation_strategy="epoch", |
|
load_best_model_at_end=True, |
|
metric_for_best_model="accuracy", |
|
fp16=True, |
|
) |
|
|
|
|
|
|
|
from transformers import Trainer |
|
from transformers import EarlyStoppingCallback |
|
|
|
trainer = Trainer( |
|
model=model, |
|
train_dataset=encoded_train_dataset, |
|
eval_dataset=encoded_valid_dataset, |
|
data_collator=data_collator, |
|
args=training_args, |
|
compute_metrics=compute_metrics, |
|
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], |
|
) |
|
trainer.train() |
|
|
|
|
|
|
|
trainer.save_model(save_dir) |
|
tokenizer.save_pretrained(save_dir) |
|
|
|
|
|
|
|
import matplotlib.pyplot as plt |
|
from sklearn.linear_model import LinearRegression |
|
|
|
def show_graph(df, suptitle, output='output.png'): |
|
suptitle_size = 23 |
|
graph_title_size = 20 |
|
legend_size = 18 |
|
ticks_size = 13 |
|
|
|
fig = plt.figure(figsize=(20, 5)) |
|
plt.suptitle(suptitle, fontsize=suptitle_size) |
|
|
|
plt.subplot(131) |
|
plt.title('Train Loss', fontsize=graph_title_size) |
|
plt.plot(df['loss'].dropna(), label='train') |
|
plt.legend(fontsize=legend_size) |
|
plt.yticks(fontsize=ticks_size) |
|
|
|
plt.subplot(132) |
|
plt.title(f'Val Loss', fontsize=graph_title_size) |
|
y = df['eval_loss'].dropna().values |
|
x = np.arange(len(y)).reshape(-1, 1) |
|
plt.plot(y, color='tab:orange', label='val') |
|
plt.legend(fontsize=legend_size) |
|
plt.yticks(fontsize=ticks_size) |
|
|
|
plt.subplot(133) |
|
plt.title('eval Accuracy/F1', fontsize=graph_title_size) |
|
plt.plot(df['eval_accuracy'].dropna(), label='accuracy') |
|
plt.plot(df['eval_f1'].dropna(), label='F1') |
|
plt.legend(fontsize=legend_size) |
|
plt.yticks(fontsize=ticks_size) |
|
plt.tight_layout() |
|
plt.savefig(output) |
|
|
|
|
|
history_df = pd.DataFrame(trainer.state.log_history) |
|
history_df.to_csv(f'{save_dir}/history.csv') |
|
|
|
suptitle = f'batch:16, lr:{best_lr}, gradient_accumulation: {best_grad_acc_steps}, type:{best_lr_type}, weight_decay:{best_weight_decay}' |
|
show_graph(history_df, suptitle, f'{save_dir}/output.png') |
|
|