# %% [markdown] # ## Hugging Faceを使って事前学習モデルを日本語の感情分析用にファインチューニングしてみた # 以下で紹介されているコードを写経したもの # https://dev.classmethod.jp/articles/huggingface-jp-text-classification/ # %% from datasets import load_dataset from transformers import AutoModelForSequenceClassification, AutoTokenizer from transformers import TrainingArguments from transformers import Trainer from sklearn.metrics import accuracy_score, f1_score from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix import torch import matplotlib.pyplot as plt import numpy as np # %% print('gpu available:',torch.cuda.is_available()) # %% [markdown] # ## データセット # %% dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese") # %% # データフレームとして扱う dataset.set_format(type='pandas') train_df = dataset['train'][:] # %% def label_int2str(x): return dataset["train"].features["label"].int2str(x) train_df["label_name"] = train_df["label"].apply(label_int2str) # %% dataset.reset_format() # %% from transformers import AutoTokenizer model_ckpt = "cl-tohoku/bert-base-japanese-whole-word-masking" tokenizer = AutoTokenizer.from_pretrained(model_ckpt) # %% def tokenize(batch): return tokenizer(batch["text"], padding=True, truncation=True) # %% dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None) # %% [markdown] # ## モデル # %% import torch from transformers import AutoModelForSequenceClassification device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) num_labels = 3 model = (AutoModelForSequenceClassification .from_pretrained(model_ckpt, num_labels=num_labels) .to(device)) # %% from sklearn.metrics import accuracy_score, f1_score def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) f1 = f1_score(labels, preds, average="weighted") acc = accuracy_score(labels, preds) return {"accuracy": acc, "f1": f1} # %% from transformers import TrainingArguments batch_size = 16 logging_steps = len(dataset_encoded["train"]) // batch_size model_name = "sample-text-classification-bert" training_args = TrainingArguments( output_dir=model_name, num_train_epochs=10, learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, evaluation_strategy="epoch", disable_tqdm=False, logging_steps=logging_steps, push_to_hub=False, log_level="error" ) # %% from transformers import Trainer trainer = Trainer( model=model, args=training_args, compute_metrics=compute_metrics, train_dataset=dataset_encoded["train"], eval_dataset=dataset_encoded["validation"], tokenizer=tokenizer ) print('start training..') trainer.train() # %% # ラベル情報付与 id2label = {} for i in range(dataset["train"].features["label"].num_classes): id2label[i] = dataset["train"].features["label"].int2str(i) label2id = {} for i in range(dataset["train"].features["label"].num_classes): label2id[dataset["train"].features["label"].int2str(i)] = i trainer.model.config.id2label = id2label trainer.model.config.label2id = label2id # %% # 保存 print('save model.') trainer.save_model('sample-text-classification-bert')