A-Funakoshi commited on
Commit
5e1810a
1 Parent(s): aa4e7f4

Upload 8 files

Browse files
ReadMe.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 以下のサイトで紹介されている訓練を実行したもの
2
+
3
+ - https://dev.classmethod.jp/articles/huggingface-jp-text-classification/#toc-17
SentimentAnalysis.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %% [markdown]
2
+ # ## Hugging Faceを使って事前学習モデルを日本語の感情分析用にファインチューニングしてみた
3
+ # 以下で紹介されているコードを写経したもの
4
+ # https://dev.classmethod.jp/articles/huggingface-jp-text-classification/
5
+
6
+ # %%
7
+ from datasets import load_dataset
8
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
9
+ from transformers import TrainingArguments
10
+ from transformers import Trainer
11
+ from sklearn.metrics import accuracy_score, f1_score
12
+ from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
13
+ import torch
14
+ import matplotlib.pyplot as plt
15
+ import numpy as np
16
+
17
+ # %%
18
+ print('gpu available:',torch.cuda.is_available())
19
+
20
+ # %% [markdown]
21
+ # ## データセット
22
+
23
+ # %%
24
+ dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese")
25
+
26
+ # %%
27
+ # データフレームとして扱う
28
+ dataset.set_format(type='pandas')
29
+ train_df = dataset['train'][:]
30
+
31
+ # %%
32
+ def label_int2str(x):
33
+ return dataset["train"].features["label"].int2str(x)
34
+
35
+ train_df["label_name"] = train_df["label"].apply(label_int2str)
36
+
37
+ # %%
38
+ dataset.reset_format()
39
+
40
+ # %%
41
+ from transformers import AutoTokenizer
42
+
43
+ model_ckpt = "cl-tohoku/bert-base-japanese-whole-word-masking"
44
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
45
+
46
+ # %%
47
+ def tokenize(batch):
48
+ return tokenizer(batch["text"], padding=True, truncation=True)
49
+
50
+ # %%
51
+ dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)
52
+
53
+ # %% [markdown]
54
+ # ## モデル
55
+
56
+ # %%
57
+ import torch
58
+ from transformers import AutoModelForSequenceClassification
59
+
60
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
61
+ print(device)
62
+ num_labels = 3
63
+
64
+ model = (AutoModelForSequenceClassification
65
+ .from_pretrained(model_ckpt, num_labels=num_labels)
66
+ .to(device))
67
+
68
+ # %%
69
+ from sklearn.metrics import accuracy_score, f1_score
70
+
71
+ def compute_metrics(pred):
72
+ labels = pred.label_ids
73
+ preds = pred.predictions.argmax(-1)
74
+ f1 = f1_score(labels, preds, average="weighted")
75
+ acc = accuracy_score(labels, preds)
76
+ return {"accuracy": acc, "f1": f1}
77
+
78
+ # %%
79
+ from transformers import TrainingArguments
80
+
81
+ batch_size = 16
82
+ logging_steps = len(dataset_encoded["train"]) // batch_size
83
+ model_name = "sample-text-classification-bert"
84
+
85
+ training_args = TrainingArguments(
86
+ output_dir=model_name,
87
+ num_train_epochs=10,
88
+ learning_rate=2e-5,
89
+ per_device_train_batch_size=batch_size,
90
+ per_device_eval_batch_size=batch_size,
91
+ weight_decay=0.01,
92
+ evaluation_strategy="epoch",
93
+ disable_tqdm=False,
94
+ logging_steps=logging_steps,
95
+ push_to_hub=False,
96
+ log_level="error"
97
+ )
98
+
99
+ # %%
100
+ from transformers import Trainer
101
+
102
+ trainer = Trainer(
103
+ model=model,
104
+ args=training_args,
105
+ compute_metrics=compute_metrics,
106
+ train_dataset=dataset_encoded["train"],
107
+ eval_dataset=dataset_encoded["validation"],
108
+ tokenizer=tokenizer
109
+ )
110
+ print('start training..')
111
+ trainer.train()
112
+
113
+ # %%
114
+ # ラベル情報付与
115
+ id2label = {}
116
+ for i in range(dataset["train"].features["label"].num_classes):
117
+ id2label[i] = dataset["train"].features["label"].int2str(i)
118
+
119
+ label2id = {}
120
+ for i in range(dataset["train"].features["label"].num_classes):
121
+ label2id[dataset["train"].features["label"].int2str(i)] = i
122
+
123
+ trainer.model.config.id2label = id2label
124
+ trainer.model.config.label2id = label2id
125
+
126
+ # %%
127
+ # 保存
128
+ print('save model.')
129
+ trainer.save_model('sample-text-classification-bert')
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "positive",
13
+ "1": "neutral",
14
+ "2": "negative"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "negative": 2,
20
+ "neutral": 1,
21
+ "positive": 0
22
+ },
23
+ "layer_norm_eps": 1e-12,
24
+ "max_position_embeddings": 512,
25
+ "model_type": "bert",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "pad_token_id": 0,
29
+ "position_embedding_type": "absolute",
30
+ "problem_type": "single_label_classification",
31
+ "tokenizer_class": "BertJapaneseTokenizer",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.32.0.dev0",
34
+ "type_vocab_size": 2,
35
+ "use_cache": true,
36
+ "vocab_size": 32000
37
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efaeeb76dc6ec51e0d40e3fadf0538b5b83825be30faf9c74f8bc0b525c9a146
3
+ size 442545135
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "do_subword_tokenize": true,
6
+ "do_word_tokenize": true,
7
+ "jumanpp_kwargs": null,
8
+ "mask_token": "[MASK]",
9
+ "mecab_kwargs": null,
10
+ "model_max_length": 512,
11
+ "never_split": null,
12
+ "pad_token": "[PAD]",
13
+ "sep_token": "[SEP]",
14
+ "subword_tokenizer_type": "wordpiece",
15
+ "sudachi_kwargs": null,
16
+ "tokenizer_class": "BertJapaneseTokenizer",
17
+ "unk_token": "[UNK]",
18
+ "word_tokenizer_type": "mecab"
19
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90919965fa8d550dd2517104570f80e2ff56984cbd40408d9c74c012ffed307d
3
+ size 4015
vocab.txt ADDED
The diff for this file is too large to render. See raw diff