File size: 9,242 Bytes
a909fa2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
# -*- coding: utf-8 -*-
"""kpmg (2).ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1h7M0B8Uvu4c7u6iZK1VT-mAS4YydvyA3
# **Import Module**
"""
import pandas as pd
import numpy as np
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
#GPU μ¬μ©
device = torch.device("cuda:0")
#BERT λͺ¨λΈ, Vocabulary λΆλ¬μ€κΈ°
bertmodel, vocab = get_pytorch_kobert_model()
import os
"""# **Load Data**"""
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv(r'/content/drive/MyDrive/kpmg/concat.csv')
data
data.loc[(data['category'] == "μ€λ¦½"), 'category'] = 0
data.loc[(data['category'] == "e"), 'category'] = 1
data.loc[(data['category'] == "s"), 'category'] = 2
data.loc[(data['category'] == "g"), 'category'] = 3
data_list = []
for q, label in zip(data['contents'], data['category']) :
data1 = []
data1.append(q)
data1.append(str(label))
data_list.append(data1)
print(data_list[0])
print(data_list[100])
print(data_list[250])
print(data_list[1000])
print(data_list[2500])
print(data_list[3300])
#train & test λ°μ΄ν°λ‘ λλκΈ°
from sklearn.model_selection import train_test_split
dataset_train, dataset_test = train_test_split(data, test_size=0.25, random_state=0)
print(len(dataset_train))
print(len(dataset_test))
class BERTDataset(Dataset):
def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
pad, pair):
transform = nlp.data.BERTSentenceTransform(
bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
self.sentences = [transform([dataset.iloc[i][sent_idx]]) for i in range(len(dataset))]
self.labels = [np.int32(dataset.iloc[i][label_idx]) for i in range(len(dataset))]
def __getitem__(self, i):
return (self.sentences[i] + (self.labels[i], ))
def __len__(self):
return (len(self.labels))
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5, shuffle=True)
"""# **KOBERT νμ΅μν€κΈ°**"""
class BERTClassifier(nn.Module):
def __init__(self,
bert,
hidden_size = 768,
num_classes=4,
dr_rate=None,
params=None):
super(BERTClassifier, self).__init__()
self.bert = bert
self.dr_rate = dr_rate
self.classifier = nn.Linear(hidden_size , num_classes)
if dr_rate:
self.dropout = nn.Dropout(p=dr_rate)
def gen_attention_mask(self, token_ids, valid_length):
attention_mask = torch.zeros_like(token_ids)
for i, v in enumerate(valid_length):
attention_mask[i][:v] = 1
return attention_mask.float()
def forward(self, token_ids, valid_length, segment_ids):
attention_mask = self.gen_attention_mask(token_ids, valid_length)
_, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict=False)
if self.dr_rate:
out = self.dropout(pooler)
return self.classifier(out)
#BERT λͺ¨λΈ λΆλ¬μ€κΈ°
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
#optimizerμ schedule μ€μ
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
def calc_accuracy(X,Y):
max_vals, max_indices = torch.max(X, 1)
train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
return train_acc
"""Train"""
for e in range(num_epochs):
train_acc = 0.0
test_acc = 0.0
model.train()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
optimizer.zero_grad()
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
out = model(token_ids, valid_length, segment_ids)
loss = loss_fn(out, label)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
optimizer.step()
scheduler.step()
train_acc += calc_accuracy(out, label)
print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
model.eval()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
out = model(token_ids, valid_length, segment_ids)
test_acc += calc_accuracy(out, label)
print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
"""TEST"""
def softmax(vals, idx):
valscpu = vals.cpu().detach().squeeze(0)
a = 0
for i in valscpu:
a += np.exp(i)
return ((np.exp(valscpu[idx]))/a).item() * 100
def testModel(model, seq):
cate = ["μ€λ¦½","e","s","g"]
tmp = [seq]
transform = nlp.data.BERTSentenceTransform(tok, max_len, pad=True, pair=False)
tokenized = transform(tmp)
model.eval()
result = model(torch.tensor([tokenized[0]]).to(device), [tokenized[1]], torch.tensor(tokenized[2]).to(device))
idx = result.argmax().cpu().item()
print("λ³΄κ³ μμ μΉ΄ν
κ³ λ¦¬λ:", cate[idx])
print("μ λ’°λλ:", "{:.2f}%".format(softmax(result,idx)))
testModel(model, "μ΄μ¬ν κΈνΈμμ ννμ μ§μκ°λ₯ν κΈ°μ
μ λ§λ€κΈ° μν΄ κ±΄μ ν μ§λ°°κ΅¬μ‘°λ₯Ό ꡬμΆνκ³ μμ΅λλ€. μ΄μ¬νλ μ΄ν΄κ΄κ³μμ μ΄μ΅μ λλ³νκ³ , κ²½μμ§μ λν κ°λ
μν μ νλ©°, μ₯κΈ°μ μΈ κ΄μ μ μμ¬κ²°μ μ νκΈ° μν΄ λ
Έλ ₯ν©λλ€.")
testModel(model, "κΈνΈμμ ννμ μμ₯μ λ³νμ μ μ ν λμνκ³ μΉνκ²½ ν¬νΈν΄λ¦¬μ€ μ νμ μν΄ κ³ λΆκ°/μΉνκ²½ μ ν μμ°, μΉνκ²½ μλμ°¨ κ΄λ ¨ μ루μ
, λ°μ΄μ€/μΉνκ²½μμ¬ λ° κ³ λΆκ° μ€νμ
ν° μ ν μ°κ΅¬κ°λ° λ±μ κ³ν μ€μ
λλ€.")
testModel(model, "λΉμ¬λ κΈμ΅μνκ³Ό κ΄λ ¨νμ¬ μ μ©μν, μ λμ±μν λ° μμ₯μνμ λ
ΈμΆλμ΄ μμ΅λλ€. λ³Έ μ£Όμμ λΉμ¬κ° λ
ΈμΆλμ΄ μλ μμ μνμ λν μ 보μ λΉμ¬μ μνκ΄λ¦¬ λͺ©ν,μ μ±
, μν νκ° λ° κ΄λ¦¬ μ μ°¨, κ·Έλ¦¬κ³ μλ³Έκ΄λ¦¬μ λν΄ κ³΅μνκ³ μμ΅λλ€. μΆκ°μ μΈκ³λμ μ 보μ λν΄μλ λ³Έ μ¬λ¬΄μ ν μ λ°μ κ±Έμ³μ 곡μλμ΄ μμ΅λλ€.")
testModel(model, "μ£Όκ΄νλ β2021λ
μλ°μ μλμ§ν¨μ¨λͺ©νμ μλ²μ¬μ
β νμ½μ ν΅ν΄ μλμ§ μλ¨μ λͺ©ν κ°μ μ μν΄ λ
Έλ ₯νκ³ μμΌλ©°, μ§μμ¬ν λ° μλμ§μλ―Όμ°λμμ μ£Όκ΄νλ νκ²½ κ΄λ ¨ νλμ μ°Έμ¬νλ©° κΈ°νλ³ν λμ μ€μμ±μ λν 곡κ°κ³Ό μν΅μ μ€μ²νκ³ μμ΅λλ€. ")
testModel(model, "μλ¬Όλ€μμ± μ μ§")
testModel(model, "μλ¬Όλ€μμ± μ μ§ λ° μ§μκ°λ₯μ±μ μΆμ§νλ κ΅μ λΉμ리 ν경보νΈλ¨μ²΄")
testModel(model, "μμΈλ¬ μ ν μ μ‘°, ν맀 μ λ¨κ³μ μμ΄μμ νμλ°°μΆμ κ°μ μν 곡κΈλ§ κ΄λ¦¬ 체κ³λ₯Ό λ³΄λ€ κ°νν΄ λμκ° κ²μ
λλ€.")
testModel(model, "κ°λ°μμ μ ν΅κΉμ§, μλ£λΆν° μ νκΉμ§, λͺ¨λ λ¨κ³λ₯Ό μμ°λ₯΄λ νμ§μμ μ ν보λ νμμ μ
λλ€.")
testModel(model, "λ‘―λ°μ κ³Όλ λλ°μ±μ₯μμΉ΄λ°λ―Έλ₯Ό μ¨λΌμΈμΌλ‘ μ°μ€ μ΄μνλ©° νλ ₯μ
체μ μΈμ μμ κ°λ°μ μ§μνκ³ μμ΅λλ€. ")
testModel(model, "") |