File size: 9,242 Bytes
a909fa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# -*- coding: utf-8 -*-
"""kpmg (2).ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1h7M0B8Uvu4c7u6iZK1VT-mAS4YydvyA3

# **Import Module**
"""

import pandas as pd
import numpy as np

!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch

!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook


from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

#GPU μ‚¬μš©
device = torch.device("cuda:0")
#BERT λͺ¨λΈ, Vocabulary 뢈러였기
bertmodel, vocab = get_pytorch_kobert_model()

import os

"""# **Load Data**"""

from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv(r'/content/drive/MyDrive/kpmg/concat.csv')

data

data.loc[(data['category'] == "쀑립"), 'category'] = 0
data.loc[(data['category'] == "e"), 'category'] = 1
data.loc[(data['category'] == "s"), 'category'] = 2
data.loc[(data['category'] == "g"), 'category'] = 3

data_list = []
for q, label in zip(data['contents'], data['category'])  :
    data1 = []
    data1.append(q)
    data1.append(str(label))

    data_list.append(data1)

print(data_list[0])
print(data_list[100])
print(data_list[250])
print(data_list[1000])
print(data_list[2500])
print(data_list[3300])

#train & test λ°μ΄ν„°λ‘œ λ‚˜λˆ„κΈ°
from sklearn.model_selection import train_test_split
                                                         
dataset_train, dataset_test = train_test_split(data, test_size=0.25, random_state=0)
print(len(dataset_train))
print(len(dataset_test))

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([dataset.iloc[i][sent_idx]]) for i in range(len(dataset))]
        self.labels = [np.int32(dataset.iloc[i][label_idx]) for i in range(len(dataset))]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5, shuffle=True)

"""# **KOBERT ν•™μŠ΅μ‹œν‚€κΈ°**"""

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=4,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device), return_dict=False)

        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

#BERT λͺ¨λΈ 뢈러였기
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

#optimizer와 schedule μ„€μ •
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

"""Train"""

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        train_acc += calc_accuracy(out, label)
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

"""TEST"""

def softmax(vals, idx):
    valscpu = vals.cpu().detach().squeeze(0)
    a = 0
    for i in valscpu:
        a += np.exp(i)
    return ((np.exp(valscpu[idx]))/a).item() * 100

def testModel(model, seq):
    cate = ["쀑립","e","s","g"]
    tmp = [seq]
    transform = nlp.data.BERTSentenceTransform(tok, max_len, pad=True, pair=False)
    tokenized = transform(tmp)

    model.eval()
    result = model(torch.tensor([tokenized[0]]).to(device), [tokenized[1]], torch.tensor(tokenized[2]).to(device))
    idx = result.argmax().cpu().item()
    print("λ³΄κ³ μ„œμ˜ μΉ΄ν…Œκ³ λ¦¬λŠ”:", cate[idx])
    print("μ‹ λ’°λ„λŠ”:", "{:.2f}%".format(softmax(result,idx)))

testModel(model, "μ΄μ‚¬νšŒ κΈˆν˜Έμ„μœ ν™”ν•™μ€ 지속가λŠ₯ν•œ 기업을 λ§Œλ“€κΈ° μœ„ν•΄ κ±΄μ „ν•œ 지배ꡬ쑰λ₯Ό κ΅¬μΆ•ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€. μ΄μ‚¬νšŒλŠ” μ΄ν•΄κ΄€κ³„μžμ˜ 이읡을 λŒ€λ³€ν•˜κ³ , κ²½μ˜μ§„μ— λŒ€ν•œ 감독 역할을 ν•˜λ©°, μž₯기적인 κ΄€μ μ˜ μ˜μ‚¬κ²°μ •μ„ ν•˜κΈ° μœ„ν•΄ λ…Έλ ₯ν•©λ‹ˆλ‹€.")

testModel(model, "κΈˆν˜Έμ„μœ ν™”ν•™μ€ μ‹œμž₯의 변화에 적절히 λŒ€μ‘ν•˜κ³  μΉœν™˜κ²½ 포트폴리였 μ „ν™˜μ„ μœ„ν•΄ κ³ λΆ€κ°€/μΉœν™˜κ²½ μ œν’ˆ 생산, μΉœν™˜κ²½ μžλ™μ°¨ κ΄€λ ¨ μ†”λ£¨μ…˜, λ°”μ΄μ˜€/μΉœν™˜κ²½μ†Œμž¬ 및 κ³ λΆ€κ°€ μŠ€νŽ˜μ…œν‹° μ œν’ˆ μ—°κ΅¬κ°œλ°œ 등을 κ³„νš μ€‘μž…λ‹ˆλ‹€.")

testModel(model, "λ‹Ήμ‚¬λŠ” κΈˆμœ΅μƒν’ˆκ³Ό κ΄€λ ¨ν•˜μ—¬ μ‹ μš©μœ„ν—˜, μœ λ™μ„±μœ„ν—˜ 및 μ‹œμž₯μœ„ν—˜μ— λ…ΈμΆœλ˜μ–΄ μžˆμŠ΅λ‹ˆλ‹€. λ³Έ 주석은 당사가 λ…ΈμΆœλ˜μ–΄ μžˆλŠ” μœ„μ˜ μœ„ν—˜μ— λŒ€ν•œ 정보와 λ‹Ήμ‚¬μ˜ μœ„ν—˜κ΄€λ¦¬ λͺ©ν‘œ,μ •μ±…, μœ„ν—˜ 평가 및 관리 절차, 그리고 μžλ³Έκ΄€λ¦¬μ— λŒ€ν•΄ κ³΅μ‹œν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€. μΆ”κ°€μ μΈκ³„λŸ‰μ  정보에 λŒ€ν•΄μ„œλŠ” λ³Έ μž¬λ¬΄μ œν‘œ μ „λ°˜μ— κ±Έμ³μ„œ κ³΅μ‹œλ˜μ–΄ μžˆμŠ΅λ‹ˆλ‹€.")

testModel(model, "μ£Όκ΄€ν•˜λŠ” β€˜2021λ…„ μžλ°œμ μ—λ„ˆμ§€νš¨μœ¨λͺ©ν‘œμ œ μ‹œλ²”μ‚¬μ—…β€™ ν˜‘μ•½μ„ 톡해 μ—λ„ˆμ§€ μ›λ‹¨μœ„ λͺ©ν‘œ κ°œμ„ μ„ μœ„ν•΄ λ…Έλ ₯ν•˜κ³  있으며, μ§€μ—­μ‚¬νšŒ 및 μ—λ„ˆμ§€μ‹œλ―Όμ—°λŒ€μ—μ„œ μ£Όκ΄€ν•˜λŠ” ν™˜κ²½ κ΄€λ ¨ ν™œλ™μ— μ°Έμ—¬ν•˜λ©° κΈ°ν›„λ³€ν™” λŒ€μ‘ μ€‘μš”μ„±μ— λŒ€ν•œ 곡감과 μ†Œν†΅μ„ μ‹€μ²œν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€. ")

testModel(model, "생물닀양성 μœ μ§€")

testModel(model, "생물닀양성 μœ μ§€ 및 지속가λŠ₯성을 μΆ”μ§„ν•˜λŠ” ꡭ제 λΉ„μ˜λ¦¬ ν™˜κ²½λ³΄ν˜Έλ‹¨μ²΄")

testModel(model, "μ•„μšΈλŸ¬ μ œν’ˆ 제쑰, 판맀 전단계에 μžˆμ–΄μ„œμ˜ νƒ„μ†Œλ°°μΆœμ ˆκ°μ„ μœ„ν•œ 곡급망 관리 체계λ₯Ό 보닀 κ°•ν™”ν•΄ λ‚˜μ•„κ°ˆ κ²ƒμž…λ‹ˆλ‹€.")

testModel(model, "κ°œλ°œμ—μ„œ μœ ν†΅κΉŒμ§€, μ›λ£ŒλΆ€ν„° μ œν’ˆκΉŒμ§€, λͺ¨λ“  단계λ₯Ό μ•„μš°λ₯΄λŠ” ν’ˆμ§ˆμ•ˆμ „μ˜ ν™•λ³΄λŠ” ν•„μˆ˜μ μž…λ‹ˆλ‹€.")

testModel(model, "λ‘―λ°μ œκ³ΌλŠ” λ™λ°˜μ„±μž₯아카데미λ₯Ό 온라인으둜 연쀑 μš΄μ˜ν•˜λ©° ν˜‘λ ₯μ—…μ²΄μ˜ μΈμ μžμ› κ°œλ°œμ„ μ§€μ›ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€. ")

testModel(model, "")