winninglife commited on
Commit
c344d9f
Β·
1 Parent(s): a028f63

initial commit

Browse files
TODO ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1. sentiment λͺ¨λΈ 4개 λ§Œλ“€κΈ°
2
+ 2. λͺ¨λΈ λ‘œλ“œν•˜λŠ” λΆ€λΆ„ μΆ”κ°€
3
+ 3. 감성 νŒλ‹¨ν•˜λŠ” λΆ€λΆ„ μΆ”κ°€
4
+ 4. ν‘œμ • ν‘œμ‹œ
5
+ - %에 따라 ν‘œμ • 생성?
6
+ 5. 응닡 ν‘œμ‹œ
7
+ - %에 따라 응닡 생성?
8
+ 6. API μΆ”κ°€
9
+ - csv λ‹€μš΄λ‘œλ“œ μΆ”κ°€?
10
+ 7. λ²ˆμ—­ 데이터 μΆ”κ°€
11
+ - λͺ¨λΈ 4개 μΆ”κ°€
12
+
13
+ 8. μž…λ ₯값을 ν•™μŠ΅ λ°μ΄ν„°λ‘œ feedback ν•˜λŠ” κΈ°λŠ₯ μΆ”κ°€
14
+
15
+ extra. λ‘λ²ˆμ§Έ 과제 ν•΄λ³΄μž.
__pycache__/mymodel.cpython-311.pyc ADDED
Binary file (19.5 kB). View file
 
bert_model_layered/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "klue/bert-base",
3
+ "architectures": [
4
+ "CustomBertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "LABEL_0": 0,
20
+ "LABEL_1": 1,
21
+ "LABEL_2": 2
22
+ },
23
+ "layer_norm_eps": 1e-12,
24
+ "max_position_embeddings": 512,
25
+ "model_type": "bert",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "pad_token_id": 0,
29
+ "position_embedding_type": "absolute",
30
+ "problem_type": "single_label_classification",
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.35.2",
33
+ "type_vocab_size": 2,
34
+ "use_cache": true,
35
+ "vocab_size": 32000
36
+ }
bert_model_layered/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:237aa1d1bdf2ea320fc791c9aa2fcafc43e6e25c49e591717b2d365c3c2bb459
3
+ size 443688288
bert_model_layered/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e9d495db71dff2f0270e8ac6514421d7beccffabe349925d36d3ea841c00989
3
+ size 4536
bert_tokenizer_layered/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
bert_tokenizer_layered/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "BertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
bert_tokenizer_layered/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
bert_trainer_layered/bert_model_layered/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "klue/bert-base",
3
+ "architectures": [
4
+ "CustomBertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "LABEL_0": 0,
20
+ "LABEL_1": 1,
21
+ "LABEL_2": 2
22
+ },
23
+ "layer_norm_eps": 1e-12,
24
+ "max_position_embeddings": 512,
25
+ "model_type": "bert",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "pad_token_id": 0,
29
+ "position_embedding_type": "absolute",
30
+ "problem_type": "single_label_classification",
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.35.2",
33
+ "type_vocab_size": 2,
34
+ "use_cache": true,
35
+ "vocab_size": 32000
36
+ }
bert_trainer_layered/bert_model_layered/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:237aa1d1bdf2ea320fc791c9aa2fcafc43e6e25c49e591717b2d365c3c2bb459
3
+ size 443688288
bert_trainer_layered/bert_model_layered/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e9d495db71dff2f0270e8ac6514421d7beccffabe349925d36d3ea841c00989
3
+ size 4536
bert_trainer_layered/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "klue/bert-base",
3
+ "architectures": [
4
+ "CustomBertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "LABEL_0": 0,
20
+ "LABEL_1": 1,
21
+ "LABEL_2": 2
22
+ },
23
+ "layer_norm_eps": 1e-12,
24
+ "max_position_embeddings": 512,
25
+ "model_type": "bert",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "pad_token_id": 0,
29
+ "position_embedding_type": "absolute",
30
+ "problem_type": "single_label_classification",
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.35.2",
33
+ "type_vocab_size": 2,
34
+ "use_cache": true,
35
+ "vocab_size": 32000
36
+ }
bert_trainer_layered/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:237aa1d1bdf2ea320fc791c9aa2fcafc43e6e25c49e591717b2d365c3c2bb459
3
+ size 443688288
bert_trainer_layered/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e9d495db71dff2f0270e8ac6514421d7beccffabe349925d36d3ea841c00989
3
+ size 4536
homework.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pyparsing import empty
3
+ from mymodel import CustomBertForSequenceClassification
4
+ from transformers import BertTokenizer, Trainer
5
+
6
+ POSITIVE = 0
7
+ NEGATIVE = 1
8
+ NEUTRAL = 2
9
+
10
+ idx_target = {POSITIVE:'positive', NEGATIVE:'negative', NEUTRAL:'neutral'}
11
+
12
+ g_selected_model_type = None
13
+ g_input_text = ' '
14
+
15
+ def get_model_type(select_model, add_layers):
16
+ if (select_model == "BERT") & (add_layers == True):
17
+ return "klue/bert-base", "bert_layered"
18
+ elif (select_model == "BERT") & (add_layers == False):
19
+ return "klue/bert-base", "bert"
20
+ elif (select_model == "RoBERTa") & (add_layers == True):
21
+ return "klue/roberta-base", "roberta_layered"
22
+ elif (select_model == "RoBERTa") & (add_layers == False):
23
+ return "klue/roberta-base", "roberta"
24
+
25
+ def show_predict_result(model_type, input_text):
26
+ target = POSITIVE
27
+ show_response_img(target)
28
+ show_response_text(target)
29
+
30
+ def show_response_img(target = None):
31
+ if target == None:
32
+ st.con3.write("")
33
+ elif target == POSITIVE:
34
+ st.con3.write("μ›ƒλŠ” ν‘œμ •")
35
+ elif target == NEGATIVE:
36
+ st.con3.write("μš°λŠ” ν‘œμ •")
37
+ elif target == NEUTRAL:
38
+ st.con3.write("λ¬΄λ€λ€ν•œ ν‘œμ •")
39
+
40
+ def show_response_text(target = None):
41
+ if target == None:
42
+ st.con4.write("")
43
+ elif target == POSITIVE:
44
+ st.con4.write("긍정적인 λ°˜μ‘")
45
+ elif target == NEGATIVE:
46
+ st.con4.write("뢀정정인 λ°˜μ‘")
47
+ elif target == NEUTRAL:
48
+ st.con4.write("λ¬΄λ€λ€ν•œ λ°˜μ‘")
49
+
50
+
51
+ def show_data():
52
+ st.write("data")
53
+
54
+ def show_api_usage():
55
+ with st.container(border=True):
56
+ st.write("api μ‚¬μš©λ²•")
57
+
58
+ def tab1_page():
59
+ global g_selected_model_type
60
+ global g_input_text
61
+ st.con1,st.con2 = st.columns([0.3,0.7])
62
+ st.con3,st.con4 = st.columns([0.3,0.7])
63
+ st.con5,empty1 = st.columns([0.9999,0.0001])
64
+
65
+ with st.container():
66
+ with st.con1:
67
+ with st.con1.container(border=True):
68
+ selected_model = st.selectbox("예츑 λͺ¨λΈμ„ μ„ νƒν•˜μ„Έμš”.", ["BERT", "RoBERTa"])
69
+ add_layers = st.checkbox('Layer μΆ”κ°€')
70
+ g_selected_model_type = get_model_type(selected_model, add_layers)
71
+ show_predict_result(g_selected_model_type, g_input_text)
72
+ with st.con2:
73
+ with st.con2.container(border=True):
74
+ input_text = st.text_area("input_text")
75
+ submit_button = st.button('확인')
76
+ if submit_button :
77
+ g_input_text = input_text
78
+ show_predict_result(g_selected_model_type, g_input_text)
79
+ with st.con3:
80
+ with st.con3.container(border=True):
81
+ show_response_img()
82
+ with st.con4:
83
+ with st.con4.container(border=True):
84
+ show_response_text()
85
+ with st.con5:
86
+ with st.con5.container(border=True):
87
+ show_data()
88
+ with empty1:
89
+ empty()
90
+
91
+
92
+ def tab2_page():
93
+ show_api_usage()
94
+
95
+
96
+ #tokenizer = BertTokenizer()
97
+ #model = CustomBertForSequenceClassification()
98
+ #trainer = Trainer()
99
+ tokenizer = BertTokenizer.from_pretrained("bert_tokenizer_layered")
100
+ model = CustomBertForSequenceClassification.from_pretrained("bert_model_layered")
101
+ trainer = Trainer().load_model("bert_trainer_layered")
102
+
103
+
104
+
105
+ st.title("Semi Project - Sentiment analysis")
106
+ #st.subheader("일상 λŒ€ν™”λ₯Ό μž…λ ₯ν•˜λ©΄ ν•΄λ‹Ή λ¬Έμž₯이 λ‚˜νƒ€λ‚΄λŠ” λŠλ‚Œμ„ ν‘œμ‹œν•©λ‹ˆλ‹€.")
107
+
108
+
109
+ tab1, tab2 = st.tabs(['λ™μž‘ 확인', 'API'])
110
+
111
+ with tab1:
112
+ tab1_page()
113
+
114
+ with tab2:
115
+ tab2_page()
mymodel.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import joblib
3
+ import pickle
4
+ import numpy as np
5
+ import pandas as pd
6
+ import tensorflow as tf
7
+ from typing import Optional, Union, Tuple
8
+ from gensim.models import Word2Vec
9
+ from transformers import BertTokenizer
10
+ from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertModel
11
+ from transformers.modeling_outputs import SequenceClassifierOutput
12
+ from torch.nn import MSELoss, CrossEntropyLoss, BCEWithLogitsLoss
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.metrics import accuracy_score, classification_report
15
+ import torch.nn.functional as F
16
+
17
+ import torch
18
+ import time
19
+ from torch import nn
20
+ from transformers import Trainer
21
+ from transformers import AutoModel, AutoTokenizer
22
+ from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
23
+
24
+ NUM_CLASSES = 3 # λΆ„λ₯˜ 클래슀 수
25
+ DROP_OUT = 0.3 # μ›ν•˜λŠ” dropout ν™•λ₯ 
26
+
27
+ class SentimentDataset(torch.utils.data.Dataset):
28
+ def __init__(self, encodings, labels=None):
29
+ self.encodings = encodings
30
+ self.labels = labels
31
+
32
+ def __getitem__(self, idx):
33
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
34
+ if self.labels:
35
+ item['labels'] = torch.tensor(self.labels[idx])
36
+ return item
37
+
38
+ def __len__(self):
39
+ return len(self.encodings["input_ids"])
40
+
41
+ class CustomBertForSequenceClassification(BertForSequenceClassification):
42
+
43
+ def __init__(self, config):
44
+ super().__init__(config)
45
+ self.num_labels = config.num_labels
46
+ self.config = config
47
+
48
+ self.bert = BertModel(config)
49
+ classifier_dropout = (
50
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
51
+ )
52
+ self.dropout = nn.Dropout(classifier_dropout)
53
+
54
+ # ν•˜κΈ° λ°©μ‹μœΌλ‘œ λŒ€μ²΄ν•œλ‹€.
55
+ #self.classifier = nn.Linear(config.hidden_size, config.num_labels)
56
+
57
+ # https://github.com/KisuYang/EmotionX-KU/blob/master/models.py
58
+ self.linear_h = nn.Linear(config.hidden_size, 384)
59
+ self.linear_o = nn.Linear(384, config.num_labels)
60
+ self.selu = nn.SELU()
61
+
62
+ print("hidden_size:", config.hidden_size, "num_lables:", config.num_labels)
63
+
64
+ # Initialize weights and apply final processing
65
+ self.post_init()
66
+
67
+ def forward(
68
+ self,
69
+ input_ids: Optional[torch.Tensor] = None,
70
+ attention_mask: Optional[torch.Tensor] = None,
71
+ token_type_ids: Optional[torch.Tensor] = None,
72
+ position_ids: Optional[torch.Tensor] = None,
73
+ head_mask: Optional[torch.Tensor] = None,
74
+ inputs_embeds: Optional[torch.Tensor] = None,
75
+ labels: Optional[torch.Tensor] = None,
76
+ output_attentions: Optional[bool] = None,
77
+ output_hidden_states: Optional[bool] = None,
78
+ return_dict: Optional[bool] = None,
79
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
80
+ r"""
81
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
82
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
83
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
84
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
85
+ """
86
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
87
+
88
+ outputs = self.bert(
89
+ input_ids,
90
+ attention_mask=attention_mask,
91
+ token_type_ids=token_type_ids,
92
+ position_ids=position_ids,
93
+ head_mask=head_mask,
94
+ inputs_embeds=inputs_embeds,
95
+ output_attentions=output_attentions,
96
+ output_hidden_states=output_hidden_states,
97
+ return_dict=return_dict,
98
+ )
99
+
100
+ # outputs[0]: batch_size(16), feature_size(38), hidden_size(768)
101
+ # outputs[1]: batch_size(16), hidden_size(768)
102
+
103
+ # BertModel 의 좜λ ₯쀑 Pooled Output 좜λ ₯을 μ·¨ν•œλ‹€.
104
+ pooled_output = outputs[1]
105
+
106
+ # Dropout 전에 https://github.com/KisuYang/EmotionX-KU/blob/master/models.py λ°©μ‹μœΌλ‘œ λ ˆμ΄μ–΄λ₯Ό μΆ”κ°€ν•œλ‹€.
107
+ pooled_output = self.selu(self.linear_h(pooled_output))
108
+
109
+ # Dropout 적용
110
+ pooled_output = self.dropout(pooled_output)
111
+
112
+ # Linear layerλ₯Ό ν†΅κ³Όμ‹œμΌœ num_labels 에 ν•΄λ‹Ήν•˜λŠ” 좜λ ₯을 μƒμ„±ν•œλ‹€.
113
+ #logits = self.classifier(pooled_output)
114
+ logits = self.linear_o(pooled_output)
115
+
116
+ loss = None
117
+ if labels is not None:
118
+ if self.config.problem_type is None:
119
+ if self.num_labels == 1:
120
+ self.config.problem_type = "regression"
121
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
122
+ self.config.problem_type = "single_label_classification"
123
+ else:
124
+ self.config.problem_type = "multi_label_classification"
125
+
126
+ if self.config.problem_type == "regression":
127
+ loss_fct = MSELoss()
128
+ if self.num_labels == 1:
129
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
130
+ else:
131
+ loss = loss_fct(logits, labels)
132
+ elif self.config.problem_type == "single_label_classification":
133
+ loss_fct = CrossEntropyLoss()
134
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
135
+ elif self.config.problem_type == "multi_label_classification":
136
+ loss_fct = BCEWithLogitsLoss()
137
+ loss = loss_fct(logits, labels)
138
+ if not return_dict:
139
+ output = (logits,) + outputs[2:]
140
+ return ((loss,) + output) if loss is not None else output
141
+
142
+ return SequenceClassifierOutput(
143
+ loss=loss,
144
+ logits=logits,
145
+ hidden_states=outputs.hidden_states,
146
+ attentions=outputs.attentions,
147
+ )
148
+
149
+ def train_model(model_name, X_train, X_test, y_train, y_test, epochs=2, train_batch_size=8, eval_batch_size=16, use_emotion_x=False):
150
+
151
+ tokenizer = BertTokenizer.from_pretrained(model_name)
152
+
153
+ train_encodings = tokenizer(X_train, truncation=True, padding=True)
154
+ train_dataset = SentimentDataset(train_encodings, y_train)
155
+
156
+ test_encodings = tokenizer(X_test, truncation=True, padding=True)
157
+ test_dataset = SentimentDataset(test_encodings, y_test)
158
+
159
+ print(train_dataset[1]['input_ids'].shape)
160
+ print(train_dataset[1]['attention_mask'].shape)
161
+
162
+ training_args = TrainingArguments(
163
+ output_dir='./results', # output μ €μž₯ directory
164
+ num_train_epochs=epochs, # total number of training epochs
165
+ per_device_train_batch_size=train_batch_size, # batch size per device during training
166
+ per_device_eval_batch_size=eval_batch_size, # batch size per device during evaluation
167
+ warmup_steps = 500, # number of warmup steps for learning rate scheduler
168
+ weight_decay = 0.01, # weight decay 강도
169
+ logging_dir='./logs', # log μ €μž₯ directory
170
+ logging_steps=10,
171
+ do_eval=True
172
+ )
173
+
174
+ if use_emotion_x == True:
175
+ model = CustomBertForSequenceClassification.from_pretrained(model_name, num_labels=NUM_CLASSES).to('cuda')
176
+ else:
177
+ model = BertForSequenceClassification.from_pretrained(model_name, num_labels=NUM_CLASSES).to('cuda')
178
+
179
+ trainer = Trainer(
180
+ model = model,
181
+ args = training_args,
182
+ train_dataset = train_dataset,
183
+ eval_dataset = test_dataset
184
+ )
185
+
186
+ s = time.time()
187
+
188
+ trainer.train()
189
+
190
+ trainer.evaluate(test_dataset)
191
+
192
+ prediction = trainer.predict(test_dataset)
193
+
194
+ y_logit = torch.tensor(prediction[0])
195
+
196
+ y_pred = F.softmax(y_logit, dim=-1).argmax(axis=1).numpy()
197
+
198
+ print(classification_report(y_test, y_pred))
199
+ print(confusion_matrix(y_test, y_pred))
200
+ print(accuracy_score(y_test, y_pred))
201
+
202
+ return trainer , tokenizer
203
+
204
+
205
+ def test_trainer(trainer, tokenizer):
206
+ POSITIVE = 0
207
+ NEGATIVE = 1
208
+ NEUTRAL = 2
209
+
210
+ idx_to_label = {POSITIVE:'positive', NEGATIVE:'negative', NEUTRAL:'neutral'}
211
+
212
+
213
+ test_dict = {
214
+ '였늘 짜증 μ§€λŒ€λ‘œλ„€': NEGATIVE,
215
+ '톡μž₯이 ν……ν…… λΉ„μ—ˆμŒ': NEGATIVE,
216
+ '경제 사정이 μ’€ λ‚˜μ•„μ Έμ„œ μ’‹λ„€μš”': POSITIVE,
217
+ 'κ΅­κ°€κ°„ 관계가 μ•…ν™”λ˜κ³  μžˆμ–΄μš”': NEGATIVE,
218
+ 'ν•œκ΅­κ³Ό 일본은 사이가 μ•ˆμ’‹μ•„μš”.': NEGATIVE,
219
+ 'μ‹€νŒ¨λŠ” μ„±κ³΅μ˜ μ–΄λ¨Έλ‹ˆμ΄λ‹€.': POSITIVE,
220
+ '날씨가 λ”°λœ»ν•΄μ„œ 마음이 νŽΈμ•ˆν•΄μš”.': POSITIVE,
221
+ 'μ£Όλ¨Έλ‹ˆ 사정이 νŒŒμ‚° μ§μ „μž„' : NEGATIVE,
222
+ 'λ„ˆλ¬΄ 걱정말고 νž˜λ‚΄!' : POSITIVE,
223
+ 'μ•„ μ§„μ§œ! μ§œμ¦λ‚˜κ²Œ ꡴지말고 저리가!' : NEGATIVE,
224
+ '인생이 ν”Όκ³€ν•˜λ‹€.' : NEGATIVE,
225
+ 'λ”°λœ»ν•œ 말씀 κ°μ‚¬ν•©λ‹ˆλ‹€.' :POSITIVE,
226
+ '바보같은 λ†ˆλ“€ ν•œμ‹¬ν•˜λ„€' :NEGATIVE,
227
+ 'κ·Έ 말이 μ €λ₯Ό λ„ˆλ¬΄ νž˜λ“€κ²Œ ν•˜λ„€μš”' : NEGATIVE,
228
+ 'μšΈμ§€λ§κ³  νž˜λ‚΄':POSITIVE,
229
+ '눈물이 λ©ˆμΆ”μ§ˆ μ•Šμ•„μš”':NEGATIVE,
230
+ 'μƒˆλ‘œμš΄ 사μž₯λ‹˜μ€ 진취적인 뢄이라 κΈ°λŒ€κ°€ λœλ‹€':POSITIVE,
231
+ '였늘 할일이 νƒœμ‚°μ΄λ„€':NEUTRAL,
232
+ '할일이 λ„ˆλ¬΄ λ§Žμ§€λ§Œ κΎΈμ—­κΎΈμ—­ ν•˜κ³  μžˆμ–΄':NEUTRAL,
233
+ 'λ°°κ°€ κ³ ν”„λ„€μš”':NEUTRAL,
234
+ '집에 κ°€κ³  μ‹Άλ„€μš”':NEUTRAL,
235
+ 'μ½”μ½”μ•„ ν•œμž” ν•˜μ‹€λž˜μš”?':NEUTRAL,
236
+ '컴퓨터 λ°”κΏ”μ£Όμ„Έμš”.':NEUTRAL,
237
+ 'ν•œλŒ€ λ§žμ„λž˜?': NEGATIVE,
238
+ 'μ‹ λ‚˜λŠ” 여행을 μƒκ°ν•˜λ‹ˆ 기뢄이 μ’‹μŠ΅λ‹ˆλ‹€':POSITIVE,
239
+ 'λ°°κ³ ν”ˆλ° λ°₯이 μ—†μ–΄μš”.':NEGATIVE,
240
+ 'κ΅­κ°€ κ²½μ œκ°€ νŒŒνƒ„ λ‚˜λŠ” 쀑이닀.':NEGATIVE,
241
+ 'λ„ˆλ•Œλ¬Έμ— λ‚΄κ°€ λ„ˆλ¬΄ νž˜λ“€μ–΄':NEGATIVE,
242
+ 'κ·Έλž˜λ„ λ‹ˆκ°€ μžˆμ–΄μ„œ 닀행이야':POSITIVE,
243
+ 'μ•”μšΈν•œ 경제 사정에도 μ—΄μ‹¬νžˆ ν•΄μ€˜μ„œ κ³ λ§ˆμ›Œμš”':POSITIVE,
244
+ '였늘 κΈ°λΆ„ μ§±μ΄μ—μš”':POSITIVE,
245
+ 'λ„ˆλŠ” λŒ€μ²΄ ν•  쀄 μ•„λŠ”κ²Œ λ­λ‹ˆ?':NEGATIVE,
246
+ 'μˆ™μ œκ°€ λ„ˆλ¬΄ μ–΄λ €μ›Œ λ―ΈμΉ˜κ² λ‹€':NEGATIVE,
247
+ '우리 νŒ€μ›λ“€ μ—΄μ‹¬νžˆ ν•΄μ€˜μ„œ μžλž‘μŠ€λŸ½μŠ΅λ‹ˆλ‹€':POSITIVE,
248
+ 'Wow! μ˜ν™” μ§„μ§œ μž¬λ―Έμžˆλ„€':POSITIVE,
249
+ 'γ… γ…  νž˜λ“€μ–΄ 죽을거 κ°™μ•„μš”':NEGATIVE,
250
+ '이번 μ—¬ν–‰μ½”μŠ€λŠ” 정말 ν™˜μƒμ μ΄λ„€μš”':POSITIVE,
251
+ 'λ‹΅λ‹΅ν•œ μƒν™©μ΄μ§€λ§Œ λ„Œ 이겨낼 수 μžˆμ„κΊΌμ•Ό':POSITIVE,
252
+ 'λ‹΅λ‹΅ν•œ μƒν™©μ΄μ§€λ§Œ λ„Œ 잘 ν•΄λ‚Ό 수 μžˆμ„κΊΌμ•Ό':POSITIVE,
253
+ 'μ–Έμ œλ‚˜ 곁에 μžˆμ–΄μ€˜μ„œ 힘이 λ©λ‹ˆλ‹€.':POSITIVE,
254
+ 'λͺΈμ΄ λ„ˆλ¬΄ μ•„νŒŒμ„œ 일이 손에 μ•ˆμž‘ν˜€μš”':NEGATIVE,
255
+ 'λ„ˆ 정말 μž˜ν•œλ‹€ λ¦¬μŠ€νŽ™!':POSITIVE,
256
+ 'μŠ¬ν”„μ§€λ§Œ κ΄œμ±¦μ•„':POSITIVE,
257
+ 'κ°œλΉ‘μΉ˜λ„€ μ§„μ§œ':NEGATIVE,
258
+ 'λΉ„κ°€ λ„ˆλ¬΄ 많이 μ™€μ„œ 집이 λ– λ‚΄λ €κ°”μ–΄μš”':NEGATIVE,
259
+ '햇빛이 μ¨μ¨ν•΄μ„œ 옷이 잘 마λ₯΄λ„€μš”':POSITIVE,
260
+ 'AIκ³΅λΆ€λŠ” μ–΄λ ΅μ§€λ§Œ μž¬λ―Έμžˆμ–΄μš”':POSITIVE,
261
+ '널 μ–΄μ©Œλ©΄ 쒋냐? ν•œμˆ¨λ°–μ— μ•ˆλ‚˜μ˜¨λ‹€':NEGATIVE,
262
+ 'λ„λŒ€μ²΄ 무슨 μƒκ°μœΌλ‘œ 이런 짓을 ν•œκ±°μ•Ό?':NEGATIVE,
263
+ 'λ―Έμ›Œλ„ λ‹€μ‹œ ν•œλ²ˆ':POSITIVE,
264
+ '도움 말씀 κ°μ‚¬ν•©λ‹ˆλ‹€':POSITIVE,
265
+ '말도 μ•ˆλ˜λŠ” μ†Œλ¦¬ κ·Έλ§Œν•˜κ³  저리가':NEGATIVE,
266
+ '였늘 컀피챗 λΆ„μœ„κΈ° κ΅Ώ':POSITIVE,
267
+ 'κΈ°λΆ„ λ‚˜λΉ μ„œ λ„ˆλž‘ μ–˜κΈ°ν•˜κΈ° μ‹«μ–΄':NEGATIVE,
268
+ '이 κ·Έλ¦Ό λ„ˆλ¬΄ λ§ˆμŒμ— λ“ λ‹€':POSITIVE,
269
+ '어이가 μ—†μ–΄μ„œ ν•  말이 μ—†μ–΄':NEGATIVE,
270
+ 'λ™λ£Œ 직원이 퇴사 인사λ₯Ό ν–ˆλŠ”λ° μ”μ“Έν•œ 마음이 λ“œλ„€':NEGATIVE,
271
+ 'νŒ€μ›μ΄ 아이디어 κ²€ν† λ₯Ό μš”μ²­ν–ˆλŠ”λ° λ„ˆλ¬΄ 쒋은 아이디어 κ°™μ•„. μ˜κ²¬μ„ λ¬Όμ–΄λ΄μ€˜μ„œ κ³ λ§ˆμ›Œ':POSITIVE,
272
+ '성격이 쒋은 νŒ€μ›λ“€κ³Ό ν•¨κ»˜ ν•  수 μžˆμ–΄μ„œ 닀행이야':POSITIVE,
273
+ 'κΈˆμš”μΌλ§Œ 되면 기뢄이 μ’‹μ•„μ Έ':POSITIVE,
274
+ '벌써 μΌμš”μΌμ΄λΌλ‹ˆ μΆœκ·Όν•  μƒκ°ν•˜λ‹ˆ κΈ‰ λ‹€μš΄λœλ‹€.':NEGATIVE,
275
+ 'μ§œμ¦λ‚˜λ‹ˆκΉŒ μ–˜κΈ°ν•˜μ§€λ§ˆ!':NEGATIVE,
276
+ 'λ„ˆλ¬΄ 심심해.':NEUTRAL,
277
+ 'λ˜‘λ˜‘ν•œ μ‚¬λžŒμ΄λž‘ λŒ€ν™”ν•˜λŠ”κ±΄ μ¦κ±°μ›Œμš”':POSITIVE,
278
+ '당신은 항상 μ›ƒλŠ” μ–Όκ΅΄μ΄μ–΄μ„œ λ§Œλ‚˜λ©΄ 기뢄이 μ’‹μ•„μ Έμš”':POSITIVE,
279
+ '연섀이 λ„ˆλ¬΄ λ”°λΆ„ν•΄μ„œ ν•˜ν’ˆμ΄ λ‚˜μ™€μš”':NEGATIVE,
280
+ 'λ§›μžˆλŠ” 식당에 갈 생각을 ν•˜λ‹ˆ μ‹ λ‚˜μš”':POSITIVE,
281
+ '이런 ν›Œλ₯­ν•œ κ°•μ˜λ₯Ό λ“£κ²Œ λ˜μ„œ μ˜κ΄‘μž…λ‹ˆλ‹€.':POSITIVE,
282
+ 'λ§Œλ‚˜λ΅™κ²Œ λ˜μ„œ λ°˜κ°‘μŠ΅λ‹ˆλ‹€.':POSITIVE,
283
+ 'κ·Έ μ‚¬λžŒλ§Œ λ§Œλ‚˜λ©΄ 짜증이 λ‚˜μ„œ 보기가 μ‹«μ–΄':NEGATIVE,
284
+ '아이듀이 ν™œκΈ°μ°¨κ²Œ λ›°μ–΄λ…ΈλŠ” λͺ¨μŠ΅μ΄ 보기 μ’‹μ•„μš”':POSITIVE,
285
+ 'ν•œμ‹¬ν•œ μ†Œλ¦¬μ’€ κ·Έλ§Œν•  수 μ—†μ–΄μš”?':NEGATIVE,
286
+ '웃기고 μžλΉ μ‘Œλ„€!':NEGATIVE,
287
+ '휴! μ‹­λ…„ κ°μˆ˜ν–ˆλ„€!':NEUTRAL,
288
+ '말같지도 μ•Šμ€ μ†Œλ¦¬ν•˜κ³  μžˆμ–΄! γ……γ…‚':NEGATIVE,
289
+ 'μž…μ—μ„œ μš•μ΄ μžλ™μœΌλ‘œ λ‚˜μ˜¨λ‹€...':NEGATIVE,
290
+ 'μž…λ§Œ μ—΄λ©΄ 거짓말이 μžλ™μœΌλ‘œ λ‚˜μ™€!':NEGATIVE,
291
+ 'μ €κ±° 바보 아냐?':NEGATIVE,
292
+ 'νž˜λ“€λ•Œ 곁에 μžˆμ–΄μ€˜μ„œ κ³ λ§ˆμ›Œ':POSITIVE,
293
+ '아이큐가 μ†Œμˆ«μ  μ΄ν•˜ κ°™μ•„':NEGATIVE,
294
+ 'μ €λŸ° λͺ¨μ§€λ¦¬ κ°™μœΌλ‹ˆλΌκ³ ':NEGATIVE,
295
+ '지지리 λͺ»λ‚œ λ†ˆ':NEGATIVE,
296
+ 'μ € 인간 λ•Œλ¬Έμ— λ‚΄κ°€ 제 λͺ…에 λͺ»μ‚΄κ²ƒ κ°™μ•„':NEGATIVE,
297
+ 'μ € μƒˆλΌ μ£½μ—¬':NEGATIVE,
298
+ 'λ„Œ 정말 μ²œμ‚¬κ°™μ•„':POSITIVE,
299
+ '당신이 μ’‹μ•„μš” 항상 곁에 μžˆμ–΄μ£Όμ„Έμš”':POSITIVE,
300
+ '꼴도 보기 μ‹«μœΌλ‹ˆ 썩 κΊΌμ Έ':NEGATIVE,
301
+ 'μ•„ μ§„μ§œ λŒμ•„λ²„λ¦¬κ² λ„€':NEGATIVE,
302
+ 'μ—­κ²¨μš΄ λ†ˆλ“€':NEGATIVE,
303
+ 'μ €λŸ° 미인을 λ³΄λ‹ˆ μ•ˆκ΅¬κ°€ μ •ν™”λ˜λŠ” λŠλ‚Œμ΄μ•Ό':POSITIVE,
304
+ 'μ•„μ˜€ γ……γ…‚ 눈 μ©λŠ”λ‹€':NEGATIVE,
305
+ 'κΉμΉ˜μ§€λ§ˆ λ’€μ§ˆλž˜?':NEGATIVE,
306
+ 'μ–Έμ œλ“  ν™˜μ˜μ΄μ—μš”':POSITIVE,
307
+ '쀘 νŒ¨λ²„λ¦¬κ³  μ‹Άλ„€ μ§„μ§œ':NEGATIVE,
308
+ 'μ• κΈ°λ§Œ 보면 μ›ƒμŒμ΄ λ‚˜μ™€':POSITIVE,
309
+ 'ν•˜λŠ” 짓 보면 μ €λŠ₯μ•„ κ°™μ•„':NEGATIVE,
310
+ '칭챙좍':NEGATIVE,
311
+ '왓더뻑':NEGATIVE,
312
+ '이 λΉ‘λŒ€κ°€λ¦¬μ•Ό':NEGATIVE,
313
+ 'λŒλŒ€κ°€λ¦¬ μžμ‹':NEGATIVE,
314
+ 'λ„ˆλ„ μžμ‹μ΄λΌκ³  낳은 λ‹ˆ μ—„λ§ˆκ°€ 뢈쌍':NEGATIVE,
315
+ '예쁜 κ³΅μ£Όλ‹˜μ΄μ—μš” μΆ•ν•˜ν•΄μš”':POSITIVE,
316
+ 'μ”©μ”©ν•œ μ™•μžλ‹˜μ΄μ—μš”. μ’‹μœΌμ‹œκ² μ–΄μš”.':POSITIVE,
317
+ '얼씨ꡬ μ’‹λ‹€':POSITIVE,
318
+ 'ν•˜λŠ˜μ΄ λ¬΄λ„ˆμ§€λŠ” 기뢄이야':NEGATIVE,
319
+ 'ν•˜λŠ˜μ„ λ‚˜λŠ” 기뢄이야':POSITIVE,
320
+ 'μ•„μžμ•„μž ν™”μ΄νŒ…!':POSITIVE,
321
+ 'κ°œμƒˆλΌ':NEGATIVE,
322
+ 'μ•„μ£Ό λ‚˜μ΄μŠ€':POSITIVE,
323
+ '닡도 μ—†λŠ” 인간듀':NEGATIVE,
324
+ '정말 μ—¬κΈ΄ μ €λŠ₯μ•„ 집단 κ°™μ•„':NEGATIVE,
325
+ 'λ§Œλ‚˜μ„œ λ°˜κ°€μ›Œμš”. 정말 λ―ΈμΈμ΄μ‹œλ„€μš”':POSITIVE,
326
+ '당신이 κ·Έλ¦¬μ›Œμš”. λ³΄κ³ μ‹Άμ–΄μš”.':POSITIVE,
327
+ 'λ°”λΌλ§Œ 봐도 μ›ƒμŒμ΄ λ‚˜μ™€μš”':POSITIVE,
328
+ '개 μ—΄λ°›λ„€':NEGATIVE,
329
+ 'λ ˆμ•Œ λŒ€λ°• γ…‹γ…‹γ…‹':POSITIVE,
330
+ 'λ„ˆλ¬΄ λ³΄κ³ μ‹Άμ—ˆμ–΄μš”. μ΄λ ‡κ²Œ λ§Œλ‚˜κ²Œλ˜μ„œ λ°˜κ°‘μŠ΅λ‹ˆλ‹€.':POSITIVE,
331
+ 'μΉœκ΅¬μ•Ό μ‚¬λž‘ν•΄':POSITIVE,
332
+ '이 바보 μžμ‹μ•„':NEGATIVE,
333
+ 'μ˜€λŠ˜μ€ 날씨가 μ°Έ μ’‹λ„€μš”. 기뢄이 μƒμΎŒν•΄μš”.':POSITIVE,
334
+ 'μ†μƒν•΄μ„œ λ°₯이 μ•ˆλ„˜μ–΄κ°„λ‹€.': NEGATIVE,
335
+ '마음이 μšΈμ ν•΄μ„œ 길을 λ‚˜μ„°λ„€':NEGATIVE,
336
+ 'μ˜€λŠ˜μ€ 인생 졜고의 λ‚ ': POSITIVE,
337
+ '이 ν›Œλ₯­ν•œ 일에 λ™μ°Έν•˜κ²Œ λ˜μ„œ μ˜κ΄‘μž…λ‹ˆλ‹€.':POSITIVE,
338
+ 'λ‚˜ μ§‘μ—μ„œ μžλŠ” 쀑':NEUTRAL,
339
+ }
340
+
341
+ hit_cnt = 0
342
+ tot_cnt = len(test_dict)
343
+
344
+ for x, y in test_dict.items():
345
+ tokenized = tokenizer([x], truncation=True, padding=True)
346
+ pred = trainer.predict(SentimentDataset(tokenized))
347
+
348
+ logit = torch.tensor(pred[0])
349
+ result = F.softmax(logit, dim=-1).argmax(1).numpy()
350
+
351
+ if result[0] != y:
352
+ print(f"ERROR: {x} expected:{idx_to_label[y]} result:{idx_to_label[result[0]]}")
353
+ else:
354
+ hit_cnt += 1
355
+
356
+ print()
357
+ print(f"hit/total: {hit_cnt}/{tot_cnt}, rate: {hit_cnt/tot_cnt}")