seriouspark commited on
Commit
ba79e72
โ€ข
1 Parent(s): 80ae5a7

put inference model

Browse files
Files changed (1) hide show
  1. app.py +273 -7
app.py CHANGED
@@ -1,15 +1,281 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
 
3
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- def greet(name):
6
- return "Hello" + name + '!!'
 
 
 
 
 
 
 
 
 
7
 
8
- title = "moogeul-moogeul"
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  iface = gr.Interface(
11
- fn=greet,
12
  inputs = gr.Textbox(lines=2, placeholder= '๋‹น์‹ ์˜ ๊ธ€์„ ๋„ฃ์–ด๋ณด์„ธ์š”'),
13
- outputs = gr.Textbox(lines=10, placeholder = '๊ธ€ ์†์— ์žˆ๋Š” ๋ฌด์˜์‹์„ ์ฐพ์•„๋“œ๋ฆฝ๋‹ˆ๋‹ค.')
14
  )
15
- iface.launch(share =True)
 
1
+ import datetime
2
+ import numpy as np
3
+ import pandas as pd
4
+ import re
5
+ import json
6
+ import os
7
+ import glob
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from torch.optim import Adam
12
+ from tqdm.notebook import tqdm
13
+ from torch import nn
14
+ from transformers import BertModel
15
+
16
+ from transformers import AutoTokenizer
17
+
18
+ import argparse
19
+
20
+ def split_essay_to_sentence(origin_essay):
21
+ origin_essay_sentence = sum([[a.strip() for a in i.split('.')] for i in origin_essay.split('\n')], [])
22
+ essay_sent = [a for a in origin_essay_sentence if len(a) > 0]
23
+ return essay_sent
24
+
25
+ def get_first_extraction(text_sentence):
26
+ row_dict = {}
27
+ for row in tqdm(text_sentence):
28
+ question = 'what is the feeling?'
29
+ answer = question_answerer(question=question, context=row)
30
+ row_dict[row] = answer
31
+ return row_dict
32
+
33
+
34
+ def get_sent_labeldata():
35
+ label =pd.read_csv('./rawdata/sentimental_label.csv', encoding = 'cp949', header = None)
36
+ label[1] = label[1].apply(lambda x : re.findall(r'[๊ฐ€-ํžฃ]+', x)[0])
37
+ label_dict =label[label.index % 10 == 0].set_index(0).to_dict()[1]
38
+ emo2idx = {v : k for k, v in enumerate(label_dict.items())}
39
+ idx2emo = {v : k[1] for k, v in emo2idx.items()}
40
+ return emo2idx, idx2emo
41
+
42
+ def load_model():
43
+
44
+ class BertClassifier(nn.Module):
45
+
46
+ def __init__(self, dropout = 0.3):
47
+ super(BertClassifier, self).__init__()
48
+
49
+ self.bert= BertModel.from_pretrained('bert-base-multilingual-cased')
50
+ self.dropout = nn.Dropout(dropout)
51
+ self.linear = nn.Linear(768, 6)
52
+ self.relu = nn.ReLU()
53
+
54
+ def forward(self, input_id, mask):
55
+ _, pooled_output = self.bert(input_ids = input_id, attention_mask = mask, return_dict = False)
56
+ dropout_output = self.dropout(pooled_output)
57
+ linear_output = self.linear(dropout_output)
58
+ final_layer= self.relu(linear_output)
59
+
60
+ return final_layer
61
+
62
+
63
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
64
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
65
+ cls_model = BertClassifier()
66
+ criterion = nn.CrossEntropyLoss()
67
+ model_name = 'bert-base-multilingual-cased'
68
+ PATH = './model' + '/' + model_name + '_' + '2023102410'
69
+ print(PATH)
70
+ cls_model = torch.load(PATH)
71
+ #cls_model.load_state_dict(torch.load(PATH))
72
+ return tokenizer, cls_model
73
+
74
+
75
+ class myDataset_for_infer(torch.utils.data.Dataset):
76
+ def __init__(self, X):
77
+ self.X = X
78
+
79
+ def __len__(self):
80
+ return len(self.X)
81
+
82
+ def __getitem__(self,idx):
83
+ sentences = tokenizer(self.X[idx], return_tensors = 'pt', padding = 'max_length', max_length = 128, truncation = True)
84
+ return sentences
85
+
86
+
87
+ def infer_data(model, main_feeling_keyword):
88
+ #ds = myDataset_for_infer()
89
+ df_infer = myDataset_for_infer(main_feeling_keyword)
90
+
91
+ infer_dataloader = torch.utils.data.DataLoader(df_infer, batch_size= 16)
92
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
93
+
94
+ if device == 'cuda':
95
+ model = model.cuda()
96
+
97
+ result_list = []
98
+ with torch.no_grad():
99
+ for idx, infer_input in tqdm(enumerate(infer_dataloader)):
100
+ mask = infer_input['attention_mask'].to(device)
101
+ input_id = infer_input['input_ids'].squeeze(1).to(device)
102
+
103
+ output = model(input_id, mask)
104
+ result = np.argmax(F.softmax(output, dim=0).cpu(), axis=1).numpy()
105
+ result_list.extend(result)
106
+ return result_list
107
+
108
+ def get_word_emotion_pair(cls_model, origin_essay_sentence):
109
 
110
+ from konlpy.tag import Okt
111
 
112
+ okt = Okt()
113
+ #text = '๋‚˜๋Š” ์™œ ์—„๋งˆ๋งŒ ๋ฏธ์›Œํ–ˆ์„๊นŒ'
114
+ def get_noun(text):
115
+ noun_list = [k for k, v in okt.pos(text) if (v == 'Noun' and len(k) > 1)]
116
+ return noun_list
117
+ def get_adj(text):
118
+ adj_list = [k for k, v in okt.pos(text) if (v == 'Adjective') and (len(k) > 1)]
119
+ return adj_list
120
+ def get_verb(text):
121
+ verb_list = [k for k, v in okt.pos(text) if (v == 'Verb') and (len(k) > 1)]
122
+ return verb_list
123
 
124
+ result_list = infer_data(cls_model, origin_essay_sentence)
125
+ final_result = pd.DataFrame(data = {'text': origin_essay_sentence , 'label' : result_list})
126
+ final_result['emotion'] = final_result['label'].map(idx2emo)
127
+ final_result['noun_list'] = final_result['text'].map(get_noun)
128
+ final_result['adj_list'] = final_result['text'].map(get_adj)
129
+ final_result['verb_list'] = final_result['text'].map(get_verb)
130
+ final_result['title'] = 'none'
131
+ file_made_dt = datetime.datetime.now()
132
+ file_made_dt_str = datetime.datetime.strftime(file_made_dt, '%Y%m%d_%H%M%d')
133
+ os.makedirs(f'./result/{file_made_dt_str}/', exist_ok = True)
134
+ final_result.to_csv(f"./result/{file_made_dt_str}/essay_result.csv", index = False)
135
 
136
+ return final_result, file_made_dt_str
137
 
138
+
139
+
140
+ def get_essay_base_analysis(file_made_dt_str):
141
+ essay1 = pd.read_csv(f"./result/{file_name_dt}/essay_result.csv")
142
+ essay1['noun_list_len'] = essay1['noun_list'].apply(lambda x : len(x))
143
+ essay1['noun_list_uniqlen'] = essay1['noun_list'].apply(lambda x : len(set(x)))
144
+ essay1['adj_list_len'] = essay1['adj_list'].apply(lambda x : len(x))
145
+ essay1['adj_list_uniqlen'] = essay1['adj_list'].apply(lambda x : len(set(x)))
146
+ essay1['vocab_all'] = essay1[['noun_list','adj_list']].apply(lambda x : sum((eval(x[0]),eval(x[1])), []), axis=1)
147
+ essay1['vocab_cnt'] = essay1['vocab_all'].apply(lambda x : len(x))
148
+ essay1['vocab_unique_cnt'] = essay1['vocab_all'].apply(lambda x : len(set(x)))
149
+ essay1['noun_list'] = essay1['noun_list'].apply(lambda x : eval(x))
150
+ essay1['adj_list'] = essay1['adj_list'].apply(lambda x : eval(x))
151
+ d = essay1.groupby('title')[['noun_list','adj_list']].sum([]).reset_index()
152
+ d['noun_cnt'] = d['noun_list'].apply(lambda x : len(set(x)))
153
+ d['adj_cnt'] = d['adj_list'].apply(lambda x : len(set(x)))
154
+
155
+ # ๋ฌธ์žฅ ๊ธฐ์ค€ ์ตœ๊ณ  ๊ฐ์ •
156
+ essay_summary =essay1.groupby(['title'])['emotion'].value_counts().unstack(level =1)
157
+
158
+ emo_vocab_dict = {}
159
+ for k, v in essay1[['emotion','noun_list']].values:
160
+ for vocab in v:
161
+ if (k, 'noun', vocab) not in emo_vocab_dict:
162
+ emo_vocab_dict[(k, 'noun', vocab)] = 0
163
+
164
+ emo_vocab_dict[(k, 'noun', vocab)] += 1
165
+
166
+ for k, v in essay1[['emotion','adj_list']].values:
167
+ for vocab in v:
168
+ if (k, 'adj', vocab) not in emo_vocab_dict:
169
+ emo_vocab_dict[(k, 'adj', vocab)] = 0
170
+
171
+ emo_vocab_dict[(k, 'adj', vocab)] += 1
172
+ vocab_emo_cnt_dict = {}
173
+ for k, v in essay1[['emotion','noun_list']].values:
174
+ for vocab in v:
175
+ if (vocab, 'noun') not in vocab_emo_cnt_dict:
176
+ vocab_emo_cnt_dict[('noun', vocab)] = {}
177
+ if k not in vocab_emo_cnt_dict[( 'noun', vocab)]:
178
+ vocab_emo_cnt_dict[( 'noun', vocab)][k] = 0
179
+
180
+ vocab_emo_cnt_dict[('noun', vocab)][k] += 1
181
+
182
+ for k, v in essay1[['emotion','adj_list']].values:
183
+ for vocab in v:
184
+ if ('adj', vocab) not in vocab_emo_cnt_dict:
185
+ vocab_emo_cnt_dict[( 'adj', vocab)] = {}
186
+ if k not in vocab_emo_cnt_dict[( 'adj', vocab)]:
187
+ vocab_emo_cnt_dict[( 'adj', vocab)][k] = 0
188
+
189
+ vocab_emo_cnt_dict[('adj', vocab)][k] += 1
190
+
191
+ vocab_emo_cnt_df = pd.DataFrame(vocab_emo_cnt_dict).T
192
+ vocab_emo_cnt_df['total'] = vocab_emo_cnt_df.sum(axis=1)
193
+ # ๋‹จ์–ด๋ณ„ ์ตœ๊ณ  ๊ฐ์ • ๋ฐ ๊ฐ์ • ๊ฐœ์ˆ˜
194
+ all_result=vocab_emo_cnt_df.sort_values(by = 'total', ascending = False)
195
+
196
+ # ๋‹จ์–ด๋ณ„ ์ตœ๊ณ  ๊ฐ์ • ๋ฐ ๊ฐ์ • ๊ฐœ์ˆ˜ , ํ˜•์šฉ์‚ฌ ํฌํ•จ ์‹œ
197
+ adj_result=vocab_emo_cnt_df.sort_values(by = 'total', ascending = False)
198
+
199
+ # ๋ช…์‚ฌ๋งŒ ์‚ฌ์šฉ ์‹œ
200
+ noun_result=vocab_emo_cnt_df[vocab_emo_cnt_df.index.get_level_values(0) == 'noun'].sort_values(by = 'total', ascending = False)
201
+
202
+ final_file_name = f"essay_all_vocab_result.csv"
203
+ adj_file_name = f"essay_adj_vocab_result.csv"
204
+ noun_file_name = f"essay_noun_vocab_result.csv"
205
+
206
+ os.makedirs(f'./result/{file_made_dt_str}/', exist_ok = True)
207
+
208
+ final_result.to_csv(f"./result/{file_made_dt_str}/essay_all_vocab_result.csv", index = False)
209
+ adj_result.to_csv(f"./result/{file_made_dt_str}/essay_adj_vocab_result.csv", index = False)
210
+ noun_result.to_csv(f"./result/{file_made_dt_str}/essay_noun_vocab_result.csv", index = False)
211
+
212
+ return final_result, adj_result, noun_result, essay_summary, file_made_dt_str
213
+
214
+
215
+ from transformers import pipeline
216
+ model_name = 'AlexKay/xlm-roberta-large-qa-multilingual-finedtuned-ru'
217
+ question_answerer = pipeline("question-answering", model=model_name)
218
+
219
+ class BertClassifier(nn.Module):
220
+
221
+ def __init__(self, dropout = 0.3):
222
+ super(BertClassifier, self).__init__()
223
+
224
+ self.bert= BertModel.from_pretrained('bert-base-multilingual-cased')
225
+ self.dropout = nn.Dropout(dropout)
226
+ self.linear = nn.Linear(768, 6)
227
+ self.relu = nn.ReLU()
228
+
229
+ def forward(self, input_id, mask):
230
+ _, pooled_output = self.bert(input_ids = input_id, attention_mask = mask, return_dict = False)
231
+ dropout_output = self.dropout(pooled_output)
232
+ linear_output = self.linear(dropout_output)
233
+ final_layer= self.relu(linear_output)
234
+
235
+ return final_layer
236
+
237
+
238
+ def all_process(origin_essay):
239
+ essay_sent =split_essay_to_sentence(origin_essay)
240
+ row_dict = {}
241
+ for row in tqdm(essay_sent):
242
+ question = 'what is the feeling?'
243
+ answer = question_answerer(question=question, context=row)
244
+ row_dict[row] = answer
245
+ emo2idx, idx2emo = get_sent_labeldata()
246
+ tokenizer, cls_model = load_model()
247
+ final_result, file_name_dt = get_word_emotion_pair(cls_model, essay_sent)
248
+ all_result, adj_result, noun_result, essay_summary, file_made_dt_str = get_essay_base_analysis(file_name_dt)
249
+
250
+ summary_result = pd.concat([adj_result, noun_result]).fillna(0).sort_values(by = 'total', ascending = False).fillna(0).reset_index()[:30]
251
+ with open(f'./result/{file_name_dt}/summary.json','w') as f:
252
+ json.dump( essay_summary.to_json(),f)
253
+ with open(f'./result/{file_made_dt_str}/all_result.json','w') as f:
254
+ json.dump( all_result.to_json(),f)
255
+ with open(f'./result/{file_made_dt_str}/adj_result.json','w') as f:
256
+ json.dump( adj_result.to_json(),f)
257
+ with open(f'./result/{file_made_dt_str}/noun_result.json','w') as f:
258
+ json.dump( noun_result.to_json(),f)
259
+ return essay_summary
260
+
261
+ import gradio as gr
262
+ outputs = [gr.Dataframe(row_count = (6, "dynamic"),
263
+ col_count=(2, "dynamic"),
264
+ label="Essay Summary based on Words")
265
+ #headers=['type','word','์Šฌํ””', '๋ถ„๋…ธ', '๊ธฐ์จ', '๋ถˆ์•ˆ', '์ƒ์ฒ˜', '๋‹นํ™ฉ', 'total'])
266
+
267
+ ]
268
+
269
+
270
+ #row_count = (10, "dynamic"),
271
+ #col_count=(9, "dynamic"),
272
+ #label="Results",
273
+ #headers=['type','word','์Šฌํ””', '๋ถ„๋…ธ', '๊ธฐ์จ', '๋ถˆ์•ˆ', '์ƒ์ฒ˜', '๋‹นํ™ฉ', 'total'])
274
+ #]
275
+
276
  iface = gr.Interface(
277
+ fn=all_process,
278
  inputs = gr.Textbox(lines=2, placeholder= '๋‹น์‹ ์˜ ๊ธ€์„ ๋„ฃ์–ด๋ณด์„ธ์š”'),
279
+ outputs = outputs,
280
  )
281
+ iface.launch(share =True)