seriouspark commited on
Commit
3cde0b0
โ€ข
1 Parent(s): 1c0ad46

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -64
app.py CHANGED
@@ -13,7 +13,7 @@ from tqdm import tqdm
13
  from torch import nn
14
  from transformers import BertModel
15
 
16
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
17
 
18
  import argparse
19
 
@@ -39,37 +39,37 @@ def get_sent_labeldata():
39
  idx2emo = {v : k[1] for k, v in emo2idx.items()}
40
  return emo2idx, idx2emo
41
 
42
- def load_model():
43
 
44
- class BertClassifier(nn.Module):
45
 
46
- def __init__(self, dropout = 0.3):
47
- super(BertClassifier, self).__init__()
48
 
49
- self.bert= BertModel.from_pretrained('bert-base-multilingual-cased')
50
- self.dropout = nn.Dropout(dropout)
51
- self.linear = nn.Linear(768, 6)
52
- self.relu = nn.ReLU()
53
 
54
- def forward(self, input_id, mask):
55
- _, pooled_output = self.bert(input_ids = input_id, attention_mask = mask, return_dict = False)
56
- dropout_output = self.dropout(pooled_output)
57
- linear_output = self.linear(dropout_output)
58
- final_layer= self.relu(linear_output)
59
 
60
- return final_layer
61
 
62
 
63
- tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
64
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
65
- cls_model = BertClassifier()
66
- criterion = nn.CrossEntropyLoss()
67
- model_name = 'bert-base-multilingual-cased'
68
- PATH = './model' + '/' + model_name + '_' + '2023102410'
69
- print(PATH)
70
- cls_model = torch.load(PATH)
71
- #cls_model.load_state_dict(torch.load(PATH))
72
- return tokenizer, cls_model
73
 
74
 
75
  class myDataset_for_infer(torch.utils.data.Dataset):
@@ -80,7 +80,7 @@ class myDataset_for_infer(torch.utils.data.Dataset):
80
  return len(self.X)
81
 
82
  def __getitem__(self,idx):
83
- sentences = tokenizer(self.X[idx], return_tensors = 'pt', padding = 'max_length', max_length = 128, truncation = True)
84
  return sentences
85
 
86
 
@@ -100,33 +100,32 @@ def infer_data(model, main_feeling_keyword):
100
  mask = infer_input['attention_mask'].to(device)
101
  input_id = infer_input['input_ids'].squeeze(1).to(device)
102
 
103
- output = model(input_id, mask)
104
- result = np.argmax(F.softmax(output, dim=0).cpu(), axis=1).numpy()
105
  result_list.extend(result)
106
  return result_list
107
 
108
- def get_word_emotion_pair(cls_model, origin_essay_sentence):
109
-
110
- from konlpy.tag import Okt
111
 
112
- okt = Okt()
113
- #text = '๋‚˜๋Š” ์™œ ์—„๋งˆ๋งŒ ๋ฏธ์›Œํ–ˆ์„๊นŒ'
114
- def get_noun(text):
115
- noun_list = [k for k, v in okt.pos(text) if (v == 'Noun' and len(k) > 1)]
116
- return noun_list
117
- def get_adj(text):
118
- adj_list = [k for k, v in okt.pos(text) if (v == 'Adjective') and (len(k) > 1)]
119
- return adj_list
120
- def get_verb(text):
121
- verb_list = [k for k, v in okt.pos(text) if (v == 'Verb') and (len(k) > 1)]
122
- return verb_list
123
 
124
  result_list = infer_data(cls_model, origin_essay_sentence)
125
  final_result = pd.DataFrame(data = {'text': origin_essay_sentence , 'label' : result_list})
126
  final_result['emotion'] = final_result['label'].map(idx2emo)
 
 
 
 
127
  final_result['noun_list'] = final_result['text'].map(get_noun)
128
  final_result['adj_list'] = final_result['text'].map(get_adj)
129
  final_result['verb_list'] = final_result['text'].map(get_verb)
 
130
  final_result['title'] = 'none'
131
  file_made_dt = datetime.datetime.now()
132
  file_made_dt_str = datetime.datetime.strftime(file_made_dt, '%Y%m%d_%H%M%d')
@@ -136,7 +135,6 @@ def get_word_emotion_pair(cls_model, origin_essay_sentence):
136
  return final_result, file_made_dt_str
137
 
138
 
139
-
140
  def get_essay_base_analysis(file_made_dt_str):
141
  essay1 = pd.read_csv(f"./result/{file_name_dt}/essay_result.csv")
142
  essay1['noun_list_len'] = essay1['noun_list'].apply(lambda x : len(x))
@@ -213,28 +211,21 @@ def get_essay_base_analysis(file_made_dt_str):
213
 
214
 
215
  from transformers import pipeline
216
- model_name = 'AlexKay/xlm-roberta-large-qa-multilingual-finedtuned-ru'
 
217
  question_answerer = pipeline("question-answering", model=model_name)
218
 
219
- class BertClassifier(nn.Module):
220
-
221
- def __init__(self, dropout = 0.3):
222
- super(BertClassifier, self).__init__()
223
-
224
- self.bert= BertModel.from_pretrained('bert-base-multilingual-cased')
225
- self.dropout = nn.Dropout(dropout)
226
- self.linear = nn.Linear(768, 6)
227
- self.relu = nn.ReLU()
228
 
229
- def forward(self, input_id, mask):
230
- _, pooled_output = self.bert(input_ids = input_id, attention_mask = mask, return_dict = False)
231
- dropout_output = self.dropout(pooled_output)
232
- linear_output = self.linear(dropout_output)
233
- final_layer= self.relu(linear_output)
234
 
235
- return final_layer
 
 
236
 
237
-
238
  def all_process(origin_essay):
239
  essay_sent =split_essay_to_sentence(origin_essay)
240
  row_dict = {}
@@ -243,11 +234,12 @@ def all_process(origin_essay):
243
  answer = question_answerer(question=question, context=row)
244
  row_dict[row] = answer
245
  emo2idx, idx2emo = get_sent_labeldata()
246
- #tokenizer, cls_model = load_model()
247
  tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
248
- cls_model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
 
 
249
  final_result, file_name_dt = get_word_emotion_pair(cls_model, essay_sent)
250
- all_result, adj_result, noun_result, essay_summary, file_made_dt_str = get_essay_base_analysis(file_name_dt)
251
 
252
  summary_result = pd.concat([adj_result, noun_result]).fillna(0).sort_values(by = 'total', ascending = False).fillna(0).reset_index()[:30]
253
  with open(f'./result/{file_name_dt}/summary.json','w') as f:
@@ -280,4 +272,4 @@ iface = gr.Interface(
280
  inputs = gr.Textbox(lines=2, placeholder= '๋‹น์‹ ์˜ ๊ธ€์„ ๋„ฃ์–ด๋ณด์„ธ์š”'),
281
  outputs = outputs,
282
  )
283
- iface.launch(share =True)
 
13
  from torch import nn
14
  from transformers import BertModel
15
 
16
+ from transformers import AutoTokenizer
17
 
18
  import argparse
19
 
 
39
  idx2emo = {v : k[1] for k, v in emo2idx.items()}
40
  return emo2idx, idx2emo
41
 
42
+ # def load_model():
43
 
44
+ # class BertClassifier(nn.Module):
45
 
46
+ # def __init__(self, dropout = 0.3):
47
+ # super(BertClassifier, self).__init__()
48
 
49
+ # self.bert= BertModel.from_pretrained('bert-base-multilingual-cased')
50
+ # self.dropout = nn.Dropout(dropout)
51
+ # self.linear = nn.Linear(768, 6)
52
+ # self.relu = nn.ReLU()
53
 
54
+ # def forward(self, input_id, mask):
55
+ # _, pooled_output = self.bert(input_ids = input_id, attention_mask = mask, return_dict = False)
56
+ # dropout_output = self.dropout(pooled_output)
57
+ # linear_output = self.linear(dropout_output)
58
+ # final_layer= self.relu(linear_output)
59
 
60
+ # return final_layer
61
 
62
 
63
+ # tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
64
+ # device = 'cuda' if torch.cuda.is_available() else 'cpu'
65
+ # cls_model = BertClassifier()
66
+ # criterion = nn.CrossEntropyLoss()
67
+ # model_name = 'bert-base-multilingual-cased'
68
+ # PATH = './model' + '/' + model_name + '_' + '2023102410'
69
+ # print(PATH)
70
+ # cls_model = torch.load(PATH)
71
+ # #cls_model.load_state_dict(torch.load(PATH))
72
+ # return tokenizer, cls_model
73
 
74
 
75
  class myDataset_for_infer(torch.utils.data.Dataset):
 
80
  return len(self.X)
81
 
82
  def __getitem__(self,idx):
83
+ sentences = tokenizer(self.X[idx], return_tensors = 'pt', padding = 'max_length', max_length = 96, truncation = True)
84
  return sentences
85
 
86
 
 
100
  mask = infer_input['attention_mask'].to(device)
101
  input_id = infer_input['input_ids'].squeeze(1).to(device)
102
 
103
+ output = clsmodel(input_id, mask)
104
+ result = np.argmax(output.logits, axis=1).numpy()
105
  result_list.extend(result)
106
  return result_list
107
 
108
+ def get_word_emotion_pair(cls_model, origin_essay_sentence, idx2emo):
 
 
109
 
110
+ import re
111
+ def get_noun(sent):
112
+ return [re.sub(r'[์„๋ฅผ]+', '', vocab) for (vocab, pos) in nlp(sent) if len(vocab) > 1 and pos == 'NOUN']
113
+ def get_adj(sent):
114
+ return [re.sub(r'[์„๋ฅผ]+', '', vocab) for (vocab, pos) in nlp(sent) if len(vocab) > 1 and pos == 'ADJ']
115
+ def get_verb(sent):
116
+ return [re.sub(r'[์„๋ฅผ]+', '', vocab) for (vocab, pos) in nlp(sent) if len(vocab) > 1 and pos == 'VERB']
 
 
 
 
117
 
118
  result_list = infer_data(cls_model, origin_essay_sentence)
119
  final_result = pd.DataFrame(data = {'text': origin_essay_sentence , 'label' : result_list})
120
  final_result['emotion'] = final_result['label'].map(idx2emo)
121
+
122
+ nlp=lambda x:[(x[t["start"]:t["end"]],t["entity_group"]) for t in pipeline(x)]
123
+ #essay_sent_pos = [nlp(i) for i in tqdm(essay_sent)]
124
+ #final_result['text_pos'] = essay_sent_pos
125
  final_result['noun_list'] = final_result['text'].map(get_noun)
126
  final_result['adj_list'] = final_result['text'].map(get_adj)
127
  final_result['verb_list'] = final_result['text'].map(get_verb)
128
+
129
  final_result['title'] = 'none'
130
  file_made_dt = datetime.datetime.now()
131
  file_made_dt_str = datetime.datetime.strftime(file_made_dt, '%Y%m%d_%H%M%d')
 
135
  return final_result, file_made_dt_str
136
 
137
 
 
138
  def get_essay_base_analysis(file_made_dt_str):
139
  essay1 = pd.read_csv(f"./result/{file_name_dt}/essay_result.csv")
140
  essay1['noun_list_len'] = essay1['noun_list'].apply(lambda x : len(x))
 
211
 
212
 
213
  from transformers import pipeline
214
+ #model_name = 'AlexKay/xlm-roberta-large-qa-multilingual-finedtuned-ru'
215
+ model_name = 'monologg/koelectra-base-v2-finetuned-korquad'
216
  question_answerer = pipeline("question-answering", model=model_name)
217
 
218
+ from transformers import AutoTokenizer,AutoModelForTokenClassification,TokenClassificationPipeline
219
+ tokenizer=AutoTokenizer.from_pretrained("KoichiYasuoka/roberta-large-korean-upos")
220
+ posmodel=AutoModelForTokenClassification.from_pretrained("KoichiYasuoka/roberta-large-korean-upos")
 
 
 
 
 
 
221
 
222
+ pipeline=TokenClassificationPipeline(tokenizer=tokenizer,model=posmodel,aggregation_strategy="simple")
223
+ nlp=lambda x:[(x[t["start"]:t["end"]],t["entity_group"]) for t in pipeline(x)]
 
 
 
224
 
225
+ from transformers import AutoModelForSequenceClassification
226
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
227
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
228
 
 
229
  def all_process(origin_essay):
230
  essay_sent =split_essay_to_sentence(origin_essay)
231
  row_dict = {}
 
234
  answer = question_answerer(question=question, context=row)
235
  row_dict[row] = answer
236
  emo2idx, idx2emo = get_sent_labeldata()
 
237
  tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
238
+ #cls_model = AutoModelForSequenceClassification.from_pretrain ed('seriouspark/bert-base-multilingual-cased-finetuning-sentimental-6label')
239
+ cls_model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels = 6)
240
+
241
  final_result, file_name_dt = get_word_emotion_pair(cls_model, essay_sent)
242
+ all_result, adj_\result, noun_result, essay_summary, file_made_dt_str = get_essay_base_analysis(file_name_dt)
243
 
244
  summary_result = pd.concat([adj_result, noun_result]).fillna(0).sort_values(by = 'total', ascending = False).fillna(0).reset_index()[:30]
245
  with open(f'./result/{file_name_dt}/summary.json','w') as f:
 
272
  inputs = gr.Textbox(lines=2, placeholder= '๋‹น์‹ ์˜ ๊ธ€์„ ๋„ฃ์–ด๋ณด์„ธ์š”'),
273
  outputs = outputs,
274
  )
275
+ iface.launch(share=True)