HMPhuoc commited on
Commit
2205ed4
1 Parent(s): a65057f

add BERT model

Browse files
Files changed (4) hide show
  1. app.py +5 -14
  2. phoBERT.py +79 -0
  3. phoBertModel.pth +3 -0
  4. requirements.txt +3 -1
app.py CHANGED
@@ -11,9 +11,10 @@ import pandas as pd
11
  import plotly.express as px
12
  import keras
13
 
14
-
15
  from underthesea import word_tokenize
16
 
 
 
17
  #Load tokenizer
18
  fp = Path(__file__).with_name('tokenizer.pkl')
19
  with open(fp,mode="rb") as f:
@@ -81,26 +82,16 @@ def judge(x):
81
 
82
  lstm_pred = LSTM_predict(x)
83
  gru_pred = GRU_predict(x)
84
-
85
  #print(result)
86
 
87
  return_result = 'Result'
88
  result_lstm = np.round(lstm_pred, 2)
89
  result_gru = np.round(gru_pred, 2)
 
90
  for i in range(6):
91
- result.append((result_lstm[i]+result_gru[i])/2)
92
-
93
 
94
-
95
- #print(final_result)
96
- return_result += '\nMô hình LSTM\n'
97
- return_result += f"{result_lstm}\n"
98
-
99
-
100
- return_result += '\nMô hình GRU\n'
101
- return_result += f"{result_gru}\n"
102
-
103
-
104
  return (result)
105
 
106
 
 
11
  import plotly.express as px
12
  import keras
13
 
 
14
  from underthesea import word_tokenize
15
 
16
+ from phoBERT import BERT_predict
17
+
18
  #Load tokenizer
19
  fp = Path(__file__).with_name('tokenizer.pkl')
20
  with open(fp,mode="rb") as f:
 
82
 
83
  lstm_pred = LSTM_predict(x)
84
  gru_pred = GRU_predict(x)
85
+ bert_pred = BERT_predict(x)
86
  #print(result)
87
 
88
  return_result = 'Result'
89
  result_lstm = np.round(lstm_pred, 2)
90
  result_gru = np.round(gru_pred, 2)
91
+ result_bert = np.round(bert_pred, 2)
92
  for i in range(6):
93
+ result.append((result_lstm[i]+result_gru[i]+result_bert[i])/3)
 
94
 
 
 
 
 
 
 
 
 
 
 
95
  return (result)
96
 
97
 
phoBERT.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModel, AutoTokenizer
3
+ from underthesea import word_tokenize
4
+
5
+ phobert = AutoModel.from_pretrained("vinai/phobert-base")
6
+ tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
7
+
8
+ class PhoBertModel(torch.nn.Module):
9
+ def __init__(self):
10
+ super(PhoBertModel, self).__init__()
11
+ self.bert = phobert
12
+ self.pre_classifier = torch.nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
13
+ self.dropout = torch.nn.Dropout(0.1)
14
+ self.classifier = torch.nn.Linear(self.bert.config.hidden_size, 6)
15
+
16
+ def forward(self, input_ids, attention_mask, token_type_ids):
17
+ hidden_state, output_1 = self.bert(
18
+ input_ids = input_ids,
19
+ attention_mask=attention_mask,
20
+ return_dict = False
21
+ )
22
+ pooler = self.pre_classifier(output_1)
23
+ activation_1 = torch.nn.Tanh()(pooler)
24
+
25
+ drop = self.dropout(activation_1)
26
+
27
+ output_2 = self.classifier(drop)
28
+ # activation_2 = torch.nn.Tanh()(output_2)
29
+
30
+ output = torch.nn.Sigmoid()(output_2)
31
+ return output
32
+
33
+ def getModel():
34
+ model = torch.load('phoBertModel.pth', map_location=torch.device('cpu'))
35
+ model.eval()
36
+ return model
37
+
38
+ model = getModel()
39
+
40
+ def tokenize(data):
41
+
42
+ max_length = 200
43
+
44
+ for line in data:
45
+
46
+ token = tokenizer.encode_plus(
47
+ line,
48
+ max_length=200,
49
+ add_special_tokens=False,
50
+ pad_to_max_length=True
51
+ )
52
+
53
+ ids = torch.tensor([token['input_ids']])
54
+ mask = torch.tensor([token['attention_mask']])
55
+ token_type_ids = torch.tensor([token['token_type_ids']])
56
+
57
+
58
+ output = {
59
+ 'ids': ids,
60
+ 'mask': mask,
61
+ 'token_type_ids': token_type_ids,
62
+ }
63
+ #outputs.append(output)
64
+
65
+ return output
66
+
67
+ def BERT_predict(text):
68
+ text = word_tokenize(text)
69
+ text = [text]
70
+ token = tokenize(text)
71
+
72
+ ids = token['ids']
73
+ mask = token['mask']
74
+ token_type_ids = token['token_type_ids']
75
+
76
+ result = model(ids, mask, token_type_ids)
77
+ print(result)
78
+ return result.tolist()[0]
79
+
phoBertModel.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5fca9d837d05b1e8330798e32a59b5200bf677d5cf2f178727dcd131c86230b
3
+ size 542499629
requirements.txt CHANGED
@@ -7,4 +7,6 @@ pathlib
7
  plotly
8
  pandas
9
  keras==2.15.0
10
- underthesea
 
 
 
7
  plotly
8
  pandas
9
  keras==2.15.0
10
+ underthesea
11
+ torch
12
+ transformers