kevin-yang commited on
Commit
b1944b2
1 Parent(s): cf3d4e2

initial commit

Browse files
NanumGothicCoding-Bold.ttf ADDED
Binary file (1.8 MB). View file
NanumGothicCoding.ttf ADDED
Binary file (2.78 MB). View file
__pycache__/bertviz.cpython-38.pyc ADDED
Binary file (533 Bytes). View file
__pycache__/util.cpython-36.pyc ADDED
Binary file (6.01 kB). View file
attention.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
2
+ import gradio as gr
3
+ from torch.nn import functional as F
4
+ import seaborn
5
+
6
+ import matplotlib
7
+ import platform
8
+
9
+ if platform.system() == "Darwin":
10
+ print("MacOS")
11
+ matplotlib.use('Agg')
12
+ import matplotlib.pyplot as plt
13
+ import io
14
+ from PIL import Image
15
+
16
+ import matplotlib.font_manager as fm
17
+
18
+
19
+
20
+
21
+ import util
22
+
23
+ font_path = r'NanumGothicCoding.ttf'
24
+ fontprop = fm.FontProperties(fname=font_path, size=18)
25
+
26
+ plt.rcParams["font.family"] = 'NanumGothic'
27
+
28
+
29
+ def visualize_attention(sent, attention_matrix, n_words=10):
30
+ def draw(data, x, y, ax):
31
+ seaborn.heatmap(data,
32
+ xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0,
33
+ cbar=False, ax=ax)
34
+
35
+ # make plt figure with 1x6 subplots
36
+ fig = plt.figure(figsize=(16, 8))
37
+ # fig.subplots_adjust(hspace=0.7, wspace=0.2)
38
+ for i, layer in enumerate(range(1, 12, 2)):
39
+ ax = fig.add_subplot(2, 3, i+1)
40
+ ax.set_title("Layer {}".format(layer))
41
+ draw(attention_matrix[layer], sent if layer > 6 else [], sent if layer in [1,7] else [], ax=ax)
42
+
43
+ fig.tight_layout()
44
+ plt.close()
45
+
46
+ return fig
47
+
48
+
49
+
50
+ def predict(model_name, text):
51
+
52
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
53
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
54
+ config = AutoConfig.from_pretrained(model_name)
55
+ print(config.id2label)
56
+
57
+ tokenized_text = tokenizer([text], return_tensors='pt')
58
+
59
+ input_tokens = tokenizer.convert_ids_to_tokens(tokenized_text.input_ids[0])
60
+ print(input_tokens)
61
+ input_tokens = util.bytetokens_to_unicdode(input_tokens) if config.model_type in ['roberta', 'gpt', 'gpt2'] else input_tokens
62
+
63
+ model.eval()
64
+ output, attention = model(**tokenized_text, output_attentions=True, return_dict=False)
65
+ output = F.softmax(output, dim=-1)
66
+ result = {}
67
+
68
+ for idx, label in enumerate(output[0].detach().numpy()):
69
+ result[config.id2label[idx]] = float(label)
70
+
71
+ fig = visualize_attention(input_tokens, attention[0][0].detach().numpy())
72
+ return result, fig#.logits.detach()#.numpy()#, output.attentions.detach().numpy()
73
+
74
+
75
+ if __name__ == '__main__':
76
+
77
+ model_name = 'jason9693/SoongsilBERT-beep-base'
78
+ text = '읿딴걸 홍볿글 읿랉곭 쌑젩낄고 앉앟있냩'
79
+ # output = predict(model_name, text)
80
+
81
+ # print(output)
82
+
83
+ model_name_list = [
84
+ 'jason9693/SoongsilBERT-beep-base'
85
+ ]
86
+
87
+ #Create a gradio app with a button that calls predict()
88
+ app = gr.Interface(
89
+ fn=predict,
90
+ server_port=26899,
91
+ server_name='0.0.0.0',
92
+ inputs=[gr.inputs.Dropdown(model_name_list, label="Model Name"), 'text'], outputs=['label', 'plot'],
93
+ examples = [[model_name, text]],
94
+ title="한국어 혐오성 발화 분류기 (Korean Hate Speech Classifier)",
95
+ description="Korean Hate Speech Classifier with Several Pretrained LM\nCurrent Supported Model:\n1. SoongsilBERT"
96
+ )
97
+ app.launch(inline=False)
bvz.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+ from bertviz import model_view
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
5
+ model = AutoModel.from_pretrained("distilbert-base-uncased", output_attentions=True)
6
+ inputs = tokenizer.encode("The cat sat on the mat", return_tensors='pt')
7
+ outputs = model(inputs)
8
+ attention = outputs[-1] # Output includes attention weights when output_attentions=True
9
+ tokens = tokenizer.convert_ids_to_tokens(inputs[0])
10
+ model_view(attention, tokens)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
1
+ transformers==4.3.0
2
+ torch==1.6.0
3
+ matplotlib
4
+ seaborn
5
+ numpy
test_demp.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+
5
+
6
+ def stock_forecast(final_year, companies, noise, show_legend, point_style):
7
+ start_year = 2020
8
+ x = np.arange(start_year, final_year + 1)
9
+ year_count = x.shape[0]
10
+ plt_format = ({"cross": "X", "line": "-", "circle": "o--"})[point_style]
11
+ fig = plt.figure()
12
+ ax = fig.add_subplot(111)
13
+ for i, company in enumerate(companies):
14
+ series = np.arange(0, year_count, dtype=float)
15
+ series = series ** 2 * (i + 1)
16
+ series += np.random.rand(year_count) * noise
17
+ ax.plot(x, series, plt_format)
18
+ if show_legend:
19
+ plt.legend(companies)
20
+ plt.close()
21
+
22
+
23
+ return fig
24
+
25
+
26
+ iface = gr.Interface(
27
+ stock_forecast,
28
+ [
29
+ gr.inputs.Radio([2025, 2030, 2035, 2040], label="Project to:"),
30
+ gr.inputs.CheckboxGroup(["Google", "Microsoft", "Gradio"]),
31
+ gr.inputs.Slider(1, 100),
32
+ "checkbox",
33
+ gr.inputs.Dropdown(["cross", "line", "circle"], label="Style")],
34
+ gr.outputs.Image(plot=True, label="forecast"))
35
+
36
+ iface.test_launch()
37
+ if __name__ == "__main__":
38
+ iface.launch(inline=False)
util.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+
3
+
4
+
5
+ @lru_cache()
6
+ def bytes_to_unicode_dict():
7
+ """
8
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
9
+ characters the bpe code barfs on.
10
+
11
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
12
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
13
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
14
+ tables between utf-8 bytes and unicode strings.
15
+ """
16
+ bs = (
17
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
18
+ )
19
+ cs = bs[:]
20
+ n = 0
21
+ for b in range(2 ** 8):
22
+ if b not in bs:
23
+ bs.append(b)
24
+ cs.append(2 ** 8 + n)
25
+ n += 1
26
+ cs = [chr(n) for n in cs]
27
+ return dict(zip(cs, bs))
28
+
29
+ ORD_UNICODE_MAP = bytes_to_unicode_dict()
30
+
31
+
32
+ @lru_cache()
33
+ def byte_to_char(bytestr):
34
+ return bytearray([ORD_UNICODE_MAP[c] for c in bytestr]).decode("utf-8", errors="replace")
35
+
36
+ # @lru_cache()
37
+ def bytetokens_to_unicdode(byte_tokens: list):
38
+ return [byte_to_char(token) for token in byte_tokens]
39
+
40
+
41
+ if __name__ == '__main__':
42
+
43
+ tokens = ['<s>',
44
+ 'ì¹´ì¹´ìĺ¤',
45
+ 'ìĹĶ',
46
+ 'íĦ°',
47
+ 'íĶĦëĿ¼ìĿ´',
48
+ 'ì¦Ī',
49
+ '(',
50
+ 'ëĮĢíijľ',
51
+ 'Ġë°±',
52
+ 'ìĥģ',
53
+ 'ìĹ½',
54
+ ')',
55
+ 'ê°Ģ',
56
+ 'Ġìĺ¬íķ´',
57
+ 'Ġ8',
58
+ 'ìĽĶ',
59
+ 'Ġ기ì¤Ģ',
60
+ 'Ġëĭ¤ìĪĺ',
61
+ 'Ġê¶Į',
62
+ 'ìľĦ',
63
+ 'ĠìŀĪëĬĶ',
64
+ 'Ġê¸Ģë¡ľë²Į',
65
+ 'ĠíķĻ',
66
+ 'íļĮìĹIJìĦľ',
67
+ 'Ġì´Ŀ',
68
+ 'Ġ16',
69
+ 'ê±´',
70
+ 'ìĿĺ',
71
+ 'ĠìĿ¸ê³µ',
72
+ 'ì§Ģ',
73
+ 'ëĬ¥',
74
+ '(',
75
+ 'A',
76
+ 'I',
77
+ ')',
78
+ 'Ġëħ¼ë¬¸',
79
+ 'ìĿĦ',
80
+ 'Ġëĵ±',
81
+ 'ìŀ¬',
82
+ 'íĸĪëĭ¤ê³ł',
83
+ 'Ġ9',
84
+ 'ìĿ¼',
85
+ 'Ġë°ĿíĺĶ',
86
+ 'ëĭ¤',
87
+ '.',
88
+ 'Ġì§ĢëĤľíķ´',
89
+ 'Ġëĵ±',
90
+ 'ìŀ¬',
91
+ 'íķľ',
92
+ 'Ġ13',
93
+ 'ê±´ë',
94
+ '³´ëĭ¤',
95
+ 'Ġ3',
96
+ 'ê±´',
97
+ 'Ġë§İìĿĢ',
98
+ 'Ġëħ¼ë¬¸',
99
+ 'ìĿ´',
100
+ 'Ġë°ĺ',
101
+ 'ëħĦ',
102
+ 'ìŬ',
103
+ 'Ġë§ĮìĹIJ',
104
+ 'Ġì±Ħ',
105
+ 'íĥĿ',
106
+ 'ëIJIJëĭ¤',
107
+ '.',
108
+ 'Ġì¹´ì¹´ìĺ¤',
109
+ 'ìĹĶ',
110
+ 'íĦ°',
111
+ 'íĶĦëĿ¼ìĿ´',
112
+ 'ì¦Ī',
113
+ '(',
114
+ 'ìĿ´',
115
+ 'íķĺ',
116
+ 'Ġì¹´ì¹´ìĺ¤',
117
+ 'ìĹĶ',
118
+ 'íĦ°',
119
+ ')',
120
+ 'ëĬĶ',
121
+ 'ĠA',
122
+ 'I',
123
+ 'ĠìĹ°êµ¬',
124
+ 'ĠìĦ±',
125
+ '과를',
126
+ 'ĠìĿ´',
127
+ 'ìĸ´ê°Ģ',
128
+ '기',
129
+ 'ĠìľĦíķ´',
130
+ 'ĠìĿ¸ìŀ¬',
131
+ 'ĠíĻķë³´',
132
+ 'ìĹIJ',
133
+ 'ĠìĨį',
134
+ 'ëıĦ를',
135
+ 'ĠëĨĴìĿ´',
136
+ 'ê²łëĭ¤ëĬĶ',
137
+ 'Ġë°©',
138
+ '침',
139
+ 'ìĿ´ëĭ¤',
140
+ '.',
141
+ 'Ċ',
142
+ 'Ċ',
143
+ 'ì¹´ì¹´ìĺ¤',
144
+ 'ìĹĶ',
145
+ 'íĦ°',
146
+ 'ëĬĶ',
147
+ 'Ġ8',
148
+ 'ìĽĶ',
149
+ 'ĠìŀIJìĹ°',
150
+ 'ìĸ´',
151
+ 'ì²ĺ리',
152
+ 'Ġë¶Ħìķ¼',
153
+ 'ìĿĺ',
154
+ 'Ġê¸Ģë¡ľë²Į',
155
+ 'Ġíĥij',
156
+ 'ĠíķĻ',
157
+ 'íļĮ',
158
+ 'ìĿ¸',
159
+ "Ġ'",
160
+ 'A',
161
+ 'C',
162
+ 'L',
163
+ '-',
164
+ 'I',
165
+ 'J',
166
+ 'C',
167
+ 'N',
168
+ 'L',
169
+ 'P',
170
+ "'",
171
+ 'ìĹIJ',
172
+ 'Ġëħ¼ë¬¸',
173
+ 'ìĿĦ',
174
+ 'Ġë°ľíijľ',
175
+ 'íķľ',
176
+ 'ĠìĤ¬ë¡Ģ',
177
+ 'ê¹Įì§Ģ',
178
+ 'Ġíķ©',
179
+ 'íķ´',
180
+ 'Ġìĺ¬íķ´',
181
+ 'Ġì´Ŀ',
182
+ 'Ġ16',
183
+ 'ê±´',
184
+ 'ìĿĺ',
185
+ 'ĠA',
186
+ 'I',
187
+ 'Ġëħ¼ë¬¸',
188
+ 'ìĿĦ',
189
+ 'Ġëĵ±',
190
+ 'ìŀ¬',
191
+ 'íĸĪëĭ¤ê³ł',
192
+ 'Ġë°ĿíĺĶ',
193
+ 'ëĭ¤',
194
+ '.',
195
+ 'ĠìĿ´',
196
+ 'Ġëħ¼ë¬¸',
197
+ 'ìĿĢ',
198
+ 'ĠìĿ¸ëıĦ',
199
+ 'ë©Ķ',
200
+ 'ìĿ¸',
201
+ '(',
202
+ 'in',
203
+ '-',
204
+ 'd',
205
+ 'om',
206
+ 'a',
207
+ 'in',
208
+ ')',
209
+ 'Ġìĥĺ',
210
+ 'íĶĮ',
211
+ 'ìĿĦ',
212
+ 'ĠìĤ¬ìļ©',
213
+ 'íķ´',
214
+ 'ĠìŀIJìĹ°',
215
+ 'ìĸ´',
216
+ 'Ġ공격',
217
+ 'Ġë°©ìĭĿìľ¼ë¡ľ',
218
+ 'ĠìķĦìĽĥ',
219
+ 'ìĺ¤',
220
+ 'ë¸Į',
221
+ 'ëıĦ',
222
+ 'ë©Ķ',
223
+ 'ìĿ¸',
224
+ '(',
225
+ 'out',
226
+ '-',
227
+ 'of',
228
+ '-',
229
+ 'd',
230
+ 'om',
231
+ 'a',
232
+ 'in',
233
+ ')',
234
+ 'Ġìĥĺ',
235
+ 'íĶĮ',
236
+ 'ìĿĦ',
237
+ 'ĠìŀIJëıĻ',
238
+ 'ìľ¼ë¡ľ',
239
+ 'ĠìĥĿ',
240
+ 'ìĦ±',
241
+ ',',
242
+ 'Ġë¶Ħ',
243
+ 'ë¥ĺ',
244
+ 'Ġ모ëį¸',
245
+ 'ìĿĺ',
246
+ 'Ġê°IJ',
247
+ 'ì§Ģ',
248
+ 'ĠëĬ¥ëł¥ìĿĦ',
249
+ 'Ġíĸ¥',
250
+ 'ìĥģ',
251
+ 'ìĭľíĤ¤ëĬĶ',
252
+ 'ĠëĤ´ìļ©',
253
+ 'ìĿĺ',
254
+ 'Ġëħ¼ë¬¸',
255
+ 'ìĿ´ëĭ¤',
256
+ '.',
257
+ 'Ċ',
258
+ 'Ċ',
259
+ '7',
260
+ 'ìĽĶ',
261
+ 'ìĹIJëĬĶ',
262
+ 'Ġ머',
263
+ 'ìĭł',
264
+ '룬',
265
+ 'ëĭĿ',
266
+ 'ĠíķĻ',
267
+ 'íļĮ',
268
+ "Ġ'",
269
+ 'I',
270
+ 'C',
271
+ 'M',
272
+ 'L',
273
+ "'",
274
+ 'ìĹIJ',
275
+ 'Ġíļ¨ìľ¨',
276
+ 'ìłģìĿ¸',
277
+ 'Ġê³ł',
278
+ 'íĴĪ',
279
+ 'ì§Ī',
280
+ 'ĠìĿĮ',
281
+ 'ìĦ±',
282
+ 'íķ©',
283
+ 'ìĦ±ìĿ´',
284
+ 'Ġê°ĢëĬ¥íķľ',
285
+ "Ġ'",
286
+ 'ìĹĶ',
287
+ 'ëĵľ',
288
+ 'ĠíĪ¬',
289
+ 'ĠìĹĶ',
290
+ 'ëĵľ',
291
+ '(',
292
+ 'en',
293
+ 'd',
294
+ '-',
295
+ 't',
296
+ 'o',
297
+ '-',
298
+ 'en',
299
+ 'd',
300
+ ')',
301
+ "'",
302
+ 'Ġ모ëį¸',
303
+ 'ìĿĦ',
304
+ 'ĠìłľìķĪ',
305
+ 'íķĺëĬĶ',
306
+ 'Ġëħ¼ë¬¸',
307
+ 'ìĿĦ',
308
+ 'Ġë°ľíijľ',
309
+ 'íĸĪëĭ¤',
310
+ '.',
311
+ 'Ġ6',
312
+ 'ìĽĶ',
313
+ 'ìĹIJëĬĶ',
314
+ 'ĠìĿĮ',
315
+ 'íĸ¥',
316
+ '·',
317
+ 'ìĿĮ',
318
+ 'ìĦ±',
319
+ 'Ġìĭł',
320
+ 'íĺ¸',
321
+ 'ì²ĺ리',
322
+ 'Ġë¶Ħìķ¼',
323
+ 'ĠíķĻ',
324
+ 'ìĪł',
325
+ 'ëĮĢíļĮ',
326
+ "Ġ'",
327
+ 'I',
328
+ 'C',
329
+ 'A',
330
+ 'S',
331
+ 'S',
332
+ 'P',
333
+ "'",
334
+ 'ìĹIJ',
335
+ 'ĠëĮĢ',
336
+ 'ê·ľëª¨',
337
+ 'Ġíħ',
338
+ 'į',
339
+ 'ìĬ¤íĬ¸',
340
+ 'Ġì½Ķ',
341
+ 'íį¼ìĬ¤',
342
+ '(',
343
+ 'ìĸ¸',
344
+ 'ìĸ´',
345
+ 'ĠìĹ°',
346
+ '구를',
347
+ 'ĠìľĦíķ´',
348
+ 'Ġíħ',
349
+ 'į',
350
+ 'ìĬ¤íĬ¸ë¥¼',
351
+ 'Ġì»´íĵ¨íĦ°',
352
+ 'ê°Ģ',
353
+ 'ĠìĿ½ìĿĦ',
354
+ 'ĠìĪĺ',
355
+ 'ĠìŀĪëĬĶ',
356
+ 'Ġíĺķíĥľë¡ľ',
357
+ 'Ġ모ìķĦ',
358
+ 'ĠëĨĵìĿĢ',
359
+ 'Ġìĸ¸ìĸ´',
360
+ 'ĠìŀIJë£Į',
361
+ ')',
362
+ 'Ġìłķë³´',
363
+ 'ĠíķĻìĬµ',
364
+ 'ìĹIJ',
365
+ 'ĠëĮĢíķľ',
366
+ 'Ġëħ¼ë¬¸',
367
+ 'Ġ1',
368
+ 'ê±´ìĿĦ',
369
+ 'Ġìĭ¤',
370
+ 'ìĹĪëĭ¤',
371
+ '.',
372
+ 'Ċ',
373
+ '</s>']
374
+
375
+ import time
376
+
377
+ start = time.time()
378
+ for i in range(1000):
379
+ result = bytetokens_to_unicdode(tokens)
380
+ end = time.time()
381
+
382
+ print(result)
383
+
384
+ print(f'time: {end-start}')