magic3910 jason9693 commited on
Commit
85e396d
0 Parent(s):

Duplicate from jason9693/KoreanHateSpeechClassifier

Browse files

Co-authored-by: Yang-Kichang <jason9693@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
NanumGothicCoding-Bold.ttf ADDED
Binary file (1.8 MB). View file
 
NanumGothicCoding.ttf ADDED
Binary file (2.78 MB). View file
 
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: KoreanHateSpeechClassifier
3
+ emoji: ⚡
4
+ colorFrom: red
5
+ colorTo: purple
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ duplicated_from: jason9693/KoreanHateSpeechClassifier
10
+ ---
11
+
12
+ # Configuration
13
+
14
+ `title`: _string_
15
+ Display title for the Space
16
+
17
+ `emoji`: _string_
18
+ Space emoji (emoji-only character allowed)
19
+
20
+ `colorFrom`: _string_
21
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
22
+
23
+ `colorTo`: _string_
24
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
25
+
26
+ `sdk`: _string_
27
+ Can be either `gradio` or `streamlit`
28
+
29
+ `sdk_version` : _string_
30
+ Only applicable for `streamlit` SDK.
31
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
32
+
33
+ `app_file`: _string_
34
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code).
35
+ Path is relative to the root of the repository.
36
+
37
+ `pinned`: _boolean_
38
+ Whether the Space stays on top of your list.
__pycache__/bertviz.cpython-38.pyc ADDED
Binary file (533 Bytes). View file
 
__pycache__/util.cpython-36.pyc ADDED
Binary file (6.01 kB). View file
 
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
2
+ import gradio as gr
3
+ from torch.nn import functional as F
4
+ import seaborn
5
+
6
+ import matplotlib
7
+ import platform
8
+
9
+ from transformers.file_utils import ModelOutput
10
+
11
+ if platform.system() == "Darwin":
12
+ print("MacOS")
13
+ matplotlib.use('Agg')
14
+ import matplotlib.pyplot as plt
15
+ import io
16
+ from PIL import Image
17
+
18
+ import matplotlib.font_manager as fm
19
+ import util
20
+
21
+
22
+ # global var
23
+ MODEL_NAME = 'jason9693/SoongsilBERT-base-beep'
24
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
25
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
26
+ config = AutoConfig.from_pretrained(MODEL_NAME)
27
+
28
+ MODEL_BUF = {
29
+ "name": MODEL_NAME,
30
+ "tokenizer": tokenizer,
31
+ "model": model,
32
+ "config": config
33
+ }
34
+
35
+
36
+ font_dir = ['./']
37
+ for font in fm.findSystemFonts(font_dir):
38
+ print(font)
39
+ fm.fontManager.addfont(font)
40
+ plt.rcParams["font.family"] = 'NanumGothicCoding'
41
+
42
+
43
+ def visualize_attention(sent, attention_matrix, n_words=10):
44
+ def draw(data, x, y, ax):
45
+
46
+ seaborn.heatmap(data,
47
+ xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0,
48
+ cbar=False, ax=ax)
49
+
50
+ # make plt figure with 1x6 subplots
51
+ fig = plt.figure(figsize=(16, 8))
52
+ # fig.subplots_adjust(hspace=0.7, wspace=0.2)
53
+ for i, layer in enumerate(range(1, 12, 2)):
54
+ ax = fig.add_subplot(2, 3, i+1)
55
+ ax.set_title("Layer {}".format(layer))
56
+ draw(attention_matrix[layer], sent if layer > 6 else [], sent if layer in [1,7] else [], ax=ax)
57
+
58
+ fig.tight_layout()
59
+ plt.close()
60
+ return fig
61
+
62
+
63
+ def change_model_name(name):
64
+ MODEL_BUF["name"] = name
65
+ MODEL_BUF["tokenizer"] = AutoTokenizer.from_pretrained(name)
66
+ MODEL_BUF["model"] = AutoModelForSequenceClassification.from_pretrained(name)
67
+ MODEL_BUF["config"] = AutoConfig.from_pretrained(name)
68
+
69
+
70
+ def predict(model_name, text):
71
+ if model_name != MODEL_BUF["name"]:
72
+ change_model_name(model_name)
73
+
74
+ tokenizer = MODEL_BUF["tokenizer"]
75
+ model = MODEL_BUF["model"]
76
+ config = MODEL_BUF["config"]
77
+
78
+ tokenized_text = tokenizer([text], return_tensors='pt')
79
+
80
+ input_tokens = tokenizer.convert_ids_to_tokens(tokenized_text.input_ids[0])
81
+ try:
82
+ input_tokens = util.bytetokens_to_unicdode(input_tokens) if config.model_type in ['roberta', 'gpt', 'gpt2'] else input_tokens
83
+ except KeyError:
84
+ input_tokens = input_tokens
85
+
86
+ model.eval()
87
+ output, attention = model(**tokenized_text, output_attentions=True, return_dict=False)
88
+ output = F.softmax(output, dim=-1)
89
+ result = {}
90
+
91
+ for idx, label in enumerate(output[0].detach().numpy()):
92
+ result[config.id2label[idx]] = float(label)
93
+
94
+ fig = visualize_attention(input_tokens, attention[0][0].detach().numpy())
95
+ return result, fig#.logits.detach()#.numpy()#, output.attentions.detach().numpy()
96
+
97
+
98
+ if __name__ == '__main__':
99
+ text = '읿딴걸 홍볿글 읿랉곭 쌑젩낄고 앉앟있냩'
100
+
101
+ model_name_list = [
102
+ 'jason9693/SoongsilBERT-base-beep',
103
+ "beomi/beep-klue-roberta-base-hate",
104
+ "beomi/beep-koelectra-base-v3-discriminator-hate",
105
+ "beomi/beep-KcELECTRA-base-hate"
106
+ ]
107
+
108
+ #Create a gradio app with a button that calls predict()
109
+ app = gr.Interface(
110
+ fn=predict,
111
+ inputs=[gr.inputs.Dropdown(model_name_list, label="Model Name"), 'text'], outputs=['label', 'plot'],
112
+ examples = [[MODEL_BUF["name"], text], [MODEL_BUF["name"], "4=🦀 4≠🦀"]],
113
+ title="한국어 혐오성 발화 분류기 (Korean Hate Speech Classifier)",
114
+ description="Korean Hate Speech Classifier with Several Pretrained LM\nCurrent Supported Model:\n1. SoongsilBERT\n2. KcBERT(+KLUE)\n3. KcELECTRA\n4.KoELECTRA."
115
+ )
116
+ app.launch(inline=False)
attention.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
2
+ import gradio as gr
3
+ from torch.nn import functional as F
4
+ import seaborn
5
+
6
+ import matplotlib
7
+ import platform
8
+
9
+ if platform.system() == "Darwin":
10
+ print("MacOS")
11
+ matplotlib.use('Agg')
12
+ import matplotlib.pyplot as plt
13
+ import io
14
+ from PIL import Image
15
+
16
+ import matplotlib.font_manager as fm
17
+
18
+
19
+
20
+
21
+ import util
22
+
23
+ font_path = r'NanumGothicCoding.ttf'
24
+ fontprop = fm.FontProperties(fname=font_path, size=18)
25
+
26
+ plt.rcParams["font.family"] = 'NanumGothic'
27
+
28
+
29
+ def visualize_attention(sent, attention_matrix, n_words=10):
30
+ def draw(data, x, y, ax):
31
+ seaborn.heatmap(data,
32
+ xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0,
33
+ cbar=False, ax=ax)
34
+
35
+ # make plt figure with 1x6 subplots
36
+ fig = plt.figure(figsize=(16, 8))
37
+ # fig.subplots_adjust(hspace=0.7, wspace=0.2)
38
+ for i, layer in enumerate(range(1, 12, 2)):
39
+ ax = fig.add_subplot(2, 3, i+1)
40
+ ax.set_title("Layer {}".format(layer))
41
+ draw(attention_matrix[layer], sent if layer > 6 else [], sent if layer in [1,7] else [], ax=ax)
42
+
43
+ fig.tight_layout()
44
+ plt.close()
45
+
46
+ return fig
47
+
48
+
49
+
50
+ def predict(model_name, text):
51
+
52
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
53
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
54
+ config = AutoConfig.from_pretrained(model_name)
55
+ print(config.id2label)
56
+
57
+ tokenized_text = tokenizer([text], return_tensors='pt')
58
+
59
+ input_tokens = tokenizer.convert_ids_to_tokens(tokenized_text.input_ids[0])
60
+ print(input_tokens)
61
+ input_tokens = util.bytetokens_to_unicdode(input_tokens) if config.model_type in ['roberta', 'gpt', 'gpt2'] else input_tokens
62
+
63
+ model.eval()
64
+ output, attention = model(**tokenized_text, output_attentions=True, return_dict=False)
65
+ output = F.softmax(output, dim=-1)
66
+ result = {}
67
+
68
+ for idx, label in enumerate(output[0].detach().numpy()):
69
+ result[config.id2label[idx]] = float(label)
70
+
71
+ fig = visualize_attention(input_tokens, attention[0][0].detach().numpy())
72
+ return result, fig#.logits.detach()#.numpy()#, output.attentions.detach().numpy()
73
+
74
+
75
+ if __name__ == '__main__':
76
+
77
+ model_name = 'jason9693/SoongsilBERT-beep-base'
78
+ text = '읿딴걸 홍볿글 읿랉곭 쌑젩낄고 앉앟있냩'
79
+ # output = predict(model_name, text)
80
+
81
+ # print(output)
82
+
83
+ model_name_list = [
84
+ 'jason9693/SoongsilBERT-beep-base'
85
+ ]
86
+
87
+ #Create a gradio app with a button that calls predict()
88
+ app = gr.Interface(
89
+ fn=predict,
90
+ server_port=26899,
91
+ server_name='0.0.0.0',
92
+ inputs=[gr.inputs.Dropdown(model_name_list, label="Model Name"), 'text'], outputs=['label', 'plot'],
93
+ examples = [[model_name, text]],
94
+ title="한국어 혐오성 발화 분류기 (Korean Hate Speech Classifier)",
95
+ description="Korean Hate Speech Classifier with Several Pretrained LM\nCurrent Supported Model:\n1. SoongsilBERT"
96
+ )
97
+ app.launch(inline=False)
bvz.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+ from bertviz import model_view
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
5
+ model = AutoModel.from_pretrained("distilbert-base-uncased", output_attentions=True)
6
+ inputs = tokenizer.encode("The cat sat on the mat", return_tensors='pt')
7
+ outputs = model(inputs)
8
+ attention = outputs[-1] # Output includes attention weights when output_attentions=True
9
+ tokens = tokenizer.convert_ids_to_tokens(inputs[0])
10
+ model_view(attention, tokens)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers==4.3.0
2
+ torch==1.6.0
3
+ matplotlib
4
+ seaborn
5
+ numpy
test_demp.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+
5
+
6
+ def stock_forecast(final_year, companies, noise, show_legend, point_style):
7
+ start_year = 2020
8
+ x = np.arange(start_year, final_year + 1)
9
+ year_count = x.shape[0]
10
+ plt_format = ({"cross": "X", "line": "-", "circle": "o--"})[point_style]
11
+ fig = plt.figure()
12
+ ax = fig.add_subplot(111)
13
+ for i, company in enumerate(companies):
14
+ series = np.arange(0, year_count, dtype=float)
15
+ series = series ** 2 * (i + 1)
16
+ series += np.random.rand(year_count) * noise
17
+ ax.plot(x, series, plt_format)
18
+ if show_legend:
19
+ plt.legend(companies)
20
+ plt.close()
21
+
22
+
23
+ return fig
24
+
25
+
26
+ iface = gr.Interface(
27
+ stock_forecast,
28
+ [
29
+ gr.inputs.Radio([2025, 2030, 2035, 2040], label="Project to:"),
30
+ gr.inputs.CheckboxGroup(["Google", "Microsoft", "Gradio"]),
31
+ gr.inputs.Slider(1, 100),
32
+ "checkbox",
33
+ gr.inputs.Dropdown(["cross", "line", "circle"], label="Style")],
34
+ gr.outputs.Image(plot=True, label="forecast"))
35
+
36
+ iface.test_launch()
37
+ if __name__ == "__main__":
38
+ iface.launch(inline=False)
util.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+
3
+
4
+
5
+ @lru_cache()
6
+ def bytes_to_unicode_dict():
7
+ """
8
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
9
+ characters the bpe code barfs on.
10
+
11
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
12
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
13
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
14
+ tables between utf-8 bytes and unicode strings.
15
+ """
16
+ bs = (
17
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
18
+ )
19
+ cs = bs[:]
20
+ n = 0
21
+ for b in range(2 ** 8):
22
+ if b not in bs:
23
+ bs.append(b)
24
+ cs.append(2 ** 8 + n)
25
+ n += 1
26
+ cs = [chr(n) for n in cs]
27
+ return dict(zip(cs, bs))
28
+
29
+ ORD_UNICODE_MAP = bytes_to_unicode_dict()
30
+
31
+
32
+ @lru_cache()
33
+ def byte_to_char(bytestr):
34
+ return bytearray([ORD_UNICODE_MAP[c] for c in bytestr]).decode("utf-8", errors="replace")
35
+
36
+ # @lru_cache()
37
+ def bytetokens_to_unicdode(byte_tokens: list):
38
+ return [byte_to_char(token) for token in byte_tokens]
39
+
40
+
41
+ if __name__ == '__main__':
42
+
43
+ tokens = ['<s>',
44
+ 'ì¹´ì¹´ìĺ¤',
45
+ 'ìĹĶ',
46
+ 'íĦ°',
47
+ 'íĶĦëĿ¼ìĿ´',
48
+ 'ì¦Ī',
49
+ '(',
50
+ 'ëĮĢíijľ',
51
+ 'Ġë°±',
52
+ 'ìĥģ',
53
+ 'ìĹ½',
54
+ ')',
55
+ 'ê°Ģ',
56
+ 'Ġìĺ¬íķ´',
57
+ 'Ġ8',
58
+ 'ìĽĶ',
59
+ 'Ġ기ì¤Ģ',
60
+ 'Ġëĭ¤ìĪĺ',
61
+ 'Ġê¶Į',
62
+ 'ìľĦ',
63
+ 'ĠìŀĪëĬĶ',
64
+ 'Ġê¸Ģë¡ľë²Į',
65
+ 'ĠíķĻ',
66
+ 'íļĮìĹIJìĦľ',
67
+ 'Ġì´Ŀ',
68
+ 'Ġ16',
69
+ 'ê±´',
70
+ 'ìĿĺ',
71
+ 'ĠìĿ¸ê³µ',
72
+ 'ì§Ģ',
73
+ 'ëĬ¥',
74
+ '(',
75
+ 'A',
76
+ 'I',
77
+ ')',
78
+ 'Ġëħ¼ë¬¸',
79
+ 'ìĿĦ',
80
+ 'Ġëĵ±',
81
+ 'ìŀ¬',
82
+ 'íĸĪëĭ¤ê³ł',
83
+ 'Ġ9',
84
+ 'ìĿ¼',
85
+ 'Ġë°ĿíĺĶ',
86
+ 'ëĭ¤',
87
+ '.',
88
+ 'Ġì§ĢëĤľíķ´',
89
+ 'Ġëĵ±',
90
+ 'ìŀ¬',
91
+ 'íķľ',
92
+ 'Ġ13',
93
+ 'ê±´ë',
94
+ '³´ëĭ¤',
95
+ 'Ġ3',
96
+ 'ê±´',
97
+ 'Ġë§İìĿĢ',
98
+ 'Ġëħ¼ë¬¸',
99
+ 'ìĿ´',
100
+ 'Ġë°ĺ',
101
+ 'ëħĦ',
102
+ 'ìŬ',
103
+ 'Ġë§ĮìĹIJ',
104
+ 'Ġì±Ħ',
105
+ 'íĥĿ',
106
+ 'ëIJIJëĭ¤',
107
+ '.',
108
+ 'Ġì¹´ì¹´ìĺ¤',
109
+ 'ìĹĶ',
110
+ 'íĦ°',
111
+ 'íĶĦëĿ¼ìĿ´',
112
+ 'ì¦Ī',
113
+ '(',
114
+ 'ìĿ´',
115
+ 'íķĺ',
116
+ 'Ġì¹´ì¹´ìĺ¤',
117
+ 'ìĹĶ',
118
+ 'íĦ°',
119
+ ')',
120
+ 'ëĬĶ',
121
+ 'ĠA',
122
+ 'I',
123
+ 'ĠìĹ°êµ¬',
124
+ 'ĠìĦ±',
125
+ '과를',
126
+ 'ĠìĿ´',
127
+ 'ìĸ´ê°Ģ',
128
+ '기',
129
+ 'ĠìľĦíķ´',
130
+ 'ĠìĿ¸ìŀ¬',
131
+ 'ĠíĻķë³´',
132
+ 'ìĹIJ',
133
+ 'ĠìĨį',
134
+ 'ëıĦ를',
135
+ 'ĠëĨĴìĿ´',
136
+ 'ê²łëĭ¤ëĬĶ',
137
+ 'Ġë°©',
138
+ '침',
139
+ 'ìĿ´ëĭ¤',
140
+ '.',
141
+ 'Ċ',
142
+ 'Ċ',
143
+ 'ì¹´ì¹´ìĺ¤',
144
+ 'ìĹĶ',
145
+ 'íĦ°',
146
+ 'ëĬĶ',
147
+ 'Ġ8',
148
+ 'ìĽĶ',
149
+ 'ĠìŀIJìĹ°',
150
+ 'ìĸ´',
151
+ 'ì²ĺ리',
152
+ 'Ġë¶Ħìķ¼',
153
+ 'ìĿĺ',
154
+ 'Ġê¸Ģë¡ľë²Į',
155
+ 'Ġíĥij',
156
+ 'ĠíķĻ',
157
+ 'íļĮ',
158
+ 'ìĿ¸',
159
+ "Ġ'",
160
+ 'A',
161
+ 'C',
162
+ 'L',
163
+ '-',
164
+ 'I',
165
+ 'J',
166
+ 'C',
167
+ 'N',
168
+ 'L',
169
+ 'P',
170
+ "'",
171
+ 'ìĹIJ',
172
+ 'Ġëħ¼ë¬¸',
173
+ 'ìĿĦ',
174
+ 'Ġë°ľíijľ',
175
+ 'íķľ',
176
+ 'ĠìĤ¬ë¡Ģ',
177
+ 'ê¹Įì§Ģ',
178
+ 'Ġíķ©',
179
+ 'íķ´',
180
+ 'Ġìĺ¬íķ´',
181
+ 'Ġì´Ŀ',
182
+ 'Ġ16',
183
+ 'ê±´',
184
+ 'ìĿĺ',
185
+ 'ĠA',
186
+ 'I',
187
+ 'Ġëħ¼ë¬¸',
188
+ 'ìĿĦ',
189
+ 'Ġëĵ±',
190
+ 'ìŀ¬',
191
+ 'íĸĪëĭ¤ê³ł',
192
+ 'Ġë°ĿíĺĶ',
193
+ 'ëĭ¤',
194
+ '.',
195
+ 'ĠìĿ´',
196
+ 'Ġëħ¼ë¬¸',
197
+ 'ìĿĢ',
198
+ 'ĠìĿ¸ëıĦ',
199
+ 'ë©Ķ',
200
+ 'ìĿ¸',
201
+ '(',
202
+ 'in',
203
+ '-',
204
+ 'd',
205
+ 'om',
206
+ 'a',
207
+ 'in',
208
+ ')',
209
+ 'Ġìĥĺ',
210
+ 'íĶĮ',
211
+ 'ìĿĦ',
212
+ 'ĠìĤ¬ìļ©',
213
+ 'íķ´',
214
+ 'ĠìŀIJìĹ°',
215
+ 'ìĸ´',
216
+ 'Ġ공격',
217
+ 'Ġë°©ìĭĿìľ¼ë¡ľ',
218
+ 'ĠìķĦìĽĥ',
219
+ 'ìĺ¤',
220
+ 'ë¸Į',
221
+ 'ëıĦ',
222
+ 'ë©Ķ',
223
+ 'ìĿ¸',
224
+ '(',
225
+ 'out',
226
+ '-',
227
+ 'of',
228
+ '-',
229
+ 'd',
230
+ 'om',
231
+ 'a',
232
+ 'in',
233
+ ')',
234
+ 'Ġìĥĺ',
235
+ 'íĶĮ',
236
+ 'ìĿĦ',
237
+ 'ĠìŀIJëıĻ',
238
+ 'ìľ¼ë¡ľ',
239
+ 'ĠìĥĿ',
240
+ 'ìĦ±',
241
+ ',',
242
+ 'Ġë¶Ħ',
243
+ 'ë¥ĺ',
244
+ 'Ġ모ëį¸',
245
+ 'ìĿĺ',
246
+ 'Ġê°IJ',
247
+ 'ì§Ģ',
248
+ 'ĠëĬ¥ëł¥ìĿĦ',
249
+ 'Ġíĸ¥',
250
+ 'ìĥģ',
251
+ 'ìĭľíĤ¤ëĬĶ',
252
+ 'ĠëĤ´ìļ©',
253
+ 'ìĿĺ',
254
+ 'Ġëħ¼ë¬¸',
255
+ 'ìĿ´ëĭ¤',
256
+ '.',
257
+ 'Ċ',
258
+ 'Ċ',
259
+ '7',
260
+ 'ìĽĶ',
261
+ 'ìĹIJëĬĶ',
262
+ 'Ġ머',
263
+ 'ìĭł',
264
+ '룬',
265
+ 'ëĭĿ',
266
+ 'ĠíķĻ',
267
+ 'íļĮ',
268
+ "Ġ'",
269
+ 'I',
270
+ 'C',
271
+ 'M',
272
+ 'L',
273
+ "'",
274
+ 'ìĹIJ',
275
+ 'Ġíļ¨ìľ¨',
276
+ 'ìłģìĿ¸',
277
+ 'Ġê³ł',
278
+ 'íĴĪ',
279
+ 'ì§Ī',
280
+ 'ĠìĿĮ',
281
+ 'ìĦ±',
282
+ 'íķ©',
283
+ 'ìĦ±ìĿ´',
284
+ 'Ġê°ĢëĬ¥íķľ',
285
+ "Ġ'",
286
+ 'ìĹĶ',
287
+ 'ëĵľ',
288
+ 'ĠíĪ¬',
289
+ 'ĠìĹĶ',
290
+ 'ëĵľ',
291
+ '(',
292
+ 'en',
293
+ 'd',
294
+ '-',
295
+ 't',
296
+ 'o',
297
+ '-',
298
+ 'en',
299
+ 'd',
300
+ ')',
301
+ "'",
302
+ 'Ġ모ëį¸',
303
+ 'ìĿĦ',
304
+ 'ĠìłľìķĪ',
305
+ 'íķĺëĬĶ',
306
+ 'Ġëħ¼ë¬¸',
307
+ 'ìĿĦ',
308
+ 'Ġë°ľíijľ',
309
+ 'íĸĪëĭ¤',
310
+ '.',
311
+ 'Ġ6',
312
+ 'ìĽĶ',
313
+ 'ìĹIJëĬĶ',
314
+ 'ĠìĿĮ',
315
+ 'íĸ¥',
316
+ '·',
317
+ 'ìĿĮ',
318
+ 'ìĦ±',
319
+ 'Ġìĭł',
320
+ 'íĺ¸',
321
+ 'ì²ĺ리',
322
+ 'Ġë¶Ħìķ¼',
323
+ 'ĠíķĻ',
324
+ 'ìĪł',
325
+ 'ëĮĢíļĮ',
326
+ "Ġ'",
327
+ 'I',
328
+ 'C',
329
+ 'A',
330
+ 'S',
331
+ 'S',
332
+ 'P',
333
+ "'",
334
+ 'ìĹIJ',
335
+ 'ĠëĮĢ',
336
+ 'ê·ľëª¨',
337
+ 'Ġíħ',
338
+ 'į',
339
+ 'ìĬ¤íĬ¸',
340
+ 'Ġì½Ķ',
341
+ 'íį¼ìĬ¤',
342
+ '(',
343
+ 'ìĸ¸',
344
+ 'ìĸ´',
345
+ 'ĠìĹ°',
346
+ '구를',
347
+ 'ĠìľĦíķ´',
348
+ 'Ġíħ',
349
+ 'į',
350
+ 'ìĬ¤íĬ¸ë¥¼',
351
+ 'Ġì»´íĵ¨íĦ°',
352
+ 'ê°Ģ',
353
+ 'ĠìĿ½ìĿĦ',
354
+ 'ĠìĪĺ',
355
+ 'ĠìŀĪëĬĶ',
356
+ 'Ġíĺķíĥľë¡ľ',
357
+ 'Ġ모ìķĦ',
358
+ 'ĠëĨĵìĿĢ',
359
+ 'Ġìĸ¸ìĸ´',
360
+ 'ĠìŀIJë£Į',
361
+ ')',
362
+ 'Ġìłķë³´',
363
+ 'ĠíķĻìĬµ',
364
+ 'ìĹIJ',
365
+ 'ĠëĮĢíķľ',
366
+ 'Ġëħ¼ë¬¸',
367
+ 'Ġ1',
368
+ 'ê±´ìĿĦ',
369
+ 'Ġìĭ¤',
370
+ 'ìĹĪëĭ¤',
371
+ '.',
372
+ 'Ċ',
373
+ '</s>']
374
+
375
+ import time
376
+
377
+ start = time.time()
378
+ for i in range(1000):
379
+ result = bytetokens_to_unicdode(tokens)
380
+ end = time.time()
381
+
382
+ print(result)
383
+
384
+ print(f'time: {end-start}')