Duplicate from jason9693/KoreanHateSpeechClassifier
Browse filesCo-authored-by: Yang-Kichang <jason9693@users.noreply.huggingface.co>
- .gitattributes +27 -0
- NanumGothicCoding-Bold.ttf +0 -0
- NanumGothicCoding.ttf +0 -0
- README.md +38 -0
- __pycache__/bertviz.cpython-38.pyc +0 -0
- __pycache__/util.cpython-36.pyc +0 -0
- app.py +116 -0
- attention.py +97 -0
- bvz.py +10 -0
- requirements.txt +5 -0
- test_demp.py +38 -0
- util.py +384 -0
.gitattributes
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
20 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
NanumGothicCoding-Bold.ttf
ADDED
Binary file (1.8 MB). View file
|
|
NanumGothicCoding.ttf
ADDED
Binary file (2.78 MB). View file
|
|
README.md
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: KoreanHateSpeechClassifier
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
app_file: app.py
|
8 |
+
pinned: false
|
9 |
+
duplicated_from: jason9693/KoreanHateSpeechClassifier
|
10 |
+
---
|
11 |
+
|
12 |
+
# Configuration
|
13 |
+
|
14 |
+
`title`: _string_
|
15 |
+
Display title for the Space
|
16 |
+
|
17 |
+
`emoji`: _string_
|
18 |
+
Space emoji (emoji-only character allowed)
|
19 |
+
|
20 |
+
`colorFrom`: _string_
|
21 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
22 |
+
|
23 |
+
`colorTo`: _string_
|
24 |
+
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
25 |
+
|
26 |
+
`sdk`: _string_
|
27 |
+
Can be either `gradio` or `streamlit`
|
28 |
+
|
29 |
+
`sdk_version` : _string_
|
30 |
+
Only applicable for `streamlit` SDK.
|
31 |
+
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
32 |
+
|
33 |
+
`app_file`: _string_
|
34 |
+
Path to your main application file (which contains either `gradio` or `streamlit` Python code).
|
35 |
+
Path is relative to the root of the repository.
|
36 |
+
|
37 |
+
`pinned`: _boolean_
|
38 |
+
Whether the Space stays on top of your list.
|
__pycache__/bertviz.cpython-38.pyc
ADDED
Binary file (533 Bytes). View file
|
|
__pycache__/util.cpython-36.pyc
ADDED
Binary file (6.01 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
|
2 |
+
import gradio as gr
|
3 |
+
from torch.nn import functional as F
|
4 |
+
import seaborn
|
5 |
+
|
6 |
+
import matplotlib
|
7 |
+
import platform
|
8 |
+
|
9 |
+
from transformers.file_utils import ModelOutput
|
10 |
+
|
11 |
+
if platform.system() == "Darwin":
|
12 |
+
print("MacOS")
|
13 |
+
matplotlib.use('Agg')
|
14 |
+
import matplotlib.pyplot as plt
|
15 |
+
import io
|
16 |
+
from PIL import Image
|
17 |
+
|
18 |
+
import matplotlib.font_manager as fm
|
19 |
+
import util
|
20 |
+
|
21 |
+
|
22 |
+
# global var
|
23 |
+
MODEL_NAME = 'jason9693/SoongsilBERT-base-beep'
|
24 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
25 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
|
26 |
+
config = AutoConfig.from_pretrained(MODEL_NAME)
|
27 |
+
|
28 |
+
MODEL_BUF = {
|
29 |
+
"name": MODEL_NAME,
|
30 |
+
"tokenizer": tokenizer,
|
31 |
+
"model": model,
|
32 |
+
"config": config
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
font_dir = ['./']
|
37 |
+
for font in fm.findSystemFonts(font_dir):
|
38 |
+
print(font)
|
39 |
+
fm.fontManager.addfont(font)
|
40 |
+
plt.rcParams["font.family"] = 'NanumGothicCoding'
|
41 |
+
|
42 |
+
|
43 |
+
def visualize_attention(sent, attention_matrix, n_words=10):
|
44 |
+
def draw(data, x, y, ax):
|
45 |
+
|
46 |
+
seaborn.heatmap(data,
|
47 |
+
xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0,
|
48 |
+
cbar=False, ax=ax)
|
49 |
+
|
50 |
+
# make plt figure with 1x6 subplots
|
51 |
+
fig = plt.figure(figsize=(16, 8))
|
52 |
+
# fig.subplots_adjust(hspace=0.7, wspace=0.2)
|
53 |
+
for i, layer in enumerate(range(1, 12, 2)):
|
54 |
+
ax = fig.add_subplot(2, 3, i+1)
|
55 |
+
ax.set_title("Layer {}".format(layer))
|
56 |
+
draw(attention_matrix[layer], sent if layer > 6 else [], sent if layer in [1,7] else [], ax=ax)
|
57 |
+
|
58 |
+
fig.tight_layout()
|
59 |
+
plt.close()
|
60 |
+
return fig
|
61 |
+
|
62 |
+
|
63 |
+
def change_model_name(name):
|
64 |
+
MODEL_BUF["name"] = name
|
65 |
+
MODEL_BUF["tokenizer"] = AutoTokenizer.from_pretrained(name)
|
66 |
+
MODEL_BUF["model"] = AutoModelForSequenceClassification.from_pretrained(name)
|
67 |
+
MODEL_BUF["config"] = AutoConfig.from_pretrained(name)
|
68 |
+
|
69 |
+
|
70 |
+
def predict(model_name, text):
|
71 |
+
if model_name != MODEL_BUF["name"]:
|
72 |
+
change_model_name(model_name)
|
73 |
+
|
74 |
+
tokenizer = MODEL_BUF["tokenizer"]
|
75 |
+
model = MODEL_BUF["model"]
|
76 |
+
config = MODEL_BUF["config"]
|
77 |
+
|
78 |
+
tokenized_text = tokenizer([text], return_tensors='pt')
|
79 |
+
|
80 |
+
input_tokens = tokenizer.convert_ids_to_tokens(tokenized_text.input_ids[0])
|
81 |
+
try:
|
82 |
+
input_tokens = util.bytetokens_to_unicdode(input_tokens) if config.model_type in ['roberta', 'gpt', 'gpt2'] else input_tokens
|
83 |
+
except KeyError:
|
84 |
+
input_tokens = input_tokens
|
85 |
+
|
86 |
+
model.eval()
|
87 |
+
output, attention = model(**tokenized_text, output_attentions=True, return_dict=False)
|
88 |
+
output = F.softmax(output, dim=-1)
|
89 |
+
result = {}
|
90 |
+
|
91 |
+
for idx, label in enumerate(output[0].detach().numpy()):
|
92 |
+
result[config.id2label[idx]] = float(label)
|
93 |
+
|
94 |
+
fig = visualize_attention(input_tokens, attention[0][0].detach().numpy())
|
95 |
+
return result, fig#.logits.detach()#.numpy()#, output.attentions.detach().numpy()
|
96 |
+
|
97 |
+
|
98 |
+
if __name__ == '__main__':
|
99 |
+
text = '읿딴걸 홍볿글 읿랉곭 쌑젩낄고 앉앟있냩'
|
100 |
+
|
101 |
+
model_name_list = [
|
102 |
+
'jason9693/SoongsilBERT-base-beep',
|
103 |
+
"beomi/beep-klue-roberta-base-hate",
|
104 |
+
"beomi/beep-koelectra-base-v3-discriminator-hate",
|
105 |
+
"beomi/beep-KcELECTRA-base-hate"
|
106 |
+
]
|
107 |
+
|
108 |
+
#Create a gradio app with a button that calls predict()
|
109 |
+
app = gr.Interface(
|
110 |
+
fn=predict,
|
111 |
+
inputs=[gr.inputs.Dropdown(model_name_list, label="Model Name"), 'text'], outputs=['label', 'plot'],
|
112 |
+
examples = [[MODEL_BUF["name"], text], [MODEL_BUF["name"], "4=🦀 4≠🦀"]],
|
113 |
+
title="한국어 혐오성 발화 분류기 (Korean Hate Speech Classifier)",
|
114 |
+
description="Korean Hate Speech Classifier with Several Pretrained LM\nCurrent Supported Model:\n1. SoongsilBERT\n2. KcBERT(+KLUE)\n3. KcELECTRA\n4.KoELECTRA."
|
115 |
+
)
|
116 |
+
app.launch(inline=False)
|
attention.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
|
2 |
+
import gradio as gr
|
3 |
+
from torch.nn import functional as F
|
4 |
+
import seaborn
|
5 |
+
|
6 |
+
import matplotlib
|
7 |
+
import platform
|
8 |
+
|
9 |
+
if platform.system() == "Darwin":
|
10 |
+
print("MacOS")
|
11 |
+
matplotlib.use('Agg')
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
import io
|
14 |
+
from PIL import Image
|
15 |
+
|
16 |
+
import matplotlib.font_manager as fm
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
import util
|
22 |
+
|
23 |
+
font_path = r'NanumGothicCoding.ttf'
|
24 |
+
fontprop = fm.FontProperties(fname=font_path, size=18)
|
25 |
+
|
26 |
+
plt.rcParams["font.family"] = 'NanumGothic'
|
27 |
+
|
28 |
+
|
29 |
+
def visualize_attention(sent, attention_matrix, n_words=10):
|
30 |
+
def draw(data, x, y, ax):
|
31 |
+
seaborn.heatmap(data,
|
32 |
+
xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0,
|
33 |
+
cbar=False, ax=ax)
|
34 |
+
|
35 |
+
# make plt figure with 1x6 subplots
|
36 |
+
fig = plt.figure(figsize=(16, 8))
|
37 |
+
# fig.subplots_adjust(hspace=0.7, wspace=0.2)
|
38 |
+
for i, layer in enumerate(range(1, 12, 2)):
|
39 |
+
ax = fig.add_subplot(2, 3, i+1)
|
40 |
+
ax.set_title("Layer {}".format(layer))
|
41 |
+
draw(attention_matrix[layer], sent if layer > 6 else [], sent if layer in [1,7] else [], ax=ax)
|
42 |
+
|
43 |
+
fig.tight_layout()
|
44 |
+
plt.close()
|
45 |
+
|
46 |
+
return fig
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
def predict(model_name, text):
|
51 |
+
|
52 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
53 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
54 |
+
config = AutoConfig.from_pretrained(model_name)
|
55 |
+
print(config.id2label)
|
56 |
+
|
57 |
+
tokenized_text = tokenizer([text], return_tensors='pt')
|
58 |
+
|
59 |
+
input_tokens = tokenizer.convert_ids_to_tokens(tokenized_text.input_ids[0])
|
60 |
+
print(input_tokens)
|
61 |
+
input_tokens = util.bytetokens_to_unicdode(input_tokens) if config.model_type in ['roberta', 'gpt', 'gpt2'] else input_tokens
|
62 |
+
|
63 |
+
model.eval()
|
64 |
+
output, attention = model(**tokenized_text, output_attentions=True, return_dict=False)
|
65 |
+
output = F.softmax(output, dim=-1)
|
66 |
+
result = {}
|
67 |
+
|
68 |
+
for idx, label in enumerate(output[0].detach().numpy()):
|
69 |
+
result[config.id2label[idx]] = float(label)
|
70 |
+
|
71 |
+
fig = visualize_attention(input_tokens, attention[0][0].detach().numpy())
|
72 |
+
return result, fig#.logits.detach()#.numpy()#, output.attentions.detach().numpy()
|
73 |
+
|
74 |
+
|
75 |
+
if __name__ == '__main__':
|
76 |
+
|
77 |
+
model_name = 'jason9693/SoongsilBERT-beep-base'
|
78 |
+
text = '읿딴걸 홍볿글 읿랉곭 쌑젩낄고 앉앟있냩'
|
79 |
+
# output = predict(model_name, text)
|
80 |
+
|
81 |
+
# print(output)
|
82 |
+
|
83 |
+
model_name_list = [
|
84 |
+
'jason9693/SoongsilBERT-beep-base'
|
85 |
+
]
|
86 |
+
|
87 |
+
#Create a gradio app with a button that calls predict()
|
88 |
+
app = gr.Interface(
|
89 |
+
fn=predict,
|
90 |
+
server_port=26899,
|
91 |
+
server_name='0.0.0.0',
|
92 |
+
inputs=[gr.inputs.Dropdown(model_name_list, label="Model Name"), 'text'], outputs=['label', 'plot'],
|
93 |
+
examples = [[model_name, text]],
|
94 |
+
title="한국어 혐오성 발화 분류기 (Korean Hate Speech Classifier)",
|
95 |
+
description="Korean Hate Speech Classifier with Several Pretrained LM\nCurrent Supported Model:\n1. SoongsilBERT"
|
96 |
+
)
|
97 |
+
app.launch(inline=False)
|
bvz.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModel
|
2 |
+
from bertviz import model_view
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
5 |
+
model = AutoModel.from_pretrained("distilbert-base-uncased", output_attentions=True)
|
6 |
+
inputs = tokenizer.encode("The cat sat on the mat", return_tensors='pt')
|
7 |
+
outputs = model(inputs)
|
8 |
+
attention = outputs[-1] # Output includes attention weights when output_attentions=True
|
9 |
+
tokens = tokenizer.convert_ids_to_tokens(inputs[0])
|
10 |
+
model_view(attention, tokens)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers==4.3.0
|
2 |
+
torch==1.6.0
|
3 |
+
matplotlib
|
4 |
+
seaborn
|
5 |
+
numpy
|
test_demp.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
def stock_forecast(final_year, companies, noise, show_legend, point_style):
|
7 |
+
start_year = 2020
|
8 |
+
x = np.arange(start_year, final_year + 1)
|
9 |
+
year_count = x.shape[0]
|
10 |
+
plt_format = ({"cross": "X", "line": "-", "circle": "o--"})[point_style]
|
11 |
+
fig = plt.figure()
|
12 |
+
ax = fig.add_subplot(111)
|
13 |
+
for i, company in enumerate(companies):
|
14 |
+
series = np.arange(0, year_count, dtype=float)
|
15 |
+
series = series ** 2 * (i + 1)
|
16 |
+
series += np.random.rand(year_count) * noise
|
17 |
+
ax.plot(x, series, plt_format)
|
18 |
+
if show_legend:
|
19 |
+
plt.legend(companies)
|
20 |
+
plt.close()
|
21 |
+
|
22 |
+
|
23 |
+
return fig
|
24 |
+
|
25 |
+
|
26 |
+
iface = gr.Interface(
|
27 |
+
stock_forecast,
|
28 |
+
[
|
29 |
+
gr.inputs.Radio([2025, 2030, 2035, 2040], label="Project to:"),
|
30 |
+
gr.inputs.CheckboxGroup(["Google", "Microsoft", "Gradio"]),
|
31 |
+
gr.inputs.Slider(1, 100),
|
32 |
+
"checkbox",
|
33 |
+
gr.inputs.Dropdown(["cross", "line", "circle"], label="Style")],
|
34 |
+
gr.outputs.Image(plot=True, label="forecast"))
|
35 |
+
|
36 |
+
iface.test_launch()
|
37 |
+
if __name__ == "__main__":
|
38 |
+
iface.launch(inline=False)
|
util.py
ADDED
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import lru_cache
|
2 |
+
|
3 |
+
|
4 |
+
|
5 |
+
@lru_cache()
|
6 |
+
def bytes_to_unicode_dict():
|
7 |
+
"""
|
8 |
+
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
|
9 |
+
characters the bpe code barfs on.
|
10 |
+
|
11 |
+
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
|
12 |
+
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
|
13 |
+
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
|
14 |
+
tables between utf-8 bytes and unicode strings.
|
15 |
+
"""
|
16 |
+
bs = (
|
17 |
+
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
|
18 |
+
)
|
19 |
+
cs = bs[:]
|
20 |
+
n = 0
|
21 |
+
for b in range(2 ** 8):
|
22 |
+
if b not in bs:
|
23 |
+
bs.append(b)
|
24 |
+
cs.append(2 ** 8 + n)
|
25 |
+
n += 1
|
26 |
+
cs = [chr(n) for n in cs]
|
27 |
+
return dict(zip(cs, bs))
|
28 |
+
|
29 |
+
ORD_UNICODE_MAP = bytes_to_unicode_dict()
|
30 |
+
|
31 |
+
|
32 |
+
@lru_cache()
|
33 |
+
def byte_to_char(bytestr):
|
34 |
+
return bytearray([ORD_UNICODE_MAP[c] for c in bytestr]).decode("utf-8", errors="replace")
|
35 |
+
|
36 |
+
# @lru_cache()
|
37 |
+
def bytetokens_to_unicdode(byte_tokens: list):
|
38 |
+
return [byte_to_char(token) for token in byte_tokens]
|
39 |
+
|
40 |
+
|
41 |
+
if __name__ == '__main__':
|
42 |
+
|
43 |
+
tokens = ['<s>',
|
44 |
+
'ì¹´ì¹´ìĺ¤',
|
45 |
+
'ìĹĶ',
|
46 |
+
'íĦ°',
|
47 |
+
'íĶĦëĿ¼ìĿ´',
|
48 |
+
'ì¦Ī',
|
49 |
+
'(',
|
50 |
+
'ëĮĢíijľ',
|
51 |
+
'Ġë°±',
|
52 |
+
'ìĥģ',
|
53 |
+
'ìĹ½',
|
54 |
+
')',
|
55 |
+
'ê°Ģ',
|
56 |
+
'Ġìĺ¬íķ´',
|
57 |
+
'Ġ8',
|
58 |
+
'ìĽĶ',
|
59 |
+
'Ġ기ì¤Ģ',
|
60 |
+
'Ġëĭ¤ìĪĺ',
|
61 |
+
'Ġê¶Į',
|
62 |
+
'ìľĦ',
|
63 |
+
'ĠìŀĪëĬĶ',
|
64 |
+
'Ġê¸Ģë¡ľë²Į',
|
65 |
+
'ĠíķĻ',
|
66 |
+
'íļĮìĹIJìĦľ',
|
67 |
+
'Ġì´Ŀ',
|
68 |
+
'Ġ16',
|
69 |
+
'ê±´',
|
70 |
+
'ìĿĺ',
|
71 |
+
'ĠìĿ¸ê³µ',
|
72 |
+
'ì§Ģ',
|
73 |
+
'ëĬ¥',
|
74 |
+
'(',
|
75 |
+
'A',
|
76 |
+
'I',
|
77 |
+
')',
|
78 |
+
'Ġëħ¼ë¬¸',
|
79 |
+
'ìĿĦ',
|
80 |
+
'Ġëĵ±',
|
81 |
+
'ìŀ¬',
|
82 |
+
'íĸĪëĭ¤ê³ł',
|
83 |
+
'Ġ9',
|
84 |
+
'ìĿ¼',
|
85 |
+
'Ġë°ĿíĺĶ',
|
86 |
+
'ëĭ¤',
|
87 |
+
'.',
|
88 |
+
'Ġì§ĢëĤľíķ´',
|
89 |
+
'Ġëĵ±',
|
90 |
+
'ìŀ¬',
|
91 |
+
'íķľ',
|
92 |
+
'Ġ13',
|
93 |
+
'ê±´ë',
|
94 |
+
'³´ëĭ¤',
|
95 |
+
'Ġ3',
|
96 |
+
'ê±´',
|
97 |
+
'Ġë§İìĿĢ',
|
98 |
+
'Ġëħ¼ë¬¸',
|
99 |
+
'ìĿ´',
|
100 |
+
'Ġë°ĺ',
|
101 |
+
'ëħĦ',
|
102 |
+
'ìŬ',
|
103 |
+
'Ġë§ĮìĹIJ',
|
104 |
+
'Ġì±Ħ',
|
105 |
+
'íĥĿ',
|
106 |
+
'ëIJIJëĭ¤',
|
107 |
+
'.',
|
108 |
+
'Ġì¹´ì¹´ìĺ¤',
|
109 |
+
'ìĹĶ',
|
110 |
+
'íĦ°',
|
111 |
+
'íĶĦëĿ¼ìĿ´',
|
112 |
+
'ì¦Ī',
|
113 |
+
'(',
|
114 |
+
'ìĿ´',
|
115 |
+
'íķĺ',
|
116 |
+
'Ġì¹´ì¹´ìĺ¤',
|
117 |
+
'ìĹĶ',
|
118 |
+
'íĦ°',
|
119 |
+
')',
|
120 |
+
'ëĬĶ',
|
121 |
+
'ĠA',
|
122 |
+
'I',
|
123 |
+
'ĠìĹ°êµ¬',
|
124 |
+
'ĠìĦ±',
|
125 |
+
'과를',
|
126 |
+
'ĠìĿ´',
|
127 |
+
'ìĸ´ê°Ģ',
|
128 |
+
'기',
|
129 |
+
'ĠìľĦíķ´',
|
130 |
+
'ĠìĿ¸ìŀ¬',
|
131 |
+
'ĠíĻķë³´',
|
132 |
+
'ìĹIJ',
|
133 |
+
'ĠìĨį',
|
134 |
+
'ëıĦ를',
|
135 |
+
'ĠëĨĴìĿ´',
|
136 |
+
'ê²łëĭ¤ëĬĶ',
|
137 |
+
'Ġë°©',
|
138 |
+
'침',
|
139 |
+
'ìĿ´ëĭ¤',
|
140 |
+
'.',
|
141 |
+
'Ċ',
|
142 |
+
'Ċ',
|
143 |
+
'ì¹´ì¹´ìĺ¤',
|
144 |
+
'ìĹĶ',
|
145 |
+
'íĦ°',
|
146 |
+
'ëĬĶ',
|
147 |
+
'Ġ8',
|
148 |
+
'ìĽĶ',
|
149 |
+
'ĠìŀIJìĹ°',
|
150 |
+
'ìĸ´',
|
151 |
+
'ì²ĺ리',
|
152 |
+
'Ġë¶Ħìķ¼',
|
153 |
+
'ìĿĺ',
|
154 |
+
'Ġê¸Ģë¡ľë²Į',
|
155 |
+
'Ġíĥij',
|
156 |
+
'ĠíķĻ',
|
157 |
+
'íļĮ',
|
158 |
+
'ìĿ¸',
|
159 |
+
"Ġ'",
|
160 |
+
'A',
|
161 |
+
'C',
|
162 |
+
'L',
|
163 |
+
'-',
|
164 |
+
'I',
|
165 |
+
'J',
|
166 |
+
'C',
|
167 |
+
'N',
|
168 |
+
'L',
|
169 |
+
'P',
|
170 |
+
"'",
|
171 |
+
'ìĹIJ',
|
172 |
+
'Ġëħ¼ë¬¸',
|
173 |
+
'ìĿĦ',
|
174 |
+
'Ġë°ľíijľ',
|
175 |
+
'íķľ',
|
176 |
+
'ĠìĤ¬ë¡Ģ',
|
177 |
+
'ê¹Įì§Ģ',
|
178 |
+
'Ġíķ©',
|
179 |
+
'íķ´',
|
180 |
+
'Ġìĺ¬íķ´',
|
181 |
+
'Ġì´Ŀ',
|
182 |
+
'Ġ16',
|
183 |
+
'ê±´',
|
184 |
+
'ìĿĺ',
|
185 |
+
'ĠA',
|
186 |
+
'I',
|
187 |
+
'Ġëħ¼ë¬¸',
|
188 |
+
'ìĿĦ',
|
189 |
+
'Ġëĵ±',
|
190 |
+
'ìŀ¬',
|
191 |
+
'íĸĪëĭ¤ê³ł',
|
192 |
+
'Ġë°ĿíĺĶ',
|
193 |
+
'ëĭ¤',
|
194 |
+
'.',
|
195 |
+
'ĠìĿ´',
|
196 |
+
'Ġëħ¼ë¬¸',
|
197 |
+
'ìĿĢ',
|
198 |
+
'ĠìĿ¸ëıĦ',
|
199 |
+
'ë©Ķ',
|
200 |
+
'ìĿ¸',
|
201 |
+
'(',
|
202 |
+
'in',
|
203 |
+
'-',
|
204 |
+
'd',
|
205 |
+
'om',
|
206 |
+
'a',
|
207 |
+
'in',
|
208 |
+
')',
|
209 |
+
'Ġìĥĺ',
|
210 |
+
'íĶĮ',
|
211 |
+
'ìĿĦ',
|
212 |
+
'ĠìĤ¬ìļ©',
|
213 |
+
'íķ´',
|
214 |
+
'ĠìŀIJìĹ°',
|
215 |
+
'ìĸ´',
|
216 |
+
'Ġ공격',
|
217 |
+
'Ġë°©ìĭĿìľ¼ë¡ľ',
|
218 |
+
'ĠìķĦìĽĥ',
|
219 |
+
'ìĺ¤',
|
220 |
+
'ë¸Į',
|
221 |
+
'ëıĦ',
|
222 |
+
'ë©Ķ',
|
223 |
+
'ìĿ¸',
|
224 |
+
'(',
|
225 |
+
'out',
|
226 |
+
'-',
|
227 |
+
'of',
|
228 |
+
'-',
|
229 |
+
'd',
|
230 |
+
'om',
|
231 |
+
'a',
|
232 |
+
'in',
|
233 |
+
')',
|
234 |
+
'Ġìĥĺ',
|
235 |
+
'íĶĮ',
|
236 |
+
'ìĿĦ',
|
237 |
+
'ĠìŀIJëıĻ',
|
238 |
+
'ìľ¼ë¡ľ',
|
239 |
+
'ĠìĥĿ',
|
240 |
+
'ìĦ±',
|
241 |
+
',',
|
242 |
+
'Ġë¶Ħ',
|
243 |
+
'ë¥ĺ',
|
244 |
+
'Ġ모ëį¸',
|
245 |
+
'ìĿĺ',
|
246 |
+
'Ġê°IJ',
|
247 |
+
'ì§Ģ',
|
248 |
+
'ĠëĬ¥ëł¥ìĿĦ',
|
249 |
+
'Ġíĸ¥',
|
250 |
+
'ìĥģ',
|
251 |
+
'ìĭľíĤ¤ëĬĶ',
|
252 |
+
'ĠëĤ´ìļ©',
|
253 |
+
'ìĿĺ',
|
254 |
+
'Ġëħ¼ë¬¸',
|
255 |
+
'ìĿ´ëĭ¤',
|
256 |
+
'.',
|
257 |
+
'Ċ',
|
258 |
+
'Ċ',
|
259 |
+
'7',
|
260 |
+
'ìĽĶ',
|
261 |
+
'ìĹIJëĬĶ',
|
262 |
+
'Ġ머',
|
263 |
+
'ìĭł',
|
264 |
+
'룬',
|
265 |
+
'ëĭĿ',
|
266 |
+
'ĠíķĻ',
|
267 |
+
'íļĮ',
|
268 |
+
"Ġ'",
|
269 |
+
'I',
|
270 |
+
'C',
|
271 |
+
'M',
|
272 |
+
'L',
|
273 |
+
"'",
|
274 |
+
'ìĹIJ',
|
275 |
+
'Ġíļ¨ìľ¨',
|
276 |
+
'ìłģìĿ¸',
|
277 |
+
'Ġê³ł',
|
278 |
+
'íĴĪ',
|
279 |
+
'ì§Ī',
|
280 |
+
'ĠìĿĮ',
|
281 |
+
'ìĦ±',
|
282 |
+
'íķ©',
|
283 |
+
'ìĦ±ìĿ´',
|
284 |
+
'Ġê°ĢëĬ¥íķľ',
|
285 |
+
"Ġ'",
|
286 |
+
'ìĹĶ',
|
287 |
+
'ëĵľ',
|
288 |
+
'ĠíĪ¬',
|
289 |
+
'ĠìĹĶ',
|
290 |
+
'ëĵľ',
|
291 |
+
'(',
|
292 |
+
'en',
|
293 |
+
'd',
|
294 |
+
'-',
|
295 |
+
't',
|
296 |
+
'o',
|
297 |
+
'-',
|
298 |
+
'en',
|
299 |
+
'd',
|
300 |
+
')',
|
301 |
+
"'",
|
302 |
+
'Ġ모ëį¸',
|
303 |
+
'ìĿĦ',
|
304 |
+
'ĠìłľìķĪ',
|
305 |
+
'íķĺëĬĶ',
|
306 |
+
'Ġëħ¼ë¬¸',
|
307 |
+
'ìĿĦ',
|
308 |
+
'Ġë°ľíijľ',
|
309 |
+
'íĸĪëĭ¤',
|
310 |
+
'.',
|
311 |
+
'Ġ6',
|
312 |
+
'ìĽĶ',
|
313 |
+
'ìĹIJëĬĶ',
|
314 |
+
'ĠìĿĮ',
|
315 |
+
'íĸ¥',
|
316 |
+
'·',
|
317 |
+
'ìĿĮ',
|
318 |
+
'ìĦ±',
|
319 |
+
'Ġìĭł',
|
320 |
+
'íĺ¸',
|
321 |
+
'ì²ĺ리',
|
322 |
+
'Ġë¶Ħìķ¼',
|
323 |
+
'ĠíķĻ',
|
324 |
+
'ìĪł',
|
325 |
+
'ëĮĢíļĮ',
|
326 |
+
"Ġ'",
|
327 |
+
'I',
|
328 |
+
'C',
|
329 |
+
'A',
|
330 |
+
'S',
|
331 |
+
'S',
|
332 |
+
'P',
|
333 |
+
"'",
|
334 |
+
'ìĹIJ',
|
335 |
+
'ĠëĮĢ',
|
336 |
+
'ê·ľëª¨',
|
337 |
+
'Ġíħ',
|
338 |
+
'į',
|
339 |
+
'ìĬ¤íĬ¸',
|
340 |
+
'Ġì½Ķ',
|
341 |
+
'íį¼ìĬ¤',
|
342 |
+
'(',
|
343 |
+
'ìĸ¸',
|
344 |
+
'ìĸ´',
|
345 |
+
'ĠìĹ°',
|
346 |
+
'구를',
|
347 |
+
'ĠìľĦíķ´',
|
348 |
+
'Ġíħ',
|
349 |
+
'į',
|
350 |
+
'ìĬ¤íĬ¸ë¥¼',
|
351 |
+
'Ġì»´íĵ¨íĦ°',
|
352 |
+
'ê°Ģ',
|
353 |
+
'ĠìĿ½ìĿĦ',
|
354 |
+
'ĠìĪĺ',
|
355 |
+
'ĠìŀĪëĬĶ',
|
356 |
+
'Ġíĺķíĥľë¡ľ',
|
357 |
+
'Ġ모ìķĦ',
|
358 |
+
'ĠëĨĵìĿĢ',
|
359 |
+
'Ġìĸ¸ìĸ´',
|
360 |
+
'ĠìŀIJë£Į',
|
361 |
+
')',
|
362 |
+
'Ġìłķë³´',
|
363 |
+
'ĠíķĻìĬµ',
|
364 |
+
'ìĹIJ',
|
365 |
+
'ĠëĮĢíķľ',
|
366 |
+
'Ġëħ¼ë¬¸',
|
367 |
+
'Ġ1',
|
368 |
+
'ê±´ìĿĦ',
|
369 |
+
'Ġìĭ¤',
|
370 |
+
'ìĹĪëĭ¤',
|
371 |
+
'.',
|
372 |
+
'Ċ',
|
373 |
+
'</s>']
|
374 |
+
|
375 |
+
import time
|
376 |
+
|
377 |
+
start = time.time()
|
378 |
+
for i in range(1000):
|
379 |
+
result = bytetokens_to_unicdode(tokens)
|
380 |
+
end = time.time()
|
381 |
+
|
382 |
+
print(result)
|
383 |
+
|
384 |
+
print(f'time: {end-start}')
|