badwords

Runtime error

App Files Files Community

magic3910

jason9693 commited on Feb 27, 2023

Commit

85e396d

0 Parent(s):

Duplicate from jason9693/KoreanHateSpeechClassifier

Browse files

Co-authored-by: Yang-Kichang <jason9693@users.noreply.huggingface.co>

Files changed (12) hide show

.gitattributes +27 -0
NanumGothicCoding-Bold.ttf +0 -0
NanumGothicCoding.ttf +0 -0
README.md +38 -0
__pycache__/bertviz.cpython-38.pyc +0 -0
__pycache__/util.cpython-36.pyc +0 -0
app.py +116 -0
attention.py +97 -0
bvz.py +10 -0
requirements.txt +5 -0
test_demp.py +38 -0
util.py +384 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

NanumGothicCoding-Bold.ttf ADDED Viewed

Binary file (1.8 MB). View file

NanumGothicCoding.ttf ADDED Viewed

Binary file (2.78 MB). View file

README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+---
+title: KoreanHateSpeechClassifier
+emoji: ⚡
+colorFrom: red
+colorTo: purple
+sdk: gradio
+app_file: app.py
+pinned: false
+duplicated_from: jason9693/KoreanHateSpeechClassifier
+---
+# Configuration
+`title`: _string_
+Display title for the Space
+`emoji`: _string_
+Space emoji (emoji-only character allowed)
+`colorFrom`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`colorTo`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`sdk`: _string_
+Can be either `gradio` or `streamlit`
+`sdk_version` : _string_
+Only applicable for `streamlit` SDK.
+See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
+`app_file`: _string_
+Path to your main application file (which contains either `gradio` or `streamlit` Python code).
+Path is relative to the root of the repository.
+`pinned`: _boolean_
+Whether the Space stays on top of your list.

__pycache__/bertviz.cpython-38.pyc ADDED Viewed

Binary file (533 Bytes). View file

__pycache__/util.cpython-36.pyc ADDED Viewed

Binary file (6.01 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
+import gradio as gr
+from torch.nn import functional as F
+import seaborn
+import matplotlib
+import platform
+from transformers.file_utils import ModelOutput
+if platform.system() == "Darwin":
+    print("MacOS")
+    matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import io
+from PIL import Image
+import matplotlib.font_manager as fm
+import util
+# global var
+MODEL_NAME = 'jason9693/SoongsilBERT-base-beep'
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
+config = AutoConfig.from_pretrained(MODEL_NAME)
+MODEL_BUF = {
+    "name": MODEL_NAME,
+    "tokenizer": tokenizer,
+    "model": model,
+    "config": config
+}
+font_dir = ['./']
+for font in fm.findSystemFonts(font_dir):
+    print(font)
+    fm.fontManager.addfont(font)
+plt.rcParams["font.family"] = 'NanumGothicCoding'
+def visualize_attention(sent, attention_matrix, n_words=10):
+    def draw(data, x, y, ax):
+        seaborn.heatmap(data,
+                        xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0,
+                        cbar=False, ax=ax)
+    # make plt figure with 1x6 subplots
+    fig = plt.figure(figsize=(16, 8))
+    # fig.subplots_adjust(hspace=0.7, wspace=0.2)
+    for i, layer in enumerate(range(1, 12, 2)):
+        ax = fig.add_subplot(2, 3, i+1)
+        ax.set_title("Layer {}".format(layer))
+        draw(attention_matrix[layer], sent if layer > 6 else [], sent if layer in [1,7] else [], ax=ax)
+    fig.tight_layout()
+    plt.close()
+    return fig
+def change_model_name(name):
+    MODEL_BUF["name"] = name
+    MODEL_BUF["tokenizer"] = AutoTokenizer.from_pretrained(name)
+    MODEL_BUF["model"] = AutoModelForSequenceClassification.from_pretrained(name)
+    MODEL_BUF["config"] = AutoConfig.from_pretrained(name)
+def predict(model_name, text):
+    if model_name != MODEL_BUF["name"]:
+        change_model_name(model_name)
+    tokenizer = MODEL_BUF["tokenizer"]
+    model = MODEL_BUF["model"]
+    config = MODEL_BUF["config"]
+    tokenized_text = tokenizer([text], return_tensors='pt')
+    input_tokens = tokenizer.convert_ids_to_tokens(tokenized_text.input_ids[0])
+    try:
+        input_tokens = util.bytetokens_to_unicdode(input_tokens) if config.model_type in ['roberta', 'gpt', 'gpt2'] else input_tokens
+    except KeyError:
+        input_tokens = input_tokens
+    model.eval()
+    output, attention = model(**tokenized_text, output_attentions=True, return_dict=False)
+    output = F.softmax(output, dim=-1)
+    result = {}
+    for idx, label in enumerate(output[0].detach().numpy()):
+        result[config.id2label[idx]] = float(label)
+    fig = visualize_attention(input_tokens, attention[0][0].detach().numpy())
+    return result, fig#.logits.detach()#.numpy()#, output.attentions.detach().numpy()
+if __name__ == '__main__':
+    text = '읿딴걸 홍볿글 읿랉곭 쌑젩낄고 앉앟있냩'
+    model_name_list = [
+        'jason9693/SoongsilBERT-base-beep',
+        "beomi/beep-klue-roberta-base-hate",
+        "beomi/beep-koelectra-base-v3-discriminator-hate",
+        "beomi/beep-KcELECTRA-base-hate"
+    ]
+    #Create a gradio app with a button that calls predict()
+    app = gr.Interface(
+        fn=predict,
+        inputs=[gr.inputs.Dropdown(model_name_list, label="Model Name"), 'text'], outputs=['label', 'plot'],
+        examples = [[MODEL_BUF["name"], text], [MODEL_BUF["name"], "4=🦀 4≠🦀"]],
+        title="한국어 혐오성 발화 분류기 (Korean Hate Speech Classifier)",
+        description="Korean Hate Speech Classifier with Several Pretrained LM\nCurrent Supported Model:\n1. SoongsilBERT\n2. KcBERT(+KLUE)\n3. KcELECTRA\n4.KoELECTRA."
+        )
+    app.launch(inline=False)

attention.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
+import gradio as gr
+from torch.nn import functional as F
+import seaborn
+import matplotlib
+import platform
+if platform.system() == "Darwin":
+    print("MacOS")
+    matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import io
+from PIL import Image
+import matplotlib.font_manager as fm
+import util
+font_path = r'NanumGothicCoding.ttf'
+fontprop = fm.FontProperties(fname=font_path, size=18)
+plt.rcParams["font.family"] = 'NanumGothic'
+def visualize_attention(sent, attention_matrix, n_words=10):
+    def draw(data, x, y, ax):
+        seaborn.heatmap(data,
+                        xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0,
+                        cbar=False, ax=ax)
+    # make plt figure with 1x6 subplots
+    fig = plt.figure(figsize=(16, 8))
+    # fig.subplots_adjust(hspace=0.7, wspace=0.2)
+    for i, layer in enumerate(range(1, 12, 2)):
+        ax = fig.add_subplot(2, 3, i+1)
+        ax.set_title("Layer {}".format(layer))
+        draw(attention_matrix[layer], sent if layer > 6 else [], sent if layer in [1,7] else [], ax=ax)
+    fig.tight_layout()
+    plt.close()
+    return fig
+def predict(model_name, text):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    config = AutoConfig.from_pretrained(model_name)
+    print(config.id2label)
+    tokenized_text = tokenizer([text], return_tensors='pt')
+    input_tokens = tokenizer.convert_ids_to_tokens(tokenized_text.input_ids[0])
+    print(input_tokens)
+    input_tokens = util.bytetokens_to_unicdode(input_tokens) if config.model_type in ['roberta', 'gpt', 'gpt2'] else input_tokens
+    model.eval()
+    output, attention = model(**tokenized_text, output_attentions=True, return_dict=False)
+    output = F.softmax(output, dim=-1)
+    result = {}
+    for idx, label in enumerate(output[0].detach().numpy()):
+        result[config.id2label[idx]] = float(label)
+    fig = visualize_attention(input_tokens, attention[0][0].detach().numpy())
+    return result, fig#.logits.detach()#.numpy()#, output.attentions.detach().numpy()
+if __name__ == '__main__':
+    model_name = 'jason9693/SoongsilBERT-beep-base'
+    text = '읿딴걸 홍볿글 읿랉곭 쌑젩낄고 앉앟있냩'
+    # output = predict(model_name, text)
+    # print(output)
+    model_name_list = [
+        'jason9693/SoongsilBERT-beep-base'
+    ]
+    #Create a gradio app with a button that calls predict()
+    app = gr.Interface(
+        fn=predict,
+        server_port=26899,
+        server_name='0.0.0.0',
+        inputs=[gr.inputs.Dropdown(model_name_list, label="Model Name"), 'text'], outputs=['label', 'plot'],
+        examples = [[model_name, text]],
+        title="한국어 혐오성 발화 분류기 (Korean Hate Speech Classifier)",
+        description="Korean Hate Speech Classifier with Several Pretrained LM\nCurrent Supported Model:\n1. SoongsilBERT"
+        )
+    app.launch(inline=False)

bvz.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from transformers import AutoTokenizer, AutoModel
+from bertviz import model_view
+tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+model = AutoModel.from_pretrained("distilbert-base-uncased", output_attentions=True)
+inputs = tokenizer.encode("The cat sat on the mat", return_tensors='pt')
+outputs = model(inputs)
+attention = outputs[-1]  # Output includes attention weights when output_attentions=True
+tokens = tokenizer.convert_ids_to_tokens(inputs[0])
+model_view(attention, tokens)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers==4.3.0
+torch==1.6.0
+matplotlib
+seaborn
+numpy

test_demp.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import gradio as gr
+import matplotlib.pyplot as plt
+import numpy as np
+def stock_forecast(final_year, companies, noise, show_legend, point_style):
+    start_year = 2020
+    x = np.arange(start_year, final_year + 1)
+    year_count = x.shape[0]
+    plt_format = ({"cross": "X", "line": "-", "circle": "o--"})[point_style]
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    for i, company in enumerate(companies):
+        series = np.arange(0, year_count, dtype=float)
+        series = series ** 2 * (i + 1)
+        series += np.random.rand(year_count) * noise
+        ax.plot(x, series, plt_format)
+    if show_legend:
+        plt.legend(companies)
+    plt.close()
+    return fig
+iface = gr.Interface(
+    stock_forecast,
+    [
+        gr.inputs.Radio([2025, 2030, 2035, 2040], label="Project to:"),
+        gr.inputs.CheckboxGroup(["Google", "Microsoft", "Gradio"]),
+        gr.inputs.Slider(1, 100),
+        "checkbox",
+        gr.inputs.Dropdown(["cross", "line", "circle"], label="Style")],
+    gr.outputs.Image(plot=True, label="forecast"))
+iface.test_launch()
+if __name__ == "__main__":
+    iface.launch(inline=False)

util.py ADDED Viewed

	@@ -0,0 +1,384 @@

+from functools import lru_cache
+@lru_cache()
+def bytes_to_unicode_dict():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(cs, bs))
+ORD_UNICODE_MAP = bytes_to_unicode_dict()
+@lru_cache()
+def byte_to_char(bytestr):
+    return bytearray([ORD_UNICODE_MAP[c] for c in bytestr]).decode("utf-8", errors="replace")
+# @lru_cache()
+def bytetokens_to_unicdode(byte_tokens: list):
+    return [byte_to_char(token) for token in byte_tokens]
+if __name__ == '__main__':
+    tokens = ['<s>',
+        'ì¹´ì¹´ìĺ¤',
+        'ìĹĶ',
+        'íĦ°',
+        'íĶĦëĿ¼ìĿ´',
+        'ì¦Ī',
+        '(',
+        'ëĮĢíĳľ',
+        'Ġë°±',
+        'ìĥģ',
+        'ìĹ½',
+        ')',
+        'ê°Ģ',
+        'Ġìĺ¬íķ´',
+        'Ġ8',
+        'ìĽĶ',
+        'Ġê¸°ì¤Ģ',
+        'Ġëĭ¤ìĪĺ',
+        'Ġê¶Į',
+        'ìľĦ',
+        'ĠìŀĪëĬĶ',
+        'Ġê¸Ģë¡ľë²Į',
+        'ĠíķĻ',
+        'íļĮìĹĲìĦľ',
+        'Ġì´Ŀ',
+        'Ġ16',
+        'ê±´',
+        'ìĿĺ',
+        'ĠìĿ¸ê³µ',
+        'ì§Ģ',
+        'ëĬ¥',
+        '(',
+        'A',
+        'I',
+        ')',
+        'Ġëħ¼ë¬¸',
+        'ìĿĦ',
+        'Ġëĵ±',
+        'ìŀ¬',
+        'íĸĪëĭ¤ê³ł',
+        'Ġ9',
+        'ìĿ¼',
+        'Ġë°ĿíĺĶ',
+        'ëĭ¤',
+        '.',
+        'Ġì§ĢëĤľíķ´',
+        'Ġëĵ±',
+        'ìŀ¬',
+        'íķľ',
+        'Ġ13',
+        'ê±´ë',
+        '³´ëĭ¤',
+        'Ġ3',
+        'ê±´',
+        'Ġë§İìĿĢ',
+        'Ġëħ¼ë¬¸',
+        'ìĿ´',
+        'Ġë°ĺ',
+        'ëħĦ',
+        'ìĹ¬',
+        'Ġë§ĮìĹĲ',
+        'Ġì±Ħ',
+        'íĥĿ',
+        'ëĲĲëĭ¤',
+        '.',
+        'Ġì¹´ì¹´ìĺ¤',
+        'ìĹĶ',
+        'íĦ°',
+        'íĶĦëĿ¼ìĿ´',
+        'ì¦Ī',
+        '(',
+        'ìĿ´',
+        'íķĺ',
+        'Ġì¹´ì¹´ìĺ¤',
+        'ìĹĶ',
+        'íĦ°',
+        ')',
+        'ëĬĶ',
+        'ĠA',
+        'I',
+        'ĠìĹ°êµ¬',
+        'ĠìĦ±',
+        'ê³¼ë¥¼',
+        'ĠìĿ´',
+        'ìĸ´ê°Ģ',
+        'ê¸°',
+        'ĠìľĦíķ´',
+        'ĠìĿ¸ìŀ¬',
+        'ĠíĻķë³´',
+        'ìĹĲ',
+        'ĠìĨį',
+        'ëıĦë¥¼',
+        'ĠëĨĴìĿ´',
+        'ê²łëĭ¤ëĬĶ',
+        'Ġë°©',
+        'ì¹¨',
+        'ìĿ´ëĭ¤',
+        '.',
+        'Ċ',
+        'Ċ',
+        'ì¹´ì¹´ìĺ¤',
+        'ìĹĶ',
+        'íĦ°',
+        'ëĬĶ',
+        'Ġ8',
+        'ìĽĶ',
+        'ĠìŀĲìĹ°',
+        'ìĸ´',
+        'ì²ĺë¦¬',
+        'Ġë¶Ħìķ¼',
+        'ìĿĺ',
+        'Ġê¸Ģë¡ľë²Į',
+        'Ġíĥĳ',
+        'ĠíķĻ',
+        'íļĮ',
+        'ìĿ¸',
+        "Ġ'",
+        'A',
+        'C',
+        'L',
+        '-',
+        'I',
+        'J',
+        'C',
+        'N',
+        'L',
+        'P',
+        "'",
+        'ìĹĲ',
+        'Ġëħ¼ë¬¸',
+        'ìĿĦ',
+        'Ġë°ľíĳľ',
+        'íķľ',
+        'ĠìĤ¬ë¡Ģ',
+        'ê¹Įì§Ģ',
+        'Ġíķ©',
+        'íķ´',
+        'Ġìĺ¬íķ´',
+        'Ġì´Ŀ',
+        'Ġ16',
+        'ê±´',
+        'ìĿĺ',
+        'ĠA',
+        'I',
+        'Ġëħ¼ë¬¸',
+        'ìĿĦ',
+        'Ġëĵ±',
+        'ìŀ¬',
+        'íĸĪëĭ¤ê³ł',
+        'Ġë°ĿíĺĶ',
+        'ëĭ¤',
+        '.',
+        'ĠìĿ´',
+        'Ġëħ¼ë¬¸',
+        'ìĿĢ',
+        'ĠìĿ¸ëıĦ',
+        'ë©Ķ',
+        'ìĿ¸',
+        '(',
+        'in',
+        '-',
+        'd',
+        'om',
+        'a',
+        'in',
+        ')',
+        'Ġìĥĺ',
+        'íĶĮ',
+        'ìĿĦ',
+        'ĠìĤ¬ìļ©',
+        'íķ´',
+        'ĠìŀĲìĹ°',
+        'ìĸ´',
+        'Ġê³µê²©',
+        'Ġë°©ìĭĿìľ¼ë¡ľ',
+        'ĠìķĦìĽĥ',
+        'ìĺ¤',
+        'ë¸Į',
+        'ëıĦ',
+        'ë©Ķ',
+        'ìĿ¸',
+        '(',
+        'out',
+        '-',
+        'of',
+        '-',
+        'd',
+        'om',
+        'a',
+        'in',
+        ')',
+        'Ġìĥĺ',
+        'íĶĮ',
+        'ìĿĦ',
+        'ĠìŀĲëıĻ',
+        'ìľ¼ë¡ľ',
+        'ĠìĥĿ',
+        'ìĦ±',
+        ',',
+        'Ġë¶Ħ',
+        'ë¥ĺ',
+        'Ġëª¨ëį¸',
+        'ìĿĺ',
+        'Ġê°Ĳ',
+        'ì§Ģ',
+        'ĠëĬ¥ëł¥ìĿĦ',
+        'Ġíĸ¥',
+        'ìĥģ',
+        'ìĭľíĤ¤ëĬĶ',
+        'ĠëĤ´ìļ©',
+        'ìĿĺ',
+        'Ġëħ¼ë¬¸',
+        'ìĿ´ëĭ¤',
+        '.',
+        'Ċ',
+        'Ċ',
+        '7',
+        'ìĽĶ',
+        'ìĹĲëĬĶ',
+        'Ġë¨¸',
+        'ìĭł',
+        'ëŁ¬',
+        'ëĭĿ',
+        'ĠíķĻ',
+        'íļĮ',
+        "Ġ'",
+        'I',
+        'C',
+        'M',
+        'L',
+        "'",
+        'ìĹĲ',
+        'Ġíļ¨ìľ¨',
+        'ìłģìĿ¸',
+        'Ġê³ł',
+        'íĴĪ',
+        'ì§Ī',
+        'ĠìĿĮ',
+        'ìĦ±',
+        'íķ©',
+        'ìĦ±ìĿ´',
+        'Ġê°ĢëĬ¥íķľ',
+        "Ġ'",
+        'ìĹĶ',
+        'ëĵľ',
+        'ĠíĪ¬',
+        'ĠìĹĶ',
+        'ëĵľ',
+        '(',
+        'en',
+        'd',
+        '-',
+        't',
+        'o',
+        '-',
+        'en',
+        'd',
+        ')',
+        "'",
+        'Ġëª¨ëį¸',
+        'ìĿĦ',
+        'ĠìłľìķĪ',
+        'íķĺëĬĶ',
+        'Ġëħ¼ë¬¸',
+        'ìĿĦ',
+        'Ġë°ľíĳľ',
+        'íĸĪëĭ¤',
+        '.',
+        'Ġ6',
+        'ìĽĶ',
+        'ìĹĲëĬĶ',
+        'ĠìĿĮ',
+        'íĸ¥',
+        'Â·',
+        'ìĿĮ',
+        'ìĦ±',
+        'Ġìĭł',
+        'íĺ¸',
+        'ì²ĺë¦¬',
+        'Ġë¶Ħìķ¼',
+        'ĠíķĻ',
+        'ìĪł',
+        'ëĮĢíļĮ',
+        "Ġ'",
+        'I',
+        'C',
+        'A',
+        'S',
+        'S',
+        'P',
+        "'",
+        'ìĹĲ',
+        'ĠëĮĢ',
+        'ê·ľëª¨',
+        'Ġíħ',
+        'į',
+        'ìĬ¤íĬ¸',
+        'Ġì½Ķ',
+        'íį¼ìĬ¤',
+        '(',
+        'ìĸ¸',
+        'ìĸ´',
+        'ĠìĹ°',
+        'êµ¬ë¥¼',
+        'ĠìľĦíķ´',
+        'Ġíħ',
+        'į',
+        'ìĬ¤íĬ¸ë¥¼',
+        'Ġì»´íĵ¨íĦ°',
+        'ê°Ģ',
+        'ĠìĿ½ìĿĦ',
+        'ĠìĪĺ',
+        'ĠìŀĪëĬĶ',
+        'Ġíĺķíĥľë¡ľ',
+        'Ġëª¨ìķĦ',
+        'ĠëĨĵìĿĢ',
+        'Ġìĸ¸ìĸ´',
+        'ĠìŀĲë£Į',
+        ')',
+        'Ġìłķë³´',
+        'ĠíķĻìĬµ',
+        'ìĹĲ',
+        'ĠëĮĢíķľ',
+        'Ġëħ¼ë¬¸',
+        'Ġ1',
+        'ê±´ìĿĦ',
+        'Ġìĭ¤',
+        'ìĹĪëĭ¤',
+        '.',
+        'Ċ',
+        '</s>']
+    import time
+    start = time.time()
+    for i in range(1000):
+        result = bytetokens_to_unicdode(tokens)
+    end = time.time()
+    print(result)
+    print(f'time: {end-start}')