File size: 3,126 Bytes
a6c67ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9495a4f
 
24b4aa5
 
9495a4f
24b4aa5
d27a756
480ae5d
a6c67ec
9495a4f
a37f943
9495a4f
 
 
 
 
 
 
f4973d4
 
480ae5d
 
 
f4973d4
 
 
 
 
480ae5d
f4973d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f833af
 
 
 
9495a4f
 
1f833af
f4973d4
 
 
 
 
480ae5d
f4973d4
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""

## characters

- alphanumeric characters
- numeric characters
- special characters: A special character is a character that is not an alphabetic or numeric character.
    - ASCII control characters
    - punctuation marks
    - accent marks
    - 数学符号
    - whitespace:
        - https://en.wikipedia.org/wiki/Whitespace_character
        - https://emptycharacter.com/


https://www.computerhope.com/jargon/s/specchar.htm
"""

examples = {
    "en": [
        ["number: (10086 + 98) = 100184", "llama", "bloom"],  #
        ["whitespace:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "llama", "bert_base_cased"],  # chatglm 有blank_n, bert丢掉了空格,
        # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
        ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "gemma_7b", "llama"],  # llama词典有点小
        ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
        # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|endoftext|>", "", ""],
    ],
    "zh": [
        ["空格测试:  2个空格        8个空格", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
        ["标点测试:,。!?;", "baichuan_7b", "llama"],
        ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
        ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
        ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
    ]
}

more_examples = [
    # bert系列
    ("bert_base_cased", "bert_base_uncased", "", ""),  # # clue VS kplug, bert VS clue
    ("bert_base_cased", "clue", "", "增加了[]()"),
    ("clue", "kplug", "", ""),

    # llama系列 (基于sentencepiece)
    ("baichuan", "baichuan2", "baichuan2支持多空格   ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"),
    ("llama", "baichuan2", "baichuan2支持多空格   ,多个换行\n\n"),
    ("llama", "chinese_llama2", ""),
    ("llama", "llama3", "扩充词典"),
    ("chinese_llama", "chinese_llama2", ""),

    # glm系列 (基于sentencepiece)
    ("glm", "chatglm1", ""),
    ("chatglm1", "chatglm2", ""),

    # gpt2系列
    ("gpt2", "moss", ""),
    ("", "", ""),

    # openai系列 (tiktoken)
    ("qwen", "gpt_35_turbo", ""),

]

lang = "en"

example_types = [t[0].split(":")[0] for t in examples[lang]]


def example_fn(example_idx):
    return examples[lang][example_idx]


def get_more_example():
    import urllib.parse
    url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
    for tokenizer1, tokenizer2, text, comment in more_examples:
        full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}'
        print(full_url)


if __name__ == "__main__":
    get_more_example()