|
""" |
|
|
|
## characters |
|
|
|
- alphanumeric characters |
|
- numeric characters |
|
- special characters: A special character is a character that is not an alphabetic or numeric character. |
|
- ASCII control characters |
|
- punctuation marks |
|
- accent marks |
|
- 数学符号 |
|
- whitespace: |
|
- https://en.wikipedia.org/wiki/Whitespace_character |
|
- https://emptycharacter.com/ |
|
|
|
|
|
https://www.computerhope.com/jargon/s/specchar.htm |
|
""" |
|
|
|
examples = { |
|
"en": [ |
|
["number: (10086 + 98) = 100184", "llama", "bloom"], |
|
["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "bert_base_cased"], |
|
|
|
["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "gemma_7b", "llama"], |
|
["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"], |
|
["special: [PAD] [UNK] [CLS] [SEP] [MASK] "], |
|
], |
|
"zh": [ |
|
["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], |
|
["标点测试:,。!?;", "baichuan_7b", "llama"], |
|
["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"], |
|
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"], |
|
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"], |
|
] |
|
} |
|
|
|
more_examples = [ |
|
|
|
|
|
("bert_base_cased", "bert_base_uncased", ""), |
|
("bert_base_cased", "clue", ""), |
|
|
|
|
|
("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"), |
|
("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"), |
|
("llama", "chinese_llama2", ""), |
|
("chinese_llama", "chinese_llama2", ""), |
|
|
|
|
|
("glm", "chatglm1", ""), |
|
("chatglm1", "chatglm2", ""), |
|
|
|
|
|
("gpt2", "moss", ""), |
|
("", "", ""), |
|
|
|
|
|
("qwen", "gpt_35_turbo", ""), |
|
|
|
] |
|
|
|
lang = "en" |
|
|
|
example_types = [t[0].split(":")[0] for t in examples[lang]] |
|
|
|
|
|
def example_fn(example_idx): |
|
return examples[lang][example_idx] |
|
|
|
|
|
def get_more_example(): |
|
import urllib.parse |
|
url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena" |
|
for tokenizer1, tokenizer2, text in more_examples: |
|
full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}' |
|
print(full_url) |
|
|
|
|
|
if __name__ == "__main__": |
|
get_more_example() |
|
|