tokenizer-arena / examples.py
xu-song's picture
update
9495a4f
raw
history blame
1.3 kB
examples = {
"en": [
["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm_6b"], # chatglm 有blank_n,
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
["punctuations: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
["digits: (10086 + 98) = 100184", "baichuan", "llama"]
]
,
"zh": [
["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
["标点测试:,。!?;", "baichuan_7b", "llama"],
["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
]
}
def example_fn(example_idx):
return examples["en"][example_idx]