Spaces:

yhavinga
/

dutch-tokenizer-arena

Runtime error

App Files Files Community

dutch-tokenizer-arena / examples.py

xu-song

add compression leaderboard

1b7fc74 6 months ago

raw

history blame contribute delete

3.16 kB

	"""

	## characters

	- alphanumeric characters
	- numeric characters
	- special characters: A special character is a character that is not an alphabetic or numeric character.
	- ASCII control characters
	- punctuation marks
	- accent marks
	- 数学符号
	- whitespace:
	- https://en.wikipedia.org/wiki/Whitespace_character
	- https://emptycharacter.com/


	https://www.computerhope.com/jargon/s/specchar.htm
	"""

	examples = {
	"en": [
	["number: (10086 + 98) = 100184", "llama", "bloom"], #
	["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "bert_base_cased"], # chatglm 有blank_n, bert丢掉了空格，
	# ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
	["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "gemma_7b", "llama"], # llama词典有点小
	["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
	# ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <\|system\|> <\|user\|> <\|assistant\|> <\|endoftext\|>", "", ""],
	],
	"zh": [
	["空格测试： 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
	["标点测试：，。！？；", "baichuan_7b", "llama"],
	["符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
	["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
	["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
	]
	}

	more_examples = [
	# bert系列
	("bert_base_cased", "bert_base_uncased", "", ""), # # clue VS kplug， bert VS clue
	("bert_base_cased", "clue", "", "增加了[]()"),
	("clue", "kplug", "", ""),

	# llama系列 (基于sentencepiece)
	("baichuan", "baichuan2", "baichuan2支持多空格，多个换行\n\n\n，do not add dummy prefix as Baichuan1"),
	("llama", "baichuan2", "baichuan2支持多空格，多个换行\n\n"),
	("llama", "chinese_llama2", ""),
	("llama", "llama3", "扩充词典"),
	("chinese_llama", "chinese_llama2", ""),

	# glm系列（基于sentencepiece）
	("glm", "chatglm1", ""),
	("chatglm1", "chatglm2", ""),

	# gpt2系列
	("gpt2", "moss", ""),
	("", "", ""),

	# openai系列（tiktoken）
	("qwen", "gpt_35_turbo", ""),

	]

	lang = "en"

	example_types = [t[0].split(":")[0] for t in examples[lang]]


	def example_fn(example_idx):
	return examples[lang][example_idx]


	def get_more_example():
	import urllib.parse
	url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
	for tokenizer1, tokenizer2, text, comment in more_examples:
	full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}'
	print(full_url)


	if __name__ == "__main__":
	get_more_example()