Spaces:

eson
/

tokenizer-arena

Running

App Files Files Community

tokenizer-arena / vocab /chatyuan_large_v2 /test.py

eson

update

751936e 10 months ago

raw history blame

No virus

2.09 kB

	"""
	https://huggingface.co/ClueAI/ChatYuan-large-v2

	支持\n \t

	- 英文编码很烂

	为什么不直接编码\n \t，反而要过一套前处理和后处理?

	"""
	import json

	from transformers import AutoTokenizer



	def preprocess(text):
	"""
	词典里
	"""
	print("原文本: ", text)
	text = text.replace("\n", "\\n").replace("\t", "\\t")
	print("预处理后文本: ", text)
	return text


	def postprocess(text):
	return text.replace("\\n", "\n").replace("\\t", "\t").replace('%20', ' ')


	model_dir = 'ChatYuan-large-v2'
	tokenizer = AutoTokenizer.from_pretrained(model_dir)

	text = "中国\nabcde jump \tnice"
	tokens = tokenizer.tokenize(text)

	print(tokens)
	# ['▁中国', '▁', 'ab', 'c', 'de', '▁', 'j', 'ump', '▁n', 'ice']
	print(tokenizer.tokenize(preprocess(text)))
	# ['▁中国', '\\n', 'ab', 'c', 'de', '▁', 'j', 'ump', '▁', '\\t', 'n', 'ice']

	tokens = [12, 623, 5, 13409, 7, 51, 158, 5, 864, 93,
	3, 1329, 14965, 3402, 188, 4, 7, 623, 5, 56,
	4464, 4, 7, 51, 158, 5, 1526, 158, 617, 1456,
	84, 1607, 10, 11442, 1456, 9938, 9, 12, 14, 38,
	6582, 2945, 2861, 3, 11779, 1074, 712, 1036, 167, 6,
	7, 623, 5, 9898, 513, 79, 26455, 489, 3, 34,
	12029, 22, 7, 51, 158, 5, 1]

	tokens = [0, 12, 14381, 10, 19849, 3, 7, 7, 34, 313,
	1344, 9017, 3, 276, 26455, 2258, 3, 578, 864, 529,
	2771, 874, 26455, 1442, 6, 7, 7, 26455, 9220, 19849,
	937, 16, 11726, 33, 11726, 52, 6, 7, 12, 7,
	7, 8353, 1036, 8093, 67, 276, 1036, 3338, 3, 480,
	4490, 30, 34, 1325, 6, 7, 2200, 53, 7321, 2187,
	648, 78, 7321, 2899, 25823, 6, 7, 2964, 3402, 1203,
	13, 537, 6, 7, 1660, 2795, 3402, 1203, 6, 7,
	407, 1802, 7, 7, 3095, 1477, 37, 7, 7, 19849,
	7, 7, 11726, 16, 11726, 7893, 42, 1]


	print(tokenizer.decode(tokens))