Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

add more tokenizers

f4973d4 7 months ago

No virus

797 Bytes


	"""
	这个 bug 已解决， https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/commit/e03d54f05b9d42740c43a191c5d2914fcfb4c6e5
	"""

	import os
	from transformers import AutoTokenizer
	from vocab import TokenizerType


	CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
	TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Baichuan2-7B-Chat")
	tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
	# tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", trust_remote_code=True)

	token_ids = tokenizer.encode("<pad>")
	# token1 = tokenizer.decode(125696) # out of range

	token_ids = tokenizer.encode("中")
	filtered_tokens = tokenizer.convert_ids_to_tokens(token_ids)
	decoded_text = tokenizer.convert_tokens_to_string(filtered_tokens)
	print(decoded_text)