# HF_tokenizer # from tokenizers import Tokenizer # tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json") import sentencepiece as spm text = "nice job 华为手机" text = "世界上最高的山是哪座山?" # 29871, 41334, 30528, 30210, 30329, 41894, 31780, 30329, 30882, tokenizer = spm.SentencePieceProcessor(model_file="tokenizer/tokenizer.model") tokens = tokenizer.encode(text) # [7575, 4982, 29871, 31266, 30573, 30880, 31429] print(tokens) from transformers import LlamaTokenizer tokenizer = LlamaTokenizer.from_pretrained("tokenizer") tokens = tokenizer.encode(text) # [1, 7575, 4982, 29871, 31266, 30573, 30880, 31429] print(tokens)