File size: 674 Bytes
f4973d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

# HF_tokenizer
# from tokenizers import Tokenizer
# tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")


import sentencepiece as spm

text = "nice job 华为手机"
text = "<s>世界上最高的山是哪座山?</s><pad>"  #  29871, 41334, 30528, 30210, 30329, 41894, 31780, 30329, 30882,
tokenizer = spm.SentencePieceProcessor(model_file="tokenizer/tokenizer.model")
tokens = tokenizer.encode(text)  # [7575, 4982, 29871, 31266, 30573, 30880, 31429]
print(tokens)

from transformers import LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained("tokenizer")
tokens = tokenizer.encode(text)  # [1, 7575, 4982, 29871, 31266, 30573, 30880, 31429]
print(tokens)