samleeasus commited on
Commit
5bc2a52
1 Parent(s): 43a614c

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +27 -0
README.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```python
2
+ from transformers import LlamaTokenizer
3
+
4
+ tokenizer = LlamaTokenizer.from_pretrained(
5
+ 'ocisd4/openllama_tokenizer_v2',
6
+ add_bos_token=False,
7
+ add_eos_token=True,
8
+ force_download=False,
9
+ use_auth_token=True,
10
+ # additional_special_tokens=['<|spcout|>', '<|sep|>', '<|eot|>', '<|output|>']
11
+ )
12
+
13
+ print('vocab size:',tokenizer.vocab_size)
14
+ #vocab size: 51456
15
+
16
+ text = '今天天氣真好!'
17
+
18
+
19
+ print(tokenizer.tokenize(text))
20
+ #['▁', '今天', '天氣', '真', '好', '!']
21
+
22
+ print(tokenizer.encode(text))
23
+ #[29500, 32097, 32916, 30615, 30192, 30042, 2]
24
+
25
+ print(tokenizer.decode(tokenizer.encode(text)))
26
+ # 今天天氣真好!</s>
27
+ ```