versae commited on
Commit
04b8476
1 Parent(s): 384dd2d

Add HF tokenizer converted from SentencePiece

Browse files
convert.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # For models trained with SentencePiece with byte_fallback for autoregressive models
2
+ # ./spm_train --vocab_size 32000 --character_coverage 1.0 --hard_vocab_limit --model_type bpe --pad_id 3 --shuffle_input_sentence true --model_prefix ./sentencepiece.model --byte_fallback=true --input text.txt --input_sentence_size=100000 --num_threads 8
3
+ wget -O sentencepiece_extractor.py https://raw.githubusercontent.com/huggingface/tokenizers/master/bindings/python/scripts/sentencepiece_extractor.py
4
+ python sentencepiece_extractor.py --provider sentencepiece --model sentencepiece.model --merges-output-path ./merges.txt --vocab-output-path ./vocab.json
5
+
6
+ python <<EOF
7
+ from transformers import AutoTokenizer
8
+ from tokenizers import SentencePieceBPETokenizer
9
+ SentencePieceBPETokenizer.from_file("./vocab.json", "./merges.txt")
10
+ tokenizer = SentencePieceBPETokenizer.from_file("./vocab.json", "./merges.txt")
11
+ tokenizer.model.byte_fallback=True
12
+ tokenizer.model.fuse_unk=True
13
+ tokenizer.save("./tokenizer.json")
14
+ htok = AutoTokenizer.from_pretrained("./")
15
+ htok.padding_side = "right"
16
+ htok.save_pretrained("./")
17
+ EOF
18
+
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
sentencepiece.vocab → sentencepiece.vocab.bak RENAMED
File without changes
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>"
5
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "bos_token": "<s>",
31
+ "clean_up_tokenization_spaces": false,
32
+ "eos_token": "</s>",
33
+ "model_max_length": 1000000000000000019884624838656,
34
+ "tokenizer_class": "LlamaTokenizer",
35
+ "unk_token": "<unk>",
36
+ "use_default_system_prompt": false
37
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff