versae commited on
Commit
08838e9
1 Parent(s): 470a654

Upload 5 files

Browse files
README.md CHANGED
@@ -1,3 +1,17 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ ### How to use
6
+ ###
7
+ ```
8
+ from transformers import LlamaTokenizerFast
9
+ tokenizer = LlamaTokenizerFast.from_pretrained("mimir-project/tokenizer", token=True)
10
+ ```
11
+ or
12
+ ```
13
+ from transformers import AutoTokenizer
14
+ tokenizer = AutoTokenizer.from_pretrained("mimir-project/tokenizer", token=True)
15
+ ```
16
+
17
+ Copied from https://github.com/SmartmediaAI/MIMIR-project/tree/main
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<pad>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ }
37
+ },
38
+ "bos_token": "<s>",
39
+ "bos_token_id": 1,
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "eos_token_id": 2,
43
+ "model_max_length": 1000000000000000019884624838656,
44
+ "pad_token": "<pad>",
45
+ "pad_token_id": 3,
46
+ "padding_side": "right",
47
+ "tokenizer_class": "LlamaTokenizer",
48
+ "unk_token": "<unk>",
49
+ "unk_token_id": 0,
50
+ "use_default_system_prompt": false
51
+ }
tokenizer_train.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from tqdm import tqdm
4
+ import os
5
+
6
+ from datasets import load_dataset
7
+ from tokenizers import SentencePieceBPETokenizer
8
+ from transformers import LlamaTokenizerFast, TrainingArguments, AutoTokenizer
9
+
10
+ def main(args):
11
+
12
+ # Load the dataset from the huggingface Hub and prepare it for training
13
+ if args.dataset_name is not None:
14
+ data_files = os.listdir(args.dataset_name)
15
+ data_files = [args.dataset_name+f for f in data_files]
16
+ print(len(data_files))
17
+ dataset = load_dataset("json",
18
+ data_files=data_files,
19
+ split=args.dataset_split,
20
+ token=args.hub_token if args.hub_token else None
21
+ )
22
+ print(dataset)
23
+
24
+ else:
25
+ raise ValueError("No dataset name provided or dataset is already tokenized")
26
+
27
+ # Remove non text columns
28
+ dataset = dataset.remove_columns([col for col in dataset.column_names if col != "text"])
29
+
30
+ # select `num_samples` from the dataset
31
+ dataset = dataset.shuffle(seed=args.seed).select(range(args.num_samples))
32
+
33
+ # Create a SentencePieceBPETokenizer
34
+ tokenizer = SentencePieceBPETokenizer()
35
+
36
+ # Train the SentencePieceBPETokenizer on the dataset
37
+ tokenizer.train_from_iterator(
38
+ iterator=dataset['text'],
39
+ vocab_size=args.vocab_size,
40
+ show_progress=True,
41
+ special_tokens=["<unk>", "<s>", "</s>", "<pad>"],
42
+ )
43
+
44
+ # Save the tokenizer
45
+ tokenizer.save("new-sentencepiece-tokenizer.json", pretty=True)
46
+
47
+ # Load reference tokenizer
48
+ if args.reference_tokenizer is not None and args.hub_token is not None:
49
+ reference_tokenizer = AutoTokenizer.from_pretrained(args.reference_tokenizer, token=args.hub_token if args.hub_token else None)
50
+ reference_tokenizer.save_pretrained("reference-tokenizer")
51
+ else:
52
+ raise ValueError("No tokenizer name provided or no hub token provided. Try using `--reference_tokenizer 'meta-llama/Llama-2-7b-hf'")
53
+
54
+ # Read and dump the json file for the new tokenizer and the reference tokenizer
55
+ with open("new-sentencepiece-tokenizer.json") as f:
56
+ new_llama_tokenizer_json = json.load(f)
57
+
58
+ with open("reference-tokenizer/tokenizer.json") as f:
59
+ reference_tokenizer_json = json.load(f)
60
+
61
+ # Add the reference tokenizer's config to the new tokenizer's config
62
+ new_llama_tokenizer_json["normalizer"] = reference_tokenizer_json["normalizer"]
63
+ new_llama_tokenizer_json["pre_tokenizer"] = reference_tokenizer_json["pre_tokenizer"]
64
+ new_llama_tokenizer_json["post_processor"] = reference_tokenizer_json["post_processor"]
65
+ new_llama_tokenizer_json["decoder"] = reference_tokenizer_json["decoder"]
66
+ new_llama_tokenizer_json["model"]['fuse_unk'] = reference_tokenizer_json["model"]['fuse_unk']
67
+ new_llama_tokenizer_json["model"]['byte_fallback'] = reference_tokenizer_json["model"]['byte_fallback']
68
+
69
+ # Dump the new tokenizer's config
70
+ with open("new-sentencepiece-tokenizer.json", "w") as f:
71
+ json.dump(new_llama_tokenizer_json, f, indent=2, ensure_ascii=False)
72
+
73
+ # Load the new tokenizer as a LlamaTokenizerFast
74
+ new_llama_tokenizer = LlamaTokenizerFast(
75
+ tokenizer_file="new-sentencepiece-tokenizer.json",
76
+ name_or_path=args.reference_tokenizer + "-tokenizer",
77
+ unk_token="<unk>",
78
+ unk_token_id=0,
79
+ bos_token="<s>",
80
+ bos_token_id=1,
81
+ eos_token="</s>",
82
+ eos_token_id=2,
83
+ pad_token="<pad>",
84
+ pad_token_id=3,
85
+ padding_side="right",
86
+ )
87
+
88
+ # Save the new tokenizer
89
+ new_llama_tokenizer.save_pretrained("new-llama-tokenizer")
90
+
91
+ if __name__ == "__main__":
92
+ parser = argparse.ArgumentParser(description="Train a new Llama tokenizer")
93
+ parser.add_argument(
94
+ "--dataset_name",
95
+ type=str,
96
+ default=None,
97
+ help="The name of the dataset to be tokenized",
98
+ )
99
+ parser.add_argument(
100
+ "--dataset_split",
101
+ type=str,
102
+ default=None,
103
+ help="The split of the dataset to be tokenized",
104
+ )
105
+ parser.add_argument(
106
+ "--hub_token",
107
+ type=str,
108
+ default=None,
109
+ help="The token to access the dataset on the hub",
110
+ )
111
+ parser.add_argument(
112
+ "--reference_tokenizer",
113
+ type=str,
114
+ default=None,
115
+ help="The name of the reference tokenizer to use",
116
+ )
117
+ parser.add_argument(
118
+ "--seed",
119
+ type=int,
120
+ default=123,
121
+ help="set random seed",
122
+ )
123
+ parser.add_argument(
124
+ "--num_samples",
125
+ type=int,
126
+ default=None,
127
+ help="Number of samples to use from the dataset",
128
+ )
129
+ parser.add_argument(
130
+ "--vocab_size",
131
+ type=int,
132
+ default=None,
133
+ help="Vocabulary size to use for the tokenizer",
134
+ )
135
+ args = parser.parse_args()
136
+ main(args)
137
+
138
+ # How to run:
139
+ # python tokenizer_train.py --dataset_name /mimir/dataset/delivery/mimir_base/data/ --dataset_split train --reference_tokenizer meta-llama/Llama-2-7b-hf --vocab_size 32768 --hub_token hf_IIbKlx.... --num_samples 6000000
140
+