AlumiK commited on
Commit
93e9af8
1 Parent(s): 99485f7

update tokenizer

Browse files
special_tokens_map.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<unused1>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<unused2>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<unused3>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<unused4>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<unused5>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "<unused6>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "<unused7>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "<unused8>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "<unused9>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "<unused10>",
68
+ "lstrip": false,
69
+ "normalized": false,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ }
73
+ ],
74
+ "bos_token": {
75
+ "content": "<|startoftext|>",
76
+ "lstrip": false,
77
+ "normalized": false,
78
+ "rstrip": false,
79
+ "single_word": false
80
+ },
81
+ "cls_token": {
82
+ "content": "<cls>",
83
+ "lstrip": false,
84
+ "normalized": false,
85
+ "rstrip": false,
86
+ "single_word": false
87
+ },
88
+ "eos_token": {
89
+ "content": "<|endoftext|>",
90
+ "lstrip": false,
91
+ "normalized": false,
92
+ "rstrip": false,
93
+ "single_word": false
94
+ },
95
+ "mask_token": {
96
+ "content": "<mask>",
97
+ "lstrip": false,
98
+ "normalized": false,
99
+ "rstrip": false,
100
+ "single_word": false
101
+ },
102
+ "pad_token": {
103
+ "content": "<pad>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false
108
+ },
109
+ "sep_token": {
110
+ "content": "<sep>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false
115
+ },
116
+ "unk_token": {
117
+ "content": "<unk>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false
122
+ }
123
+ }
tokenization_linglong_fast.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+
3
+ from tokenizers import (
4
+ Tokenizer as HFTokenizer,
5
+ normalizers,
6
+ pre_tokenizers,
7
+ models,
8
+ decoders,
9
+ )
10
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
11
+
12
+
13
+ class LingLongTokenizerFast(PreTrainedTokenizerFast):
14
+ vocab_files_names = {'vocab_file': 'tokenizer.txt', 'tokenizer_file': 'tokenizer.json'}
15
+ model_input_names = ['input_ids', 'attention_mask']
16
+
17
+ class CustomDecoder:
18
+
19
+ @staticmethod
20
+ def decode_chain(tokens: list[str]) -> list[str]:
21
+ new_tokens = []
22
+ for token in tokens:
23
+ if token.startswith('##'):
24
+ new_tokens.append(token[2:])
25
+ else:
26
+ new_tokens.append(' ' + token)
27
+
28
+ # Remove whitespaces between Chinese characters.
29
+ # TODO: This will remove whitespaces between some English words as well. Need fix.
30
+ alphabet_set = set(list(string.ascii_letters))
31
+ for i in range(len(new_tokens)):
32
+ if new_tokens[i][0] == ' ':
33
+ if new_tokens[i][1] not in alphabet_set or i == 0:
34
+ new_tokens[i] = new_tokens[i][1:]
35
+ return new_tokens
36
+
37
+ def __init__(
38
+ self,
39
+ vocab_file: str | None = None,
40
+ tokenizer_file: str | None = None,
41
+ do_lower_case: bool = True,
42
+ do_basic_tokenize: bool = True,
43
+ unk_token: str = '<unk>',
44
+ sep_token: str = '<sep>',
45
+ pad_token: str = '<pad>',
46
+ cls_token: str = '<cls>',
47
+ mask_token: str = '<mask>',
48
+ bos_token: str = '<|startoftext|>',
49
+ eos_token: str = '<|endoftext|>',
50
+ tokenize_chinese_chars: bool = True,
51
+ strip_accents: bool | None = None,
52
+ **kwargs,
53
+ ):
54
+ backend_tokenizer = None
55
+ if tokenizer_file is None:
56
+ backend_tokenizer = HFTokenizer(
57
+ models.WordPiece.from_file(
58
+ vocab=vocab_file,
59
+ unk_token=unk_token,
60
+ max_input_chars_per_word=100,
61
+ ),
62
+ )
63
+ backend_tokenizer.add_special_tokens(
64
+ [unk_token, sep_token, pad_token, cls_token, mask_token, bos_token, eos_token],
65
+ )
66
+ normalizer_sequence = [normalizers.Replace('\n', sep_token)]
67
+ if do_basic_tokenize:
68
+ normalizer_sequence.append(
69
+ normalizers.BertNormalizer(
70
+ handle_chinese_chars=tokenize_chinese_chars,
71
+ strip_accents=strip_accents,
72
+ lowercase=do_lower_case,
73
+ ),
74
+ )
75
+ backend_tokenizer.normalizer = normalizers.Sequence(normalizer_sequence)
76
+ backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
77
+ pre_tokenizers.WhitespaceSplit(),
78
+ pre_tokenizers.Digits(individual_digits=True),
79
+ ])
80
+ super().__init__(
81
+ tokenizer_file=tokenizer_file,
82
+ tokenizer_object=backend_tokenizer,
83
+ unk_token=unk_token,
84
+ sep_token=sep_token,
85
+ pad_token=pad_token,
86
+ cls_token=cls_token,
87
+ mask_token=mask_token,
88
+ bos_token=bos_token,
89
+ eos_token=eos_token,
90
+ do_lower_case=do_lower_case,
91
+ do_basic_tokenize=do_basic_tokenize,
92
+ tokenize_chinese_chars=tokenize_chinese_chars,
93
+ strip_accents=strip_accents,
94
+ **kwargs,
95
+ )
96
+ self._tokenizer.decoder = decoders.Decoder.custom(self.CustomDecoder())
97
+ self.add_special_tokens({'additional_special_tokens': [f'<unused{i}>' for i in range(1, 11)]})
98
+ self.chat_template = '{{ bos_token }}{{ "问题:" }}{{ messages[-1]["content"] }}{{ "<unused1>答案:" }}'
99
+
100
+ def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str]:
101
+ files = self.backend_tokenizer.model.save(save_directory, name=filename_prefix)
102
+ return tuple(files)
103
+
104
+ def save_pretrained(self, *args, **kwargs) -> tuple[str]:
105
+ self._tokenizer.decoder = decoders.WordPiece()
106
+ return super().save_pretrained(*args, **kwargs)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "7": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "8": {
20
+ "content": "<|endoftext|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "9": {
28
+ "content": "<sep>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "10": {
36
+ "content": "<|startoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "13224": {
44
+ "content": "<unused1>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "13225": {
52
+ "content": "<unused2>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "13226": {
60
+ "content": "<unused3>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "13227": {
68
+ "content": "<unused4>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "13228": {
76
+ "content": "<unused5>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "13229": {
84
+ "content": "<unused6>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "13230": {
92
+ "content": "<unused7>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "13231": {
100
+ "content": "<unused8>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "13232": {
108
+ "content": "<unused9>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "13233": {
116
+ "content": "<unused10>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "13310": {
124
+ "content": "<cls>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "13311": {
132
+ "content": "<mask>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ }
139
+ },
140
+ "additional_special_tokens": [
141
+ "<unused1>",
142
+ "<unused2>",
143
+ "<unused3>",
144
+ "<unused4>",
145
+ "<unused5>",
146
+ "<unused6>",
147
+ "<unused7>",
148
+ "<unused8>",
149
+ "<unused9>",
150
+ "<unused10>"
151
+ ],
152
+ "auto_map": {
153
+ "AutoTokenizer": [
154
+ null,
155
+ "tokenization_linglong_fast.LingLongTokenizerFast"
156
+ ]
157
+ },
158
+ "bos_token": "<|startoftext|>",
159
+ "chat_template": "{{ bos_token }}{{ \"问题:\" }}{{ messages[-1][\"content\"] }}{{ \"<unused1>答案:\" }}",
160
+ "clean_up_tokenization_spaces": true,
161
+ "cls_token": "<cls>",
162
+ "do_basic_tokenize": true,
163
+ "do_lower_case": true,
164
+ "eos_token": "<|endoftext|>",
165
+ "mask_token": "<mask>",
166
+ "model_max_length": 1000000000000000019884624838656,
167
+ "pad_token": "<pad>",
168
+ "sep_token": "<sep>",
169
+ "strip_accents": null,
170
+ "tokenize_chinese_chars": true,
171
+ "tokenizer_class": "LingLongTokenizer",
172
+ "unk_token": "<unk>"
173
+ }