Davidsamuel101 commited on
Commit
677d0c8
1 Parent(s): 1360301

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +51 -0
  2. tokenizer.json +245 -0
  3. tokenizer_config.json +58 -0
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 86,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 87,
17
+ "content": "<pad>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 88,
26
+ "content": "<s>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 89,
35
+ "content": "</s>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 90,
44
+ "content": "<mask>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": {
53
+ "type": "Sequence",
54
+ "normalizers": [
55
+ {
56
+ "type": "Lowercase"
57
+ },
58
+ {
59
+ "type": "Strip",
60
+ "strip_left": true,
61
+ "strip_right": true
62
+ }
63
+ ]
64
+ },
65
+ "pre_tokenizer": {
66
+ "type": "Whitespace"
67
+ },
68
+ "post_processor": {
69
+ "type": "TemplateProcessing",
70
+ "single": [
71
+ {
72
+ "SpecialToken": {
73
+ "id": "<s>",
74
+ "type_id": 0
75
+ }
76
+ },
77
+ {
78
+ "Sequence": {
79
+ "id": "A",
80
+ "type_id": 0
81
+ }
82
+ },
83
+ {
84
+ "SpecialToken": {
85
+ "id": "</s>",
86
+ "type_id": 0
87
+ }
88
+ }
89
+ ],
90
+ "pair": [
91
+ {
92
+ "SpecialToken": {
93
+ "id": "<s>",
94
+ "type_id": 0
95
+ }
96
+ },
97
+ {
98
+ "Sequence": {
99
+ "id": "A",
100
+ "type_id": 0
101
+ }
102
+ },
103
+ {
104
+ "SpecialToken": {
105
+ "id": "</s>",
106
+ "type_id": 0
107
+ }
108
+ },
109
+ {
110
+ "Sequence": {
111
+ "id": "B",
112
+ "type_id": 1
113
+ }
114
+ },
115
+ {
116
+ "SpecialToken": {
117
+ "id": "</s>",
118
+ "type_id": 1
119
+ }
120
+ }
121
+ ],
122
+ "special_tokens": {
123
+ "</s>": {
124
+ "id": "</s>",
125
+ "ids": [
126
+ 89
127
+ ],
128
+ "tokens": [
129
+ "</s>"
130
+ ]
131
+ },
132
+ "<s>": {
133
+ "id": "<s>",
134
+ "ids": [
135
+ 88
136
+ ],
137
+ "tokens": [
138
+ "<s>"
139
+ ]
140
+ }
141
+ }
142
+ },
143
+ "decoder": {
144
+ "type": "WordPiece",
145
+ "prefix": "",
146
+ "cleanup": true
147
+ },
148
+ "model": {
149
+ "type": "WordLevel",
150
+ "vocab": {
151
+ "aɪ": 0,
152
+ "aʊ": 1,
153
+ "b": 2,
154
+ "d": 3,
155
+ "d͡ʒ": 4,
156
+ "eɪ": 5,
157
+ "f": 6,
158
+ "h": 7,
159
+ "i": 8,
160
+ "j": 9,
161
+ "k": 10,
162
+ "l": 11,
163
+ "m": 12,
164
+ "n": 13,
165
+ "oʊ": 14,
166
+ "p": 15,
167
+ "s": 16,
168
+ "t": 17,
169
+ "t͡ʃ": 18,
170
+ "u": 19,
171
+ "v": 20,
172
+ "w": 21,
173
+ "z": 22,
174
+ "æ": 23,
175
+ "ð": 24,
176
+ "ŋ": 25,
177
+ "ɑ": 26,
178
+ "ɔ": 27,
179
+ "ɔɪ": 28,
180
+ "ə": 29,
181
+ "ɚ": 30,
182
+ "ɛ": 31,
183
+ "ɡ": 32,
184
+ "ɪ": 33,
185
+ "ɹ": 34,
186
+ "ʃ": 35,
187
+ "ʊ": 36,
188
+ "ʌ": 37,
189
+ "ʒ": 38,
190
+ "ˈaɪ": 39,
191
+ "ˈaʊ": 40,
192
+ "ˈeɪ": 41,
193
+ "ˈi": 42,
194
+ "ˈoʊ": 43,
195
+ "ˈu": 44,
196
+ "ˈæ": 45,
197
+ "ˈɑ": 46,
198
+ "ˈɔ": 47,
199
+ "ˈɔɪ": 48,
200
+ "ˈɚ": 49,
201
+ "ˈɛ": 50,
202
+ "ˈɪ": 51,
203
+ "ˈʊ": 52,
204
+ "ˈʌ": 53,
205
+ "ˌaɪ": 54,
206
+ "ˌaʊ": 55,
207
+ "ˌeɪ": 56,
208
+ "ˌi": 57,
209
+ "ˌoʊ": 58,
210
+ "ˌu": 59,
211
+ "ˌæ": 60,
212
+ "ˌɑ": 61,
213
+ "ˌɔ": 62,
214
+ "ˌɔɪ": 63,
215
+ "ˌɚ": 64,
216
+ "ˌɛ": 65,
217
+ "ˌɪ": 66,
218
+ "ˌʊ": 67,
219
+ "ˌʌ": 68,
220
+ "θ": 69,
221
+ "\"": 70,
222
+ " ": 71,
223
+ "_": 72,
224
+ "^": 73,
225
+ "$": 74,
226
+ "!": 75,
227
+ "#": 76,
228
+ "'": 77,
229
+ "(": 78,
230
+ ")": 79,
231
+ ",": 80,
232
+ "-": 81,
233
+ ".": 82,
234
+ ":": 83,
235
+ ";": 84,
236
+ "?": 85,
237
+ "<unk>": 86,
238
+ "<pad>": 87,
239
+ "<s>": 88,
240
+ "</s>": 89,
241
+ "<mask>": 90
242
+ },
243
+ "unk_token": "<unk>"
244
+ }
245
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "86": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "87": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "88": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "89": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "90": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "max_length": null,
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "pad_to_multiple_of": null,
52
+ "pad_token": "<pad>",
53
+ "pad_token_type_id": 0,
54
+ "padding_side": "right",
55
+ "sep_token": "</s>",
56
+ "tokenizer_class": "PreTrainedTokenizerFast",
57
+ "unk_token": "<unk>"
58
+ }