shanearora commited on
Commit
48f893d
1 Parent(s): 5c82221

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +14 -0
  3. tokenizer.json +1 -26
  4. tokenizer_config.json +3 -5
  5. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,4 +1,11 @@
1
  {
 
 
 
 
 
 
 
2
  "eos_token": {
3
  "content": "<|endoftext|>",
4
  "lstrip": false,
@@ -12,5 +19,12 @@
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
 
 
 
 
 
 
 
15
  }
16
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
  "eos_token": {
10
  "content": "<|endoftext|>",
11
  "lstrip": false,
 
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
  }
30
  }
tokenizer.json CHANGED
@@ -222,32 +222,7 @@
222
  }
223
  ]
224
  },
225
- "post_processor": {
226
- "type": "TemplateProcessing",
227
- "single": [
228
- {
229
- "Sequence": {
230
- "id": "A",
231
- "type_id": 0
232
- }
233
- }
234
- ],
235
- "pair": [
236
- {
237
- "Sequence": {
238
- "id": "A",
239
- "type_id": 0
240
- }
241
- },
242
- {
243
- "Sequence": {
244
- "id": "B",
245
- "type_id": 1
246
- }
247
- }
248
- ],
249
- "special_tokens": {}
250
- },
251
  "decoder": {
252
  "type": "ByteLevel",
253
  "add_prefix_space": true,
 
222
  }
223
  ]
224
  },
225
+ "post_processor": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  "decoder": {
227
  "type": "ByteLevel",
228
  "add_prefix_space": true,
tokenizer_config.json CHANGED
@@ -1,6 +1,4 @@
1
  {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
  "add_prefix_space": false,
5
  "added_tokens_decoder": {
6
  "100256": {
@@ -180,12 +178,12 @@
180
  "special": true
181
  }
182
  },
183
- "bos_token": null,
184
  "clean_up_tokenization_spaces": false,
185
  "eos_token": "<|endoftext|>",
186
  "extra_special_tokens": {},
187
  "model_max_length": 1000000000000000019884624838656,
188
  "pad_token": "<|pad|>",
189
- "tokenizer_class": "GPTNeoXTokenizer",
190
- "unk_token": null
191
  }
 
1
  {
 
 
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
  "100256": {
 
178
  "special": true
179
  }
180
  },
181
+ "bos_token": "<|endoftext|>",
182
  "clean_up_tokenization_spaces": false,
183
  "eos_token": "<|endoftext|>",
184
  "extra_special_tokens": {},
185
  "model_max_length": 1000000000000000019884624838656,
186
  "pad_token": "<|pad|>",
187
+ "tokenizer_class": "GPT2Tokenizer",
188
+ "unk_token": "<|endoftext|>"
189
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff