voidful commited on
Commit
5433102
1 Parent(s): 30177e5

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +6 -0
  2. tokenizer.json +39 -21
  3. tokenizer_config.json +9 -0
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "<|endoftext|>"
6
+ }
tokenizer.json CHANGED
@@ -2,27 +2,6 @@
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
5
- "normalizer": {
6
- "type": "NFC"
7
- },
8
- "pre_tokenizer": {
9
- "type": "ByteLevel",
10
- "add_prefix_space": false,
11
- "trim_offsets": true,
12
- "use_regex": true
13
- },
14
- "post_processor": {
15
- "type": "ByteLevel",
16
- "add_prefix_space": false,
17
- "trim_offsets": true,
18
- "use_regex": true
19
- },
20
- "decoder": {
21
- "type": "ByteLevel",
22
- "add_prefix_space": false,
23
- "trim_offsets": true,
24
- "use_regex": true
25
- },
26
  "added_tokens": [
27
  {
28
  "id": 0,
@@ -9239,8 +9218,47 @@
9239
  "rstrip": false,
9240
  "normalized": true,
9241
  "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9242
  }
9243
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9244
  "model": {
9245
  "type": "BPE",
9246
  "dropout": null,
 
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
9218
  "rstrip": false,
9219
  "normalized": true,
9220
  "special": false
9221
+ },
9222
+ {
9223
+ "id": 1024,
9224
+ "content": "<|endoftext|>",
9225
+ "single_word": false,
9226
+ "lstrip": false,
9227
+ "rstrip": false,
9228
+ "normalized": false,
9229
+ "special": true
9230
+ },
9231
+ {
9232
+ "id": 1025,
9233
+ "content": "[PAD]",
9234
+ "single_word": false,
9235
+ "lstrip": false,
9236
+ "rstrip": false,
9237
+ "normalized": false,
9238
+ "special": true
9239
  }
9240
  ],
9241
+ "normalizer": {
9242
+ "type": "NFC"
9243
+ },
9244
+ "pre_tokenizer": {
9245
+ "type": "ByteLevel",
9246
+ "add_prefix_space": false,
9247
+ "trim_offsets": true,
9248
+ "use_regex": true
9249
+ },
9250
+ "post_processor": {
9251
+ "type": "ByteLevel",
9252
+ "add_prefix_space": false,
9253
+ "trim_offsets": true,
9254
+ "use_regex": true
9255
+ },
9256
+ "decoder": {
9257
+ "type": "ByteLevel",
9258
+ "add_prefix_space": false,
9259
+ "trim_offsets": true,
9260
+ "use_regex": true
9261
+ },
9262
  "model": {
9263
  "type": "BPE",
9264
  "dropout": null,
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|endoftext|>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "tokenizer_class": "GPTNeoXTokenizer",
8
+ "unk_token": "<|endoftext|>"
9
+ }