liyuesen commited on
Commit
9e8254f
1 Parent(s): 4388c98

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "<|startoftext|>": 53082,
3
- "[PAD]": 53081
4
  }
 
1
  {
2
+ "<|startoftext|>": 50258,
3
+ "[PAD]": 50257
4
  }
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -2,11 +2,5 @@
2
  "bos_token": "<|startoftext|>",
3
  "eos_token": "<|endoftext|>",
4
  "pad_token": "[PAD]",
5
- "unk_token": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- }
12
  }
 
2
  "bos_token": "<|startoftext|>",
3
  "eos_token": "<|endoftext|>",
4
  "pad_token": "[PAD]",
5
+ "unk_token": "<|endoftext|>"
 
 
 
 
 
 
6
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,33 +1,9 @@
1
  {
2
- "add_bos_token": false,
3
  "add_prefix_space": false,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "eos_token": {
13
- "__type": "AddedToken",
14
- "content": "<|endoftext|>",
15
- "lstrip": false,
16
- "normalized": true,
17
- "rstrip": false,
18
- "single_word": false
19
- },
20
- "errors": "replace",
21
- "model_max_length": 1000000000000000019884624838656,
22
- "pad_token": null,
23
- "special_tokens_map_file": "tokenizer_folder/new_tokenizer_gpt\\special_tokens_map.json",
24
  "tokenizer_class": "GPT2Tokenizer",
25
- "unk_token": {
26
- "__type": "AddedToken",
27
- "content": "<|endoftext|>",
28
- "lstrip": false,
29
- "normalized": true,
30
- "rstrip": false,
31
- "single_word": false
32
- }
33
  }
 
1
  {
 
2
  "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1024,
6
+ "special_tokens_map_file": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  "tokenizer_class": "GPT2Tokenizer",
8
+ "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
9
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff