AlexWortega commited on
Commit
1f2433e
1 Parent(s): 5801fd6

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,8 +1,6 @@
1
  {
2
  "<endoftext>": 50257,
3
- "<|endoftext|>": 50258,
4
- "<|pad|>": 50259,
5
- "q:": 50260,
6
- "style": 50262,
7
- "summarize": 50261
8
  }
 
1
  {
2
  "<endoftext>": 50257,
3
+ "<|pad|>": 50258,
4
+ "q:": 50259,
5
+ "summarize": 50260
 
 
6
  }
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -5,32 +5,8 @@
5
  "a",
6
  "style"
7
  ],
8
- "bos_token": {
9
- "content": "<endoftext>",
10
- "lstrip": false,
11
- "normalized": true,
12
- "rstrip": false,
13
- "single_word": false
14
- },
15
- "eos_token": {
16
- "content": "<endoftext>",
17
- "lstrip": false,
18
- "normalized": true,
19
- "rstrip": false,
20
- "single_word": false
21
- },
22
- "pad_token": {
23
- "content": "<|pad|>",
24
- "lstrip": false,
25
- "normalized": true,
26
- "rstrip": false,
27
- "single_word": false
28
- },
29
- "unk_token": {
30
- "content": "<|endoftext|>",
31
- "lstrip": false,
32
- "normalized": true,
33
- "rstrip": false,
34
- "single_word": false
35
- }
36
  }
 
5
  "a",
6
  "style"
7
  ],
8
+ "bos_token": "<endoftext>",
9
+ "eos_token": "<endoftext>",
10
+ "pad_token": "<|pad|>",
11
+ "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,32 +1,12 @@
1
  {
2
  "add_bos_token": false,
3
  "add_prefix_space": false,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "<endoftext>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "eos_token": {
13
- "__type": "AddedToken",
14
- "content": "<endoftext>",
15
- "lstrip": false,
16
- "normalized": true,
17
- "rstrip": false,
18
- "single_word": false
19
- },
20
  "errors": "replace",
21
- "name_or_path": "sberbank-ai/rugpt3large_based_on_gpt2",
22
- "pad_token": {
23
- "__type": "AddedToken",
24
- "content": "<|pad|>",
25
- "lstrip": false,
26
- "normalized": true,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
  "special_tokens_map_file": null,
31
  "tokenizer_class": "GPT2Tokenizer",
32
  "unk_token": {
 
1
  {
2
  "add_bos_token": false,
3
  "add_prefix_space": false,
4
+ "bos_token": "<endoftext>",
5
+ "eos_token": "<endoftext>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "errors": "replace",
7
+ "model_max_length": 2048,
8
+ "name_or_path": "ruGPTNEO1.3b_2_orig_tokens/checkpoint-10000/",
9
+ "pad_token": "<|pad|>",
 
 
 
 
 
 
10
  "special_tokens_map_file": null,
11
  "tokenizer_class": "GPT2Tokenizer",
12
  "unk_token": {
vocab.json CHANGED
The diff for this file is too large to render. See raw diff