AlexWortega commited on
Commit
642ef91
1 Parent(s): 4167854

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "</code>": 50259,
3
+ "<code>": 50258,
4
+ "<instructionE>": 50261,
5
+ "<instructionS>": 50260,
6
+ "<next>": 50262,
7
+ "<|endoftext|>": 50257
8
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
special_tokens_map.json CHANGED
@@ -1,17 +1,30 @@
1
  {
2
  "additional_special_tokens": [
3
- "<madeupword0>",
4
- "<madeupword1>",
5
- "<madeupword2>",
6
- "<madeupword3>",
7
- "<madeupword4>",
8
- "<madeupword5>",
9
- "<madeupword6>"
10
  ],
11
- "bos_token": "<s>",
12
- "cls_token": "<s>",
13
- "eos_token": "</s>",
14
- "pad_token": "<pad>",
15
- "sep_token": "</s>",
16
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  }
1
  {
2
  "additional_special_tokens": [
3
+ "<code>",
4
+ "</code>",
5
+ "<instructionS>",
6
+ "<instructionE>",
7
+ "<next>"
 
 
8
  ],
9
+ "bos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
  }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00d163342a36b3ad1ea2f5f608e6bb2b2ff29bd453a41c4f52525a7ebc7c4b6a
3
- size 17210041
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72e35983ee232774e27e39f244f5eae96a3f52ca053c7c15d0e207f4cd346b43
3
+ size 3737405
tokenizer_config.json CHANGED
@@ -1,21 +1,34 @@
1
  {
2
- "additional_special_tokens": [
3
- "<madeupword0>",
4
- "<madeupword1>",
5
- "<madeupword2>",
6
- "<madeupword3>",
7
- "<madeupword4>",
8
- "<madeupword5>",
9
- "<madeupword6>"
10
- ],
11
- "bos_token": "<s>",
12
- "cls_token": "<s>",
13
- "eos_token": "</s>",
 
 
 
 
 
 
 
14
  "model_max_length": 1000000000000000019884624838656,
15
- "pad_token": "<pad>",
16
- "sep_token": "</s>",
17
- "sp_model_kwargs": {},
18
- "special_tokens_map_file": "/home/jovyan/.cache/huggingface/hub/models--facebook--xglm-1.7B/snapshots/d23a5e8e2164af31a84a26756b9b17f925143050/special_tokens_map.json",
19
- "tokenizer_class": "XGLMTokenizer",
20
- "unk_token": "<unk>"
 
 
 
 
 
 
21
  }
1
  {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "eos_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "errors": "replace",
21
  "model_max_length": 1000000000000000019884624838656,
22
+ "name_or_path": "sberbank-ai/rugpt3small_based_on_gpt2",
23
+ "pad_token": null,
24
+ "special_tokens_map_file": null,
25
+ "tokenizer_class": "GPT2Tokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff