cakiki commited on
Commit
2e704d2
1 Parent(s): 935d1c4

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +25 -0
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +29 -0
  5. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim-prefix>",
5
+ "<fim-middle>",
6
+ "<fim-suffix>",
7
+ "<fim-pad>",
8
+ "<filename>",
9
+ "<gh-stars>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<empty_output>",
18
+ "<commit_before>",
19
+ "<commit_msg>",
20
+ "<commit_after>"
21
+ ],
22
+ "bos_token": "<|endoftext|>",
23
+ "eos_token": "<|endoftext|>",
24
+ "unk_token": "<|endoftext|>"
25
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "additional_special_tokens": [
4
+ "<|endoftext|>",
5
+ "<fim-prefix>",
6
+ "<fim-middle>",
7
+ "<fim-suffix>",
8
+ "<fim-pad>",
9
+ "<filename>",
10
+ "<gh-stars>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<empty_output>",
19
+ "<commit_before>",
20
+ "<commit_msg>",
21
+ "<commit_after>"
22
+ ],
23
+ "bos_token": "<|endoftext|>",
24
+ "eos_token": "<|endoftext|>",
25
+ "model_max_length": 1000000000000000019884624838656,
26
+ "tokenizer_class": "GPT2Tokenizer",
27
+ "unk_token": "<|endoftext|>",
28
+ "vocab_size": 49152
29
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff