pablo-rf commited on
Commit
9a094f3
1 Parent(s): deea6c7

[ADD] Tokenizer files

Browse files
Files changed (4) hide show
  1. merges.txt +0 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +50 -0
  4. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ }
13
+ },
14
+ "bos_token": {
15
+ "__type": "AddedToken",
16
+ "content": "<|endoftext|>",
17
+ "lstrip": false,
18
+ "normalized": true,
19
+ "rstrip": false,
20
+ "single_word": false
21
+ },
22
+ "clean_up_tokenization_spaces": true,
23
+ "eos_token": {
24
+ "__type": "AddedToken",
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "errors": "replace",
32
+ "model_max_length": 1000000000000000019884624838656,
33
+ "pad_token": {
34
+ "__type": "AddedToken",
35
+ "content": "<|endoftext|>",
36
+ "lstrip": false,
37
+ "normalized": true,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ },
41
+ "tokenizer_class": "GPT2Tokenizer",
42
+ "unk_token": {
43
+ "__type": "AddedToken",
44
+ "content": "<|endoftext|>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false
49
+ }
50
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff