codebyzeb commited on
Commit
d23ef38
1 Parent(s): cf78632

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +2 -2
  2. tokenizer.json +0 -18
  3. tokenizer_config.json +2 -18
special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "bos_token": "BOS",
3
- "eos_token": "EOS",
4
  "pad_token": "PAD",
5
  "unk_token": "UNK"
6
  }
 
1
  {
2
+ "bos_token": "UTT_BOUNDARY",
3
+ "eos_token": "UTT_BOUNDARY",
4
  "pad_token": "PAD",
5
  "unk_token": "UNK"
6
  }
tokenizer.json CHANGED
@@ -21,24 +21,6 @@
21
  "normalized": false,
22
  "special": true
23
  },
24
- {
25
- "id": 2,
26
- "content": "BOS",
27
- "single_word": false,
28
- "lstrip": false,
29
- "rstrip": false,
30
- "normalized": false,
31
- "special": true
32
- },
33
- {
34
- "id": 3,
35
- "content": "EOS",
36
- "single_word": false,
37
- "lstrip": false,
38
- "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
- },
42
  {
43
  "id": 54,
44
  "content": "<|endoftext|>",
 
21
  "normalized": false,
22
  "special": true
23
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  {
25
  "id": 54,
26
  "content": "<|endoftext|>",
tokenizer_config.json CHANGED
@@ -17,22 +17,6 @@
17
  "single_word": false,
18
  "special": true
19
  },
20
- "2": {
21
- "content": "BOS",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- },
28
- "3": {
29
- "content": "EOS",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false,
34
- "special": true
35
- },
36
  "54": {
37
  "content": "<|endoftext|>",
38
  "lstrip": false,
@@ -42,9 +26,9 @@
42
  "special": true
43
  }
44
  },
45
- "bos_token": "BOS",
46
  "clean_up_tokenization_spaces": true,
47
- "eos_token": "EOS",
48
  "model_max_length": 1000000000000000019884624838656,
49
  "pad_token": "PAD",
50
  "tokenizer_class": "GPT2Tokenizer",
 
17
  "single_word": false,
18
  "special": true
19
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  "54": {
21
  "content": "<|endoftext|>",
22
  "lstrip": false,
 
26
  "special": true
27
  }
28
  },
29
+ "bos_token": "UTT_BOUNDARY",
30
  "clean_up_tokenization_spaces": true,
31
+ "eos_token": "UTT_BOUNDARY",
32
  "model_max_length": 1000000000000000019884624838656,
33
  "pad_token": "PAD",
34
  "tokenizer_class": "GPT2Tokenizer",