Fangbing Liu commited on
Commit
9b0e556
1 Parent(s): 7fb349a

modify eos

Browse files
special_tokens_map.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "unk_token": "<|endoftext|>"
5
  }
 
1
  {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>"
5
  }
tokenizer.json CHANGED
@@ -5,13 +5,32 @@
5
  "added_tokens": [
6
  {
7
  "id": 50256,
8
- "content": "<|endoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
  "normalized": false,
13
  "special": true
14
  }
 
15
  ],
16
  "normalizer": null,
17
  "pre_tokenizer": {
@@ -50296,7 +50315,9 @@
50296
  "ĠCollider": 50253,
50297
  "Ġinformants": 50254,
50298
  "Ġgazed": 50255,
50299
- "<|endoftext|>": 50256
 
 
50300
  },
50301
  "merges": [
50302
  "Ġ t",
@@ -100301,4 +100322,4 @@
100301
  "Ġg azed"
100302
  ]
100303
  }
100304
- }
 
5
  "added_tokens": [
6
  {
7
  "id": 50256,
8
+ "content": "</s>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 50257,
17
+ "content": "<s>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 50258,
26
+ "content": "<unk>",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
  "normalized": false,
31
  "special": true
32
  }
33
+
34
  ],
35
  "normalizer": null,
36
  "pre_tokenizer": {
 
50315
  "ĠCollider": 50253,
50316
  "Ġinformants": 50254,
50317
  "Ġgazed": 50255,
50318
+ "</s>": 50256,
50319
+ "<s>": 50257,
50320
+ "<unk>": 50258
50321
  },
50322
  "merges": [
50323
  "Ġ t",
 
100322
  "Ġg azed"
100323
  ]
100324
  }
100325
+ }
tokenizer_config.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "add_prefix_space": false,
3
- "bos_token": "<|endoftext|>",
4
- "eos_token": "<|endoftext|>",
5
  "model_max_length": 1024,
6
  "name_or_path": "gpt2",
7
  "special_tokens_map_file": null,
8
  "tokenizer_class": "GPT2Tokenizer",
9
- "unk_token": "<|endoftext|>"
10
  }
 
1
  {
2
  "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
  "model_max_length": 1024,
6
  "name_or_path": "gpt2",
7
  "special_tokens_map_file": null,
8
  "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<unk>"
10
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff