tuanio commited on
Commit
a97e27f
1 Parent(s): a27c8be

Upload tokenizer

Browse files
merges.txt CHANGED
@@ -19884,6 +19884,3 @@ pr it
19884
  Ġtor rents
19885
  Ġchim neys
19886
  Ġer a
19887
- Ġsugg ests
19888
- Ġund is
19889
- Ġbá»ī n
 
19884
  Ġtor rents
19885
  Ġchim neys
19886
  Ġer a
 
 
 
special_tokens_map.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "unk_token": "<|endoftext|>"
5
  }
 
1
  {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>"
5
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "add_prefix_space": false,
3
- "bos_token": "<|endoftext|>",
4
  "clean_up_tokenization_spaces": true,
5
- "eos_token": "<|endoftext|>",
6
  "model_max_length": 1000000000000000019884624838656,
7
  "tokenizer_class": "GPT2Tokenizer",
8
- "unk_token": "<|endoftext|>"
9
  }
 
1
  {
2
  "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
  "clean_up_tokenization_spaces": true,
5
+ "eos_token": "</s>",
6
  "model_max_length": 1000000000000000019884624838656,
7
  "tokenizer_class": "GPT2Tokenizer",
8
+ "unk_token": "<unk>"
9
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff