ArmelR commited on
Commit
5ead9d8
1 Parent(s): 7b00df5

tokenizer stuff

Browse files
Files changed (4) hide show
  1. merges.txt +0 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +11 -0
  4. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "errors": "replace",
6
+ "model_max_length": 2048,
7
+ "name_or_path": "bigcode/santacoder",
8
+ "special_tokens_map_file": "/home/armelzebaze/.cache/huggingface/hub/models--bigcode--santacoder/snapshots/6a4fb77ff71c32c34dc8c61af500c7a7ca17c1a6/special_tokens_map.json",
9
+ "tokenizer_class": "GPT2Tokenizer",
10
+ "unk_token": "<|endoftext|>"
11
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff