elliotthwangmsa commited on
Commit
028fc01
1 Parent(s): 4ac25f1

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -37,6 +37,6 @@
37
  " ": 50259,
38
  " ": 50258,
39
  " ": 50257,
40
- "<|im_start|>": 50295,
41
- "[PAD]": 50296
42
  }
 
37
  " ": 50259,
38
  " ": 50258,
39
  " ": 50257,
40
+ "<|im_end|>": 50296,
41
+ "<|im_start|>": 50295
42
  }
special_tokens_map.json CHANGED
@@ -7,19 +7,13 @@
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "[PAD]",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
 
23
  "unk_token": {
24
  "content": "<|endoftext|>",
25
  "lstrip": false,
 
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "<|im_end|>",
 
 
 
 
 
 
 
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": "</s>",
17
  "unk_token": {
18
  "content": "<|endoftext|>",
19
  "lstrip": false,
tokenizer.json CHANGED
@@ -365,7 +365,7 @@
365
  },
366
  {
367
  "id": 50296,
368
- "content": "[PAD]",
369
  "single_word": false,
370
  "lstrip": false,
371
  "rstrip": false,
 
365
  },
366
  {
367
  "id": 50296,
368
+ "content": "<|im_end|>",
369
  "single_word": false,
370
  "lstrip": false,
371
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -322,7 +322,7 @@
322
  "special": false
323
  },
324
  "50296": {
325
- "content": "[PAD]",
326
  "lstrip": false,
327
  "normalized": false,
328
  "rstrip": false,
@@ -332,9 +332,9 @@
332
  },
333
  "bos_token": "<|endoftext|>",
334
  "clean_up_tokenization_spaces": true,
335
- "eos_token": "<|endoftext|>",
336
  "model_max_length": 2048,
337
- "pad_token": "[PAD]",
338
  "tokenizer_class": "CodeGenTokenizer",
339
  "unk_token": "<|endoftext|>"
340
  }
 
322
  "special": false
323
  },
324
  "50296": {
325
+ "content": "<|im_end|>",
326
  "lstrip": false,
327
  "normalized": false,
328
  "rstrip": false,
 
332
  },
333
  "bos_token": "<|endoftext|>",
334
  "clean_up_tokenization_spaces": true,
335
+ "eos_token": "<|im_end|>",
336
  "model_max_length": 2048,
337
+ "pad_token": "</s>",
338
  "tokenizer_class": "CodeGenTokenizer",
339
  "unk_token": "<|endoftext|>"
340
  }