loubnabnl HF staff RaymondLi commited on
Commit
cd86b97
1 Parent(s): 3482bf4

Update tokenizer (#11)

Browse files

- add special tokens (1354bdd629179fb09a56394c65253a0748c68258)


Co-authored-by: Raymond Li <RaymondLi@users.noreply.huggingface.co>

special_tokens_map.json CHANGED
@@ -1 +1,9 @@
1
- {}
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim-prefix>",
5
+ "<fim-middle>",
6
+ "<fim-suffix>",
7
+ "<fim-pad>"
8
+ ]
9
+ }
tokenizer.json CHANGED
@@ -2,7 +2,53 @@
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
5
- "added_tokens": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "normalizer": null,
7
  "pre_tokenizer": {
8
  "type": "Sequence",
2
  "version": "1.0",
3
  "truncation": null,
4
  "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 49152,
8
+ "content": "<|endoftext|>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 49153,
17
+ "content": "<fim-prefix>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 49154,
26
+ "content": "<fim-middle>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 49155,
35
+ "content": "<fim-suffix>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 49156,
44
+ "content": "<fim-pad>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
  "normalizer": null,
53
  "pre_tokenizer": {
54
  "type": "Sequence",
tokenizer_config.json CHANGED
@@ -1,7 +1,5 @@
1
  {
2
- "name_or_path": "bigcode/digit-bytelevel-bpe-jss-v1.1-49152",
3
- "special_tokens_map_file": "/Users/leandro/.cache/huggingface/hub/models--bigcode--digit-bytelevel-bpe-jss-v1.1-49152/snapshots/fa09b77949689a484afafc5f89534e6b6ba2c151/special_tokens_map.json",
4
  "tokenizer_class": "PreTrainedTokenizerFast",
5
- "vocab_size": 49152,
6
  "model_max_length": 2048
7
  }
1
  {
2
+ "errors": "replace",
 
3
  "tokenizer_class": "PreTrainedTokenizerFast",
 
4
  "model_max_length": 2048
5
  }