Update tokenizer (#11)

- add special tokens (1354bdd629179fb09a56394c65253a0748c68258)

Co-authored-by: Raymond Li <RaymondLi@users.noreply.huggingface.co>

Files changed (3) hide show

special_tokens_map.json CHANGED Viewed

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<fim-prefix>",
+    "<fim-middle>",
+    "<fim-suffix>",
+    "<fim-pad>"
+  ]
+}

tokenizer.json CHANGED Viewed

@@ -2,7 +2,53 @@
   "version": "1.0",
   "truncation": null,
   "padding": null,
-  "added_tokens": [],
   "normalizer": null,
   "pre_tokenizer": {
     "type": "Sequence",

   "version": "1.0",
   "truncation": null,
   "padding": null,
+  "added_tokens": [
+    {
+      "id": 49152,
+      "content": "<|endoftext|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 49153,
+      "content": "<fim-prefix>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 49154,
+      "content": "<fim-middle>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 49155,
+      "content": "<fim-suffix>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 49156,
+      "content": "<fim-pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
   "normalizer": null,
   "pre_tokenizer": {
     "type": "Sequence",

tokenizer_config.json CHANGED Viewed

@@ -1,7 +1,5 @@
 {
-  "name_or_path": "bigcode/digit-bytelevel-bpe-jss-v1.1-49152",
-  "special_tokens_map_file": "/Users/leandro/.cache/huggingface/hub/models--bigcode--digit-bytelevel-bpe-jss-v1.1-49152/snapshots/fa09b77949689a484afafc5f89534e6b6ba2c151/special_tokens_map.json",
   "tokenizer_class": "PreTrainedTokenizerFast",
-  "vocab_size": 49152,
   "model_max_length": 2048
 }

 {
+  "errors": "replace",
   "tokenizer_class": "PreTrainedTokenizerFast",
   "model_max_length": 2048
 }