Fix tokenizer

The tokenizer is the original tokenizer used by the GPT1 model, created
from the vocab and merges contained in their repo.

Files changed (2) hide show

configuration_gpt1.py CHANGED Viewed

@@ -8,7 +8,7 @@ class GPT1Config(PretrainedConfig):
     def __init__(
         self,
-        vocab_size=40000,
         hidden_size=768,
         intermediate_size=3072,
         num_hidden_layers=12,

     def __init__(
         self,
+        vocab_size=40478,
         hidden_size=768,
         intermediate_size=3072,
         num_hidden_layers=12,

tokenizer.json CHANGED Viewed

@@ -17,68 +17,11 @@
     "type": "Sequence",
     "normalizers": [
       {
-        "type": "Replace",
-        "pattern": {
-          "String": "—"
-        },
-        "content": "-"
-      },
-      {
-        "type": "Replace",
-        "pattern": {
-          "String": "–"
-        },
-        "content": "-"
-      },
-      {
-        "type": "Replace",
-        "pattern": {
-          "String": "―"
-        },
-        "content": "-"
-      },
-      {
-        "type": "Replace",
-        "pattern": {
-          "String": "…"
-        },
-        "content": "..."
-      },
-      {
-        "type": "Replace",
-        "pattern": {
-          "String": "´"
-        },
-        "content": "'"
-      },
-      {
-        "type": "Replace",
-        "pattern": {
-          "Regex": "(-+|~+|!+|\"+|;+|\\?+|\\++|,+|\\)+|\\(+|\\+|\\/+|\\*+|\\[+|\\]+|}+|{+|\\|+|_+)"
-        },
-        "content": " \\1 "
-      },
-      {
-        "type": "Replace",
-        "pattern": {
-          "Regex": "\\s*\n\\s*"
-        },
-        "content": " \n "
-      },
-      {
-        "type": "Replace",
-        "pattern": {
-          "Regex": "[^\\S\n]+"
-        },
-        "content": " "
-      },
-      {
-        "type": "Strip",
-        "strip_left": true,
-        "strip_right": true
-      },
-      {
-        "type": "Lowercase"
       }
     ]
   },
@@ -80586,4 +80529,4 @@
       "bachel orette</w>"
     ]
   }
-}

     "type": "Sequence",
     "normalizers": [
       {
+        "type": "BertNormalizer",
+        "clean_text": true,
+        "handle_chinese_chars": true,
+        "strip_accents": null,
+        "lowercase": true
       }
     ]
   },
       "bachel orette</w>"
     ]
   }
+}