Fix tokenizer

Instead of having a trained tokenizer from scratch, replace it with the
actual tokenizer used by the original model.

Note that while the vocabulary and the merges are those from the GPT1
model, the pre- and post-processing might be slightly different, due to
employing different methods of tokenization (spaCy vs HuggingFace's
tokenizers).

Files changed (3) hide show

special_tokens_map.json +1 -1
tokenizer.json +0 -0
tokenizer_config.json +3 -3

special_tokens_map.json CHANGED Viewed

@@ -1,3 +1,3 @@
 {
-  "eos_token": "<|endoftext|>"
 }

 {
+  "unk_token": "<unk>"
 }

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "added_tokens_decoder": {
     "0": {
-      "content": "<|endoftext|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -10,7 +10,7 @@
     }
   },
   "clean_up_tokenization_spaces": true,
-  "eos_token": "<|endoftext|>",
   "model_max_length": 1000000000000000019884624838656,
-  "tokenizer_class": "PreTrainedTokenizerFast"
 }

 {
   "added_tokens_decoder": {
     "0": {
+      "content": "<unk>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
     }
   },
   "clean_up_tokenization_spaces": true,
   "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
 }