revert tokenizer

Browse files

Files changed (5) hide show

README.md +5 -4
added_tokens.json +0 -3
special_tokens_map.json +1 -2
tokenizer.json +3 -15
tokenizer_config.json +11 -70

README.md CHANGED Viewed

@@ -1,18 +1,19 @@
 ---
 language:
 - fr
-license: mit
-library_name: transformers
 tags:
 - biomedical
 - clinical
 - life sciences
 datasets:
 - rntc/biomed-fr
-pipeline_tag: fill-mask
 widget:
-- text: Les médicaments <mask> typiques sont largement utilisés dans le traitement
     de première intention des patients schizophrènes.
 ---
 <a href=https://camembert-bio-model.fr/>

 ---
+license: mit
 language:
 - fr
+pipeline_tag: fill-mask
 tags:
 - biomedical
 - clinical
 - life sciences
 datasets:
 - rntc/biomed-fr
 widget:
+- text: >-
+    Les médicaments <mask> typiques sont largement utilisés dans le traitement
     de première intention des patients schizophrènes.
+library_name: transformers
 ---
 <a href=https://camembert-bio-model.fr/>

added_tokens.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "<unk>NOTUSED": 32005
-}

special_tokens_map.json CHANGED Viewed

@@ -1,8 +1,7 @@
 {
   "additional_special_tokens": [
     "<s>NOTUSED",
-    "</s>NOTUSED",
-    "<unk>NOTUSED"
   ],
   "bos_token": "<s>",
   "cls_token": "<s>",

 {
   "additional_special_tokens": [
     "<s>NOTUSED",
+    "</s>NOTUSED"
   ],
   "bos_token": "<s>",
   "cls_token": "<s>",

tokenizer.json CHANGED Viewed

@@ -65,15 +65,6 @@
       "rstrip": false,
       "normalized": false,
       "special": true
-    },
-    {
-      "id": 32005,
-      "content": "<unk>NOTUSED",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
     }
   ],
   "normalizer": {
@@ -89,8 +80,7 @@
       {
         "type": "Metaspace",
         "replacement": "▁",
-        "prepend_scheme": "always",
-        "split": true
       }
     ]
   },
@@ -178,8 +168,7 @@
   "decoder": {
     "type": "Metaspace",
     "replacement": "▁",
-    "prepend_scheme": "always",
-    "split": true
   },
   "model": {
     "type": "Unigram",
@@ -128205,7 +128194,6 @@
         "<mask>",
         0.0
       ]
-    ],
-    "byte_fallback": false
   }
 }

       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": {
       {
         "type": "Metaspace",
         "replacement": "▁",
+        "add_prefix_space": true
       }
     ]
   },
   "decoder": {
     "type": "Metaspace",
     "replacement": "▁",
+    "add_prefix_space": true
   },
   "model": {
     "type": "Unigram",
         "<mask>",
         0.0
       ]
+    ]
   }
 }

tokenizer_config.json CHANGED Viewed

@@ -1,83 +1,24 @@
 {
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<s>NOTUSED",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<pad>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "</s>NOTUSED",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "4": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "5": {
-      "content": "<s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "6": {
-      "content": "</s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "32004": {
-      "content": "<mask>",
-      "lstrip": true,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "32005": {
-      "content": "<unk>NOTUSED",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
   "additional_special_tokens": [
     "<s>NOTUSED",
-    "</s>NOTUSED",
-    "<unk>NOTUSED"
   ],
   "bos_token": "<s>",
-  "clean_up_tokenization_spaces": true,
   "cls_token": "<s>",
   "eos_token": "</s>",
-  "mask_token": "<mask>",
   "model_max_length": 512,
   "pad_token": "<pad>",
   "sep_token": "</s>",
   "tokenizer_class": "CamembertTokenizer",
   "unk_token": "<unk>"
 }

 {
   "additional_special_tokens": [
     "<s>NOTUSED",
+    "</s>NOTUSED"
   ],
   "bos_token": "<s>",
   "cls_token": "<s>",
   "eos_token": "</s>",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
   "model_max_length": 512,
+  "name_or_path": "camembert-base",
   "pad_token": "<pad>",
   "sep_token": "</s>",
+  "special_tokens_map_file": null,
   "tokenizer_class": "CamembertTokenizer",
   "unk_token": "<unk>"
 }