ocisd4
/

mistral_tokenizer_ext

AlexHT_Hung commited on Feb 6

Commit

9c962ca

•

1 Parent(s): a4300b1

Remove dummy tokens, add func_start, func_end

Files changed (4) hide show

README.md CHANGED Viewed

@@ -1,8 +1,10 @@
 Mistral擴充詞表只包含與教育部常用4808字的交集
-後面補了25個dummy token,補到64的倍數可以增加訓練效率
-未來可以作為special token的預留空間
 ```python
@@ -16,7 +18,7 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
 print('vocab size:', tokenizer.vocab_size)
-#vocab size: 35712
 print(tokenizer.tokenize('今天天氣真好！'))
 #['▁', '今', '天', '天', '氣', '真', '好', '！']

 Mistral擴充詞表只包含與教育部常用4808字的交集
+~~後面補了25個dummy token,補到64的倍數可以增加訓練效率~~
+~~未來可以作為special token的預留空間~~
+- 移除dummy token
+- 增加`<|func_start|>`, `<|func_end|>`
 ```python
 )
 print('vocab size:', tokenizer.vocab_size)
+#vocab size: 35686
 print(tokenizer.tokenize('今天天氣真好！'))
 #['▁', '今', '天', '天', '氣', '真', '好', '！']

added_tokens.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
-  "<|func_end|>": 35712,
-  "<|func_start|>": 35713
 }

 {
+  "<|func_end|>": 35686,
+  "<|func_start|>": 35687
 }

tokenizer.json CHANGED Viewed

@@ -31,7 +31,7 @@
       "special": true
     },
     {
-      "id": 35712,
       "content": "<|func_end|>",
       "single_word": true,
       "lstrip": true,
@@ -40,7 +40,7 @@
       "special": false
     },
     {
-      "id": 35713,
       "content": "<|func_start|>",
       "single_word": true,
       "lstrip": true,
@@ -35838,33 +35838,7 @@
       "賅": 35682,
       "簞": 35683,
       "鼴": 35684,
-      "躂": 35685,
-      "<DUMMY_0>": 35686,
-      "<DUMMY_1>": 35687,
-      "<DUMMY_2>": 35688,
-      "<DUMMY_3>": 35689,
-      "<DUMMY_4>": 35690,
-      "<DUMMY_5>": 35691,
-      "<DUMMY_6>": 35692,
-      "<DUMMY_7>": 35693,
-      "<DUMMY_8>": 35694,
-      "<DUMMY_9>": 35695,
-      "<DUMMY_10>": 35696,
-      "<DUMMY_11>": 35697,
-      "<DUMMY_12>": 35698,
-      "<DUMMY_13>": 35699,
-      "<DUMMY_14>": 35700,
-      "<DUMMY_15>": 35701,
-      "<DUMMY_16>": 35702,
-      "<DUMMY_17>": 35703,
-      "<DUMMY_18>": 35704,
-      "<DUMMY_19>": 35705,
-      "<DUMMY_20>": 35706,
-      "<DUMMY_21>": 35707,
-      "<DUMMY_22>": 35708,
-      "<DUMMY_23>": 35709,
-      "<DUMMY_24>": 35710,
-      "<DUMMY_25>": 35711
     },
     "merges": [
       "▁ t",

       "special": true
     },
     {
+      "id": 35686,
       "content": "<|func_end|>",
       "single_word": true,
       "lstrip": true,
       "special": false
     },
     {
+      "id": 35687,
       "content": "<|func_start|>",
       "single_word": true,
       "lstrip": true,
       "賅": 35682,
       "簞": 35683,
       "鼴": 35684,
+      "躂": 35685
     },
     "merges": [
       "▁ t",

tokenizer_config.json CHANGED Viewed

@@ -26,7 +26,7 @@
       "single_word": false,
       "special": true
     },
-    "35712": {
       "content": "<|func_end|>",
       "lstrip": true,
       "normalized": false,
@@ -34,7 +34,7 @@
       "single_word": true,
       "special": false
     },
-    "35713": {
       "content": "<|func_start|>",
       "lstrip": true,
       "normalized": false,

       "single_word": false,
       "special": true
     },
+    "35686": {
       "content": "<|func_end|>",
       "lstrip": true,
       "normalized": false,
       "single_word": true,
       "special": false
     },
+    "35687": {
       "content": "<|func_start|>",
       "lstrip": true,
       "normalized": false,