AlexHT_Hung
commited on
Commit
•
9c962ca
1
Parent(s):
a4300b1
Remove dummy tokens, add func_start, func_end
Browse files- README.md +5 -3
- added_tokens.json +2 -2
- tokenizer.json +3 -29
- tokenizer_config.json +2 -2
README.md
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
|
2 |
Mistral擴充詞表只包含與教育部常用4808字的交集
|
3 |
|
4 |
-
|
5 |
-
|
|
|
|
|
6 |
|
7 |
|
8 |
```python
|
@@ -16,7 +18,7 @@ tokenizer = AutoTokenizer.from_pretrained(
|
|
16 |
)
|
17 |
|
18 |
print('vocab size:', tokenizer.vocab_size)
|
19 |
-
#vocab size:
|
20 |
|
21 |
print(tokenizer.tokenize('今天天氣真好!'))
|
22 |
#['▁', '今', '天', '天', '氣', '真', '好', '!']
|
|
|
1 |
|
2 |
Mistral擴充詞表只包含與教育部常用4808字的交集
|
3 |
|
4 |
+
~~後面補了25個dummy token,補到64的倍數可以增加訓練效率~~
|
5 |
+
~~未來可以作為special token的預留空間~~
|
6 |
+
- 移除dummy token
|
7 |
+
- 增加`<|func_start|>`, `<|func_end|>`
|
8 |
|
9 |
|
10 |
```python
|
|
|
18 |
)
|
19 |
|
20 |
print('vocab size:', tokenizer.vocab_size)
|
21 |
+
#vocab size: 35686
|
22 |
|
23 |
print(tokenizer.tokenize('今天天氣真好!'))
|
24 |
#['▁', '今', '天', '天', '氣', '真', '好', '!']
|
added_tokens.json
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
{
|
2 |
-
"<|func_end|>":
|
3 |
-
"<|func_start|>":
|
4 |
}
|
|
|
1 |
{
|
2 |
+
"<|func_end|>": 35686,
|
3 |
+
"<|func_start|>": 35687
|
4 |
}
|
tokenizer.json
CHANGED
@@ -31,7 +31,7 @@
|
|
31 |
"special": true
|
32 |
},
|
33 |
{
|
34 |
-
"id":
|
35 |
"content": "<|func_end|>",
|
36 |
"single_word": true,
|
37 |
"lstrip": true,
|
@@ -40,7 +40,7 @@
|
|
40 |
"special": false
|
41 |
},
|
42 |
{
|
43 |
-
"id":
|
44 |
"content": "<|func_start|>",
|
45 |
"single_word": true,
|
46 |
"lstrip": true,
|
@@ -35838,33 +35838,7 @@
|
|
35838 |
"賅": 35682,
|
35839 |
"簞": 35683,
|
35840 |
"鼴": 35684,
|
35841 |
-
"躂": 35685
|
35842 |
-
"<DUMMY_0>": 35686,
|
35843 |
-
"<DUMMY_1>": 35687,
|
35844 |
-
"<DUMMY_2>": 35688,
|
35845 |
-
"<DUMMY_3>": 35689,
|
35846 |
-
"<DUMMY_4>": 35690,
|
35847 |
-
"<DUMMY_5>": 35691,
|
35848 |
-
"<DUMMY_6>": 35692,
|
35849 |
-
"<DUMMY_7>": 35693,
|
35850 |
-
"<DUMMY_8>": 35694,
|
35851 |
-
"<DUMMY_9>": 35695,
|
35852 |
-
"<DUMMY_10>": 35696,
|
35853 |
-
"<DUMMY_11>": 35697,
|
35854 |
-
"<DUMMY_12>": 35698,
|
35855 |
-
"<DUMMY_13>": 35699,
|
35856 |
-
"<DUMMY_14>": 35700,
|
35857 |
-
"<DUMMY_15>": 35701,
|
35858 |
-
"<DUMMY_16>": 35702,
|
35859 |
-
"<DUMMY_17>": 35703,
|
35860 |
-
"<DUMMY_18>": 35704,
|
35861 |
-
"<DUMMY_19>": 35705,
|
35862 |
-
"<DUMMY_20>": 35706,
|
35863 |
-
"<DUMMY_21>": 35707,
|
35864 |
-
"<DUMMY_22>": 35708,
|
35865 |
-
"<DUMMY_23>": 35709,
|
35866 |
-
"<DUMMY_24>": 35710,
|
35867 |
-
"<DUMMY_25>": 35711
|
35868 |
},
|
35869 |
"merges": [
|
35870 |
"▁ t",
|
|
|
31 |
"special": true
|
32 |
},
|
33 |
{
|
34 |
+
"id": 35686,
|
35 |
"content": "<|func_end|>",
|
36 |
"single_word": true,
|
37 |
"lstrip": true,
|
|
|
40 |
"special": false
|
41 |
},
|
42 |
{
|
43 |
+
"id": 35687,
|
44 |
"content": "<|func_start|>",
|
45 |
"single_word": true,
|
46 |
"lstrip": true,
|
|
|
35838 |
"賅": 35682,
|
35839 |
"簞": 35683,
|
35840 |
"鼴": 35684,
|
35841 |
+
"躂": 35685
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35842 |
},
|
35843 |
"merges": [
|
35844 |
"▁ t",
|
tokenizer_config.json
CHANGED
@@ -26,7 +26,7 @@
|
|
26 |
"single_word": false,
|
27 |
"special": true
|
28 |
},
|
29 |
-
"
|
30 |
"content": "<|func_end|>",
|
31 |
"lstrip": true,
|
32 |
"normalized": false,
|
@@ -34,7 +34,7 @@
|
|
34 |
"single_word": true,
|
35 |
"special": false
|
36 |
},
|
37 |
-
"
|
38 |
"content": "<|func_start|>",
|
39 |
"lstrip": true,
|
40 |
"normalized": false,
|
|
|
26 |
"single_word": false,
|
27 |
"special": true
|
28 |
},
|
29 |
+
"35686": {
|
30 |
"content": "<|func_end|>",
|
31 |
"lstrip": true,
|
32 |
"normalized": false,
|
|
|
34 |
"single_word": true,
|
35 |
"special": false
|
36 |
},
|
37 |
+
"35687": {
|
38 |
"content": "<|func_start|>",
|
39 |
"lstrip": true,
|
40 |
"normalized": false,
|