AlexHT_Hung commited on
Commit
9c962ca
1 Parent(s): a4300b1

Remove dummy tokens, add func_start, func_end

Browse files
Files changed (4) hide show
  1. README.md +5 -3
  2. added_tokens.json +2 -2
  3. tokenizer.json +3 -29
  4. tokenizer_config.json +2 -2
README.md CHANGED
@@ -1,8 +1,10 @@
1
 
2
  Mistral擴充詞表只包含與教育部常用4808字的交集
3
 
4
- 後面補了25個dummy token,補到64的倍數可以增加訓練效率
5
- 未來可以作為special token的預留空間
 
 
6
 
7
 
8
  ```python
@@ -16,7 +18,7 @@ tokenizer = AutoTokenizer.from_pretrained(
16
  )
17
 
18
  print('vocab size:', tokenizer.vocab_size)
19
- #vocab size: 35712
20
 
21
  print(tokenizer.tokenize('今天天氣真好!'))
22
  #['▁', '今', '天', '天', '氣', '真', '好', '!']
 
1
 
2
  Mistral擴充詞表只包含與教育部常用4808字的交集
3
 
4
+ ~~後面補了25個dummy token,補到64的倍數可以增加訓練效率~~
5
+ ~~未來可以作為special token的預留空間~~
6
+ - 移除dummy token
7
+ - 增加`<|func_start|>`, `<|func_end|>`
8
 
9
 
10
  ```python
 
18
  )
19
 
20
  print('vocab size:', tokenizer.vocab_size)
21
+ #vocab size: 35686
22
 
23
  print(tokenizer.tokenize('今天天氣真好!'))
24
  #['▁', '今', '天', '天', '氣', '真', '好', '!']
added_tokens.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "<|func_end|>": 35712,
3
- "<|func_start|>": 35713
4
  }
 
1
  {
2
+ "<|func_end|>": 35686,
3
+ "<|func_start|>": 35687
4
  }
tokenizer.json CHANGED
@@ -31,7 +31,7 @@
31
  "special": true
32
  },
33
  {
34
- "id": 35712,
35
  "content": "<|func_end|>",
36
  "single_word": true,
37
  "lstrip": true,
@@ -40,7 +40,7 @@
40
  "special": false
41
  },
42
  {
43
- "id": 35713,
44
  "content": "<|func_start|>",
45
  "single_word": true,
46
  "lstrip": true,
@@ -35838,33 +35838,7 @@
35838
  "賅": 35682,
35839
  "簞": 35683,
35840
  "鼴": 35684,
35841
- "躂": 35685,
35842
- "<DUMMY_0>": 35686,
35843
- "<DUMMY_1>": 35687,
35844
- "<DUMMY_2>": 35688,
35845
- "<DUMMY_3>": 35689,
35846
- "<DUMMY_4>": 35690,
35847
- "<DUMMY_5>": 35691,
35848
- "<DUMMY_6>": 35692,
35849
- "<DUMMY_7>": 35693,
35850
- "<DUMMY_8>": 35694,
35851
- "<DUMMY_9>": 35695,
35852
- "<DUMMY_10>": 35696,
35853
- "<DUMMY_11>": 35697,
35854
- "<DUMMY_12>": 35698,
35855
- "<DUMMY_13>": 35699,
35856
- "<DUMMY_14>": 35700,
35857
- "<DUMMY_15>": 35701,
35858
- "<DUMMY_16>": 35702,
35859
- "<DUMMY_17>": 35703,
35860
- "<DUMMY_18>": 35704,
35861
- "<DUMMY_19>": 35705,
35862
- "<DUMMY_20>": 35706,
35863
- "<DUMMY_21>": 35707,
35864
- "<DUMMY_22>": 35708,
35865
- "<DUMMY_23>": 35709,
35866
- "<DUMMY_24>": 35710,
35867
- "<DUMMY_25>": 35711
35868
  },
35869
  "merges": [
35870
  "▁ t",
 
31
  "special": true
32
  },
33
  {
34
+ "id": 35686,
35
  "content": "<|func_end|>",
36
  "single_word": true,
37
  "lstrip": true,
 
40
  "special": false
41
  },
42
  {
43
+ "id": 35687,
44
  "content": "<|func_start|>",
45
  "single_word": true,
46
  "lstrip": true,
 
35838
  "賅": 35682,
35839
  "簞": 35683,
35840
  "鼴": 35684,
35841
+ "躂": 35685
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35842
  },
35843
  "merges": [
35844
  "▁ t",
tokenizer_config.json CHANGED
@@ -26,7 +26,7 @@
26
  "single_word": false,
27
  "special": true
28
  },
29
- "35712": {
30
  "content": "<|func_end|>",
31
  "lstrip": true,
32
  "normalized": false,
@@ -34,7 +34,7 @@
34
  "single_word": true,
35
  "special": false
36
  },
37
- "35713": {
38
  "content": "<|func_start|>",
39
  "lstrip": true,
40
  "normalized": false,
 
26
  "single_word": false,
27
  "special": true
28
  },
29
+ "35686": {
30
  "content": "<|func_end|>",
31
  "lstrip": true,
32
  "normalized": false,
 
34
  "single_word": true,
35
  "special": false
36
  },
37
+ "35687": {
38
  "content": "<|func_start|>",
39
  "lstrip": true,
40
  "normalized": false,