Jubliano commited on
Commit
296d082
1 Parent(s): 0759654

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +2 -2
  2. tokenizer_config.json +4 -4
  3. vocab.json +21 -41
added_tokens.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "</s>": 135,
3
- "<s>": 134
4
  }
 
1
  {
2
+ "</s>": 115,
3
+ "<s>": 114
4
  }
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "added_tokens_decoder": {
3
- "132": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
@@ -8,7 +8,7 @@
8
  "single_word": false,
9
  "special": false
10
  },
11
- "133": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
@@ -16,7 +16,7 @@
16
  "single_word": false,
17
  "special": false
18
  },
19
- "134": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": false,
@@ -24,7 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "135": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": false,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "112": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
 
8
  "single_word": false,
9
  "special": false
10
  },
11
+ "113": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
 
16
  "single_word": false,
17
  "special": false
18
  },
19
+ "114": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": false,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "115": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": false,
vocab.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- " ": 130,
3
- "(": 118,
4
- ")": 114,
5
- ".": 112,
6
- "2": 126,
7
- "4": 129,
8
- "5": 123,
9
- "6": 124,
10
- "[PAD]": 133,
11
- "[UNK]": 132,
12
  "a": 84,
13
  "b": 1,
14
  "c": 6,
@@ -31,12 +31,12 @@
31
  "t": 2,
32
  "u": 65,
33
  "v": 30,
34
- "w": 119,
35
  "x": 41,
36
  "y": 61,
37
  "z": 34,
38
- "|": 131,
39
- "ä": 117,
40
  "æ": 82,
41
  "ç": 39,
42
  "ð": 32,
@@ -48,24 +48,24 @@
48
  "ɑ": 86,
49
  "ɒ": 87,
50
  "ɔ": 81,
51
- "ɕ": 113,
52
  "ɖ": 5,
53
- "ɗ": 125,
54
  "ɘ": 71,
55
  "ə": 75,
56
- "ɚ": 115,
57
  "ɛ": 76,
58
  "ɜ": 78,
59
  "ɞ": 79,
60
  "ɟ": 7,
61
- "ɡ": 121,
62
  "ɢ": 11,
63
  "ɣ": 42,
64
  "ɤ": 73,
65
  "ɦ": 48,
66
  "ɨ": 62,
67
  "ɪ": 66,
68
- "ɫ": 127,
69
  "ɬ": 49,
70
  "ɭ": 57,
71
  "ɮ": 50,
@@ -94,7 +94,7 @@
94
  "ʎ": 58,
95
  "ʏ": 67,
96
  "ʐ": 38,
97
- "ʑ": 122,
98
  "ʒ": 36,
99
  "ʔ": 13,
100
  "ʕ": 46,
@@ -105,32 +105,12 @@
105
  "ʰ": 88,
106
  "ʲ": 90,
107
  "ʷ": 89,
108
- "ː": 116,
109
  "ˠ": 91,
110
  "ˤ": 92,
111
- "̃": 111,
112
- "̈": 100,
113
- "̊": 94,
114
- "̘": 106,
115
- "̙": 107,
116
- "̜": 97,
117
- "̝": 105,
118
- "̞": 104,
119
- "̟": 98,
120
- "̠": 99,
121
- "̤": 108,
122
- "̥": 93,
123
- "̩": 102,
124
- "̪": 128,
125
- "̬": 95,
126
- "̯": 103,
127
- "̰": 109,
128
- "̹": 96,
129
- "̼": 110,
130
- "̽": 101,
131
  "β": 28,
132
  "θ": 31,
133
  "χ": 43,
134
- "ᵝ": 120,
135
  "ⱱ": 24
136
  }
 
1
  {
2
+ " ": 110,
3
+ "(": 99,
4
+ ")": 95,
5
+ ".": 93,
6
+ "2": 107,
7
+ "4": 109,
8
+ "5": 104,
9
+ "6": 105,
10
+ "[PAD]": 113,
11
+ "[UNK]": 112,
12
  "a": 84,
13
  "b": 1,
14
  "c": 6,
 
31
  "t": 2,
32
  "u": 65,
33
  "v": 30,
34
+ "w": 100,
35
  "x": 41,
36
  "y": 61,
37
  "z": 34,
38
+ "|": 111,
39
+ "ä": 98,
40
  "æ": 82,
41
  "ç": 39,
42
  "ð": 32,
 
48
  "ɑ": 86,
49
  "ɒ": 87,
50
  "ɔ": 81,
51
+ "ɕ": 94,
52
  "ɖ": 5,
53
+ "ɗ": 106,
54
  "ɘ": 71,
55
  "ə": 75,
56
+ "ɚ": 96,
57
  "ɛ": 76,
58
  "ɜ": 78,
59
  "ɞ": 79,
60
  "ɟ": 7,
61
+ "ɡ": 102,
62
  "ɢ": 11,
63
  "ɣ": 42,
64
  "ɤ": 73,
65
  "ɦ": 48,
66
  "ɨ": 62,
67
  "ɪ": 66,
68
+ "ɫ": 108,
69
  "ɬ": 49,
70
  "ɭ": 57,
71
  "ɮ": 50,
 
94
  "ʎ": 58,
95
  "ʏ": 67,
96
  "ʐ": 38,
97
+ "ʑ": 103,
98
  "ʒ": 36,
99
  "ʔ": 13,
100
  "ʕ": 46,
 
105
  "ʰ": 88,
106
  "ʲ": 90,
107
  "ʷ": 89,
108
+ "ː": 97,
109
  "ˠ": 91,
110
  "ˤ": 92,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  "β": 28,
112
  "θ": 31,
113
  "χ": 43,
114
+ "ᵝ": 101,
115
  "ⱱ": 24
116
  }