Jubliano commited on
Commit
30fa4ef
1 Parent(s): ceb96b2

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +3 -3
  2. tokenizer_config.json +5 -5
  3. vocab.json +47 -17
added_tokens.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "</s>": 106,
3
- "<s>": 105,
4
- "[UNK]": 107
5
  }
 
1
  {
2
+ "</s>": 136,
3
+ "<s>": 135,
4
+ "[UNK]": 137
5
  }
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "added_tokens_decoder": {
3
- "103": {
4
  "content": "[PAD]",
5
  "lstrip": true,
6
  "normalized": false,
@@ -8,7 +8,7 @@
8
  "single_word": false,
9
  "special": false
10
  },
11
- "104": {
12
  "content": "[UNK",
13
  "lstrip": true,
14
  "normalized": false,
@@ -16,7 +16,7 @@
16
  "single_word": false,
17
  "special": false
18
  },
19
- "105": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": false,
@@ -24,7 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "106": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": false,
@@ -32,7 +32,7 @@
32
  "single_word": false,
33
  "special": true
34
  },
35
- "107": {
36
  "content": "[UNK]",
37
  "lstrip": false,
38
  "normalized": false,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "133": {
4
  "content": "[PAD]",
5
  "lstrip": true,
6
  "normalized": false,
 
8
  "single_word": false,
9
  "special": false
10
  },
11
+ "134": {
12
  "content": "[UNK",
13
  "lstrip": true,
14
  "normalized": false,
 
16
  "single_word": false,
17
  "special": false
18
  },
19
+ "135": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": false,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "136": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": false,
 
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "137": {
36
  "content": "[UNK]",
37
  "lstrip": false,
38
  "normalized": false,
vocab.json CHANGED
@@ -1,8 +1,14 @@
1
  {
2
- " ": 99,
3
- ".": 88,
4
- "[PAD]": 103,
5
- "[UNK": 104,
 
 
 
 
 
 
6
  "a": 84,
7
  "b": 1,
8
  "c": 6,
@@ -25,43 +31,43 @@
25
  "t": 2,
26
  "u": 65,
27
  "v": 30,
28
- "w": 93,
29
  "x": 41,
30
  "y": 61,
31
  "z": 34,
32
- "|": 102,
33
- "ä": 92,
34
  "æ": 82,
35
  "ç": 39,
36
  "ð": 32,
37
  "ø": 70,
38
  "ħ": 45,
39
- "ĩ": 101,
40
  "ŋ": 19,
41
  "œ": 77,
42
- "ũ": 100,
43
  "ɐ": 83,
44
  "ɑ": 86,
45
  "ɒ": 87,
46
  "ɔ": 81,
47
- "ɕ": 89,
48
  "ɖ": 5,
49
- "ɗ": 97,
50
  "ɘ": 71,
51
  "ə": 75,
52
- "ɚ": 90,
53
  "ɛ": 76,
54
  "ɜ": 78,
55
  "ɞ": 79,
56
  "ɟ": 7,
57
- "ɡ": 95,
58
  "ɢ": 11,
59
  "ɣ": 42,
60
  "ɤ": 73,
61
  "ɦ": 48,
62
  "ɨ": 62,
63
  "ɪ": 66,
64
- "ɫ": 98,
65
  "ɬ": 49,
66
  "ɭ": 57,
67
  "ɮ": 50,
@@ -90,7 +96,7 @@
90
  "ʎ": 58,
91
  "ʏ": 67,
92
  "ʐ": 38,
93
- "ʑ": 96,
94
  "ʒ": 36,
95
  "ʔ": 13,
96
  "ʕ": 46,
@@ -98,10 +104,34 @@
98
  "ʝ": 40,
99
  "ʟ": 59,
100
  "ʡ": 12,
101
- "ː": 91,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  "β": 28,
103
  "θ": 31,
104
  "χ": 43,
105
- "ᵝ": 94,
106
  "ⱱ": 24
107
  }
 
1
  {
2
+ " ": 110,
3
+ "(": 99,
4
+ ")": 95,
5
+ ".": 93,
6
+ "2": 107,
7
+ "4": 109,
8
+ "5": 104,
9
+ "6": 105,
10
+ "[PAD]": 133,
11
+ "[UNK": 134,
12
  "a": 84,
13
  "b": 1,
14
  "c": 6,
 
31
  "t": 2,
32
  "u": 65,
33
  "v": 30,
34
+ "w": 100,
35
  "x": 41,
36
  "y": 61,
37
  "z": 34,
38
+ "|": 132,
39
+ "ä": 98,
40
  "æ": 82,
41
  "ç": 39,
42
  "ð": 32,
43
  "ø": 70,
44
  "ħ": 45,
45
+ "ĩ": 131,
46
  "ŋ": 19,
47
  "œ": 77,
48
+ "ũ": 130,
49
  "ɐ": 83,
50
  "ɑ": 86,
51
  "ɒ": 87,
52
  "ɔ": 81,
53
+ "ɕ": 94,
54
  "ɖ": 5,
55
+ "ɗ": 106,
56
  "ɘ": 71,
57
  "ə": 75,
58
+ "ɚ": 96,
59
  "ɛ": 76,
60
  "ɜ": 78,
61
  "ɞ": 79,
62
  "ɟ": 7,
63
+ "ɡ": 102,
64
  "ɢ": 11,
65
  "ɣ": 42,
66
  "ɤ": 73,
67
  "ɦ": 48,
68
  "ɨ": 62,
69
  "ɪ": 66,
70
+ "ɫ": 108,
71
  "ɬ": 49,
72
  "ɭ": 57,
73
  "ɮ": 50,
 
96
  "ʎ": 58,
97
  "ʏ": 67,
98
  "ʐ": 38,
99
+ "ʑ": 103,
100
  "ʒ": 36,
101
  "ʔ": 13,
102
  "ʕ": 46,
 
104
  "ʝ": 40,
105
  "ʟ": 59,
106
  "ʡ": 12,
107
+ "ʰ": 88,
108
+ "ʲ": 90,
109
+ "ʷ": 89,
110
+ "ː": 97,
111
+ "ˠ": 91,
112
+ "ˤ": 92,
113
+ "̃": 129,
114
+ "̈": 118,
115
+ "̊": 112,
116
+ "̘": 124,
117
+ "̙": 125,
118
+ "̜": 115,
119
+ "̝": 123,
120
+ "̞": 122,
121
+ "̟": 116,
122
+ "̠": 117,
123
+ "̤": 126,
124
+ "̥": 111,
125
+ "̩": 120,
126
+ "̬": 113,
127
+ "̯": 121,
128
+ "̰": 127,
129
+ "̹": 114,
130
+ "̼": 128,
131
+ "̽": 119,
132
  "β": 28,
133
  "θ": 31,
134
  "χ": 43,
135
+ "ᵝ": 101,
136
  "ⱱ": 24
137
  }