woodlee309 commited on
Commit
088f82f
1 Parent(s): 910c121

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,3 +1,3 @@
1
  {
2
- "<unk>": 177
3
  }
 
1
  {
2
+ "<unk>": 38
3
  }
special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "pad_token": {
3
- "content": "_",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
 
1
  {
2
  "pad_token": {
3
+ "content": "k",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -2,14 +2,14 @@
2
  "add_blank": true,
3
  "added_tokens_decoder": {
4
  "0": {
5
- "content": "_",
6
  "lstrip": false,
7
  "normalized": false,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
11
  },
12
- "177": {
13
  "content": "<unk>",
14
  "lstrip": false,
15
  "normalized": false,
@@ -20,11 +20,11 @@
20
  },
21
  "clean_up_tokenization_spaces": true,
22
  "is_uroman": false,
23
- "language": null,
24
- "model_max_length": 1000000000000000019884624838656,
25
  "normalize": true,
26
- "pad_token": "_",
27
- "phonemize": true,
28
  "tokenizer_class": "VitsTokenizer",
29
  "unk_token": "<unk>",
30
  "verbose": false
 
2
  "add_blank": true,
3
  "added_tokens_decoder": {
4
  "0": {
5
+ "content": "k",
6
  "lstrip": false,
7
  "normalized": false,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
11
  },
12
+ "38": {
13
  "content": "<unk>",
14
  "lstrip": false,
15
  "normalized": false,
 
20
  },
21
  "clean_up_tokenization_spaces": true,
22
  "is_uroman": false,
23
+ "language": "eng",
24
+ "model_max_length": 4096,
25
  "normalize": true,
26
+ "pad_token": "k",
27
+ "phonemize": false,
28
  "tokenizer_class": "VitsTokenizer",
29
  "unk_token": "<unk>",
30
  "verbose": false
vocab.json CHANGED
@@ -1,179 +1,40 @@
1
  {
2
- " ": 16,
3
- "!": 5,
4
- "\"": 11,
5
- "'": 176,
6
- ",": 3,
7
- ".": 4,
8
- ":": 2,
9
- ";": 1,
10
- "?": 6,
11
- "A": 17,
12
- "B": 18,
13
- "C": 19,
14
- "D": 20,
15
- "E": 21,
16
- "F": 22,
17
- "G": 23,
18
- "H": 24,
19
- "I": 25,
20
- "J": 26,
21
- "K": 27,
22
- "L": 28,
23
- "M": 29,
24
- "N": 30,
25
- "O": 31,
26
- "P": 32,
27
- "Q": 33,
28
- "R": 34,
29
- "S": 35,
30
- "T": 36,
31
- "U": 37,
32
- "V": 38,
33
- "W": 39,
34
- "X": 40,
35
- "Y": 41,
36
- "Z": 42,
37
- "_": 0,
38
- "a": 43,
39
- "b": 44,
40
- "c": 45,
41
- "d": 46,
42
- "e": 47,
43
- "f": 48,
44
- "g": 49,
45
- "h": 50,
46
- "i": 51,
47
- "j": 52,
48
- "k": 53,
49
- "l": 54,
50
- "m": 55,
51
- "n": 56,
52
- "o": 57,
53
- "p": 58,
54
- "q": 59,
55
- "r": 60,
56
- "s": 61,
57
- "t": 62,
58
- "u": 63,
59
- "v": 64,
60
- "w": 65,
61
- "x": 66,
62
- "y": 67,
63
- "z": 68,
64
- "¡": 7,
65
- "«": 12,
66
- "»": 13,
67
- "¿": 8,
68
- "æ": 72,
69
- "ç": 78,
70
- "ð": 81,
71
- "ø": 116,
72
- "ħ": 98,
73
- "ŋ": 112,
74
- "œ": 120,
75
- "ǀ": 152,
76
- "ǁ": 153,
77
- "ǂ": 154,
78
- "ǃ": 155,
79
- "ɐ": 70,
80
- "ɑ": 69,
81
- "ɒ": 71,
82
- "ɓ": 73,
83
- "ɔ": 76,
84
- "ɕ": 77,
85
- "ɖ": 80,
86
- "ɗ": 79,
87
- "ɘ": 84,
88
- "ə": 83,
89
- "ɚ": 85,
90
- "ɛ": 86,
91
- "ɜ": 87,
92
- "ɝ": 88,
93
- "ɞ": 89,
94
- "ɟ": 90,
95
- "ɠ": 93,
96
- "ɡ": 92,
97
- "ɢ": 94,
98
- "ɣ": 139,
99
- "ɤ": 140,
100
- "ɥ": 99,
101
- "ɦ": 96,
102
- "ɧ": 97,
103
- "ɨ": 101,
104
- "ɪ": 102,
105
- "ɫ": 106,
106
- "ɬ": 105,
107
- "ɭ": 104,
108
- "ɮ": 107,
109
- "ɯ": 110,
110
- "ɰ": 111,
111
- "ɱ": 109,
112
- "ɲ": 114,
113
- "ɳ": 113,
114
- "ɴ": 115,
115
- "ɵ": 117,
116
- "ɶ": 121,
117
- "ɸ": 118,
118
- "ɹ": 123,
119
- "ɺ": 124,
120
- "ɻ": 126,
121
- "ɽ": 129,
122
- "ɾ": 125,
123
- "ʀ": 127,
124
- "ʁ": 128,
125
- "ʂ": 130,
126
- "ʃ": 131,
127
- "ʄ": 91,
128
- "ʈ": 132,
129
- "ʉ": 134,
130
- "ʊ": 135,
131
- "ʋ": 136,
132
- "ʌ": 138,
133
- "ʍ": 141,
134
- "ʎ": 143,
135
- "ʏ": 144,
136
- "ʐ": 146,
137
- "ʑ": 145,
138
- "ʒ": 147,
139
- "ʔ": 148,
140
- "ʕ": 150,
141
- "ʘ": 122,
142
- "ʙ": 74,
143
- "ʛ": 95,
144
- "ʜ": 100,
145
- "ʝ": 103,
146
- "ʟ": 108,
147
- "ʡ": 149,
148
- "ʢ": 151,
149
- "ʤ": 82,
150
- "ʧ": 133,
151
- "ʰ": 162,
152
- "ʱ": 163,
153
- "ʲ": 164,
154
- "ʴ": 161,
155
- "ʷ": 165,
156
- "ʼ": 160,
157
- "ˈ": 156,
158
- "ˌ": 157,
159
- "ː": 158,
160
- "ˑ": 159,
161
- "˞": 168,
162
- "ˠ": 166,
163
- "ˤ": 167,
164
- "̩": 175,
165
- "β": 75,
166
- "θ": 119,
167
- "χ": 142,
168
- "ᵻ": 177,
169
- "—": 9,
170
- "“": 14,
171
- "”": 15,
172
- "…": 10,
173
- "↑": 170,
174
- "→": 171,
175
- "↓": 169,
176
- "↗": 172,
177
- "↘": 173,
178
- "ⱱ": 137
179
  }
 
1
  {
2
+ " ": 19,
3
+ "'": 1,
4
+ "-": 14,
5
+ "0": 23,
6
+ "1": 15,
7
+ "2": 28,
8
+ "3": 11,
9
+ "4": 27,
10
+ "5": 35,
11
+ "6": 36,
12
+ "_": 30,
13
+ "a": 26,
14
+ "b": 24,
15
+ "c": 12,
16
+ "d": 5,
17
+ "e": 7,
18
+ "f": 20,
19
+ "g": 37,
20
+ "h": 6,
21
+ "i": 18,
22
+ "j": 16,
23
+ "k": 0,
24
+ "l": 21,
25
+ "m": 17,
26
+ "n": 29,
27
+ "o": 22,
28
+ "p": 13,
29
+ "q": 34,
30
+ "r": 25,
31
+ "s": 8,
32
+ "t": 33,
33
+ "u": 4,
34
+ "v": 32,
35
+ "w": 9,
36
+ "x": 31,
37
+ "y": 3,
38
+ "z": 2,
39
+ "": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  }