mimba commited on
Commit
d7bd582
·
verified ·
1 Parent(s): 4afe0b6

Upload tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +123 -126
tokenizer.json CHANGED
@@ -26,7 +26,7 @@
26
  {
27
  "type": "Replace",
28
  "pattern": {
29
- "Regex": "[^ -\\\"$-.0-;?A-Za-zÀÁÂÉÈÊËÌÍÎÏÒÓÔÙÚÛŸŃŊŋƆɔƐɛʉǎǐǒǔḿẅ’ʼ£́̂̌]"
30
  },
31
  "content": ""
32
  }
@@ -43,131 +43,128 @@
43
  "model": {
44
  "type": "WordLevel",
45
  "vocab": {
46
- "A": 0,
47
- "B": 1,
48
- "C": 2,
49
- "E": 3,
50
- "F": 4,
51
- "G": 5,
52
- "H": 6,
53
- "I": 7,
54
- "J": 8,
55
- "K": 9,
56
- "L": 10,
57
- "M": 11,
58
- "N": 12,
59
- "O": 13,
60
- "P": 14,
61
- "R": 15,
62
- "S": 16,
63
- "T": 17,
64
- "V": 18,
65
- "W": 19,
66
- "Y": 20,
67
- "Z": 21,
68
- "À": 22,
69
- "Á": 23,
70
- "É": 24,
71
- "Ń": 25,
72
- "Ŋ": 26,
73
- "Ɔ": 27,
74
- "Ǎ": 28,
75
- "Ǐ": 29,
76
- "Ǒ": 30,
77
- "Ǔ": 31,
78
- "a": 32,
79
- "b": 33,
80
- "c": 34,
81
- "d": 35,
82
- "e": 36,
83
- "f": 37,
84
- "g": 38,
85
- "h": 39,
86
- "i": 40,
87
- "j": 41,
88
- "k": 42,
89
- "l": 43,
90
- "m": 44,
91
- "n": 45,
92
- "o": 46,
93
- "p": 47,
94
- "r": 48,
95
- "s": 49,
96
- "t": 50,
97
- "u": 51,
98
- "v": 52,
99
- "w": 53,
100
- "y": 54,
101
- "z": 55,
102
- "à": 56,
103
- "á": 57,
104
- "â": 58,
105
- "è": 59,
106
- "é": 60,
107
- "ê": 61,
108
- "ì": 62,
109
- "í": 63,
110
- "î": 64,
111
- "ï": 65,
112
- "ò": 66,
113
- "ó": 67,
114
- "ô": 68,
115
- "ù": 69,
116
- "ú": 70,
117
- "û": 71,
118
- "ÿ": 72,
119
- "ě": 73,
120
- "ń": 74,
121
- "ŋ": 75,
122
- "ɔ": 76,
123
- "ɛ": 77,
124
- "ʉ": 78,
125
- "ǎ": 79,
126
- "ǐ": 80,
127
- "ǒ": 81,
128
- "ǔ": 82,
129
- "ḿ": 83,
130
- "": 84,
131
- "": 85,
132
- "ʼ": 86,
133
- " ": 87,
134
- "!": 88,
135
- "\"": 89,
136
- "#": 90,
137
- "$": 91,
138
- "%": 92,
139
- "&": 93,
140
- "'": 94,
141
- "(": 95,
142
- ")": 96,
143
- "*": 97,
144
- "+": 98,
145
- ",": 99,
146
- "-": 100,
147
- ".": 101,
148
- "/": 102,
149
- "0": 103,
150
- "1": 104,
151
- "2": 105,
152
- "3": 106,
153
- "4": 107,
154
- "5": 108,
155
- "6": 109,
156
- "7": 110,
157
- "8": 111,
158
- "9": 112,
159
- ":": 113,
160
- ";": 114,
161
- "?": 115,
162
- "@": 116,
163
- "£": 117,
164
- "̀": 118,
165
- "́": 119,
166
- "̂": 120,
167
- "̄": 121,
168
- "̆": 122,
169
- "̌": 123
170
  },
171
- "unk_token": "́"
172
  }
173
  }
 
26
  {
27
  "type": "Replace",
28
  "pattern": {
29
+ "Regex": "[^a-zA-Z0-9\\s\\-\\!\\\"\\$\\%\\(\\)\\*\\+\\,\\.\\/\\:\\;\\?\\@\\_ÀÁÂÉÈÊËÌÍÎÏÒÓÔÙÚÛŸŃŊŋƆɔƐɛʉǎǐǒǔḿẅ’ʼ£̀́̂̌]"
30
  },
31
  "content": ""
32
  }
 
43
  "model": {
44
  "type": "WordLevel",
45
  "vocab": {
46
+ " ": 0,
47
+ "!": 1,
48
+ "\"": 2,
49
+ "$": 3,
50
+ "%": 4,
51
+ "&": 5,
52
+ "'": 6,
53
+ "(": 7,
54
+ ")": 8,
55
+ "*": 9,
56
+ "+": 10,
57
+ ",": 11,
58
+ "-": 12,
59
+ ".": 13,
60
+ "/": 14,
61
+ "0": 15,
62
+ "1": 16,
63
+ "2": 17,
64
+ "3": 18,
65
+ "4": 19,
66
+ "5": 20,
67
+ "6": 21,
68
+ "7": 22,
69
+ "8": 23,
70
+ "9": 24,
71
+ ":": 25,
72
+ ";": 26,
73
+ "?": 27,
74
+ "@": 28,
75
+ "A": 29,
76
+ "B": 30,
77
+ "C": 31,
78
+ "D": 32,
79
+ "E": 33,
80
+ "F": 34,
81
+ "G": 35,
82
+ "H": 36,
83
+ "I": 37,
84
+ "J": 38,
85
+ "K": 39,
86
+ "L": 40,
87
+ "M": 41,
88
+ "N": 42,
89
+ "O": 43,
90
+ "P": 44,
91
+ "Q": 45,
92
+ "R": 46,
93
+ "S": 47,
94
+ "T": 48,
95
+ "U": 49,
96
+ "V": 50,
97
+ "W": 51,
98
+ "X": 52,
99
+ "Y": 53,
100
+ "Z": 54,
101
+ "a": 55,
102
+ "b": 56,
103
+ "c": 57,
104
+ "d": 58,
105
+ "e": 59,
106
+ "f": 60,
107
+ "g": 61,
108
+ "h": 62,
109
+ "i": 63,
110
+ "j": 64,
111
+ "k": 65,
112
+ "l": 66,
113
+ "m": 67,
114
+ "n": 68,
115
+ "o": 69,
116
+ "p": 70,
117
+ "q": 71,
118
+ "r": 72,
119
+ "s": 73,
120
+ "t": 74,
121
+ "u": 75,
122
+ "v": 76,
123
+ "w": 77,
124
+ "x": 78,
125
+ "y": 79,
126
+ "z": 80,
127
+ "À": 81,
128
+ "Á": 82,
129
+ "Â": 83,
130
+ "É": 84,
131
+ "È": 85,
132
+ "Ê": 86,
133
+ "Ë": 87,
134
+ "Ì": 88,
135
+ "Í": 89,
136
+ "Î": 90,
137
+ "Ï": 91,
138
+ "Ò": 92,
139
+ "Ó": 93,
140
+ "Ô": 94,
141
+ "Ù": 95,
142
+ "Ú": 96,
143
+ "Û": 97,
144
+ "Ÿ": 98,
145
+ "Ń": 99,
146
+ "Ŋ": 100,
147
+ "ŋ": 101,
148
+ "Ɔ": 102,
149
+ "ɔ": 103,
150
+ "Ɛ": 104,
151
+ "ɛ": 105,
152
+ "ʉ": 106,
153
+ "ǎ": 107,
154
+ "ǐ": 108,
155
+ "ǒ": 109,
156
+ "ǔ": 110,
157
+ "ḿ": 111,
158
+ "": 112,
159
+ "": 113,
160
+ "ʼ": 114,
161
+ "£": 115,
162
+ "_": 116,
163
+ "̀": 117,
164
+ "́": 118,
165
+ "̂": 119,
166
+ "̌": 120
 
 
 
167
  },
168
+ "unk_token": ""
169
  }
170
  }