mimba commited on
Commit
b952f93
·
verified ·
1 Parent(s): 0508890

Upload tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +173 -0
tokenizer.json ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [],
6
+ "normalizer": {
7
+ "type": "Sequence",
8
+ "normalizers": [
9
+ {
10
+ "type": "NFKD"
11
+ },
12
+ {
13
+ "type": "Replace",
14
+ "pattern": {
15
+ "Regex": "\\s+"
16
+ },
17
+ "content": " "
18
+ },
19
+ {
20
+ "type": "Replace",
21
+ "pattern": {
22
+ "Regex": "[–—]"
23
+ },
24
+ "content": "-"
25
+ },
26
+ {
27
+ "type": "Replace",
28
+ "pattern": {
29
+ "Regex": "[^ -\\\"$-.0-;?A-Za-zÀÁÂÉÈÊËÌÍÎÏÒÓÔÙÚÛŸŃŊŋƆɔƐɛʉǎǐǒǔḿẅ’ʼ£́̂̌]"
30
+ },
31
+ "content": ""
32
+ }
33
+ ]
34
+ },
35
+ "pre_tokenizer": {
36
+ "type": "FixedLength",
37
+ "length": 1
38
+ },
39
+ "post_processor": null,
40
+ "decoder": {
41
+ "type": "Fuse"
42
+ },
43
+ "model": {
44
+ "type": "WordLevel",
45
+ "vocab": {
46
+ "A": 0,
47
+ "B": 1,
48
+ "C": 2,
49
+ "E": 3,
50
+ "F": 4,
51
+ "G": 5,
52
+ "H": 6,
53
+ "I": 7,
54
+ "J": 8,
55
+ "K": 9,
56
+ "L": 10,
57
+ "M": 11,
58
+ "N": 12,
59
+ "O": 13,
60
+ "P": 14,
61
+ "R": 15,
62
+ "S": 16,
63
+ "T": 17,
64
+ "V": 18,
65
+ "W": 19,
66
+ "Y": 20,
67
+ "Z": 21,
68
+ "À": 22,
69
+ "Á": 23,
70
+ "É": 24,
71
+ "Ń": 25,
72
+ "Ŋ": 26,
73
+ "Ɔ": 27,
74
+ "Ǎ": 28,
75
+ "Ǐ": 29,
76
+ "Ǒ": 30,
77
+ "Ǔ": 31,
78
+ "a": 32,
79
+ "b": 33,
80
+ "c": 34,
81
+ "d": 35,
82
+ "e": 36,
83
+ "f": 37,
84
+ "g": 38,
85
+ "h": 39,
86
+ "i": 40,
87
+ "j": 41,
88
+ "k": 42,
89
+ "l": 43,
90
+ "m": 44,
91
+ "n": 45,
92
+ "o": 46,
93
+ "p": 47,
94
+ "r": 48,
95
+ "s": 49,
96
+ "t": 50,
97
+ "u": 51,
98
+ "v": 52,
99
+ "w": 53,
100
+ "y": 54,
101
+ "z": 55,
102
+ "à": 56,
103
+ "á": 57,
104
+ "â": 58,
105
+ "è": 59,
106
+ "é": 60,
107
+ "ê": 61,
108
+ "ì": 62,
109
+ "í": 63,
110
+ "î": 64,
111
+ "ï": 65,
112
+ "ò": 66,
113
+ "ó": 67,
114
+ "ô": 68,
115
+ "ù": 69,
116
+ "ú": 70,
117
+ "û": 71,
118
+ "ÿ": 72,
119
+ "ě": 73,
120
+ "ń": 74,
121
+ "ŋ": 75,
122
+ "ɔ": 76,
123
+ "ɛ": 77,
124
+ "ʉ": 78,
125
+ "ǎ": 79,
126
+ "ǐ": 80,
127
+ "ǒ": 81,
128
+ "ǔ": 82,
129
+ "ḿ": 83,
130
+ "ẅ": 84,
131
+ "’": 85,
132
+ "ʼ": 86,
133
+ " ": 87,
134
+ "!": 88,
135
+ "\"": 89,
136
+ "#": 90,
137
+ "$": 91,
138
+ "%": 92,
139
+ "&": 93,
140
+ "'": 94,
141
+ "(": 95,
142
+ ")": 96,
143
+ "*": 97,
144
+ "+": 98,
145
+ ",": 99,
146
+ "-": 100,
147
+ ".": 101,
148
+ "/": 102,
149
+ "0": 103,
150
+ "1": 104,
151
+ "2": 105,
152
+ "3": 106,
153
+ "4": 107,
154
+ "5": 108,
155
+ "6": 109,
156
+ "7": 110,
157
+ "8": 111,
158
+ "9": 112,
159
+ ":": 113,
160
+ ";": 114,
161
+ "?": 115,
162
+ "@": 116,
163
+ "£": 117,
164
+ "̀": 118,
165
+ "́": 119,
166
+ "̂": 120,
167
+ "̄": 121,
168
+ "̆": 122,
169
+ "̌": 123
170
+ },
171
+ "unk_token": "́"
172
+ }
173
+ }