dotan1111 commited on
Commit
57453a9
1 Parent(s): b55e757

Upload tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +210 -0
tokenizer.json ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<UNK>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ }
15
+ ],
16
+ "normalizer": {
17
+ "type": "Lowercase"
18
+ },
19
+ "pre_tokenizer": {
20
+ "type": "Whitespace"
21
+ },
22
+ "post_processor": null,
23
+ "decoder": null,
24
+ "model": {
25
+ "type": "BPE",
26
+ "dropout": null,
27
+ "unk_token": "<UNK>",
28
+ "continuing_subword_prefix": null,
29
+ "end_of_word_suffix": null,
30
+ "fuse_unk": false,
31
+ "vocab": {
32
+ "<UNK>": 0,
33
+ "a": 1,
34
+ "b": 2,
35
+ "c": 3,
36
+ "d": 4,
37
+ "e": 5,
38
+ "f": 6,
39
+ "g": 7,
40
+ "h": 8,
41
+ "i": 9,
42
+ "k": 10,
43
+ "l": 11,
44
+ "m": 12,
45
+ "n": 13,
46
+ "o": 14,
47
+ "p": 15,
48
+ "q": 16,
49
+ "r": 17,
50
+ "s": 18,
51
+ "t": 19,
52
+ "u": 20,
53
+ "v": 21,
54
+ "w": 22,
55
+ "x": 23,
56
+ "y": 24,
57
+ "z": 25,
58
+ "aa": 26,
59
+ "ll": 27,
60
+ "la": 28,
61
+ "gg": 29,
62
+ "rr": 30,
63
+ "va": 31,
64
+ "ga": 32,
65
+ "ra": 33,
66
+ "lv": 34,
67
+ "pa": 35,
68
+ "lg": 36,
69
+ "sa": 37,
70
+ "lr": 38,
71
+ "ea": 39,
72
+ "vv": 40,
73
+ "da": 41,
74
+ "ta": 42,
75
+ "ls": 43,
76
+ "lp": 44,
77
+ "ia": 45,
78
+ "gr": 46,
79
+ "ld": 47,
80
+ "le": 48,
81
+ "ss": 49,
82
+ "gv": 50,
83
+ "lt": 51,
84
+ "gs": 52,
85
+ "er": 53,
86
+ "gt": 54,
87
+ "gd": 55,
88
+ "pp": 56,
89
+ "li": 57,
90
+ "vr": 58,
91
+ "ge": 59,
92
+ "qa": 60,
93
+ "fa": 61,
94
+ "lk": 62,
95
+ "vt": 63,
96
+ "vs": 64,
97
+ "gi": 65,
98
+ "vd": 66,
99
+ "ve": 67,
100
+ "lf": 68,
101
+ "pr": 69,
102
+ "ka": 70,
103
+ "dr": 71,
104
+ "ps": 72,
105
+ "lq": 73,
106
+ "ee": 74,
107
+ "tt": 75,
108
+ "gk": 76,
109
+ "na": 77,
110
+ "sr": 78,
111
+ "pd": 79,
112
+ "vi": 80,
113
+ "pe": 81,
114
+ "gf": 82,
115
+ "ln": 83,
116
+ "pt": 84,
117
+ "gq": 85,
118
+ "ha": 86,
119
+ "st": 87,
120
+ "dd": 88,
121
+ "qr": 89,
122
+ "gp": 90,
123
+ "ei": 91,
124
+ "ya": 92,
125
+ "kk": 93,
126
+ "gn": 94,
127
+ "lh": 95,
128
+ "vp": 96,
129
+ "tr": 97,
130
+ "vf": 98,
131
+ "si": 99
132
+ },
133
+ "merges": [
134
+ "a a",
135
+ "l l",
136
+ "l a",
137
+ "g g",
138
+ "r r",
139
+ "v a",
140
+ "g a",
141
+ "r a",
142
+ "l v",
143
+ "p a",
144
+ "l g",
145
+ "s a",
146
+ "l r",
147
+ "e a",
148
+ "v v",
149
+ "d a",
150
+ "t a",
151
+ "l s",
152
+ "l p",
153
+ "i a",
154
+ "g r",
155
+ "l d",
156
+ "l e",
157
+ "s s",
158
+ "g v",
159
+ "l t",
160
+ "g s",
161
+ "e r",
162
+ "g t",
163
+ "g d",
164
+ "p p",
165
+ "l i",
166
+ "v r",
167
+ "g e",
168
+ "q a",
169
+ "f a",
170
+ "l k",
171
+ "v t",
172
+ "v s",
173
+ "g i",
174
+ "v d",
175
+ "v e",
176
+ "l f",
177
+ "p r",
178
+ "k a",
179
+ "d r",
180
+ "p s",
181
+ "l q",
182
+ "e e",
183
+ "t t",
184
+ "g k",
185
+ "n a",
186
+ "s r",
187
+ "p d",
188
+ "v i",
189
+ "p e",
190
+ "g f",
191
+ "l n",
192
+ "p t",
193
+ "g q",
194
+ "h a",
195
+ "s t",
196
+ "d d",
197
+ "q r",
198
+ "g p",
199
+ "e i",
200
+ "y a",
201
+ "k k",
202
+ "g n",
203
+ "l h",
204
+ "v p",
205
+ "t r",
206
+ "v f",
207
+ "s i"
208
+ ]
209
+ }
210
+ }