grosenthal commited on
Commit
fda9a35
1 Parent(s): 014fc80

Upload tokenizer

Browse files
Files changed (5) hide show
  1. added_tokens.json +445 -0
  2. source.spm +0 -0
  3. target.spm +0 -0
  4. tokenizer_config.json +2 -2
  5. vocab.json +0 -0
added_tokens.json ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "CASE_-ve": 65119,
3
+ "CASE_Chium": 65187,
4
+ "CASE_Domini": 65225,
5
+ "CASE_Erant": 65208,
6
+ "CASE_Erat": 65394,
7
+ "CASE_Erit": 65019,
8
+ "CASE_Erunt": 65189,
9
+ "CASE_Gallis": 65375,
10
+ "CASE_a": 65145,
11
+ "CASE_abam": 65353,
12
+ "CASE_abamini": 65047,
13
+ "CASE_abamur": 65091,
14
+ "CASE_abamus": 65197,
15
+ "CASE_abant": 65376,
16
+ "CASE_abantur": 65365,
17
+ "CASE_abar": 65087,
18
+ "CASE_abare": 65028,
19
+ "CASE_abaris": 65254,
20
+ "CASE_abas": 65215,
21
+ "CASE_abat": 65210,
22
+ "CASE_abatis": 65148,
23
+ "CASE_abatur": 65190,
24
+ "CASE_abere": 65329,
25
+ "CASE_aberis": 65355,
26
+ "CASE_abimini": 65135,
27
+ "CASE_abimur": 65272,
28
+ "CASE_abimus": 65193,
29
+ "CASE_abis": 65037,
30
+ "CASE_abit": 65084,
31
+ "CASE_abitis": 65359,
32
+ "CASE_abitur": 65199,
33
+ "CASE_abo": 65413,
34
+ "CASE_abor": 65083,
35
+ "CASE_abunt": 65222,
36
+ "CASE_abuntur": 65082,
37
+ "CASE_abus": 65290,
38
+ "CASE_ac": 65104,
39
+ "CASE_accersendus": 65001,
40
+ "CASE_ad": 65318,
41
+ "CASE_ae": 65267,
42
+ "CASE_aec": 65410,
43
+ "CASE_ai": 65195,
44
+ "CASE_am": 65102,
45
+ "CASE_amini": 65122,
46
+ "CASE_amore": 65044,
47
+ "CASE_amur": 65137,
48
+ "CASE_amus": 65229,
49
+ "CASE_an": 65095,
50
+ "CASE_anc": 65060,
51
+ "CASE_anda": 65311,
52
+ "CASE_andae": 65053,
53
+ "CASE_andam": 65024,
54
+ "CASE_andarum": 65227,
55
+ "CASE_andas": 65027,
56
+ "CASE_ande": 65156,
57
+ "CASE_andi": 65319,
58
+ "CASE_andis": 65282,
59
+ "CASE_ando": 65221,
60
+ "CASE_andorum": 65045,
61
+ "CASE_andos": 65274,
62
+ "CASE_andum": 65256,
63
+ "CASE_andus": 65160,
64
+ "CASE_ans": 65219,
65
+ "CASE_ant": 65432,
66
+ "CASE_ante": 65246,
67
+ "CASE_antem": 65379,
68
+ "CASE_antes": 65025,
69
+ "CASE_anti": 65356,
70
+ "CASE_antia": 65100,
71
+ "CASE_antibus": 65377,
72
+ "CASE_antis": 65308,
73
+ "CASE_antium": 65239,
74
+ "CASE_anto": 65245,
75
+ "CASE_antum": 65074,
76
+ "CASE_antur": 65056,
77
+ "CASE_ar": 65317,
78
+ "CASE_arce": 65166,
79
+ "CASE_are": 65350,
80
+ "CASE_arem": 65436,
81
+ "CASE_aremini": 65134,
82
+ "CASE_aremur": 65280,
83
+ "CASE_aremus": 65105,
84
+ "CASE_arent": 65258,
85
+ "CASE_arentur": 65275,
86
+ "CASE_arer": 65341,
87
+ "CASE_arere": 65427,
88
+ "CASE_areris": 65315,
89
+ "CASE_ares": 65198,
90
+ "CASE_aret": 65062,
91
+ "CASE_aretis": 65063,
92
+ "CASE_aretur": 65079,
93
+ "CASE_ari": 65211,
94
+ "CASE_arier": 65276,
95
+ "CASE_aris": 65098,
96
+ "CASE_armis": 65362,
97
+ "CASE_arum": 65106,
98
+ "CASE_arun": 65069,
99
+ "CASE_as": 65327,
100
+ "CASE_at": 65278,
101
+ "CASE_ate": 65302,
102
+ "CASE_atis": 65011,
103
+ "CASE_ato": 65396,
104
+ "CASE_ator": 65223,
105
+ "CASE_atote": 65109,
106
+ "CASE_atur": 65200,
107
+ "CASE_autem": 65174,
108
+ "CASE_bam": 65124,
109
+ "CASE_bamus": 65249,
110
+ "CASE_bant": 65155,
111
+ "CASE_bantur": 65403,
112
+ "CASE_bas": 65188,
113
+ "CASE_bat": 65435,
114
+ "CASE_batur": 65112,
115
+ "CASE_bimus": 65228,
116
+ "CASE_bis": 65206,
117
+ "CASE_bit": 65192,
118
+ "CASE_bitis": 65434,
119
+ "CASE_bo": 65264,
120
+ "CASE_bunt": 65441,
121
+ "CASE_circuitu": 65247,
122
+ "CASE_d": 65054,
123
+ "CASE_diebus": 65387,
124
+ "CASE_dolo": 65131,
125
+ "CASE_e": 65404,
126
+ "CASE_eam": 65234,
127
+ "CASE_eamini": 65214,
128
+ "CASE_eamur": 65399,
129
+ "CASE_eamus": 65186,
130
+ "CASE_eant": 65077,
131
+ "CASE_eantur": 65354,
132
+ "CASE_ear": 65163,
133
+ "CASE_eare": 65041,
134
+ "CASE_earis": 65370,
135
+ "CASE_eas": 65409,
136
+ "CASE_eat": 65291,
137
+ "CASE_eatis": 65400,
138
+ "CASE_eatur": 65080,
139
+ "CASE_ebam": 65059,
140
+ "CASE_ebamini": 65433,
141
+ "CASE_ebamur": 65115,
142
+ "CASE_ebamus": 65180,
143
+ "CASE_ebant": 65401,
144
+ "CASE_ebantur": 65136,
145
+ "CASE_ebar": 65342,
146
+ "CASE_ebare": 65049,
147
+ "CASE_ebaris": 65064,
148
+ "CASE_ebas": 65414,
149
+ "CASE_ebat": 65237,
150
+ "CASE_ebatis": 65382,
151
+ "CASE_ebatur": 65111,
152
+ "CASE_ebere": 65031,
153
+ "CASE_eberis": 65381,
154
+ "CASE_ebimur": 65061,
155
+ "CASE_ebimus": 65038,
156
+ "CASE_ebis": 65051,
157
+ "CASE_ebit": 65110,
158
+ "CASE_ebitis": 65162,
159
+ "CASE_ebitur": 65076,
160
+ "CASE_ebo": 65391,
161
+ "CASE_ebor": 65292,
162
+ "CASE_ebunt": 65286,
163
+ "CASE_ebuntur": 65316,
164
+ "CASE_ebus": 65277,
165
+ "CASE_ei": 65325,
166
+ "CASE_eidem": 65301,
167
+ "CASE_eis": 65141,
168
+ "CASE_em": 65339,
169
+ "CASE_eme": 65232,
170
+ "CASE_emini": 65123,
171
+ "CASE_emur": 65191,
172
+ "CASE_emus": 65002,
173
+ "CASE_en": 65175,
174
+ "CASE_enda": 65333,
175
+ "CASE_endae": 65118,
176
+ "CASE_endam": 65066,
177
+ "CASE_endarum": 65057,
178
+ "CASE_endas": 65009,
179
+ "CASE_ende": 65086,
180
+ "CASE_endi": 65368,
181
+ "CASE_endis": 65058,
182
+ "CASE_endo": 65182,
183
+ "CASE_endorum": 65425,
184
+ "CASE_endos": 65297,
185
+ "CASE_endum": 65073,
186
+ "CASE_endus": 65361,
187
+ "CASE_ens": 65238,
188
+ "CASE_ent": 65213,
189
+ "CASE_ente": 65030,
190
+ "CASE_entem": 65005,
191
+ "CASE_entes": 65426,
192
+ "CASE_enti": 65020,
193
+ "CASE_entia": 65133,
194
+ "CASE_entibus": 65032,
195
+ "CASE_entis": 65298,
196
+ "CASE_entium": 65094,
197
+ "CASE_ento": 65439,
198
+ "CASE_entum": 65142,
199
+ "CASE_entur": 65392,
200
+ "CASE_enus": 65430,
201
+ "CASE_eo": 65261,
202
+ "CASE_eor": 65130,
203
+ "CASE_er": 65429,
204
+ "CASE_eram": 65366,
205
+ "CASE_eramus": 65420,
206
+ "CASE_erant": 65373,
207
+ "CASE_eras": 65411,
208
+ "CASE_erat": 65294,
209
+ "CASE_eratis": 65386,
210
+ "CASE_ere": 65230,
211
+ "CASE_erem": 65268,
212
+ "CASE_eremini": 65348,
213
+ "CASE_eremur": 65008,
214
+ "CASE_eremus": 65389,
215
+ "CASE_erent": 65120,
216
+ "CASE_erentur": 65374,
217
+ "CASE_erer": 65204,
218
+ "CASE_erere": 65138,
219
+ "CASE_ereris": 65139,
220
+ "CASE_eres": 65344,
221
+ "CASE_eret": 65357,
222
+ "CASE_eretis": 65143,
223
+ "CASE_eretur": 65022,
224
+ "CASE_eri": 65203,
225
+ "CASE_erier": 65405,
226
+ "CASE_erim": 65241,
227
+ "CASE_erimus": 65326,
228
+ "CASE_erint": 65273,
229
+ "CASE_eris": 65402,
230
+ "CASE_erit": 65114,
231
+ "CASE_eritis": 65154,
232
+ "CASE_ero": 65068,
233
+ "CASE_erum": 65287,
234
+ "CASE_erunt": 65067,
235
+ "CASE_es": 65107,
236
+ "CASE_ese": 65320,
237
+ "CASE_esse": 65216,
238
+ "CASE_essem": 65281,
239
+ "CASE_essemus": 65126,
240
+ "CASE_essent": 65034,
241
+ "CASE_esset": 65251,
242
+ "CASE_est": 65364,
243
+ "CASE_estis": 65157,
244
+ "CASE_esto": 65071,
245
+ "CASE_et": 65014,
246
+ "CASE_ete": 65171,
247
+ "CASE_etis": 65089,
248
+ "CASE_eto": 65248,
249
+ "CASE_etor": 65125,
250
+ "CASE_etote": 65220,
251
+ "CASE_etur": 65380,
252
+ "CASE_fore": 65017,
253
+ "CASE_forent": 65078,
254
+ "CASE_foret": 65212,
255
+ "CASE_germanus": 65424,
256
+ "CASE_i": 65334,
257
+ "CASE_ia": 65159,
258
+ "CASE_iant": 65201,
259
+ "CASE_ias": 65218,
260
+ "CASE_iat": 65322,
261
+ "CASE_ibe": 65099,
262
+ "CASE_ibi": 65161,
263
+ "CASE_ibus": 65081,
264
+ "CASE_ic": 65415,
265
+ "CASE_id": 65035,
266
+ "CASE_iens": 65039,
267
+ "CASE_ier": 65367,
268
+ "CASE_ieri": 65129,
269
+ "CASE_ies": 65253,
270
+ "CASE_ihi": 65422,
271
+ "CASE_ii": 65178,
272
+ "CASE_illa": 65305,
273
+ "CASE_ille": 65043,
274
+ "CASE_im": 65183,
275
+ "CASE_imini": 65007,
276
+ "CASE_imur": 65092,
277
+ "CASE_imus": 65442,
278
+ "CASE_int": 65299,
279
+ "CASE_ipsius": 65385,
280
+ "CASE_ire": 65283,
281
+ "CASE_irem": 65128,
282
+ "CASE_iremini": 65352,
283
+ "CASE_iremur": 65065,
284
+ "CASE_iremus": 65345,
285
+ "CASE_irent": 65384,
286
+ "CASE_irentur": 65328,
287
+ "CASE_irer": 65252,
288
+ "CASE_irere": 65412,
289
+ "CASE_ireris": 65226,
290
+ "CASE_ires": 65335,
291
+ "CASE_iret": 65243,
292
+ "CASE_iretis": 65314,
293
+ "CASE_iretur": 65431,
294
+ "CASE_iri": 65269,
295
+ "CASE_irier": 65101,
296
+ "CASE_iris": 65257,
297
+ "CASE_is": 65158,
298
+ "CASE_isse": 65151,
299
+ "CASE_issem": 65108,
300
+ "CASE_issemus": 65397,
301
+ "CASE_issent": 65330,
302
+ "CASE_isses": 65150,
303
+ "CASE_isset": 65423,
304
+ "CASE_issetis": 65010,
305
+ "CASE_isti": 65428,
306
+ "CASE_istis": 65070,
307
+ "CASE_it": 65096,
308
+ "CASE_ite": 65018,
309
+ "CASE_itis": 65408,
310
+ "CASE_ito": 65388,
311
+ "CASE_itor": 65165,
312
+ "CASE_itote": 65378,
313
+ "CASE_itur": 65097,
314
+ "CASE_ium": 65103,
315
+ "CASE_ius": 65021,
316
+ "CASE_jus": 65176,
317
+ "CASE_latuit": 65418,
318
+ "CASE_le": 65371,
319
+ "CASE_lem": 65055,
320
+ "CASE_lent": 65240,
321
+ "CASE_les": 65360,
322
+ "CASE_let": 65194,
323
+ "CASE_litore": 65289,
324
+ "CASE_ma": 65116,
325
+ "CASE_mae": 65271,
326
+ "CASE_maerens": 65012,
327
+ "CASE_magistratu": 65346,
328
+ "CASE_mam": 65144,
329
+ "CASE_marum": 65259,
330
+ "CASE_mas": 65209,
331
+ "CASE_me": 65393,
332
+ "CASE_mi": 65121,
333
+ "CASE_mini": 65338,
334
+ "CASE_mis": 65040,
335
+ "CASE_mo": 65260,
336
+ "CASE_morum": 65036,
337
+ "CASE_mos": 65172,
338
+ "CASE_mum": 65284,
339
+ "CASE_mur": 65164,
340
+ "CASE_mus": 65205,
341
+ "CASE_non": 65383,
342
+ "CASE_o": 65146,
343
+ "CASE_obis": 65417,
344
+ "CASE_obus": 65085,
345
+ "CASE_oc": 65421,
346
+ "CASE_od": 65304,
347
+ "CASE_oe": 65390,
348
+ "CASE_om": 65306,
349
+ "CASE_on": 65293,
350
+ "CASE_or": 65168,
351
+ "CASE_ora": 65310,
352
+ "CASE_ore": 65416,
353
+ "CASE_orem": 65336,
354
+ "CASE_ores": 65196,
355
+ "CASE_ori": 65349,
356
+ "CASE_oribus": 65052,
357
+ "CASE_oris": 65279,
358
+ "CASE_orum": 65437,
359
+ "CASE_orun": 65140,
360
+ "CASE_os": 65262,
361
+ "CASE_pax": 65440,
362
+ "CASE_praeclarior": 65177,
363
+ "CASE_quae": 65149,
364
+ "CASE_qui": 65324,
365
+ "CASE_quod": 65013,
366
+ "CASE_re": 65184,
367
+ "CASE_rem": 65015,
368
+ "CASE_remur": 65242,
369
+ "CASE_remus": 65332,
370
+ "CASE_rent": 65075,
371
+ "CASE_rentur": 65323,
372
+ "CASE_res": 65312,
373
+ "CASE_ret": 65233,
374
+ "CASE_retis": 65406,
375
+ "CASE_retur": 65285,
376
+ "CASE_ri": 65016,
377
+ "CASE_rier": 65266,
378
+ "CASE_ris": 65363,
379
+ "CASE_s": 65236,
380
+ "CASE_se": 65372,
381
+ "CASE_sem": 65117,
382
+ "CASE_semus": 65050,
383
+ "CASE_sent": 65270,
384
+ "CASE_ses": 65309,
385
+ "CASE_set": 65343,
386
+ "CASE_setis": 65331,
387
+ "CASE_t": 65295,
388
+ "CASE_te": 65288,
389
+ "CASE_tis": 65358,
390
+ "CASE_to": 65170,
391
+ "CASE_tote": 65231,
392
+ "CASE_tristis": 65398,
393
+ "CASE_tuo": 65202,
394
+ "CASE_tur": 65003,
395
+ "CASE_u": 65244,
396
+ "CASE_ua": 65179,
397
+ "CASE_ubi": 65395,
398
+ "CASE_ubus": 65072,
399
+ "CASE_ud": 65004,
400
+ "CASE_ui": 65321,
401
+ "CASE_um": 65250,
402
+ "CASE_umus": 65296,
403
+ "CASE_unc": 65088,
404
+ "CASE_unda": 65169,
405
+ "CASE_undae": 65029,
406
+ "CASE_undam": 65048,
407
+ "CASE_undarum": 65224,
408
+ "CASE_undas": 65300,
409
+ "CASE_unde": 65006,
410
+ "CASE_undi": 65438,
411
+ "CASE_undis": 65153,
412
+ "CASE_undo": 65127,
413
+ "CASE_undorum": 65207,
414
+ "CASE_undos": 65033,
415
+ "CASE_undum": 65217,
416
+ "CASE_undus": 65347,
417
+ "CASE_unt": 65026,
418
+ "CASE_unte": 65307,
419
+ "CASE_untem": 65337,
420
+ "CASE_untes": 65181,
421
+ "CASE_unti": 65265,
422
+ "CASE_untia": 65313,
423
+ "CASE_untibus": 65303,
424
+ "CASE_untis": 65046,
425
+ "CASE_untium": 65132,
426
+ "CASE_unto": 65093,
427
+ "CASE_untur": 65147,
428
+ "CASE_ura": 65090,
429
+ "CASE_urae": 65185,
430
+ "CASE_uram": 65443,
431
+ "CASE_urarum": 65152,
432
+ "CASE_uras": 65235,
433
+ "CASE_ure": 65351,
434
+ "CASE_uri": 65167,
435
+ "CASE_uris": 65263,
436
+ "CASE_uro": 65255,
437
+ "CASE_urorum": 65369,
438
+ "CASE_uros": 65173,
439
+ "CASE_urum": 65419,
440
+ "CASE_urus": 65340,
441
+ "CASE_us": 65407,
442
+ "CASE_ut": 65023,
443
+ "CASE_uum": 65113,
444
+ "CASE_vetus": 65042
445
+ }
source.spm CHANGED
Binary files a/source.spm and b/source.spm differ
 
target.spm CHANGED
Binary files a/target.spm and b/target.spm differ
 
tokenizer_config.json CHANGED
@@ -3,10 +3,10 @@
3
  "model_max_length": 512,
4
  "pad_token": "<pad>",
5
  "separate_vocabs": false,
6
- "source_lang": "en",
7
  "sp_model_kwargs": {},
8
  "special_tokens_map_file": null,
9
- "target_lang": "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la",
10
  "tokenizer_class": "MarianTokenizer",
11
  "unk_token": "<unk>"
12
  }
 
3
  "model_max_length": 512,
4
  "pad_token": "<pad>",
5
  "separate_vocabs": false,
6
+ "source_lang": "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la",
7
  "sp_model_kwargs": {},
8
  "special_tokens_map_file": null,
9
+ "target_lang": "en",
10
  "tokenizer_class": "MarianTokenizer",
11
  "unk_token": "<unk>"
12
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff