laverdes commited on
Commit
29b0e23
1 Parent(s): a9e7935

feat: new tokenizer and config

Browse files
Files changed (3) hide show
  1. added_tokens.json +34 -38
  2. special_tokens_map.json +18 -22
  3. tokenizer.json +33 -69
added_tokens.json CHANGED
@@ -1,47 +1,43 @@
1
  {
2
- "</s_Abstract>": 57566,
3
- "</s_Address>": 57552,
4
- "</s_Advertisement>": 57564,
5
- "</s_Author>": 57550,
6
- "</s_Caption>": 57536,
7
- "</s_Chart>": 57562,
8
- "</s_Field-Name>": 57556,
9
- "</s_Footer>": 57534,
10
- "</s_Formula>": 57568,
11
- "</s_Header>": 57542,
12
- "</s_Headline>": 57546,
13
- "</s_Link>": 57540,
14
- "</s_List-item>": 57532,
15
  "</s_Metadata>": 57558,
16
- "</s_Misc>": 57530,
17
- "</s_Page number>": 57548,
18
- "</s_Subheadline>": 57528,
19
- "</s_Table>": 57538,
20
  "</s_Text>": 57526,
21
- "</s_Threading>": 57554,
22
- "</s_Title>": 57560,
23
  "</s_Value>": 57544,
24
- "<s_Abstract>": 57565,
25
- "<s_Address>": 57551,
26
- "<s_Advertisement>": 57563,
27
- "<s_Author>": 57549,
28
- "<s_Caption>": 57535,
29
- "<s_Chart>": 57561,
30
- "<s_Field-Name>": 57555,
31
- "<s_Footer>": 57533,
32
- "<s_Formula>": 57567,
33
- "<s_Header>": 57541,
34
- "<s_Headline>": 57545,
35
- "<s_Link>": 57539,
36
- "<s_List-item>": 57531,
37
  "<s_Metadata>": 57557,
38
- "<s_Misc>": 57529,
39
- "<s_Page number>": 57547,
40
- "<s_Subheadline>": 57527,
41
- "<s_Table>": 57537,
42
  "<s_Text>": 57525,
43
- "<s_Threading>": 57553,
44
- "<s_Title>": 57559,
45
  "<s_Value>": 57543,
46
  "<s_iitcdip>": 57523,
47
  "<s_synthdog>": 57524,
 
1
  {
2
+ "</s_Abstract>": 57564,
3
+ "</s_Address>": 57538,
4
+ "</s_Advertisement>": 57556,
5
+ "</s_Author>": 57546,
6
+ "</s_Caption>": 57530,
7
+ "</s_Chart>": 57560,
8
+ "</s_Field-Name>": 57548,
9
+ "</s_Footer>": 57542,
10
+ "</s_Header>": 57552,
11
+ "</s_Headline>": 57536,
12
+ "</s_Link>": 57528,
13
+ "</s_List-item>": 57540,
 
14
  "</s_Metadata>": 57558,
15
+ "</s_Misc>": 57534,
16
+ "</s_Page number>": 57562,
17
+ "</s_Subheadline>": 57532,
18
+ "</s_Table>": 57550,
19
  "</s_Text>": 57526,
20
+ "</s_Title>": 57554,
 
21
  "</s_Value>": 57544,
22
+ "<s_Abstract>": 57563,
23
+ "<s_Address>": 57537,
24
+ "<s_Advertisement>": 57555,
25
+ "<s_Author>": 57545,
26
+ "<s_Caption>": 57529,
27
+ "<s_Chart>": 57559,
28
+ "<s_Field-Name>": 57547,
29
+ "<s_Footer>": 57541,
30
+ "<s_Header>": 57551,
31
+ "<s_Headline>": 57535,
32
+ "<s_Link>": 57527,
33
+ "<s_List-item>": 57539,
 
34
  "<s_Metadata>": 57557,
35
+ "<s_Misc>": 57533,
36
+ "<s_Page number>": 57561,
37
+ "<s_Subheadline>": 57531,
38
+ "<s_Table>": 57549,
39
  "<s_Text>": 57525,
40
+ "<s_Title>": 57553,
 
41
  "<s_Value>": 57543,
42
  "<s_iitcdip>": 57523,
43
  "<s_synthdog>": 57524,
special_tokens_map.json CHANGED
@@ -2,48 +2,44 @@
2
  "additional_special_tokens": [
3
  "<s_Text>",
4
  "</s_Text>",
 
 
 
 
5
  "<s_Subheadline>",
6
  "</s_Subheadline>",
7
  "<s_Misc>",
8
  "</s_Misc>",
 
 
 
 
9
  "<s_List-item>",
10
  "</s_List-item>",
11
  "<s_Footer>",
12
  "</s_Footer>",
13
- "<s_Caption>",
14
- "</s_Caption>",
15
- "<s_Table>",
16
- "</s_Table>",
17
- "<s_Link>",
18
- "</s_Link>",
19
- "<s_Header>",
20
- "</s_Header>",
21
  "<s_Value>",
22
  "</s_Value>",
23
- "<s_Headline>",
24
- "</s_Headline>",
25
- "<s_Page number>",
26
- "</s_Page number>",
27
  "<s_Author>",
28
  "</s_Author>",
29
- "<s_Address>",
30
- "</s_Address>",
31
- "<s_Threading>",
32
- "</s_Threading>",
33
  "<s_Field-Name>",
34
  "</s_Field-Name>",
35
- "<s_Metadata>",
36
- "</s_Metadata>",
 
 
37
  "<s_Title>",
38
  "</s_Title>",
39
- "<s_Chart>",
40
- "</s_Chart>",
41
  "<s_Advertisement>",
42
  "</s_Advertisement>",
 
 
 
 
 
 
43
  "<s_Abstract>",
44
  "</s_Abstract>",
45
- "<s_Formula>",
46
- "</s_Formula>",
47
  "<s>",
48
  "</s>"
49
  ],
 
2
  "additional_special_tokens": [
3
  "<s_Text>",
4
  "</s_Text>",
5
+ "<s_Link>",
6
+ "</s_Link>",
7
+ "<s_Caption>",
8
+ "</s_Caption>",
9
  "<s_Subheadline>",
10
  "</s_Subheadline>",
11
  "<s_Misc>",
12
  "</s_Misc>",
13
+ "<s_Headline>",
14
+ "</s_Headline>",
15
+ "<s_Address>",
16
+ "</s_Address>",
17
  "<s_List-item>",
18
  "</s_List-item>",
19
  "<s_Footer>",
20
  "</s_Footer>",
 
 
 
 
 
 
 
 
21
  "<s_Value>",
22
  "</s_Value>",
 
 
 
 
23
  "<s_Author>",
24
  "</s_Author>",
 
 
 
 
25
  "<s_Field-Name>",
26
  "</s_Field-Name>",
27
+ "<s_Table>",
28
+ "</s_Table>",
29
+ "<s_Header>",
30
+ "</s_Header>",
31
  "<s_Title>",
32
  "</s_Title>",
 
 
33
  "<s_Advertisement>",
34
  "</s_Advertisement>",
35
+ "<s_Metadata>",
36
+ "</s_Metadata>",
37
+ "<s_Chart>",
38
+ "</s_Chart>",
39
+ "<s_Page number>",
40
+ "</s_Page number>",
41
  "<s_Abstract>",
42
  "</s_Abstract>",
 
 
43
  "<s>",
44
  "</s>"
45
  ],
tokenizer.json CHANGED
@@ -109,7 +109,7 @@
109
  },
110
  {
111
  "id": 57527,
112
- "content": "<s_Subheadline>",
113
  "single_word": false,
114
  "lstrip": false,
115
  "rstrip": false,
@@ -118,7 +118,7 @@
118
  },
119
  {
120
  "id": 57528,
121
- "content": "</s_Subheadline>",
122
  "single_word": false,
123
  "lstrip": false,
124
  "rstrip": false,
@@ -127,7 +127,7 @@
127
  },
128
  {
129
  "id": 57529,
130
- "content": "<s_Misc>",
131
  "single_word": false,
132
  "lstrip": false,
133
  "rstrip": false,
@@ -136,7 +136,7 @@
136
  },
137
  {
138
  "id": 57530,
139
- "content": "</s_Misc>",
140
  "single_word": false,
141
  "lstrip": false,
142
  "rstrip": false,
@@ -145,7 +145,7 @@
145
  },
146
  {
147
  "id": 57531,
148
- "content": "<s_List-item>",
149
  "single_word": false,
150
  "lstrip": false,
151
  "rstrip": false,
@@ -154,7 +154,7 @@
154
  },
155
  {
156
  "id": 57532,
157
- "content": "</s_List-item>",
158
  "single_word": false,
159
  "lstrip": false,
160
  "rstrip": false,
@@ -163,7 +163,7 @@
163
  },
164
  {
165
  "id": 57533,
166
- "content": "<s_Footer>",
167
  "single_word": false,
168
  "lstrip": false,
169
  "rstrip": false,
@@ -172,7 +172,7 @@
172
  },
173
  {
174
  "id": 57534,
175
- "content": "</s_Footer>",
176
  "single_word": false,
177
  "lstrip": false,
178
  "rstrip": false,
@@ -181,7 +181,7 @@
181
  },
182
  {
183
  "id": 57535,
184
- "content": "<s_Caption>",
185
  "single_word": false,
186
  "lstrip": false,
187
  "rstrip": false,
@@ -190,7 +190,7 @@
190
  },
191
  {
192
  "id": 57536,
193
- "content": "</s_Caption>",
194
  "single_word": false,
195
  "lstrip": false,
196
  "rstrip": false,
@@ -199,7 +199,7 @@
199
  },
200
  {
201
  "id": 57537,
202
- "content": "<s_Table>",
203
  "single_word": false,
204
  "lstrip": false,
205
  "rstrip": false,
@@ -208,7 +208,7 @@
208
  },
209
  {
210
  "id": 57538,
211
- "content": "</s_Table>",
212
  "single_word": false,
213
  "lstrip": false,
214
  "rstrip": false,
@@ -217,7 +217,7 @@
217
  },
218
  {
219
  "id": 57539,
220
- "content": "<s_Link>",
221
  "single_word": false,
222
  "lstrip": false,
223
  "rstrip": false,
@@ -226,7 +226,7 @@
226
  },
227
  {
228
  "id": 57540,
229
- "content": "</s_Link>",
230
  "single_word": false,
231
  "lstrip": false,
232
  "rstrip": false,
@@ -235,7 +235,7 @@
235
  },
236
  {
237
  "id": 57541,
238
- "content": "<s_Header>",
239
  "single_word": false,
240
  "lstrip": false,
241
  "rstrip": false,
@@ -244,7 +244,7 @@
244
  },
245
  {
246
  "id": 57542,
247
- "content": "</s_Header>",
248
  "single_word": false,
249
  "lstrip": false,
250
  "rstrip": false,
@@ -271,7 +271,7 @@
271
  },
272
  {
273
  "id": 57545,
274
- "content": "<s_Headline>",
275
  "single_word": false,
276
  "lstrip": false,
277
  "rstrip": false,
@@ -280,7 +280,7 @@
280
  },
281
  {
282
  "id": 57546,
283
- "content": "</s_Headline>",
284
  "single_word": false,
285
  "lstrip": false,
286
  "rstrip": false,
@@ -289,7 +289,7 @@
289
  },
290
  {
291
  "id": 57547,
292
- "content": "<s_Page number>",
293
  "single_word": false,
294
  "lstrip": false,
295
  "rstrip": false,
@@ -298,7 +298,7 @@
298
  },
299
  {
300
  "id": 57548,
301
- "content": "</s_Page number>",
302
  "single_word": false,
303
  "lstrip": false,
304
  "rstrip": false,
@@ -307,7 +307,7 @@
307
  },
308
  {
309
  "id": 57549,
310
- "content": "<s_Author>",
311
  "single_word": false,
312
  "lstrip": false,
313
  "rstrip": false,
@@ -316,7 +316,7 @@
316
  },
317
  {
318
  "id": 57550,
319
- "content": "</s_Author>",
320
  "single_word": false,
321
  "lstrip": false,
322
  "rstrip": false,
@@ -325,7 +325,7 @@
325
  },
326
  {
327
  "id": 57551,
328
- "content": "<s_Address>",
329
  "single_word": false,
330
  "lstrip": false,
331
  "rstrip": false,
@@ -334,7 +334,7 @@
334
  },
335
  {
336
  "id": 57552,
337
- "content": "</s_Address>",
338
  "single_word": false,
339
  "lstrip": false,
340
  "rstrip": false,
@@ -343,7 +343,7 @@
343
  },
344
  {
345
  "id": 57553,
346
- "content": "<s_Threading>",
347
  "single_word": false,
348
  "lstrip": false,
349
  "rstrip": false,
@@ -352,7 +352,7 @@
352
  },
353
  {
354
  "id": 57554,
355
- "content": "</s_Threading>",
356
  "single_word": false,
357
  "lstrip": false,
358
  "rstrip": false,
@@ -361,7 +361,7 @@
361
  },
362
  {
363
  "id": 57555,
364
- "content": "<s_Field-Name>",
365
  "single_word": false,
366
  "lstrip": false,
367
  "rstrip": false,
@@ -370,7 +370,7 @@
370
  },
371
  {
372
  "id": 57556,
373
- "content": "</s_Field-Name>",
374
  "single_word": false,
375
  "lstrip": false,
376
  "rstrip": false,
@@ -397,7 +397,7 @@
397
  },
398
  {
399
  "id": 57559,
400
- "content": "<s_Title>",
401
  "single_word": false,
402
  "lstrip": false,
403
  "rstrip": false,
@@ -406,7 +406,7 @@
406
  },
407
  {
408
  "id": 57560,
409
- "content": "</s_Title>",
410
  "single_word": false,
411
  "lstrip": false,
412
  "rstrip": false,
@@ -415,7 +415,7 @@
415
  },
416
  {
417
  "id": 57561,
418
- "content": "<s_Chart>",
419
  "single_word": false,
420
  "lstrip": false,
421
  "rstrip": false,
@@ -424,7 +424,7 @@
424
  },
425
  {
426
  "id": 57562,
427
- "content": "</s_Chart>",
428
  "single_word": false,
429
  "lstrip": false,
430
  "rstrip": false,
@@ -433,24 +433,6 @@
433
  },
434
  {
435
  "id": 57563,
436
- "content": "<s_Advertisement>",
437
- "single_word": false,
438
- "lstrip": false,
439
- "rstrip": false,
440
- "normalized": false,
441
- "special": true
442
- },
443
- {
444
- "id": 57564,
445
- "content": "</s_Advertisement>",
446
- "single_word": false,
447
- "lstrip": false,
448
- "rstrip": false,
449
- "normalized": false,
450
- "special": true
451
- },
452
- {
453
- "id": 57565,
454
  "content": "<s_Abstract>",
455
  "single_word": false,
456
  "lstrip": false,
@@ -459,31 +441,13 @@
459
  "special": true
460
  },
461
  {
462
- "id": 57566,
463
  "content": "</s_Abstract>",
464
  "single_word": false,
465
  "lstrip": false,
466
  "rstrip": false,
467
  "normalized": false,
468
  "special": true
469
- },
470
- {
471
- "id": 57567,
472
- "content": "<s_Formula>",
473
- "single_word": false,
474
- "lstrip": false,
475
- "rstrip": false,
476
- "normalized": false,
477
- "special": true
478
- },
479
- {
480
- "id": 57568,
481
- "content": "</s_Formula>",
482
- "single_word": false,
483
- "lstrip": false,
484
- "rstrip": false,
485
- "normalized": false,
486
- "special": true
487
  }
488
  ],
489
  "normalizer": {
 
109
  },
110
  {
111
  "id": 57527,
112
+ "content": "<s_Link>",
113
  "single_word": false,
114
  "lstrip": false,
115
  "rstrip": false,
 
118
  },
119
  {
120
  "id": 57528,
121
+ "content": "</s_Link>",
122
  "single_word": false,
123
  "lstrip": false,
124
  "rstrip": false,
 
127
  },
128
  {
129
  "id": 57529,
130
+ "content": "<s_Caption>",
131
  "single_word": false,
132
  "lstrip": false,
133
  "rstrip": false,
 
136
  },
137
  {
138
  "id": 57530,
139
+ "content": "</s_Caption>",
140
  "single_word": false,
141
  "lstrip": false,
142
  "rstrip": false,
 
145
  },
146
  {
147
  "id": 57531,
148
+ "content": "<s_Subheadline>",
149
  "single_word": false,
150
  "lstrip": false,
151
  "rstrip": false,
 
154
  },
155
  {
156
  "id": 57532,
157
+ "content": "</s_Subheadline>",
158
  "single_word": false,
159
  "lstrip": false,
160
  "rstrip": false,
 
163
  },
164
  {
165
  "id": 57533,
166
+ "content": "<s_Misc>",
167
  "single_word": false,
168
  "lstrip": false,
169
  "rstrip": false,
 
172
  },
173
  {
174
  "id": 57534,
175
+ "content": "</s_Misc>",
176
  "single_word": false,
177
  "lstrip": false,
178
  "rstrip": false,
 
181
  },
182
  {
183
  "id": 57535,
184
+ "content": "<s_Headline>",
185
  "single_word": false,
186
  "lstrip": false,
187
  "rstrip": false,
 
190
  },
191
  {
192
  "id": 57536,
193
+ "content": "</s_Headline>",
194
  "single_word": false,
195
  "lstrip": false,
196
  "rstrip": false,
 
199
  },
200
  {
201
  "id": 57537,
202
+ "content": "<s_Address>",
203
  "single_word": false,
204
  "lstrip": false,
205
  "rstrip": false,
 
208
  },
209
  {
210
  "id": 57538,
211
+ "content": "</s_Address>",
212
  "single_word": false,
213
  "lstrip": false,
214
  "rstrip": false,
 
217
  },
218
  {
219
  "id": 57539,
220
+ "content": "<s_List-item>",
221
  "single_word": false,
222
  "lstrip": false,
223
  "rstrip": false,
 
226
  },
227
  {
228
  "id": 57540,
229
+ "content": "</s_List-item>",
230
  "single_word": false,
231
  "lstrip": false,
232
  "rstrip": false,
 
235
  },
236
  {
237
  "id": 57541,
238
+ "content": "<s_Footer>",
239
  "single_word": false,
240
  "lstrip": false,
241
  "rstrip": false,
 
244
  },
245
  {
246
  "id": 57542,
247
+ "content": "</s_Footer>",
248
  "single_word": false,
249
  "lstrip": false,
250
  "rstrip": false,
 
271
  },
272
  {
273
  "id": 57545,
274
+ "content": "<s_Author>",
275
  "single_word": false,
276
  "lstrip": false,
277
  "rstrip": false,
 
280
  },
281
  {
282
  "id": 57546,
283
+ "content": "</s_Author>",
284
  "single_word": false,
285
  "lstrip": false,
286
  "rstrip": false,
 
289
  },
290
  {
291
  "id": 57547,
292
+ "content": "<s_Field-Name>",
293
  "single_word": false,
294
  "lstrip": false,
295
  "rstrip": false,
 
298
  },
299
  {
300
  "id": 57548,
301
+ "content": "</s_Field-Name>",
302
  "single_word": false,
303
  "lstrip": false,
304
  "rstrip": false,
 
307
  },
308
  {
309
  "id": 57549,
310
+ "content": "<s_Table>",
311
  "single_word": false,
312
  "lstrip": false,
313
  "rstrip": false,
 
316
  },
317
  {
318
  "id": 57550,
319
+ "content": "</s_Table>",
320
  "single_word": false,
321
  "lstrip": false,
322
  "rstrip": false,
 
325
  },
326
  {
327
  "id": 57551,
328
+ "content": "<s_Header>",
329
  "single_word": false,
330
  "lstrip": false,
331
  "rstrip": false,
 
334
  },
335
  {
336
  "id": 57552,
337
+ "content": "</s_Header>",
338
  "single_word": false,
339
  "lstrip": false,
340
  "rstrip": false,
 
343
  },
344
  {
345
  "id": 57553,
346
+ "content": "<s_Title>",
347
  "single_word": false,
348
  "lstrip": false,
349
  "rstrip": false,
 
352
  },
353
  {
354
  "id": 57554,
355
+ "content": "</s_Title>",
356
  "single_word": false,
357
  "lstrip": false,
358
  "rstrip": false,
 
361
  },
362
  {
363
  "id": 57555,
364
+ "content": "<s_Advertisement>",
365
  "single_word": false,
366
  "lstrip": false,
367
  "rstrip": false,
 
370
  },
371
  {
372
  "id": 57556,
373
+ "content": "</s_Advertisement>",
374
  "single_word": false,
375
  "lstrip": false,
376
  "rstrip": false,
 
397
  },
398
  {
399
  "id": 57559,
400
+ "content": "<s_Chart>",
401
  "single_word": false,
402
  "lstrip": false,
403
  "rstrip": false,
 
406
  },
407
  {
408
  "id": 57560,
409
+ "content": "</s_Chart>",
410
  "single_word": false,
411
  "lstrip": false,
412
  "rstrip": false,
 
415
  },
416
  {
417
  "id": 57561,
418
+ "content": "<s_Page number>",
419
  "single_word": false,
420
  "lstrip": false,
421
  "rstrip": false,
 
424
  },
425
  {
426
  "id": 57562,
427
+ "content": "</s_Page number>",
428
  "single_word": false,
429
  "lstrip": false,
430
  "rstrip": false,
 
433
  },
434
  {
435
  "id": 57563,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  "content": "<s_Abstract>",
437
  "single_word": false,
438
  "lstrip": false,
 
441
  "special": true
442
  },
443
  {
444
+ "id": 57564,
445
  "content": "</s_Abstract>",
446
  "single_word": false,
447
  "lstrip": false,
448
  "rstrip": false,
449
  "normalized": false,
450
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  }
452
  ],
453
  "normalizer": {