dickdiss commited on
Commit
6a87b14
1 Parent(s): 06c54ad

Upload tokenizer

Browse files
special_tokens_map.json CHANGED
@@ -16,7 +16,7 @@
16
  "rstrip": false,
17
  "single_word": false
18
  },
19
- "pad_token": "<|endoftext|>",
20
  "unk_token": {
21
  "content": "<unk>",
22
  "lstrip": false,
 
16
  "rstrip": false,
17
  "single_word": false
18
  },
19
+ "pad_token": "<unk>",
20
  "unk_token": {
21
  "content": "<unk>",
22
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
@@ -404,12 +409,6 @@
404
  "id": "A",
405
  "type_id": 0
406
  }
407
- },
408
- {
409
- "SpecialToken": {
410
- "id": "<|endoftext|>",
411
- "type_id": 0
412
- }
413
  }
414
  ],
415
  "pair": [
@@ -425,12 +424,6 @@
425
  "type_id": 0
426
  }
427
  },
428
- {
429
- "SpecialToken": {
430
- "id": "<|endoftext|>",
431
- "type_id": 0
432
- }
433
- },
434
  {
435
  "SpecialToken": {
436
  "id": "<s>",
@@ -442,12 +435,6 @@
442
  "id": "B",
443
  "type_id": 1
444
  }
445
- },
446
- {
447
- "SpecialToken": {
448
- "id": "<|endoftext|>",
449
- "type_id": 1
450
- }
451
  }
452
  ],
453
  "special_tokens": {
@@ -459,15 +446,6 @@
459
  "tokens": [
460
  "<s>"
461
  ]
462
- },
463
- "<|endoftext|>": {
464
- "id": "<|endoftext|>",
465
- "ids": [
466
- 32000
467
- ],
468
- "tokens": [
469
- "<|endoftext|>"
470
- ]
471
  }
472
  }
473
  },
@@ -503,6 +481,7 @@
503
  "end_of_word_suffix": null,
504
  "fuse_unk": true,
505
  "byte_fallback": true,
 
506
  "vocab": {
507
  "<unk>": 0,
508
  "<s>": 1,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 2048,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
409
  "id": "A",
410
  "type_id": 0
411
  }
 
 
 
 
 
 
412
  }
413
  ],
414
  "pair": [
 
424
  "type_id": 0
425
  }
426
  },
 
 
 
 
 
 
427
  {
428
  "SpecialToken": {
429
  "id": "<s>",
 
435
  "id": "B",
436
  "type_id": 1
437
  }
 
 
 
 
 
 
438
  }
439
  ],
440
  "special_tokens": {
 
446
  "tokens": [
447
  "<s>"
448
  ]
 
 
 
 
 
 
 
 
 
449
  }
450
  }
451
  },
 
481
  "end_of_word_suffix": null,
482
  "fuse_unk": true,
483
  "byte_fallback": true,
484
+ "ignore_merges": false,
485
  "vocab": {
486
  "<unk>": 0,
487
  "<s>": 1,
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "add_bos_token": true,
3
- "add_eos_token": true,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
@@ -340,8 +340,8 @@
340
  "eos_token": "<|endoftext|>",
341
  "legacy": false,
342
  "model_max_length": 4096,
343
- "pad_token": "<|endoftext|>",
344
- "padding_side": "left",
345
  "sp_model_kwargs": {},
346
  "tokenizer_class": "LlamaTokenizer",
347
  "unk_token": "<unk>",
 
1
  {
2
  "add_bos_token": true,
3
+ "add_eos_token": false,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
 
340
  "eos_token": "<|endoftext|>",
341
  "legacy": false,
342
  "model_max_length": 4096,
343
+ "pad_token": "<unk>",
344
+ "padding_side": "right",
345
  "sp_model_kwargs": {},
346
  "tokenizer_class": "LlamaTokenizer",
347
  "unk_token": "<unk>",