varadhbhatnagar commited on
Commit
047e774
1 Parent(s): e183059

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +49 -108
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +61 -114
  5. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,110 +1,51 @@
1
  {
2
- "additional_special_tokens": [
3
- "<mask_1>",
4
- "<unk_2>",
5
- "<unk_3>",
6
- "<unk_4>",
7
- "<unk_5>",
8
- "<unk_6>",
9
- "<unk_7>",
10
- "<unk_8>",
11
- "<unk_9>",
12
- "<unk_10>",
13
- "<unk_11>",
14
- "<unk_12>",
15
- "<unk_13>",
16
- "<unk_14>",
17
- "<unk_15>",
18
- "<unk_16>",
19
- "<unk_17>",
20
- "<unk_18>",
21
- "<unk_19>",
22
- "<unk_20>",
23
- "<unk_21>",
24
- "<unk_22>",
25
- "<unk_23>",
26
- "<unk_24>",
27
- "<unk_25>",
28
- "<unk_26>",
29
- "<unk_27>",
30
- "<unk_28>",
31
- "<unk_29>",
32
- "<unk_30>",
33
- "<unk_31>",
34
- "<unk_32>",
35
- "<unk_33>",
36
- "<unk_34>",
37
- "<unk_35>",
38
- "<unk_36>",
39
- "<unk_37>",
40
- "<unk_38>",
41
- "<unk_39>",
42
- "<unk_40>",
43
- "<unk_41>",
44
- "<unk_42>",
45
- "<unk_43>",
46
- "<unk_44>",
47
- "<unk_45>",
48
- "<unk_46>",
49
- "<unk_47>",
50
- "<unk_48>",
51
- "<unk_49>",
52
- "<unk_50>",
53
- "<unk_51>",
54
- "<unk_52>",
55
- "<unk_53>",
56
- "<unk_54>",
57
- "<unk_55>",
58
- "<unk_56>",
59
- "<unk_57>",
60
- "<unk_58>",
61
- "<unk_59>",
62
- "<unk_60>",
63
- "<unk_61>",
64
- "<unk_62>",
65
- "<unk_63>",
66
- "<unk_64>",
67
- "<unk_65>",
68
- "<unk_66>",
69
- "<unk_67>",
70
- "<unk_68>",
71
- "<unk_69>",
72
- "<unk_70>",
73
- "<unk_71>",
74
- "<unk_72>",
75
- "<unk_73>",
76
- "<unk_74>",
77
- "<unk_75>",
78
- "<unk_76>",
79
- "<unk_77>",
80
- "<unk_78>",
81
- "<unk_79>",
82
- "<unk_80>",
83
- "<unk_81>",
84
- "<unk_82>",
85
- "<unk_83>",
86
- "<unk_84>",
87
- "<unk_85>",
88
- "<unk_86>",
89
- "<unk_87>",
90
- "<unk_88>",
91
- "<unk_89>",
92
- "<unk_90>",
93
- "<unk_91>",
94
- "<unk_92>",
95
- "<unk_93>",
96
- "<unk_94>",
97
- "<unk_95>",
98
- "<unk_96>",
99
- "<unk_97>",
100
- "<unk_98>",
101
- "<unk_99>",
102
- "<unk_100>",
103
- "<unk_101>",
104
- "<unk_102>"
105
- ],
106
- "eos_token": "</s>",
107
- "mask_token": "<mask_2>",
108
- "pad_token": "<pad>",
109
- "unk_token": "<unk>"
110
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,118 +1,65 @@
1
  {
2
- "additional_special_tokens": [
3
- "<mask_1>",
4
- "<unk_2>",
5
- "<unk_3>",
6
- "<unk_4>",
7
- "<unk_5>",
8
- "<unk_6>",
9
- "<unk_7>",
10
- "<unk_8>",
11
- "<unk_9>",
12
- "<unk_10>",
13
- "<unk_11>",
14
- "<unk_12>",
15
- "<unk_13>",
16
- "<unk_14>",
17
- "<unk_15>",
18
- "<unk_16>",
19
- "<unk_17>",
20
- "<unk_18>",
21
- "<unk_19>",
22
- "<unk_20>",
23
- "<unk_21>",
24
- "<unk_22>",
25
- "<unk_23>",
26
- "<unk_24>",
27
- "<unk_25>",
28
- "<unk_26>",
29
- "<unk_27>",
30
- "<unk_28>",
31
- "<unk_29>",
32
- "<unk_30>",
33
- "<unk_31>",
34
- "<unk_32>",
35
- "<unk_33>",
36
- "<unk_34>",
37
- "<unk_35>",
38
- "<unk_36>",
39
- "<unk_37>",
40
- "<unk_38>",
41
- "<unk_39>",
42
- "<unk_40>",
43
- "<unk_41>",
44
- "<unk_42>",
45
- "<unk_43>",
46
- "<unk_44>",
47
- "<unk_45>",
48
- "<unk_46>",
49
- "<unk_47>",
50
- "<unk_48>",
51
- "<unk_49>",
52
- "<unk_50>",
53
- "<unk_51>",
54
- "<unk_52>",
55
- "<unk_53>",
56
- "<unk_54>",
57
- "<unk_55>",
58
- "<unk_56>",
59
- "<unk_57>",
60
- "<unk_58>",
61
- "<unk_59>",
62
- "<unk_60>",
63
- "<unk_61>",
64
- "<unk_62>",
65
- "<unk_63>",
66
- "<unk_64>",
67
- "<unk_65>",
68
- "<unk_66>",
69
- "<unk_67>",
70
- "<unk_68>",
71
- "<unk_69>",
72
- "<unk_70>",
73
- "<unk_71>",
74
- "<unk_72>",
75
- "<unk_73>",
76
- "<unk_74>",
77
- "<unk_75>",
78
- "<unk_76>",
79
- "<unk_77>",
80
- "<unk_78>",
81
- "<unk_79>",
82
- "<unk_80>",
83
- "<unk_81>",
84
- "<unk_82>",
85
- "<unk_83>",
86
- "<unk_84>",
87
- "<unk_85>",
88
- "<unk_86>",
89
- "<unk_87>",
90
- "<unk_88>",
91
- "<unk_89>",
92
- "<unk_90>",
93
- "<unk_91>",
94
- "<unk_92>",
95
- "<unk_93>",
96
- "<unk_94>",
97
- "<unk_95>",
98
- "<unk_96>",
99
- "<unk_97>",
100
- "<unk_98>",
101
- "<unk_99>",
102
- "<unk_100>",
103
- "<unk_101>",
104
- "<unk_102>"
105
- ],
106
- "eos_token": "</s>",
107
- "full_tokenizer_file": null,
108
- "mask_token": "<mask_2>",
109
- "mask_token_sent": "<mask_1>",
110
  "model_max_length": 1024,
111
- "name_or_path": "sshleifer/distill-pegasus-cnn-16-4",
112
- "offset": 103,
113
- "pad_token": "<pad>",
114
- "sp_model_kwargs": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  "special_tokens_map_file": null,
116
- "tokenizer_class": "PegasusTokenizer",
117
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
118
  }
 
1
  {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "__type": "AddedToken",
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "errors": "replace",
28
+ "mask_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<mask>",
31
+ "lstrip": true,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "model_max_length": 1024,
37
+ "name_or_path": "sshleifer/distilbart-cnn-12-6",
38
+ "pad_token": {
39
+ "__type": "AddedToken",
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "sep_token": {
47
+ "__type": "AddedToken",
48
+ "content": "</s>",
49
+ "lstrip": false,
50
+ "normalized": true,
51
+ "rstrip": false,
52
+ "single_word": false
53
+ },
54
  "special_tokens_map_file": null,
55
+ "tokenizer_class": "BartTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": {
58
+ "__type": "AddedToken",
59
+ "content": "<unk>",
60
+ "lstrip": false,
61
+ "normalized": true,
62
+ "rstrip": false,
63
+ "single_word": false
64
+ }
65
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff