dipudl commited on
Commit
0da63e8
1 Parent(s): 655d821

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +153 -0
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +167 -0
  5. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_99>",
4
+ "<extra_id_98>",
5
+ "<extra_id_97>",
6
+ "<extra_id_96>",
7
+ "<extra_id_95>",
8
+ "<extra_id_94>",
9
+ "<extra_id_93>",
10
+ "<extra_id_92>",
11
+ "<extra_id_91>",
12
+ "<extra_id_90>",
13
+ "<extra_id_89>",
14
+ "<extra_id_88>",
15
+ "<extra_id_87>",
16
+ "<extra_id_86>",
17
+ "<extra_id_85>",
18
+ "<extra_id_84>",
19
+ "<extra_id_83>",
20
+ "<extra_id_82>",
21
+ "<extra_id_81>",
22
+ "<extra_id_80>",
23
+ "<extra_id_79>",
24
+ "<extra_id_78>",
25
+ "<extra_id_77>",
26
+ "<extra_id_76>",
27
+ "<extra_id_75>",
28
+ "<extra_id_74>",
29
+ "<extra_id_73>",
30
+ "<extra_id_72>",
31
+ "<extra_id_71>",
32
+ "<extra_id_70>",
33
+ "<extra_id_69>",
34
+ "<extra_id_68>",
35
+ "<extra_id_67>",
36
+ "<extra_id_66>",
37
+ "<extra_id_65>",
38
+ "<extra_id_64>",
39
+ "<extra_id_63>",
40
+ "<extra_id_62>",
41
+ "<extra_id_61>",
42
+ "<extra_id_60>",
43
+ "<extra_id_59>",
44
+ "<extra_id_58>",
45
+ "<extra_id_57>",
46
+ "<extra_id_56>",
47
+ "<extra_id_55>",
48
+ "<extra_id_54>",
49
+ "<extra_id_53>",
50
+ "<extra_id_52>",
51
+ "<extra_id_51>",
52
+ "<extra_id_50>",
53
+ "<extra_id_49>",
54
+ "<extra_id_48>",
55
+ "<extra_id_47>",
56
+ "<extra_id_46>",
57
+ "<extra_id_45>",
58
+ "<extra_id_44>",
59
+ "<extra_id_43>",
60
+ "<extra_id_42>",
61
+ "<extra_id_41>",
62
+ "<extra_id_40>",
63
+ "<extra_id_39>",
64
+ "<extra_id_38>",
65
+ "<extra_id_37>",
66
+ "<extra_id_36>",
67
+ "<extra_id_35>",
68
+ "<extra_id_34>",
69
+ "<extra_id_33>",
70
+ "<extra_id_32>",
71
+ "<extra_id_31>",
72
+ "<extra_id_30>",
73
+ "<extra_id_29>",
74
+ "<extra_id_28>",
75
+ "<extra_id_27>",
76
+ "<extra_id_26>",
77
+ "<extra_id_25>",
78
+ "<extra_id_24>",
79
+ "<extra_id_23>",
80
+ "<extra_id_22>",
81
+ "<extra_id_21>",
82
+ "<extra_id_20>",
83
+ "<extra_id_19>",
84
+ "<extra_id_18>",
85
+ "<extra_id_17>",
86
+ "<extra_id_16>",
87
+ "<extra_id_15>",
88
+ "<extra_id_14>",
89
+ "<extra_id_13>",
90
+ "<extra_id_12>",
91
+ "<extra_id_11>",
92
+ "<extra_id_10>",
93
+ "<extra_id_9>",
94
+ "<extra_id_8>",
95
+ "<extra_id_7>",
96
+ "<extra_id_6>",
97
+ "<extra_id_5>",
98
+ "<extra_id_4>",
99
+ "<extra_id_3>",
100
+ "<extra_id_2>",
101
+ "<extra_id_1>",
102
+ "<extra_id_0>"
103
+ ],
104
+ "bos_token": {
105
+ "content": "<s>",
106
+ "lstrip": false,
107
+ "normalized": true,
108
+ "rstrip": false,
109
+ "single_word": false
110
+ },
111
+ "cls_token": {
112
+ "content": "<s>",
113
+ "lstrip": false,
114
+ "normalized": true,
115
+ "rstrip": false,
116
+ "single_word": false
117
+ },
118
+ "eos_token": {
119
+ "content": "</s>",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ },
125
+ "mask_token": {
126
+ "content": "<mask>",
127
+ "lstrip": true,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false
131
+ },
132
+ "pad_token": {
133
+ "content": "<pad>",
134
+ "lstrip": false,
135
+ "normalized": true,
136
+ "rstrip": false,
137
+ "single_word": false
138
+ },
139
+ "sep_token": {
140
+ "content": "</s>",
141
+ "lstrip": false,
142
+ "normalized": true,
143
+ "rstrip": false,
144
+ "single_word": false
145
+ },
146
+ "unk_token": {
147
+ "content": "<unk>",
148
+ "lstrip": false,
149
+ "normalized": true,
150
+ "rstrip": false,
151
+ "single_word": false
152
+ }
153
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "additional_special_tokens": [
4
+ "<extra_id_99>",
5
+ "<extra_id_98>",
6
+ "<extra_id_97>",
7
+ "<extra_id_96>",
8
+ "<extra_id_95>",
9
+ "<extra_id_94>",
10
+ "<extra_id_93>",
11
+ "<extra_id_92>",
12
+ "<extra_id_91>",
13
+ "<extra_id_90>",
14
+ "<extra_id_89>",
15
+ "<extra_id_88>",
16
+ "<extra_id_87>",
17
+ "<extra_id_86>",
18
+ "<extra_id_85>",
19
+ "<extra_id_84>",
20
+ "<extra_id_83>",
21
+ "<extra_id_82>",
22
+ "<extra_id_81>",
23
+ "<extra_id_80>",
24
+ "<extra_id_79>",
25
+ "<extra_id_78>",
26
+ "<extra_id_77>",
27
+ "<extra_id_76>",
28
+ "<extra_id_75>",
29
+ "<extra_id_74>",
30
+ "<extra_id_73>",
31
+ "<extra_id_72>",
32
+ "<extra_id_71>",
33
+ "<extra_id_70>",
34
+ "<extra_id_69>",
35
+ "<extra_id_68>",
36
+ "<extra_id_67>",
37
+ "<extra_id_66>",
38
+ "<extra_id_65>",
39
+ "<extra_id_64>",
40
+ "<extra_id_63>",
41
+ "<extra_id_62>",
42
+ "<extra_id_61>",
43
+ "<extra_id_60>",
44
+ "<extra_id_59>",
45
+ "<extra_id_58>",
46
+ "<extra_id_57>",
47
+ "<extra_id_56>",
48
+ "<extra_id_55>",
49
+ "<extra_id_54>",
50
+ "<extra_id_53>",
51
+ "<extra_id_52>",
52
+ "<extra_id_51>",
53
+ "<extra_id_50>",
54
+ "<extra_id_49>",
55
+ "<extra_id_48>",
56
+ "<extra_id_47>",
57
+ "<extra_id_46>",
58
+ "<extra_id_45>",
59
+ "<extra_id_44>",
60
+ "<extra_id_43>",
61
+ "<extra_id_42>",
62
+ "<extra_id_41>",
63
+ "<extra_id_40>",
64
+ "<extra_id_39>",
65
+ "<extra_id_38>",
66
+ "<extra_id_37>",
67
+ "<extra_id_36>",
68
+ "<extra_id_35>",
69
+ "<extra_id_34>",
70
+ "<extra_id_33>",
71
+ "<extra_id_32>",
72
+ "<extra_id_31>",
73
+ "<extra_id_30>",
74
+ "<extra_id_29>",
75
+ "<extra_id_28>",
76
+ "<extra_id_27>",
77
+ "<extra_id_26>",
78
+ "<extra_id_25>",
79
+ "<extra_id_24>",
80
+ "<extra_id_23>",
81
+ "<extra_id_22>",
82
+ "<extra_id_21>",
83
+ "<extra_id_20>",
84
+ "<extra_id_19>",
85
+ "<extra_id_18>",
86
+ "<extra_id_17>",
87
+ "<extra_id_16>",
88
+ "<extra_id_15>",
89
+ "<extra_id_14>",
90
+ "<extra_id_13>",
91
+ "<extra_id_12>",
92
+ "<extra_id_11>",
93
+ "<extra_id_10>",
94
+ "<extra_id_9>",
95
+ "<extra_id_8>",
96
+ "<extra_id_7>",
97
+ "<extra_id_6>",
98
+ "<extra_id_5>",
99
+ "<extra_id_4>",
100
+ "<extra_id_3>",
101
+ "<extra_id_2>",
102
+ "<extra_id_1>",
103
+ "<extra_id_0>"
104
+ ],
105
+ "bos_token": {
106
+ "__type": "AddedToken",
107
+ "content": "<s>",
108
+ "lstrip": false,
109
+ "normalized": true,
110
+ "rstrip": false,
111
+ "single_word": false
112
+ },
113
+ "cls_token": {
114
+ "__type": "AddedToken",
115
+ "content": "<s>",
116
+ "lstrip": false,
117
+ "normalized": true,
118
+ "rstrip": false,
119
+ "single_word": false
120
+ },
121
+ "eos_token": {
122
+ "__type": "AddedToken",
123
+ "content": "</s>",
124
+ "lstrip": false,
125
+ "normalized": true,
126
+ "rstrip": false,
127
+ "single_word": false
128
+ },
129
+ "errors": "replace",
130
+ "mask_token": {
131
+ "__type": "AddedToken",
132
+ "content": "<mask>",
133
+ "lstrip": true,
134
+ "normalized": true,
135
+ "rstrip": false,
136
+ "single_word": false
137
+ },
138
+ "model_max_length": 512,
139
+ "name_or_path": "/content/kaggle/working/codet5-base",
140
+ "pad_token": {
141
+ "__type": "AddedToken",
142
+ "content": "<pad>",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false
147
+ },
148
+ "sep_token": {
149
+ "__type": "AddedToken",
150
+ "content": "</s>",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false
155
+ },
156
+ "special_tokens_map_file": "/root/.cache/huggingface/transformers/5941df5e4315c5ab63b7b2ac791fb0bf0f209744a055c06b43b5274849137cdd.b9905d0575bde443a20834122b6e2d48e853b2e36444ce98ddeb43c38097eb3f",
157
+ "tokenizer_class": "RobertaTokenizer",
158
+ "trim_offsets": true,
159
+ "unk_token": {
160
+ "__type": "AddedToken",
161
+ "content": "<unk>",
162
+ "lstrip": false,
163
+ "normalized": true,
164
+ "rstrip": false,
165
+ "single_word": false
166
+ }
167
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff