sagawa commited on
Commit
c74fe0a
1 Parent(s): 55ea870

Upload 8 files

Browse files
Files changed (8) hide show
  1. README.md +72 -0
  2. config.json +32 -0
  3. merges.txt +63 -0
  4. pytorch_model.bin +3 -0
  5. special_tokens_map.json +51 -0
  6. tokenizer.json +535 -0
  7. tokenizer_config.json +67 -0
  8. vocab.json +1 -0
README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - generated_from_trainer
5
+ datasets:
6
+ - sagawa/pubchem-10m-canonicalized
7
+ metrics:
8
+ - accuracy
9
+ model-index:
10
+ - name: PubChem10m-deberta-base-output
11
+ results:
12
+ - task:
13
+ name: Masked Language Modeling
14
+ type: fill-mask
15
+ dataset:
16
+ name: sagawa/pubchem-10m-canonicalized
17
+ type: sagawa/pubchem-10m-canonicalized
18
+ metrics:
19
+ - name: Accuracy
20
+ type: accuracy
21
+ value: 0.9741235263046233
22
+ ---
23
+
24
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
25
+ should probably proofread and complete it, then remove this comment. -->
26
+
27
+ # PubChem10m-deberta-base-output
28
+
29
+ This model is a fine-tuned version of [microsoft/deberta-base](https://huggingface.co/microsoft/deberta-base) on the sagawa/pubchem-10m-canonicalized dataset.
30
+ It achieves the following results on the evaluation set:
31
+ - Loss: 0.0698
32
+ - Accuracy: 0.9741
33
+
34
+ ## Model description
35
+
36
+ More information needed
37
+
38
+ ## Intended uses & limitations
39
+
40
+ More information needed
41
+
42
+ ## Training and evaluation data
43
+
44
+ More information needed
45
+
46
+ ## Training procedure
47
+
48
+ ### Training hyperparameters
49
+
50
+ The following hyperparameters were used during training:
51
+ - learning_rate: 5e-05
52
+ - train_batch_size: 30
53
+ - eval_batch_size: 48
54
+ - seed: 42
55
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
56
+ - lr_scheduler_type: linear
57
+ - num_epochs: 10.0
58
+
59
+ ### Training results
60
+
61
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
62
+ |:-------------:|:-----:|:------:|:---------------:|:--------:|
63
+ | 0.0855 | 3.68 | 100000 | 0.0801 | 0.9708 |
64
+ | 0.0733 | 7.37 | 200000 | 0.0702 | 0.9740 |
65
+
66
+
67
+ ### Framework versions
68
+
69
+ - Transformers 4.22.0.dev0
70
+ - Pytorch 1.12.0
71
+ - Datasets 2.4.1.dev0
72
+ - Tokenizers 0.11.6
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/deberta-base",
3
+ "architectures": [
4
+ "NewDebertaForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-07,
13
+ "max_position_embeddings": 512,
14
+ "max_relative_positions": -1,
15
+ "model_type": "deberta",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "pooler_dropout": 0,
20
+ "pooler_hidden_act": "gelu",
21
+ "pooler_hidden_size": 768,
22
+ "pos_att_type": [
23
+ "c2p",
24
+ "p2c"
25
+ ],
26
+ "position_biased_input": false,
27
+ "relative_attention": true,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.22.0.dev0",
30
+ "type_vocab_size": 0,
31
+ "vocab_size": 323
32
+ }
merges.txt ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version: 0.2 - Trained by `huggingface/tokenizers`
2
+ Ġ c
3
+ Ġ C
4
+ Ġ (
5
+ Ġ )
6
+ Ġ 1
7
+ Ġ O
8
+ Ġ 2
9
+ Ġ =
10
+ Ġ N
11
+ Ġ n
12
+ Ġ 3
13
+ Ġ ]
14
+ Ġ [
15
+ Ġ H
16
+ Ġ +
17
+ Ġ -
18
+ Ġ F
19
+ Ġ S
20
+ Ġ 4
21
+ Ġ l
22
+ Ġ s
23
+ Ġ B
24
+ Ġ o
25
+ Ġ r
26
+ Ġ 5
27
+ Ġ #
28
+ Ġ 6
29
+ Ġ i
30
+ Ġ P
31
+ Ġ I
32
+ Ġ 7
33
+ Ġ 8
34
+ Ġ %
35
+ Ġ 9
36
+ Ġ 0
37
+ Ġ e
38
+ Ġ A
39
+ Ġ p
40
+ Ġ G
41
+ Ġ T
42
+ Ġ b
43
+ Ġ g
44
+ Ġ W
45
+ Ġ a
46
+ Ġ t
47
+ Ġ R
48
+ Ġ u
49
+ Ġ V
50
+ Ġ M
51
+ Ġ Z
52
+ Ġ h
53
+ Ġ d
54
+ Ġ X
55
+ Ġ U
56
+ Ġ Y
57
+ Ġ f
58
+ Ġ K
59
+ Ġ L
60
+ Ġ E
61
+ Ġ m
62
+ Ġ D
63
+ Ġ y
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f94cc428f048d2567ec71b4f92fbcdb74f169f0b293aee8fcbc53fc0abaf8751
3
+ size 403426925
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": true,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[CLS]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": true,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[SEP]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": true,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[UNK]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": true,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": true,
47
+ "rstrip": false,
48
+ "normalized": true,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": {
54
+ "type": "ByteLevel",
55
+ "add_prefix_space": false,
56
+ "trim_offsets": true
57
+ },
58
+ "post_processor": {
59
+ "type": "TemplateProcessing",
60
+ "single": [
61
+ {
62
+ "SpecialToken": {
63
+ "id": "[CLS]",
64
+ "type_id": 0
65
+ }
66
+ },
67
+ {
68
+ "Sequence": {
69
+ "id": "A",
70
+ "type_id": 0
71
+ }
72
+ },
73
+ {
74
+ "SpecialToken": {
75
+ "id": "[SEP]",
76
+ "type_id": 0
77
+ }
78
+ }
79
+ ],
80
+ "pair": [
81
+ {
82
+ "SpecialToken": {
83
+ "id": "[CLS]",
84
+ "type_id": 0
85
+ }
86
+ },
87
+ {
88
+ "Sequence": {
89
+ "id": "A",
90
+ "type_id": 0
91
+ }
92
+ },
93
+ {
94
+ "SpecialToken": {
95
+ "id": "[SEP]",
96
+ "type_id": 0
97
+ }
98
+ },
99
+ {
100
+ "Sequence": {
101
+ "id": "B",
102
+ "type_id": 1
103
+ }
104
+ },
105
+ {
106
+ "SpecialToken": {
107
+ "id": "[SEP]",
108
+ "type_id": 1
109
+ }
110
+ }
111
+ ],
112
+ "special_tokens": {
113
+ "[CLS]": {
114
+ "id": "[CLS]",
115
+ "ids": [
116
+ 1
117
+ ],
118
+ "tokens": [
119
+ "[CLS]"
120
+ ]
121
+ },
122
+ "[SEP]": {
123
+ "id": "[SEP]",
124
+ "ids": [
125
+ 2
126
+ ],
127
+ "tokens": [
128
+ "[SEP]"
129
+ ]
130
+ }
131
+ }
132
+ },
133
+ "decoder": {
134
+ "type": "ByteLevel",
135
+ "add_prefix_space": true,
136
+ "trim_offsets": true
137
+ },
138
+ "model": {
139
+ "type": "BPE",
140
+ "dropout": null,
141
+ "unk_token": null,
142
+ "continuing_subword_prefix": "",
143
+ "end_of_word_suffix": "",
144
+ "fuse_unk": false,
145
+ "vocab": {
146
+ "[PAD]": 0,
147
+ "[CLS]": 1,
148
+ "[SEP]": 2,
149
+ "[UNK]": 3,
150
+ "[MASK]": 4,
151
+ "!": 5,
152
+ "\"": 6,
153
+ "#": 7,
154
+ "$": 8,
155
+ "%": 9,
156
+ "&": 10,
157
+ "'": 11,
158
+ "(": 12,
159
+ ")": 13,
160
+ "*": 14,
161
+ "+": 15,
162
+ ",": 16,
163
+ "-": 17,
164
+ ".": 18,
165
+ "/": 19,
166
+ "0": 20,
167
+ "1": 21,
168
+ "2": 22,
169
+ "3": 23,
170
+ "4": 24,
171
+ "5": 25,
172
+ "6": 26,
173
+ "7": 27,
174
+ "8": 28,
175
+ "9": 29,
176
+ ":": 30,
177
+ ";": 31,
178
+ "<": 32,
179
+ "=": 33,
180
+ ">": 34,
181
+ "?": 35,
182
+ "@": 36,
183
+ "A": 37,
184
+ "B": 38,
185
+ "C": 39,
186
+ "D": 40,
187
+ "E": 41,
188
+ "F": 42,
189
+ "G": 43,
190
+ "H": 44,
191
+ "I": 45,
192
+ "J": 46,
193
+ "K": 47,
194
+ "L": 48,
195
+ "M": 49,
196
+ "N": 50,
197
+ "O": 51,
198
+ "P": 52,
199
+ "Q": 53,
200
+ "R": 54,
201
+ "S": 55,
202
+ "T": 56,
203
+ "U": 57,
204
+ "V": 58,
205
+ "W": 59,
206
+ "X": 60,
207
+ "Y": 61,
208
+ "Z": 62,
209
+ "[": 63,
210
+ "\\": 64,
211
+ "]": 65,
212
+ "^": 66,
213
+ "_": 67,
214
+ "`": 68,
215
+ "a": 69,
216
+ "b": 70,
217
+ "c": 71,
218
+ "d": 72,
219
+ "e": 73,
220
+ "f": 74,
221
+ "g": 75,
222
+ "h": 76,
223
+ "i": 77,
224
+ "j": 78,
225
+ "k": 79,
226
+ "l": 80,
227
+ "m": 81,
228
+ "n": 82,
229
+ "o": 83,
230
+ "p": 84,
231
+ "q": 85,
232
+ "r": 86,
233
+ "s": 87,
234
+ "t": 88,
235
+ "u": 89,
236
+ "v": 90,
237
+ "w": 91,
238
+ "x": 92,
239
+ "y": 93,
240
+ "z": 94,
241
+ "{": 95,
242
+ "|": 96,
243
+ "}": 97,
244
+ "~": 98,
245
+ "¡": 99,
246
+ "¢": 100,
247
+ "£": 101,
248
+ "¤": 102,
249
+ "¥": 103,
250
+ "¦": 104,
251
+ "§": 105,
252
+ "¨": 106,
253
+ "©": 107,
254
+ "ª": 108,
255
+ "«": 109,
256
+ "¬": 110,
257
+ "®": 111,
258
+ "¯": 112,
259
+ "°": 113,
260
+ "±": 114,
261
+ "²": 115,
262
+ "³": 116,
263
+ "´": 117,
264
+ "µ": 118,
265
+ "¶": 119,
266
+ "·": 120,
267
+ "¸": 121,
268
+ "¹": 122,
269
+ "º": 123,
270
+ "»": 124,
271
+ "¼": 125,
272
+ "½": 126,
273
+ "¾": 127,
274
+ "¿": 128,
275
+ "À": 129,
276
+ "Á": 130,
277
+ "Â": 131,
278
+ "Ã": 132,
279
+ "Ä": 133,
280
+ "Å": 134,
281
+ "Æ": 135,
282
+ "Ç": 136,
283
+ "È": 137,
284
+ "É": 138,
285
+ "Ê": 139,
286
+ "Ë": 140,
287
+ "Ì": 141,
288
+ "Í": 142,
289
+ "Î": 143,
290
+ "Ï": 144,
291
+ "Ð": 145,
292
+ "Ñ": 146,
293
+ "Ò": 147,
294
+ "Ó": 148,
295
+ "Ô": 149,
296
+ "Õ": 150,
297
+ "Ö": 151,
298
+ "×": 152,
299
+ "Ø": 153,
300
+ "Ù": 154,
301
+ "Ú": 155,
302
+ "Û": 156,
303
+ "Ü": 157,
304
+ "Ý": 158,
305
+ "Þ": 159,
306
+ "ß": 160,
307
+ "à": 161,
308
+ "á": 162,
309
+ "â": 163,
310
+ "ã": 164,
311
+ "ä": 165,
312
+ "å": 166,
313
+ "æ": 167,
314
+ "ç": 168,
315
+ "è": 169,
316
+ "é": 170,
317
+ "ê": 171,
318
+ "ë": 172,
319
+ "ì": 173,
320
+ "í": 174,
321
+ "î": 175,
322
+ "ï": 176,
323
+ "ð": 177,
324
+ "ñ": 178,
325
+ "ò": 179,
326
+ "ó": 180,
327
+ "ô": 181,
328
+ "õ": 182,
329
+ "ö": 183,
330
+ "÷": 184,
331
+ "ø": 185,
332
+ "ù": 186,
333
+ "ú": 187,
334
+ "û": 188,
335
+ "ü": 189,
336
+ "ý": 190,
337
+ "þ": 191,
338
+ "ÿ": 192,
339
+ "Ā": 193,
340
+ "ā": 194,
341
+ "Ă": 195,
342
+ "ă": 196,
343
+ "Ą": 197,
344
+ "ą": 198,
345
+ "Ć": 199,
346
+ "ć": 200,
347
+ "Ĉ": 201,
348
+ "ĉ": 202,
349
+ "Ċ": 203,
350
+ "ċ": 204,
351
+ "Č": 205,
352
+ "č": 206,
353
+ "Ď": 207,
354
+ "ď": 208,
355
+ "Đ": 209,
356
+ "đ": 210,
357
+ "Ē": 211,
358
+ "ē": 212,
359
+ "Ĕ": 213,
360
+ "ĕ": 214,
361
+ "Ė": 215,
362
+ "ė": 216,
363
+ "Ę": 217,
364
+ "ę": 218,
365
+ "Ě": 219,
366
+ "ě": 220,
367
+ "Ĝ": 221,
368
+ "ĝ": 222,
369
+ "Ğ": 223,
370
+ "ğ": 224,
371
+ "Ġ": 225,
372
+ "ġ": 226,
373
+ "Ģ": 227,
374
+ "ģ": 228,
375
+ "Ĥ": 229,
376
+ "ĥ": 230,
377
+ "Ħ": 231,
378
+ "ħ": 232,
379
+ "Ĩ": 233,
380
+ "ĩ": 234,
381
+ "Ī": 235,
382
+ "ī": 236,
383
+ "Ĭ": 237,
384
+ "ĭ": 238,
385
+ "Į": 239,
386
+ "į": 240,
387
+ "İ": 241,
388
+ "ı": 242,
389
+ "IJ": 243,
390
+ "ij": 244,
391
+ "Ĵ": 245,
392
+ "ĵ": 246,
393
+ "Ķ": 247,
394
+ "ķ": 248,
395
+ "ĸ": 249,
396
+ "Ĺ": 250,
397
+ "ĺ": 251,
398
+ "Ļ": 252,
399
+ "ļ": 253,
400
+ "Ľ": 254,
401
+ "ľ": 255,
402
+ "Ŀ": 256,
403
+ "ŀ": 257,
404
+ "Ł": 258,
405
+ "ł": 259,
406
+ "Ń": 260,
407
+ "Ġc": 261,
408
+ "ĠC": 262,
409
+ "Ġ(": 263,
410
+ "Ġ)": 264,
411
+ "Ġ1": 265,
412
+ "ĠO": 266,
413
+ "Ġ2": 267,
414
+ "Ġ=": 268,
415
+ "ĠN": 269,
416
+ "Ġn": 270,
417
+ "Ġ3": 271,
418
+ "Ġ]": 272,
419
+ "Ġ[": 273,
420
+ "ĠH": 274,
421
+ "Ġ+": 275,
422
+ "Ġ-": 276,
423
+ "ĠF": 277,
424
+ "ĠS": 278,
425
+ "Ġ4": 279,
426
+ "Ġl": 280,
427
+ "Ġs": 281,
428
+ "ĠB": 282,
429
+ "Ġo": 283,
430
+ "Ġr": 284,
431
+ "Ġ5": 285,
432
+ "Ġ#": 286,
433
+ "Ġ6": 287,
434
+ "Ġi": 288,
435
+ "ĠP": 289,
436
+ "ĠI": 290,
437
+ "Ġ7": 291,
438
+ "Ġ8": 292,
439
+ "Ġ%": 293,
440
+ "Ġ9": 294,
441
+ "Ġ0": 295,
442
+ "Ġe": 296,
443
+ "ĠA": 297,
444
+ "Ġp": 298,
445
+ "ĠG": 299,
446
+ "ĠT": 300,
447
+ "Ġb": 301,
448
+ "Ġg": 302,
449
+ "ĠW": 303,
450
+ "Ġa": 304,
451
+ "Ġt": 305,
452
+ "ĠR": 306,
453
+ "Ġu": 307,
454
+ "ĠV": 308,
455
+ "ĠM": 309,
456
+ "ĠZ": 310,
457
+ "Ġh": 311,
458
+ "Ġd": 312,
459
+ "ĠX": 313,
460
+ "ĠU": 314,
461
+ "ĠY": 315,
462
+ "Ġf": 316,
463
+ "ĠK": 317,
464
+ "ĠL": 318,
465
+ "ĠE": 319,
466
+ "Ġm": 320,
467
+ "ĠD": 321,
468
+ "Ġy": 322
469
+ },
470
+ "merges": [
471
+ "Ġ c",
472
+ "Ġ C",
473
+ "Ġ (",
474
+ "Ġ )",
475
+ "Ġ 1",
476
+ "Ġ O",
477
+ "Ġ 2",
478
+ "Ġ =",
479
+ "Ġ N",
480
+ "Ġ n",
481
+ "Ġ 3",
482
+ "Ġ ]",
483
+ "Ġ [",
484
+ "Ġ H",
485
+ "Ġ +",
486
+ "Ġ -",
487
+ "Ġ F",
488
+ "Ġ S",
489
+ "Ġ 4",
490
+ "Ġ l",
491
+ "Ġ s",
492
+ "Ġ B",
493
+ "Ġ o",
494
+ "Ġ r",
495
+ "Ġ 5",
496
+ "Ġ #",
497
+ "Ġ 6",
498
+ "Ġ i",
499
+ "Ġ P",
500
+ "Ġ I",
501
+ "Ġ 7",
502
+ "Ġ 8",
503
+ "Ġ %",
504
+ "Ġ 9",
505
+ "Ġ 0",
506
+ "Ġ e",
507
+ "Ġ A",
508
+ "Ġ p",
509
+ "Ġ G",
510
+ "Ġ T",
511
+ "Ġ b",
512
+ "Ġ g",
513
+ "Ġ W",
514
+ "Ġ a",
515
+ "Ġ t",
516
+ "Ġ R",
517
+ "Ġ u",
518
+ "Ġ V",
519
+ "Ġ M",
520
+ "Ġ Z",
521
+ "Ġ h",
522
+ "Ġ d",
523
+ "Ġ X",
524
+ "Ġ U",
525
+ "Ġ Y",
526
+ "Ġ f",
527
+ "Ġ K",
528
+ "Ġ L",
529
+ "Ġ E",
530
+ "Ġ m",
531
+ "Ġ D",
532
+ "Ġ y"
533
+ ]
534
+ }
535
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "[CLS]",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "cls_token": {
13
+ "__type": "AddedToken",
14
+ "content": "[CLS]",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "do_lower_case": false,
21
+ "eos_token": {
22
+ "__type": "AddedToken",
23
+ "content": "[SEP]",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false
28
+ },
29
+ "errors": "replace",
30
+ "mask_token": {
31
+ "__type": "AddedToken",
32
+ "content": "[MASK]",
33
+ "lstrip": true,
34
+ "normalized": true,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ "model_max_length": 512,
39
+ "name_or_path": "./PubChem10m-deberta-base",
40
+ "pad_token": {
41
+ "__type": "AddedToken",
42
+ "content": "[PAD]",
43
+ "lstrip": false,
44
+ "normalized": true,
45
+ "rstrip": false,
46
+ "single_word": false
47
+ },
48
+ "sep_token": {
49
+ "__type": "AddedToken",
50
+ "content": "[SEP]",
51
+ "lstrip": false,
52
+ "normalized": true,
53
+ "rstrip": false,
54
+ "single_word": false
55
+ },
56
+ "special_tokens_map_file": null,
57
+ "tokenizer_class": "DebertaTokenizer",
58
+ "unk_token": {
59
+ "__type": "AddedToken",
60
+ "content": "[UNK]",
61
+ "lstrip": false,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ "vocab_type": "gpt2"
67
+ }
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"[PAD]":0,"[CLS]":1,"[SEP]":2,"[UNK]":3,"[MASK]":4,"!":5,"\"":6,"#":7,"$":8,"%":9,"&":10,"'":11,"(":12,")":13,"*":14,"+":15,",":16,"-":17,".":18,"/":19,"0":20,"1":21,"2":22,"3":23,"4":24,"5":25,"6":26,"7":27,"8":28,"9":29,":":30,";":31,"<":32,"=":33,">":34,"?":35,"@":36,"A":37,"B":38,"C":39,"D":40,"E":41,"F":42,"G":43,"H":44,"I":45,"J":46,"K":47,"L":48,"M":49,"N":50,"O":51,"P":52,"Q":53,"R":54,"S":55,"T":56,"U":57,"V":58,"W":59,"X":60,"Y":61,"Z":62,"[":63,"\\":64,"]":65,"^":66,"_":67,"`":68,"a":69,"b":70,"c":71,"d":72,"e":73,"f":74,"g":75,"h":76,"i":77,"j":78,"k":79,"l":80,"m":81,"n":82,"o":83,"p":84,"q":85,"r":86,"s":87,"t":88,"u":89,"v":90,"w":91,"x":92,"y":93,"z":94,"{":95,"|":96,"}":97,"~":98,"¡":99,"¢":100,"£":101,"¤":102,"¥":103,"¦":104,"§":105,"¨":106,"©":107,"ª":108,"«":109,"¬":110,"®":111,"¯":112,"°":113,"±":114,"²":115,"³":116,"´":117,"µ":118,"¶":119,"·":120,"¸":121,"¹":122,"º":123,"»":124,"¼":125,"½":126,"¾":127,"¿":128,"À":129,"Á":130,"Â":131,"Ã":132,"Ä":133,"Å":134,"Æ":135,"Ç":136,"È":137,"É":138,"Ê":139,"Ë":140,"Ì":141,"Í":142,"Î":143,"Ï":144,"Ð":145,"Ñ":146,"Ò":147,"Ó":148,"Ô":149,"Õ":150,"Ö":151,"×":152,"Ø":153,"Ù":154,"Ú":155,"Û":156,"Ü":157,"Ý":158,"Þ":159,"ß":160,"à":161,"á":162,"â":163,"ã":164,"ä":165,"å":166,"æ":167,"ç":168,"è":169,"é":170,"ê":171,"ë":172,"ì":173,"í":174,"î":175,"ï":176,"ð":177,"ñ":178,"ò":179,"ó":180,"ô":181,"õ":182,"ö":183,"÷":184,"ø":185,"ù":186,"ú":187,"û":188,"ü":189,"ý":190,"þ":191,"ÿ":192,"Ā":193,"ā":194,"Ă":195,"ă":196,"Ą":197,"ą":198,"Ć":199,"ć":200,"Ĉ":201,"ĉ":202,"Ċ":203,"ċ":204,"Č":205,"č":206,"Ď":207,"ď":208,"Đ":209,"đ":210,"Ē":211,"ē":212,"Ĕ":213,"ĕ":214,"Ė":215,"ė":216,"Ę":217,"ę":218,"Ě":219,"ě":220,"Ĝ":221,"ĝ":222,"Ğ":223,"ğ":224,"Ġ":225,"ġ":226,"Ģ":227,"ģ":228,"Ĥ":229,"ĥ":230,"Ħ":231,"ħ":232,"Ĩ":233,"ĩ":234,"Ī":235,"ī":236,"Ĭ":237,"ĭ":238,"Į":239,"į":240,"İ":241,"ı":242,"IJ":243,"ij":244,"Ĵ":245,"ĵ":246,"Ķ":247,"ķ":248,"ĸ":249,"Ĺ":250,"ĺ":251,"Ļ":252,"ļ":253,"Ľ":254,"ľ":255,"Ŀ":256,"ŀ":257,"Ł":258,"ł":259,"Ń":260,"Ġc":261,"ĠC":262,"Ġ(":263,"Ġ)":264,"Ġ1":265,"ĠO":266,"Ġ2":267,"Ġ=":268,"ĠN":269,"Ġn":270,"Ġ3":271,"Ġ]":272,"Ġ[":273,"ĠH":274,"Ġ+":275,"Ġ-":276,"ĠF":277,"ĠS":278,"Ġ4":279,"Ġl":280,"Ġs":281,"ĠB":282,"Ġo":283,"Ġr":284,"Ġ5":285,"Ġ#":286,"Ġ6":287,"Ġi":288,"ĠP":289,"ĠI":290,"Ġ7":291,"Ġ8":292,"Ġ%":293,"Ġ9":294,"Ġ0":295,"Ġe":296,"ĠA":297,"Ġp":298,"ĠG":299,"ĠT":300,"Ġb":301,"Ġg":302,"ĠW":303,"Ġa":304,"Ġt":305,"ĠR":306,"Ġu":307,"ĠV":308,"ĠM":309,"ĠZ":310,"Ġh":311,"Ġd":312,"ĠX":313,"ĠU":314,"ĠY":315,"Ġf":316,"ĠK":317,"ĠL":318,"ĠE":319,"Ġm":320,"ĠD":321,"Ġy":322}